diff --git a/.github/workflows/run-checks-all.yml b/.github/workflows/run-checks-all.yml index 3f899903dce8..1bc0a0b564bb 100644 --- a/.github/workflows/run-checks-all.yml +++ b/.github/workflows/run-checks-all.yml @@ -10,7 +10,7 @@ on: push: branches: - 'main' - - 'branch_9x' + - 'branch_10x' env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} diff --git a/.github/workflows/run-checks-gradle-upgrade.yml b/.github/workflows/run-checks-gradle-upgrade.yml index 751c3471b519..07b7210cf4e2 100644 --- a/.github/workflows/run-checks-gradle-upgrade.yml +++ b/.github/workflows/run-checks-gradle-upgrade.yml @@ -6,7 +6,7 @@ on: pull_request: branches: - 'main' - - 'branch_9x' + - 'branch_10x' paths: - '.github/workflows/run-checks-gradle-upgrade.yml' - 'gradle/wrapper/**' @@ -14,7 +14,7 @@ on: push: branches: - 'main' - - 'branch_9x' + - 'branch_10x' paths: - '.github/workflows/run-checks-gradle-upgrade.yml' - 'gradle/wrapper/**' @@ -30,7 +30,7 @@ jobs: strategy: matrix: os: [ ubuntu-latest ] - java-version: [ '22' ] + java-version: [ '23-ea' ] uses-alt-java: [ true, false ] runs-on: ${{ matrix.os }} @@ -61,7 +61,16 @@ jobs: # https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#setting-an-environment-variable echo "RUNTIME_JAVA_HOME=${{ env.ALT_JAVA_DIR }}" >> "$GITHUB_ENV" - - run: ./gradlew -p lucene/core check -x test + - name: ./gradlew tidy + run: | + ./gradlew tidy + if [ ! -z "$(git status --porcelain)" ]; then + echo ":warning: **tidy left local checkout in modified state**" >> $GITHUB_STEP_SUMMARY + echo '```' >> $GITHUB_STEP_SUMMARY + git status --porcelain >> $GITHUB_STEP_SUMMARY + echo '```' >> $GITHUB_STEP_SUMMARY + git reset --hard && git clean -xfd . + fi - name: ./gradlew regenerate run: | @@ -69,7 +78,7 @@ jobs: sudo apt-get install libwww-perl ./gradlew regenerate -x generateUAX29URLEmailTokenizerInternal --rerun-tasks if [ ! -z "$(git status --porcelain)" ]; then - echo ":warning: **regenerateleft local checkout in modified state**" >> $GITHUB_STEP_SUMMARY + echo ":warning: **regenerate left local checkout in modified state**" >> $GITHUB_STEP_SUMMARY echo '```' >> $GITHUB_STEP_SUMMARY git status --porcelain >> $GITHUB_STEP_SUMMARY echo '```' >> $GITHUB_STEP_SUMMARY @@ -79,8 +88,7 @@ jobs: - run: ./gradlew testOpts - run: ./gradlew helpWorkflow - run: ./gradlew licenses updateLicenses - - run: ./gradlew tidy - - run: ./gradlew check -x test + - run: ./gradlew check -x test -Pvalidation.git.failOnModified=false - run: ./gradlew assembleRelease mavenToLocal # Conserve resources: only run these in non-alt-java mode. diff --git a/.github/workflows/run-checks-mod-analysis-common.yml b/.github/workflows/run-checks-mod-analysis-common.yml index df83212757ce..a208039a99fa 100644 --- a/.github/workflows/run-checks-mod-analysis-common.yml +++ b/.github/workflows/run-checks-mod-analysis-common.yml @@ -6,7 +6,7 @@ on: pull_request: branches: - 'main' - - 'branch_9x' + - 'branch_10x' paths: - '.github/workflows/run-checks-mod-analysis-common.yml' - 'lucene/analysis/common/**' @@ -14,7 +14,7 @@ on: push: branches: - 'main' - - 'branch_9x' + - 'branch_10x' paths: - '.github/workflows/run-checks-mod-analysis-common.yml' - 'lucene/analysis/common/**' diff --git a/.github/workflows/run-checks-mod-distribution.tests.yml b/.github/workflows/run-checks-mod-distribution.tests.yml index 497382d75790..e3af5812c80c 100644 --- a/.github/workflows/run-checks-mod-distribution.tests.yml +++ b/.github/workflows/run-checks-mod-distribution.tests.yml @@ -6,12 +6,12 @@ on: pull_request: branches: - 'main' - - 'branch_9x' + - 'branch_10x' push: branches: - 'main' - - 'branch_9x' + - 'branch_10x' env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} diff --git a/.github/workflows/run-nightly-smoketester.yml b/.github/workflows/run-nightly-smoketester.yml index c0987afc8aa5..3749641b5f8f 100644 --- a/.github/workflows/run-nightly-smoketester.yml +++ b/.github/workflows/run-nightly-smoketester.yml @@ -18,7 +18,7 @@ jobs: strategy: matrix: os: [ ubuntu-latest ] - java-version: [ '21', '22' ] + java-version: [ '21', '22', '23-ea' ] runs-on: ${{ matrix.os }} @@ -72,3 +72,4 @@ jobs: name: smoke-tester-logs-jdk-${{ matrix.java-version }} path: | ${{ env.TMP_DIR }}/**/*.log + /tmp/release.log diff --git a/NOTICE.txt b/NOTICE.txt index ea6903484c0c..4b758e824d10 100644 --- a/NOTICE.txt +++ b/NOTICE.txt @@ -1,5 +1,5 @@ Apache Lucene -Copyright 2001-2022 The Apache Software Foundation +Copyright 2001-2024 The Apache Software Foundation This product includes software developed at The Apache Software Foundation (http://www.apache.org/). diff --git a/build-tools/build-infra/src/main/java/org/apache/lucene/gradle/ProfileResults.java b/build-tools/build-infra/src/main/java/org/apache/lucene/gradle/ProfileResults.java index 5f0e93316646..b5ab5abde64d 100644 --- a/build-tools/build-infra/src/main/java/org/apache/lucene/gradle/ProfileResults.java +++ b/build-tools/build-infra/src/main/java/org/apache/lucene/gradle/ProfileResults.java @@ -41,7 +41,7 @@ */ public class ProfileResults { /** Formats a frame to a formatted line. This is deduplicated on! */ - static String frameToString(RecordedFrame frame, boolean lineNumbers) { + static String frameToString(RecordedFrame frame, boolean lineNumbers, boolean frameTypes) { StringBuilder builder = new StringBuilder(); RecordedMethod method = frame.getMethod(); RecordedClass clazz = method.getType(); @@ -55,13 +55,14 @@ static String frameToString(RecordedFrame frame, boolean lineNumbers) { builder.append("#"); builder.append(method.getName()); builder.append("()"); - if (lineNumbers) { + if (lineNumbers && frame.getLineNumber() != -1) { builder.append(":"); - if (frame.getLineNumber() == -1) { - builder.append("(" + frame.getType() + " code)"); - } else { - builder.append(frame.getLineNumber()); - } + builder.append(frame.getLineNumber()); + } + if (clazz != null && frameTypes) { + builder.append(" ["); + builder.append(frame.getType()); + builder.append(" code]"); } return builder.toString(); } @@ -77,6 +78,8 @@ static String frameToString(RecordedFrame frame, boolean lineNumbers) { public static final String COUNT_DEFAULT = "10"; public static final String LINENUMBERS_KEY = "tests.profile.linenumbers"; public static final String LINENUMBERS_DEFAULT = "false"; + public static final String FRAMETYPES_KEY = "tests.profile.frametypes"; + public static final String FRAMETYPES_DEFAULT = "true"; /** * Driver method, for testing standalone. @@ -92,7 +95,8 @@ public static void main(String[] args) throws IOException { System.getProperty(MODE_KEY, MODE_DEFAULT), Integer.parseInt(System.getProperty(STACKSIZE_KEY, STACKSIZE_DEFAULT)), Integer.parseInt(System.getProperty(COUNT_KEY, COUNT_DEFAULT)), - Boolean.parseBoolean(System.getProperty(LINENUMBERS_KEY, LINENUMBERS_DEFAULT))); + Boolean.parseBoolean(System.getProperty(LINENUMBERS_KEY, LINENUMBERS_DEFAULT)), + Boolean.parseBoolean(System.getProperty(FRAMETYPES_KEY, FRAMETYPES_DEFAULT))); } /** true if we care about this event */ @@ -152,7 +156,12 @@ private static String pad(String input) { /** Process all the JFR files passed in args and print a merged summary. */ public static void printReport( - List files, String mode, int stacksize, int count, boolean lineNumbers) + List files, + String mode, + int stacksize, + int count, + boolean lineNumbers, + boolean frameTypes) throws IOException { if (!"cpu".equals(mode) && !"heap".equals(mode)) { throw new IllegalArgumentException("tests.profile.mode must be one of (cpu,heap)"); @@ -181,7 +190,7 @@ public static void printReport( if (stack.length() > 0) { stack.append("\n").append(framePadding).append(" at "); } - stack.append(frameToString(trace.getFrames().get(i), lineNumbers)); + stack.append(frameToString(trace.getFrames().get(i), lineNumbers, frameTypes)); } String line = stack.toString(); SimpleEntry entry = diff --git a/build-tools/build-infra/src/main/java/org/apache/lucene/gradle/WrapperDownloader.java b/build-tools/build-infra/src/main/java/org/apache/lucene/gradle/WrapperDownloader.java index aaf7059bb5fb..d084243b2a5d 100644 --- a/build-tools/build-infra/src/main/java/org/apache/lucene/gradle/WrapperDownloader.java +++ b/build-tools/build-infra/src/main/java/org/apache/lucene/gradle/WrapperDownloader.java @@ -60,8 +60,8 @@ public static void main(String[] args) { public static void checkVersion() { int major = Runtime.version().feature(); - if (major != 21 && major != 22) { - throw new IllegalStateException("java version must be 21 or 22, your version: " + major); + if (major != 21 && major != 22 && major != 23) { + throw new IllegalStateException("java version must be 21, 22 or 23, your version: " + major); } } diff --git a/build-tools/missing-doclet/src/main/java/org/apache/lucene/missingdoclet/MissingDoclet.java b/build-tools/missing-doclet/src/main/java/org/apache/lucene/missingdoclet/MissingDoclet.java index d37c2bb1ea95..c5d776457b62 100644 --- a/build-tools/missing-doclet/src/main/java/org/apache/lucene/missingdoclet/MissingDoclet.java +++ b/build-tools/missing-doclet/src/main/java/org/apache/lucene/missingdoclet/MissingDoclet.java @@ -231,8 +231,8 @@ private void check(Element element) { case PACKAGE: checkComment(element); break; - // class-like elements, check them, then recursively check their children (fields and - // methods) + // class-like elements, check them, then recursively check their children (fields and + // methods) case CLASS: case INTERFACE: case ENUM: @@ -257,7 +257,7 @@ && level(element) >= METHOD) { } } break; - // method-like elements, check them if we are configured to do so + // method-like elements, check them if we are configured to do so case METHOD: case CONSTRUCTOR: case FIELD: diff --git a/build.gradle b/build.gradle index 6705923d79d0..81e61a35f13e 100644 --- a/build.gradle +++ b/build.gradle @@ -41,7 +41,7 @@ apply from: file('gradle/globals.gradle') // Calculate project version: version = { // Release manager: update base version here after release: - String baseVersion = '10.0.0' + String baseVersion = '11.0.0' // On a release explicitly set release version in one go: // -Dversion.release=x.y.z @@ -80,6 +80,9 @@ ext { // Minimum Java version required to compile and run Lucene. minJavaVersion = JavaVersion.toVersion(deps.versions.minJava.get()) + // also change this in extractor tool: ExtractForeignAPI + vectorIncubatorJavaVersions = [ JavaVersion.VERSION_21, JavaVersion.VERSION_22, JavaVersion.VERSION_23 ] as Set + // snapshot build marker used in scripts. snapshotBuild = version.contains("SNAPSHOT") @@ -117,10 +120,6 @@ apply from: file('gradle/generation/local-settings.gradle') // Make sure the build environment is consistent. apply from: file('gradle/validation/check-environment.gradle') -// IDE support, settings and specials. -apply from: file('gradle/ide/intellij-idea.gradle') -apply from: file('gradle/ide/eclipse.gradle') - // Set up defaults and configure aspects for certain modules or functionality // (java, tests) apply from: file('gradle/java/folder-layout.gradle') @@ -133,6 +132,10 @@ apply from: file('gradle/testing/alternative-jdk-support.gradle') apply from: file('gradle/java/jar-manifest.gradle') apply from: file('gradle/java/modules.gradle') +// IDE support, settings and specials. +apply from: file('gradle/ide/intellij-idea.gradle') +apply from: file('gradle/ide/eclipse.gradle') + // Maven artifact publishing. apply from: file('gradle/maven/publications.gradle') diff --git a/dev-docs/working-between-major-versions.adoc b/dev-docs/working-between-major-versions.adoc index 418247406834..0a42299f252f 100644 --- a/dev-docs/working-between-major-versions.adoc +++ b/dev-docs/working-between-major-versions.adoc @@ -51,7 +51,7 @@ cd lucene git clone git@github.com:apache/lucene.git main cd main # For each branch that you want a separate directory created for, add a worktree -git worktree add ../9x branch_9x +git worktree add ../10x branch_10x ---- === Using the Worktrees diff --git a/dev-tools/doap/lucene.rdf b/dev-tools/doap/lucene.rdf index 7c400eb545bb..afc39dea5a36 100644 --- a/dev-tools/doap/lucene.rdf +++ b/dev-tools/doap/lucene.rdf @@ -67,13 +67,27 @@ + + + lucene-10.0.0 + 2024-10-14 + 10.0.0 + + + + + lucene-9.12.0 + 2024-09-28 + 9.12.0 + + lucene-9.11.1 2024-06-27 9.11.1 - . + lucene-9.11.0 @@ -186,6 +200,13 @@ 9.0.0 + + + lucene-8.11.4 + 2024-09-24 + 8.11.4 + + lucene-8.11.3 diff --git a/dev-tools/scripts/addBackcompatIndexes.py b/dev-tools/scripts/addBackcompatIndexes.py index 80272ec0f0c1..3056c8268d4a 100755 --- a/dev-tools/scripts/addBackcompatIndexes.py +++ b/dev-tools/scripts/addBackcompatIndexes.py @@ -40,7 +40,7 @@ def create_and_add_index(source, indextype, index_version, current_version, temp 'cfs': 'index', 'nocfs': 'index', 'sorted': 'sorted', - 'int8_hnsw': 'int8_hnsw', + 'int7_hnsw': 'int7_hnsw', 'moreterms': 'moreterms', 'dvupdates': 'dvupdates', 'emptyIndex': 'empty' @@ -61,7 +61,7 @@ def create_and_add_index(source, indextype, index_version, current_version, temp 'cfs': 'testCreateCFS', 'nocfs': 'testCreateNoCFS', 'sorted': 'testCreateSortedIndex', - 'int8_hnsw': 'testCreateInt8HNSWIndices', + 'int7_hnsw': 'testCreateInt7HNSWIndices', 'moreterms': 'testCreateMoreTermsIndex', 'dvupdates': 'testCreateIndexWithDocValuesUpdates', 'emptyIndex': 'testCreateEmptyIndex' @@ -206,7 +206,7 @@ def main(): current_version = scriptutil.Version.parse(scriptutil.find_current_version()) create_and_add_index(source, 'cfs', c.version, current_version, c.temp_dir) create_and_add_index(source, 'nocfs', c.version, current_version, c.temp_dir) - create_and_add_index(source, 'int8_hnsw', c.version, current_version, c.temp_dir) + create_and_add_index(source, 'int7_hnsw', c.version, current_version, c.temp_dir) should_make_sorted = current_version.is_back_compat_with(c.version) \ and (c.version.major > 6 or (c.version.major == 6 and c.version.minor >= 2)) if should_make_sorted: diff --git a/dev-tools/scripts/buildAndPushRelease.py b/dev-tools/scripts/buildAndPushRelease.py index ccf7e0bd8160..8985d77cec21 100755 --- a/dev-tools/scripts/buildAndPushRelease.py +++ b/dev-tools/scripts/buildAndPushRelease.py @@ -112,8 +112,10 @@ def prepare(root, version, pause_before_sign, gpg_key_id, gpg_password, gpg_home checkDOAPfiles(version) if not dev_mode: - print(' ./gradlew --no-daemon clean check') - run('./gradlew --no-daemon clean check') + print(' ./gradlew --stacktrace --no-daemon clean') + run('./gradlew --stacktrace --no-daemon clean') + print(' ./gradlew --stacktrace --no-daemon check') + run('./gradlew --stacktrace --no-daemon check') else: print(' skipping precommit check due to dev-mode') @@ -121,7 +123,7 @@ def prepare(root, version, pause_before_sign, gpg_key_id, gpg_password, gpg_home input("Tests complete! Please press ENTER to proceed to assembleRelease: ") print(' prepare-release') - cmd = './gradlew --no-daemon assembleRelease' \ + cmd = './gradlew --stacktrace --no-daemon assembleRelease' \ ' -Dversion.release=%s' % version if dev_mode: cmd += ' -Pvalidation.git.failOnModified=false' diff --git a/dev-tools/scripts/releaseWizard.py b/dev-tools/scripts/releaseWizard.py index 562abf8f6e76..d599095619d4 100755 --- a/dev-tools/scripts/releaseWizard.py +++ b/dev-tools/scripts/releaseWizard.py @@ -239,7 +239,7 @@ def maybe_remove_rc_from_svn(): logfile="svn_rm.log", tee=True, vars={ - 'dist_folder': """lucene-{{ release_version }}-RC{{ rc_number }}-rev{{ build_rc.git_rev | default("", True) }}""", + 'dist_folder': """lucene-{{ release_version }}-RC{{ rc_number }}-rev-{{ build_rc.git_rev | default("", True) }}""", 'dist_url': "{{ dist_url_base }}/{{ dist_folder }}" } )], diff --git a/gradle/documentation/render-javadoc.gradle b/gradle/documentation/render-javadoc.gradle index ddfb68a1b1cf..a161fbc24dd1 100644 --- a/gradle/documentation/render-javadoc.gradle +++ b/gradle/documentation/render-javadoc.gradle @@ -32,7 +32,7 @@ allprojects { missingdoclet "org.apache.lucene.tools:missing-doclet" } - ext { + project.ext { relativeDocPath = project.path.replaceFirst(/:\w+:/, "").replace(':', '/') } diff --git a/gradle/generation/extract-jdk-apis.gradle b/gradle/generation/extract-jdk-apis.gradle index 3c8e1efa4473..3adde87da838 100644 --- a/gradle/generation/extract-jdk-apis.gradle +++ b/gradle/generation/extract-jdk-apis.gradle @@ -17,13 +17,6 @@ def resources = scriptResources(buildscript) -configure(rootProject) { - ext { - // also change this in extractor tool: ExtractForeignAPI - vectorIncubatorJavaVersions = [ JavaVersion.VERSION_21, JavaVersion.VERSION_22 ] as Set - } -} - configure(project(":lucene:core")) { ext { apijars = layout.projectDirectory.dir("src/generated/jdk") diff --git a/gradle/generation/forUtil.gradle b/gradle/generation/forUtil.gradle index 5de1d850a023..b55fd0204fd3 100644 --- a/gradle/generation/forUtil.gradle +++ b/gradle/generation/forUtil.gradle @@ -23,7 +23,7 @@ configure(project(":lucene:core")) { description "Regenerate gen_ForUtil.py" group "generation" - def genDir = file("src/java/org/apache/lucene/codecs/lucene912") + def genDir = file("src/java/org/apache/lucene/codecs/lucene101") def genScript = file("${genDir}/gen_ForUtil.py") def genOutput = file("${genDir}/ForUtil.java") @@ -43,6 +43,32 @@ configure(project(":lucene:core")) { andThenTasks: ["spotlessJava", "spotlessJavaApply"], mustRunBefore: [ "compileJava" ] ]) + + task generateForDeltaUtilInternal() { + description "Regenerate gen_ForDeltaUtil.py" + group "generation" + + def genDir = file("src/java/org/apache/lucene/codecs/lucene101") + def genScript = file("${genDir}/gen_ForDeltaUtil.py") + def genOutput = file("${genDir}/ForDeltaUtil.java") + + inputs.file genScript + outputs.file genOutput + + doLast { + quietExec { + workingDir genDir + executable project.externalTool("python3") + args = [ '-B', genScript ] + } + } + } + + regenerate.dependsOn wrapWithPersistentChecksums(generateForDeltaUtilInternal, [ + andThenTasks: ["spotlessJava", "spotlessJavaApply"], + mustRunBefore: [ "compileJava" ] + ]) + } configure(project(":lucene:backward-codecs")) { @@ -121,5 +147,55 @@ configure(project(":lucene:backward-codecs")) { andThenTasks: ["spotlessJava", "spotlessJavaApply"], mustRunBefore: [ "compileJava" ] ]) + + task generateForUtil912Internal() { + description "Regenerate gen_ForUtil.py" + group "generation" + + def genDir = file("src/java/org/apache/lucene/backward_codecs/lucene912") + def genScript = file("${genDir}/gen_ForUtil.py") + def genOutput = file("${genDir}/ForUtil.java") + + inputs.file genScript + outputs.file genOutput + + doLast { + quietExec { + workingDir genDir + executable project.externalTool("python3") + args = [ '-B', genScript ] + } + } + } + + regenerate.dependsOn wrapWithPersistentChecksums(generateForUtil912Internal, [ + andThenTasks: ["spotlessJava", "spotlessJavaApply"], + mustRunBefore: [ "compileJava" ] + ]) + + task generateForDeltaUtil912Internal() { + description "Regenerate gen_ForDeltaUtil.py" + group "generation" + + def genDir = file("src/java/org/apache/lucene/backward_codecs/lucene912") + def genScript = file("${genDir}/gen_ForDeltaUtil.py") + def genOutput = file("${genDir}/ForDeltaUtil.java") + + inputs.file genScript + outputs.file genOutput + + doLast { + quietExec { + workingDir genDir + executable project.externalTool("python3") + args = [ '-B', genScript ] + } + } + } + + regenerate.dependsOn wrapWithPersistentChecksums(generateForDeltaUtil912Internal, [ + andThenTasks: ["spotlessJava", "spotlessJavaApply"], + mustRunBefore: [ "compileJava" ] + ]) } diff --git a/gradle/generation/icu.gradle b/gradle/generation/icu.gradle index 6e3d5f9062f6..fa254d4a1bb7 100644 --- a/gradle/generation/icu.gradle +++ b/gradle/generation/icu.gradle @@ -65,10 +65,8 @@ configure(project(":lucene:analysis:icu")) { icupkg = file("${icuBinDir}/icupkg") } - // Resolve version lazily (can't resolve at configuration time). - def icu4jVersionProvider = project.provider { getVersion('com.ibm.icu', 'icu4j') } // lazy gstring with ICU version. - def icu4jVersion = "${-> icu4jVersionProvider.get()}" + def icu4jVersion = deps.icu4j.get().version def icuCompileTask = Os.isFamily(Os.FAMILY_WINDOWS) ? "compileIcuWindows" : "compileIcuLinux" diff --git a/gradle/generation/kuromoji.gradle b/gradle/generation/kuromoji.gradle index cfe2cd559ce3..a5aa5fbd6046 100644 --- a/gradle/generation/kuromoji.gradle +++ b/gradle/generation/kuromoji.gradle @@ -33,7 +33,7 @@ configure(project(":lucene:analysis:kuromoji")) { apply plugin: deps.plugins.undercouch.download.get().pluginId plugins.withType(JavaPlugin) { - ext { + project.ext { targetDir = file("src/resources") } diff --git a/gradle/generation/nori.gradle b/gradle/generation/nori.gradle index db05babdf038..3a558325964b 100644 --- a/gradle/generation/nori.gradle +++ b/gradle/generation/nori.gradle @@ -33,7 +33,7 @@ configure(project(":lucene:analysis:nori")) { apply plugin: deps.plugins.undercouch.download.get().pluginId plugins.withType(JavaPlugin) { - ext { + project.ext { targetDir = file("src/resources") } diff --git a/gradle/hacks/gradle-archives.gradle b/gradle/hacks/gradle-archives.gradle index cc8561c47a0a..363b67651409 100644 --- a/gradle/hacks/gradle-archives.gradle +++ b/gradle/hacks/gradle-archives.gradle @@ -19,6 +19,7 @@ allprojects { tasks.withType(AbstractArchiveTask).configureEach { task -> duplicatesStrategy = DuplicatesStrategy.FAIL + preserveFileTimestamps = false reproducibleFileOrder = true dirPermissions { it.unix(0755) diff --git a/gradle/ide/eclipse.gradle b/gradle/ide/eclipse.gradle index 8e5c44cff9d8..7e115e4baafd 100644 --- a/gradle/ide/eclipse.gradle +++ b/gradle/ide/eclipse.gradle @@ -22,10 +22,11 @@ import org.gradle.plugins.ide.eclipse.model.ClasspathEntry def resources = scriptResources(buildscript) configure(rootProject) { - plugins.withType(JavaPlugin) { - apply plugin: "eclipse" + if (gradle.startParameter.taskNames.contains("eclipse")) { + project.pluginManager.apply("java-base") + project.pluginManager.apply("eclipse") - def eclipseJavaVersion = propertyOrDefault("eclipse.javaVersion", rootProject.minJavaVersion) + def eclipseJavaVersion = propertyOrDefault("eclipse.javaVersion", deps.versions.minJava.get()) def relativize = { other -> rootProject.rootDir.relativePath(other).toString() } eclipse { @@ -105,9 +106,9 @@ configure(rootProject) { } } - eclipseJdt { + eclipseJdt { enabled = false - dependsOn 'luceneEclipse' + dependsOn 'luceneEclipseJdt' } eclipseClasspath { diff --git a/gradle/testing/beasting.gradle b/gradle/testing/beasting.gradle index 8934100ec10b..67c20140ba8b 100644 --- a/gradle/testing/beasting.gradle +++ b/gradle/testing/beasting.gradle @@ -27,7 +27,7 @@ def beastingMode = gradle.startParameter.taskNames.any{ name -> name == 'beast' allprojects { plugins.withType(JavaPlugin) { - ext { + project.ext { testOptions += [ [propName: 'tests.dups', value: 0, description: "Reiterate runs of entire test suites ('beast' task)."] ] diff --git a/gradle/testing/defaults-tests.gradle b/gradle/testing/defaults-tests.gradle index 1f3a7d8b1a07..14e64647d667 100644 --- a/gradle/testing/defaults-tests.gradle +++ b/gradle/testing/defaults-tests.gradle @@ -128,8 +128,14 @@ allprojects { jvmArgs '--add-modules', 'jdk.management' // Enable the vector incubator module on supported Java versions: - if (rootProject.vectorIncubatorJavaVersions.contains(rootProject.runtimeJavaVersion)) { + def prop = propertyOrDefault("org.apache.lucene.vectorization.upperJavaFeatureVersion", "1") as String + def v = JavaVersion.toVersion(Integer.parseInt(prop)).majorVersion + if (rootProject.vectorIncubatorJavaVersions.contains(rootProject.runtimeJavaVersion) || + rootProject.runtimeJavaVersion.majorVersion <= v) { jvmArgs '--add-modules', 'jdk.incubator.vector' + if (rootProject.runtimeJavaVersion.majorVersion <= v) { + systemProperty 'org.apache.lucene.vectorization.upperJavaFeatureVersion', v + } } jvmArgs '--enable-native-access=' + (project.path in [ diff --git a/gradle/testing/profiling.gradle b/gradle/testing/profiling.gradle index 6c71b3f827a4..88284fb54547 100644 --- a/gradle/testing/profiling.gradle +++ b/gradle/testing/profiling.gradle @@ -19,7 +19,7 @@ def recordings = files() allprojects { plugins.withType(JavaPlugin) { - ext { + project.ext { testOptions += [ [propName: 'tests.profile', value: false, description: "Enable Java Flight Recorder profiling."] ] diff --git a/gradle/testing/randomization.gradle b/gradle/testing/randomization.gradle index 9ca8625b0eec..670f8ef2689e 100644 --- a/gradle/testing/randomization.gradle +++ b/gradle/testing/randomization.gradle @@ -62,7 +62,7 @@ allprojects { // Configure test property defaults and their descriptions. allprojects { plugins.withType(JavaPlugin) { - ext { + project.ext { String randomVectorSize = RandomPicks.randomFrom(new Random(projectSeedLong), ["default", "128", "256", "512"]) testOptions += [ // seed, repetition and amplification. @@ -135,14 +135,14 @@ allprojects { } afterEvaluate { - ext.testOptionsResolved = testOptions.findAll { opt -> + project.ext.testOptionsResolved = testOptions.findAll { opt -> propertyOrDefault(opt.propName, opt.value) != null }.collectEntries { opt -> [(opt.propName): Objects.toString(resolvedTestOption(opt.propName))] } // Compute the "reproduce with" string. - ext.testOptionsForReproduceLine = testOptions.findAll { opt -> + project.ext.testOptionsForReproduceLine = testOptions.findAll { opt -> if (opt["includeInReproLine"] == false) { return false } diff --git a/gradle/testing/slowest-tests-at-end.gradle b/gradle/testing/slowest-tests-at-end.gradle index eaf9cd1a2f12..d24e523394dc 100644 --- a/gradle/testing/slowest-tests-at-end.gradle +++ b/gradle/testing/slowest-tests-at-end.gradle @@ -22,7 +22,7 @@ def allSuites = [] allprojects { plugins.withType(JavaPlugin) { - ext { + project.ext { testOptions += [ [propName: 'tests.slowestTests', value: true, description: "Print the summary of the slowest tests."], [propName: 'tests.slowestSuites', value: true, description: "Print the summary of the slowest suites."] diff --git a/gradle/validation/dependencies.gradle b/gradle/validation/dependencies.gradle index 43dcf7583b80..5d9cc2e8d4a8 100644 --- a/gradle/validation/dependencies.gradle +++ b/gradle/validation/dependencies.gradle @@ -75,6 +75,18 @@ configure(rootProject) { it.dependsOn(":versionCatalogFormatDeps") } + // correct crlf/ default encoding after version catalog formatting finishes. + tasks.matching { + it.path in [ + ":versionCatalogFormatDeps" + ] + }.configureEach { + it.doLast { + ant.fixcrlf(file: it.catalogFile.get().asFile, + eol: "lf", fixlast: "true", encoding: "UTF-8") + } + } + tasks.matching { it.path in [ ":versionCatalogUpdateDeps" diff --git a/gradle/validation/forbidden-apis/defaults.all.txt b/gradle/validation/forbidden-apis/defaults.all.txt index 8fa8de5a4690..0d59f24d6c23 100644 --- a/gradle/validation/forbidden-apis/defaults.all.txt +++ b/gradle/validation/forbidden-apis/defaults.all.txt @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -@defaultMessage Spawns threads with vague names; use a custom thread factory (Lucene's NamedThreadFactory, Solr's SolrNamedThreadFactory) and name threads so that you can tell (by its name) which executor it is associated with +@defaultMessage Spawns threads with vague names; use a custom thread factory (Lucene's NamedThreadFactory) and name threads so that you can tell (by its name) which executor it is associated with java.util.concurrent.Executors#newFixedThreadPool(int) java.util.concurrent.Executors#newSingleThreadExecutor() java.util.concurrent.Executors#newCachedThreadPool() diff --git a/gradle/validation/git-status.gradle b/gradle/validation/git-status.gradle index 31b806416834..fa976589836c 100644 --- a/gradle/validation/git-status.gradle +++ b/gradle/validation/git-status.gradle @@ -74,21 +74,6 @@ configure(rootProject) { logger.warn("WARNING: Directory is not a valid git checkout (won't check dirty files): ${rootProject.projectDir}") } } else { - // git ignores any folders which are empty (this includes folders with recursively empty sub-folders). - def untrackedNonEmptyFolders = status.untrackedFolders.findAll { path -> - File location = file("${rootProject.projectDir}/${path}") - boolean hasFiles = false - Files.walkFileTree(location.toPath(), new SimpleFileVisitor() { - @Override - FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { - hasFiles = true - // Terminate early. - return FileVisitResult.TERMINATE - } - }) - return hasFiles - } - def offenders = [ // Exclude staged changes. These are fine in precommit. // "(added)": status.added, @@ -97,8 +82,7 @@ configure(rootProject) { "(conflicting)": status.conflicting, "(missing)": status.missing, "(modified)": status.modified, - "(untracked)": status.untracked, - "(untracked non-empty dir)": untrackedNonEmptyFolders + "(untracked)": status.untracked ].collectMany { fileStatus, files -> files.collect {file -> " - ${file} ${fileStatus}" } }.sort() diff --git a/gradle/validation/jar-checks.gradle b/gradle/validation/jar-checks.gradle index 5fe1bcbb3a67..aacb18c3eff4 100644 --- a/gradle/validation/jar-checks.gradle +++ b/gradle/validation/jar-checks.gradle @@ -20,6 +20,10 @@ // 2) notice file // 3) checksum validation/ generation. +// WARNING: The tasks in this file share internal state between tasks without using files. +// Because of this all tasks here must always execute together, so they cannot define task outputs. +// TODO: Rewrite the internal state to use state files containing the ext.jarInfos and its referencedFiles + // This should be false only for debugging. def failOnError = true @@ -194,13 +198,6 @@ subprojects { description = "Validate license and notice files of dependencies" dependsOn collectJarInfos - def outputFileName = 'validateJarLicenses' - inputs.dir(file(project.rootDir.path + '/lucene/licenses')) - .withPropertyName('licenses') - .withPathSensitivity(PathSensitivity.RELATIVE) - outputs.file(layout.buildDirectory.file(outputFileName)) - .withPropertyName('validateJarLicensesResult') - doLast { def errors = [] jarInfos.each { dep -> @@ -246,9 +243,7 @@ subprojects { } } } - // Required to take advantage of incremental building and the build cache - def f = new File(project.buildDir.path + "/" + outputFileName) - f.write(errors.toString(), "UTF-8") + if (errors) { def msg = "Certain license/ notice files are missing:\n - " + errors.join("\n - ") if (failOnError) { diff --git a/gradle/wrapper/gradle-wrapper.jar.sha256 b/gradle/wrapper/gradle-wrapper.jar.sha256 index 2a6b8668ac7d..6d5fdcd3f30c 100644 --- a/gradle/wrapper/gradle-wrapper.jar.sha256 +++ b/gradle/wrapper/gradle-wrapper.jar.sha256 @@ -1 +1 @@ -cb0da6751c2b753a16ac168bb354870ebb1e162e9083f116729cec9c781156b8 +2db75c40782f5e8ba1fc278a5574bab070adccb2d21ca5a6e5ed840888448046 \ No newline at end of file diff --git a/gradle/wrapper/gradle-wrapper.jar.version b/gradle/wrapper/gradle-wrapper.jar.version index 3b6825376add..7f6758ef97bc 100644 --- a/gradle/wrapper/gradle-wrapper.jar.version +++ b/gradle/wrapper/gradle-wrapper.jar.version @@ -1 +1 @@ -8.8.0 +8.10.0 diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties index a4413138c96c..9355b4155759 100644 --- a/gradle/wrapper/gradle-wrapper.properties +++ b/gradle/wrapper/gradle-wrapper.properties @@ -1,6 +1,6 @@ distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists -distributionUrl=https\://services.gradle.org/distributions/gradle-8.8-bin.zip +distributionUrl=https\://services.gradle.org/distributions/gradle-8.10-bin.zip networkTimeout=10000 validateDistributionUrl=true zipStoreBase=GRADLE_USER_HOME diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 265c339145e1..8b7afae0ec6b 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -3,6 +3,127 @@ Lucene Change Log For more information on past and future Lucene versions, please see: http://s.apache.org/luceneversions +======================= Lucene 11.0.0 ======================= + +API Changes +--------------------- +* GITHUB#11023: Removing deprecated parameters from CheckIndex. (Jakub Slowinski) + +New Features +--------------------- +(No changes) + +Improvements +--------------------- +(No changes) + +Optimizations +--------------------- +(No changes) + +Bug Fixes +--------------------- +(No changes) + +Other +--------------------- +(No changes) + +======================= Lucene 10.1.0 ======================= + +API Changes +--------------------- + +* GITHUB#13859: Allow open-ended ranges in Intervals range queries. (Mayya Sharipova) + +* GITHUB#13950: Make BooleanQuery#getClauses public and add #add(Collection) to BQ builder. (Shubham Chaudhary) + +* GITHUB#13957: Removed LeafSimScorer class, to save its overhead. Scorers now + compute scores directly from a SimScorer, postings and norms. (Adrien Grand) + +New Features +--------------------- +(No changes) + +Improvements +--------------------- + +* GITHUB#13986: Allow easier configuration of the Panama Vectorization provider with + newer Java versions. Set the `org.apache.lucene.vectorization.upperJavaFeatureVersion` + system property to increase the set of Java versions that Panama Vectorization will + provide optimized implementations for. (Chris Hegarty) + +Optimizations +--------------------- + +* GITHUB#13828: Reduce long[] array allocation for bitset in readBitSetIterator. (Zhang Chao) + +* GITHUB#13800: MaxScoreBulkScorer now recomputes scorer partitions when the + minimum competitive allows for a more favorable partitioning. (Adrien Grand) + +* GITHUB#13930: Use growNoCopy when copying bytes in BytesRefBuilder. (Ignacio Vera) + +* GITHUB#13931: Refactored `BooleanScorer` to evaluate matches of sub clauses + using the `Scorer` abstraction rather than the `BulkScorer` abstraction. This + speeds up exhaustive evaluation of disjunctions of term queries. + (Adrien Grand) + +* GITHUB#13941: Optimized computation of top-hits on disjunctive queries with + many clauses. (Adrien Grand) + +* GITHUB#13954: Disabled exchanging scores across slices for exhaustive + top-hits evaluation. (Adrien Grand) + +* GITHUB#13899: Check ahead if we can get the count. (Lu Xugang) + +* GITHUB#13943: Removed shared `HitsThresholdChecker`, which reduces overhead + but may delay a bit when dynamic pruning kicks in. (Adrien Grand) + +* GITHUB#13961: Replace Map with IntObjectHashMap for DV producer. (Pan Guixin) + +* GITHUB#13963: Speed up nextDoc() implementations in Lucene912PostingsReader. + (Adrien Grand) + +* GITHUB#13958: Speed up advancing within a block. (Adrien Grand) + +* GITHUB#13763: Replace Map with IntObjectHashMap for KnnVectorsReader (Pan Guixin) + +* GITHUB#13968: Switch postings from storing doc IDs in a long[] to an int[]. + Lucene 8.4 had moved to a long[] to help speed up block decoding by using + longs that would pack two integers. We are now moving back to integers to be + able to take advantage of 2x more lanes with the vector API. (Adrien Grand) + +* GITHUB#13994: Speed up top-k retrieval of filtered conjunctions. + (Adrien Grand) + +Bug Fixes +--------------------- +* GITHUB#13832: Fixed an issue where the DefaultPassageFormatter.format method did not format passages as intended + when they were not sorted by startOffset. (Seunghan Jung) +* GITHUB#13884: Remove broken .toArray from Long/CharObjectHashMap entirely. (Pan Guixin) +* GITHUB#12686: Added support for highlighting IndexOrDocValuesQuery. (Prudhvi Godithi) +* GITHUB#13927: Fix StoredFieldsConsumer finish. (linfn) +* GITHUB#13944: Ensure deterministic order of clauses for `DisjunctionMaxQuery#toString`. (Laurent Jakubina) +* GITHUB#13841: Improve Tessellatorlogic when two holes share the same vertex with the polygon which was failing + in valid polygons. (Ignacio Vera) +* GITHUB#13990: Added filter to the toString() method of Knn[Float|Byte]VectorQuery + and DiversifyingChildren[Float|Byte]KnnVectorQuery. (Viswanath Kuchibhotla) + +Build +--------------------- + +* Upgrade forbiddenapis to version 3.8. (Uwe Schindler) + +Other +--------------------- +* GITHUB#13982: Remove duplicate test code. (Lu Xugang) + +======================== Lucene 10.0.1 ======================= + +Bug Fixes +--------------------- + + ======================= Lucene 10.0.0 ======================= API Changes @@ -48,9 +169,9 @@ API Changes * GITHUB#12296: Make IndexReader and IndexReaderContext classes explicitly sealed. They have already been runtime-checked to only be implemented by the specific classes - so this is effectively a non-breaking change. + so this is effectively a non-breaking change. (Petr Portnov) -* GITHUB#12276: Rename DaciukMihovAutomatonBuilder to StringsToAutomaton +* GITHUB#12276: Rename DaciukMihovAutomatonBuilder to StringsToAutomaton. (Michael McCandless) * GITHUB#12321: Reduced visibility of StringsToAutomaton. Please use Automata#makeStringUnion instead. (Greg Miller) @@ -110,6 +231,27 @@ API Changes * GITHUB#13499: Remove deprecated TopScoreDocCollector + TopFieldCollector methods (#create, #createSharedManager) (Jakub Slowinski) +* GITHUB#13632: CandidateMatcher public matching functions (Bryan Jacobowitz) + +* GITHUB#13708: Move Operations.sameLanguage/subsetOf to test-framework. (Robert Muir) + +* GITHUB#13733: Move FacetsCollector#search utility methods to `FacetsCollectorManager`, replace the `Collector` + argument with a `FacetsCollectorManager` and update the return type to include both `TopDocs` results as well as + facets results. (Luca Cavanna) + +* GITHUB#13328: Convert many basic Lucene classes to record classes, including CollectionStatistics, TermStatistics and LeafMetadata. (Shubham Chaudhary) + +* GITHUB#13780: Remove IndexSearcher#search(List, Weight, Collector) in favour of the newly + introduced IndexSearcher#search(LeafReaderContextPartition[], Weight, Collector). (Luca Cavanna) + +* GITHUB#13779: First-class random access API for KnnVectorValues + unifies Byte/FloatVectorValues incorporating RandomAccess* API and introduces + DocIndexIterator for iterative access in place of direct inheritance from DISI. (Michael Sokolov) + +* GITHUB#13845: Add missing with-discountOverlaps Similarity constructor variants. (Pierre Salagnac, Christine Poerschke, Robert Muir) + +* GITHUB#13820, GITHUB#13825, GITHUB#13830: Corrects DataInput.readGroupVInts to be public and not-final, removes the protected + DataInput.readGroupVInt method. (Zhang Chao, Robert Muir, Uwe Schindler, Dawid Weiss) New Features --------------------- @@ -139,6 +281,19 @@ New Features * GITHUB#13604: Add Kmeans clustering on vectors (Mayya Sharipova, Jim Ferenczi, Tom Veasey) +* GITHUB#13592: Take advantage of the doc value skipper when it is primary sort in SortedNumericDocValuesRangeQuery + and SortedSetDocValuesRangeQuery. (Ignacio Vera) + +* GITHUB#13542: Add initial support for intra-segment concurrency. IndexSearcher now supports searching across leaf + reader partitions concurrently. This is useful to max out available resource usage especially with force merged + indices or big segments. There is still a performance penalty for queries that require segment-level computation + ahead of time, such as points/range queries. This is an implementation limitation that we expect to improve in + future releases, ad that's why intra-segment slicing is not enabled by default, but leveraged in tests when the + searcher is created via LuceneTestCase#newSearcher. Users may override IndexSearcher#slices(List) to optionally + create slices that target segment partitions. (Luca Cavanna) + +* GITHUB#13741: Implement Accountable for NFARunAutomaton, fix hashCode implementation of CompiledAutomaton. (Patrick Zhai) + Improvements --------------------- @@ -162,6 +317,8 @@ Improvements * GITHUB#12172: Update Romanian stopwords list to include the modern unicode forms. (Trey Jones) +* GITHUB#13707: Improve Operations.isTotal() to work with non-minimal automata. (Dawid Weiss, Robert Muir) + Optimizations --------------------- @@ -174,17 +331,21 @@ Optimizations * GITHUB#12552: Make FSTPostingsFormat load FSTs off-heap. (Tony X) +* GITHUB#13672: Leverage doc value skip lists in DocValuesRewriteMethod if indexed. (Greg Miller) + Bug Fixes --------------------- * LUCENE-10599: LogMergePolicy is more likely to keep merging segments until they reach the maximum merge size. (Adrien Grand) -* GITHUB#12220: Hunspell: disallow hidden title-case entries from compound middle/end +* GITHUB#12220: Hunspell: disallow hidden title-case entries from compound middle/end. (Peter Gromov) * GITHUB#12878: Fix the declared Exceptions of Expression#evaluate() to match those of DoubleValues#doubleValue(). (Uwe Schindler) +* GITHUB#13498: Avoid performance regression by constructing lazily the PointTree in NumericComparator, (Ignacio Vera) + Changes in Runtime Behavior --------------------- @@ -211,6 +372,9 @@ Changes in Backwards Compatibility Policy * GITHUB#13230: Remove the Kp and Lovins snowball algorithms which are not supported or intended for general use. (Robert Muir) +* GITHUB#13602: SearchWithCollectorTask no longer supports the `collector.class` config parameter to load a custom + collector implementation. `collector.manager.class` allows users to load a collector manager instead. (Luca Cavanna) + Other --------------------- @@ -251,11 +415,26 @@ Other * GITHUB#13499: Remove usage of TopScoreDocCollector + TopFieldCollector deprecated methods (#create, #createSharedManager) (Jakub Slowinski) +Build +--------------------- + +* GITHUB#13649: Fix eclipse ide settings generation #13649 (Uwe Schindler, Dawid Weiss) + +* GITHUB#13698: Upgrade to gradle 8.10 (Dawid Weiss) + ======================== Lucene 9.12.0 ======================= +Security Fixes +--------------------- + +* Deserialization of Untrusted Data vulnerability in Apache Lucene Replicator - CVE-2024-45772 + (Summ3r from Vidar-Team, Robert Muir, Paul Irwin) + API Changes --------------------- +* GITHUB#13806: Add TermInSetQuery#getBytesRefIterator to be able to iterate over query terms. (Christoph Büscher) + * GITHUB#13469: Expose FlatVectorsFormat as a first-class format; can be configured using a custom Codec. (Michael Sokolov) * GITHUB#13612: Hunspell: add Suggester#proceedPastRep to avoid losing relevant suggestions. (Peter Gromov) @@ -266,6 +445,15 @@ API Changes * GITHUB#13559: Add BitSet#nextSetBit(int, int) to get the index of the first set bit in range. (Egor Potemkin) +* GITHUB#13568: Add DoubleValuesSource#toSortableLongDoubleValuesSource and + MultiDoubleValuesSource#toSortableMultiLongValuesSource methods. (Shradha Shankar) + +* GITHUB#13568, GITHUB#13750: Add DrillSideways#search method that supports any CollectorManagers for drill-sideways dimensions + or drill-down. (Egor Potemkin) + +* GITHUB#13757: For similarities, provide default computeNorm implementation and remove remaining discountOverlaps setters. + (Christine Poerschke, Adrien Grand, Robert Muir) + New Features --------------------- @@ -278,8 +466,21 @@ New Features and LogByteSizeMergePolicy via a new #setTargetConcurrency setter. (Adrien Grand) +* GITHUB#13568: Add sandbox facets module to compute facets while collecting. (Egor Potemkin, Shradha Shankar) + +* GITHUB#13678: Add support JDK 23 to the Panama Vectorization Provider. (Chris Hegarty) + +* GITHUB#13689: Add a new faceting feature, dynamic range facets, which automatically picks a balanced set of numeric + ranges based on the distribution of values that occur across all hits. For use cases that have a highly variable + numeric doc values field, such as "price" in an e-commerce application, this facet method is powerful as it allows the + presented ranges to adapt depending on what hits the query actually matches. This is in contrast to existing range + faceting that requires the application to provide the specific fixed ranges up front. (Yuting Gan, Greg Miller, + Stefan Vodita) + Improvements --------------------- +* GITHUB#13475: Re-enable intra-merge parallelism except for terms, norms, and doc values. + Related to GITHUB#13478. (Ben Trent) * GITHUB#13548: Refactor and javadoc update for KNN vector writer classes. (Patrick Zhai) @@ -290,6 +491,17 @@ Improvements * GITHUB#13285: Early terminate graph searches of AbstractVectorSimilarityQuery to follow timeout set from IndexSearcher#setTimeout(QueryTimeout). (Kaival Parikh) + +* GITHUB#13633: Add ability to read/write knn vector values to a MemoryIndex. (Ben Trent) + +* GITHUB#12627: patch HNSW graphs to improve reachability of all nodes from entry points + +* GITHUB#13201: Better cost estimation on MultiTermQuery over few terms. (Michael Froh) + +* GITHUB#13735: Migrate monitor package usage of deprecated IndexSearcher#search(Query, Collector) + to IndexSearcher#search(Query, CollectorManager). (Greg Miller) + +* GITHUB#13746: Introduce ProfilerCollectorManager to parallelize search when using ProfilerCollector. (Luca Cavanna) Optimizations --------------------- @@ -322,8 +534,14 @@ Optimizations Closing many individual index files can potentially lead to a degradation in execution performance. Index files are mmapped one-to-one with the JDK's foreign shared Arena. The JVM deoptimizes the top few frames of all threads when closing a shared Arena (see JDK-8335480). We mitigate this situation - by 1) using a confined Arena where appropriate, and 2) grouping files from the same segment to a - single shared Arena. (Chris Hegarty, Michael Gibney, Uwe Schindler) + when running with JDK 21 and greater, by 1) using a confined Arena where appropriate, and 2) grouping + files from the same segment to a single shared Arena. + A system property has been added that allows to control the total maximum number of mmapped files + that may be associated with a single shared Arena. For example, to set the max number of permits to + 256, pass the following on the command line + -Dorg.apache.lucene.store.MMapDirectory.sharedArenaMaxPermits=256. Setting a value of 1 associates + a single file to a single shared arena. + (Chris Hegarty, Michael Gibney, Uwe Schindler) * GITHUB#13585: Lucene912PostingsFormat, the new default postings format, now only has 2 levels of skip data, which are inlined into postings instead of @@ -333,6 +551,20 @@ Optimizations * GITHUB#13581: OnHeapHnswGraph no longer allocates a lock for every graph node (Mike Sokolov) +* GITHUB#13636, GITHUB#13658: Optimizations to the decoding logic of blocks of + postings. (Adrien Grand, Uwe Schindler, Greg Miller) + +* GITHUB##13644: Improve NumericComparator competitive iterator logic by comparing the missing value with the top + value even after the hit queue is full (Pan Guixin) + +* GITHUB#13587: Use Max WAND optimizations with ToParentBlockJoinQuery when using ScoreMode.Max (Mike Pellegrini) + +* GITHUB#13742: Reorder checks in LRUQueryCache#count (Shubham Chaudhary) + +* GITHUB#13697: Add a bulk scorer to ToParentBlockJoinQuery, which delegates to the bulk scorer of the child query. + This should speed up query evaluation when the child query has a specialized bulk scorer, such as disjunctive queries. + (Mike Pellegrini) + Changes in runtime behavior --------------------- @@ -356,9 +588,44 @@ Bug Fixes * GITHUB#13615: Correct scalar quantization when used in conjunction with COSINE similarity. Vectors are normalized before quantization to ensure the cosine similarity is correctly calculated. (Ben Trent) +* GITHUB#13627: Fix race condition on flush for DWPT seqNo generation. (Ben Trent, Ao Li) + +* GITHUB#13691: Fix incorrect exponent value in explain of SigmoidFunction. (Owais Kazi) + +* GITHUB#13703: Fix bug in LatLonPoint queries where narrow polygons close to latitude 90 don't + match any points due to an Integer overflow. (Ignacio Vera) + +* GITHUB#13641: Unify how KnnFormats handle missing fields and correctly handle missing vector fields when + merging segments. (Ben Trent) + +* GITHUB#13519: 8 bit scalar vector quantization is no longer + supported: it was buggy starting in 9.11 (GITHUB#13197). 4 and 7 + bit quantization are still supported. Existing (9.x) Lucene indices + that previously used 8 bit quantization can still be read/searched + but the results from `KNN*VectorQuery` are silently buggy. Further + 8 bit quantized vector indexing into such (9.11) indices is not + permitted, so your path forward if you wish to continue using the + same 9.11 index is to index additional vectors into the same field + with either 4 or 7 bit quantization (or no quantization), and ensure + all older (9.11 written) segments are rewritten either via + `IndexWriter.forceMerge` or + `IndexWriter.addIndexes(CodecReader...)`, or reindexing entirely. + +* GITHUB#13799: Disable intra-merge parallelism for all structures but kNN vectors. (Ben Trent) + +Build +--------------------- + +* GITHUB#13695, GITHUB#13696: Fix Gradle build sometimes gives spurious "unreferenced license file" warnings. + (Uwe Schindler) + Other -------------------- -(No changes) + +* GITHUB#13720: Add float comparison based on unit of least precision and use it to stop test failures caused by float + summation not being associative in IEEE 754. (Alex Herbert, Stefan Vodita) + +* Remove code triggering forbidden-apis regarding Java serialization. (Uwe Schindler, Robert Muir) ======================== Lucene 9.11.1 ======================= diff --git a/lucene/MIGRATE.md b/lucene/MIGRATE.md index 64de353b1538..1db50b7fdd89 100644 --- a/lucene/MIGRATE.md +++ b/lucene/MIGRATE.md @@ -19,6 +19,19 @@ ## Migration from Lucene 9.x to Lucene 10.0 +### DataInput#readVLong() may now read negative vlongs + +LUCENE-10376 started allowing `DataInput#readVLong()` to read negative vlongs. +In particular, this feature is used by the `DataInput#readZLong()` method. A +practical implication is that `DataInput#readVLong()` may now read up to 10 +bytes, while it would never read more than 9 bytes in Lucene 9.x. + +### Changes to DataInput.readGroupVInt and readGroupVInts methods + +As part of GITHUB#13820, GITHUB#13825, GITHUB#13830, this issue corrects DataInput.readGroupVInts +to be public and not-final, allowing subclasses to override it. This change also removes the protected +DataInput.readGroupVInt method: subclasses should delegate or reimplement it entirely. + ### OpenNLP dependency upgrade [Apache OpenNLP](https://opennlp.apache.org) 2.x opens the door to accessing various models via the ONNX runtime. To migrate you will need to update any deprecated OpenNLP methods that you may be using. @@ -80,9 +93,22 @@ behaviour as 9.x, clone `PersianAnalyzer` in 9.x or create custom analyzer by us ### AutomatonQuery/CompiledAutomaton/RunAutomaton/RegExp no longer determinize (LUCENE-10010) These classes no longer take a `determinizeWorkLimit` and no longer determinize -behind the scenes. It is the responsibility of the caller to to call +behind the scenes. It is the responsibility of the caller to call `Operations.determinize()` for DFA execution. +### RegExp optional complement syntax has been deprecated + +Support for the optional complement syntax (`~`) has been deprecated. +The `COMPLEMENT` syntax flag has been removed and replaced by the +`DEPRECATED_COMPLEMENT` flag. Users wanting to enable the deprecated +complement support can do so by explicitly passing a syntax flags that +has `DEPRECATED_COMPLEMENT` when creating a `RegExp`. For example: +`new RegExp("~(foo)", RegExp.DEPRECATED_COMPLEMENT)`. + +Alternatively, and quite commonly, a more simple _complement bracket expression_, +`[^...]`, may be a suitable replacement, For example, `[^fo]` matches any +character that is not an `f` or `o`. + ### DocValuesFieldExistsQuery, NormsFieldExistsQuery and KnnVectorFieldExistsQuery removed in favor of FieldExistsQuery (LUCENE-10436) These classes have been removed and consolidated into `FieldExistsQuery`. To migrate, caller simply replace those classes @@ -180,6 +206,9 @@ access the members using method calls instead of field accesses. Affected classe - `IOContext`, `MergeInfo`, and `FlushInfo` (GITHUB#13205) - `BooleanClause` (GITHUB#13261) +- `TotalHits` (GITHUB#13762) +- `TermAndVector` (GITHUB#13772) +- Many basic Lucene classes, including `CollectionStatistics`, `TermStatistics` and `LeafMetadata` (GITHUB#13328) ### Boolean flags on IOContext replaced with a new ReadAdvice enum. @@ -248,6 +277,11 @@ ConcurrentMergeScheduler now disables auto I/O throttling by default. There is s happening at the CPU level, since ConcurrentMergeScheduler has a maximum number of threads it can use, which is only a fraction of the total number of threads of the host by default. +### FieldInfos#hasVectors and FieldInfo#hasVectors renamed to hasTermVectors + +To reduce confusion between term vectors and numeric vectors, `hasVectors` has been renamed to +`hasTermVectors`. + ## Migration from Lucene 9.0 to Lucene 9.1 ### Test framework package migration and module (LUCENE-10301) @@ -793,3 +827,81 @@ Specifically, the method `FunctionValues#getScorer(Weight weight, LeafReaderCont Callers must now keep track of the Weight instance that created the Scorer if they need it, instead of relying on Scorer. +### `FacetsCollector#search` utility methods moved and updated + +The static `search` methods exposed by `FacetsCollector` have been moved to `FacetsCollectorManager`. +Furthermore, they take a `FacetsCollectorManager` last argument in place of a `Collector` so that they support +intra query concurrency. The return type has also be updated to `FacetsCollectorManager.FacetsResult` which includes +both `TopDocs` as well as facets results included in a reduced `FacetsCollector` instance. + +### `SearchWithCollectorTask` no longer supports the `collector.class` config parameter + +`collector.class` used to allow users to load a custom collector implementation. `collector.manager.class` +replaces it by allowing users to load a custom collector manager instead. + +### BulkScorer#score(LeafCollector collector, Bits acceptDocs) removed + +Use `BulkScorer#score(LeafCollector collector, Bits acceptDocs, int min, int max)` instead. In order to score the +entire leaf, provide `0` as min and `DocIdSetIterator.NO_MORE_DOCS` as max. `BulkScorer` subclasses that override +such method need to instead override the method variant that takes the range of doc ids as well as arguments. + +### CollectorManager#newCollector and Collector#getLeafCollector contract + +With the introduction of intra-segment query concurrency support, multiple `LeafCollector`s may be requested for the +same `LeafReaderContext` via `Collector#getLeafCollector(LeafReaderContext)` across the different `Collector` instances +returned by multiple `CollectorManager#newCollector` calls. Any logic or computation that needs to happen +once per segment requires specific handling in the collector manager implementation. See `TotalHitCountCollectorManager` +as an example. Individual collectors don't need to be adapted as a specific `Collector` instance will still see a given +`LeafReaderContext` once, given that it is not possible to add more than one partition of the same segment to the same +leaf slice. + +### Weight#scorer, Weight#bulkScorer and Weight#scorerSupplier contract + +With the introduction of intra-segment query concurrency support, multiple `Scorer`s, `ScorerSupplier`s or `BulkScorer`s +may be requested for the same `LeafReaderContext` instance as part of a single search call. That may happen concurrently +from separate threads each searching a specific doc id range of the segment. `Weight` implementations that rely on the +assumption that a scorer, bulk scorer or scorer supplier for a given `LeafReaderContext` is requested once per search +need updating. + +### Signature of IndexSearcher#searchLeaf changed + +With the introduction of intra-segment query concurrency support, the `IndexSearcher#searchLeaf(LeafReaderContext ctx, Weight weight, Collector collector)` +method now accepts two additional int arguments to identify the min/max range of doc ids that will be searched in this +leaf partition`: IndexSearcher#searchLeaf(LeafReaderContext ctx, int minDocId, int maxDocId, Weight weight, Collector collector)`. +Subclasses of `IndexSearcher` that call or override the `searchLeaf` method need to be updated accordingly. + +### Signature of static IndexSearch#slices method changed + +The static `IndexSearcher#slices(List leaves, int maxDocsPerSlice, int maxSegmentsPerSlice)` +method now supports an additional 4th and last argument to optionally enable creating segment partitions: +`IndexSearcher#slices(List leaves, int maxDocsPerSlice, int maxSegmentsPerSlice, boolean allowSegmentPartitions)` + +### TotalHitCountCollectorManager constructor + +`TotalHitCountCollectorManager` now requires that an array of `LeafSlice`s, retrieved via `IndexSearcher#getSlices`, +is provided to its constructor. Depending on whether segment partitions are present among slices, the manager can +optimize the type of collectors it creates and exposes via `newCollector`. + +### `IndexSearcher#search(List, Weight, Collector)` removed + +The protected `IndexSearcher#search(List leaves, Weight weight, Collector collector)` method has been +removed in favour of the newly introduced `search(LeafReaderContextPartition[] partitions, Weight weight, Collector collector)`. +`IndexSearcher` subclasses that override this method need to instead override the new method. + +### Indexing vectors with 8 bit scalar quantization is no longer supported but 7 and 4 bit quantization still work (GITHUB#13519) + +8 bit scalar vector quantization is no longer supported: it was buggy +starting in 9.11 (GITHUB#13197). 4 and 7 bit quantization are still +supported. Existing (9.11) Lucene indices that previously used 8 bit +quantization can still be read/searched but the results from +`KNN*VectorQuery` are silently buggy. Further 8 bit quantized vector +indexing into such (9.11) indices is not permitted, so your path +forward if you wish to continue using the same 9.11 index is to index +additional vectors into the same field with either 4 or 7 bit +quantization (or no quantization), and ensure all older (9.x written) +segments are rewritten either via `IndexWriter.forceMerge` or +`IndexWriter.addIndexes(CodecReader...)`, or reindexing entirely. + +### Vector values APIs switched to primarily random-access + +`{Byte/Float}VectorValues` no longer inherit from `DocIdSetIterator`. Rather they extend a common class, `KnnVectorValues`, that provides a random access API (previously provided by `RandomAccessVectorValues`, now removed), and an `iterator()` method for retrieving `DocIndexIterator`: an iterator which is a DISI that also provides an `index()` method. Therefore, any iteration over vector values must now be performed using the values' `iterator()`. Random access works as before, but does not require casting to `RandomAccessVectorValues`. diff --git a/lucene/analysis/common/src/generated/checksums/generateClassicTokenizer.json b/lucene/analysis/common/src/generated/checksums/generateClassicTokenizer.json index 1e94a248e009..a66c64ffde5d 100644 --- a/lucene/analysis/common/src/generated/checksums/generateClassicTokenizer.json +++ b/lucene/analysis/common/src/generated/checksums/generateClassicTokenizer.json @@ -1,5 +1,5 @@ { "gradle/generation/jflex/skeleton.default.txt": "58944f66c9113a940dfaf6a17210ec8219024390", - "lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizerImpl.java": "1f7a446f3483326385eef257cea8366c27da0850", + "lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizerImpl.java": "e62dcd8c25219d8f5d783823b228ffe38d2bacde", "lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizerImpl.jflex": "f52109bb7d5701979fde90aeeeda726246a8d5fd" } \ No newline at end of file diff --git a/lucene/analysis/common/src/generated/checksums/generateWikipediaTokenizer.json b/lucene/analysis/common/src/generated/checksums/generateWikipediaTokenizer.json index 0103898b52cf..d37efd61e1cb 100644 --- a/lucene/analysis/common/src/generated/checksums/generateWikipediaTokenizer.json +++ b/lucene/analysis/common/src/generated/checksums/generateWikipediaTokenizer.json @@ -1,5 +1,5 @@ { "gradle/generation/jflex/skeleton.default.txt": "58944f66c9113a940dfaf6a17210ec8219024390", - "lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java": "ac298e08bc5b96202efca0c01f9f0376fda976bd", + "lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java": "2b5df5ff35543a6380c82f298225eb5fa06e4453", "lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex": "0b8c7774b98e8237702013e82c352d4711509bd0" } \ No newline at end of file diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizer.java index de7c56c5433d..949d1fbf0307 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizer.java @@ -37,23 +37,23 @@ int normalize(char[] s, int len) { for (int i = 0; i < len; i++) { switch (s[i]) { - // delete Chandrabindu + // delete Chandrabindu case '\u0981': len = delete(s, i, len); i--; break; - // DirghoI kar -> RosshoI kar + // DirghoI kar -> RosshoI kar case '\u09C0': s[i] = '\u09BF'; break; - // DirghoU kar -> RosshoU kar + // DirghoU kar -> RosshoU kar case '\u09C2': s[i] = '\u09C1'; break; - // Khio (Ka + Hoshonto + Murdorno Sh) + // Khio (Ka + Hoshonto + Murdorno Sh) case '\u0995': if (i + 2 < len && s[i + 1] == '\u09CD' && s[i + 2] == '\u09BF') { if (i == 0) { @@ -67,12 +67,12 @@ int normalize(char[] s, int len) { } break; - // Nga to Anusvara + // Nga to Anusvara case '\u0999': s[i] = '\u0982'; break; - // Ja Phala + // Ja Phala case '\u09AF': if (i - 2 == 0 && s[i - 1] == '\u09CD') { s[i - 1] = '\u09C7'; @@ -89,7 +89,7 @@ int normalize(char[] s, int len) { } break; - // Ba Phalaa + // Ba Phalaa case '\u09AC': if ((i >= 1 && s[i - 1] != '\u09CD') || i == 0) { break; @@ -109,7 +109,7 @@ int normalize(char[] s, int len) { } break; - // Visarga + // Visarga case '\u0983': if (i == len - 1) { if (len <= 3) { @@ -122,18 +122,18 @@ int normalize(char[] s, int len) { } break; - // All sh + // All sh case '\u09B6': case '\u09B7': s[i] = '\u09B8'; break; - // check na + // check na case '\u09A3': s[i] = '\u09A8'; break; - // check ra + // check ra case '\u09DC': case '\u09DD': s[i] = '\u09B0'; diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizerImpl.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizerImpl.java index 12098b10697a..0d2f05189bf9 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizerImpl.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizerImpl.java @@ -747,70 +747,70 @@ public int getNextToken() throws java.io.IOException { /* Break so we don't hit fall-through warning: */ break; /* ignore */ } - // fall through + // fall through case 11: break; case 2: { return ALPHANUM; } - // fall through + // fall through case 12: break; case 3: { return CJ; } - // fall through + // fall through case 13: break; case 4: { return NUM; } - // fall through + // fall through case 14: break; case 5: { return HOST; } - // fall through + // fall through case 15: break; case 6: { return COMPANY; } - // fall through + // fall through case 16: break; case 7: { return APOSTROPHE; } - // fall through + // fall through case 17: break; case 8: { return ACRONYM_DEP; } - // fall through + // fall through case 18: break; case 9: { return ACRONYM; } - // fall through + // fall through case 19: break; case 10: { return EMAIL; } - // fall through + // fall through case 20: break; default: diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java index 3f712558e4be..9e774708183e 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java @@ -53,18 +53,18 @@ public boolean incrementToken() throws IOException { private int lowerCase(int codepoint) { switch (codepoint) { - /* There are two lowercase forms of sigma: - * U+03C2: small final sigma (end of word) - * U+03C3: small sigma (otherwise) - * - * Standardize both to U+03C3 - */ + /* There are two lowercase forms of sigma: + * U+03C2: small final sigma (end of word) + * U+03C3: small sigma (otherwise) + * + * Standardize both to U+03C3 + */ case '\u03C2': /* small final sigma */ return '\u03C3'; /* small sigma */ - /* Some greek characters contain diacritics. - * This filter removes these, converting to the lowercase base form. - */ + /* Some greek characters contain diacritics. + * This filter removes these, converting to the lowercase base form. + */ case '\u0386': /* capital alpha with tonos */ case '\u03AC': /* small alpha with tonos */ @@ -100,9 +100,9 @@ private int lowerCase(int codepoint) { case '\u03CE': /* small omega with tonos */ return '\u03C9'; /* small omega */ - /* The previous implementation did the conversion below. - * Only implemented for backwards compatibility with old indexes. - */ + /* The previous implementation did the conversion below. + * Only implemented for backwards compatibility with old indexes. + */ case '\u03A2': /* reserved */ return '\u03C2'; /* small final sigma */ diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/PorterStemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/PorterStemmer.java index 3804c8a18bb9..79943ca5a896 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/PorterStemmer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/PorterStemmer.java @@ -456,7 +456,7 @@ private final void step5() { /* j >= 0 fixes Bug 2 */ if (ends("ou")) break; return; - /* takes care of -ous */ + /* takes care of -ous */ case 's': if (ends("ism")) break; return; diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishLowerCaseFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishLowerCaseFilter.java index bc1c42c8af64..eb415beead66 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishLowerCaseFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishLowerCaseFilter.java @@ -67,7 +67,7 @@ private boolean isUpperVowel(int v) { case 'I': case 'O': case 'U': - // vowels with acute accent (fada) + // vowels with acute accent (fada) case '\u00c1': case '\u00c9': case '\u00cd': diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizer.java index 5a8376dcb7ca..f3d9c222715c 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizer.java @@ -47,18 +47,18 @@ int normalize(char[] s, int len) { for (int i = 0; i < len; i++) { switch (s[i]) { - // dead n -> bindu + // dead n -> bindu case '\u0928': if (i + 1 < len && s[i + 1] == '\u094D') { s[i] = '\u0902'; len = delete(s, i + 1, len); } break; - // candrabindu -> bindu + // candrabindu -> bindu case '\u0901': s[i] = '\u0902'; break; - // nukta deletions + // nukta deletions case '\u093C': len = delete(s, i, len); i--; @@ -96,18 +96,18 @@ int normalize(char[] s, int len) { case '\u095F': s[i] = '\u092F'; break; - // zwj/zwnj -> delete + // zwj/zwnj -> delete case '\u200D': case '\u200C': len = delete(s, i, len); i--; break; - // virama -> delete + // virama -> delete case '\u094D': len = delete(s, i, len); i--; break; - // chandra/short -> replace + // chandra/short -> replace case '\u0945': case '\u0946': s[i] = '\u0947'; @@ -127,7 +127,7 @@ int normalize(char[] s, int len) { case '\u0972': s[i] = '\u0905'; break; - // long -> short ind. vowels + // long -> short ind. vowels case '\u0906': s[i] = '\u0905'; break; @@ -149,7 +149,7 @@ int normalize(char[] s, int len) { case '\u0914': s[i] = '\u0913'; break; - // long -> short dep. vowels + // long -> short dep. vowels case '\u0940': s[i] = '\u093F'; break; diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/CheckCompoundPattern.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/CheckCompoundPattern.java index 9a779138a009..b5796bd1e579 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/CheckCompoundPattern.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/CheckCompoundPattern.java @@ -52,7 +52,7 @@ public String toString() { boolean prohibitsCompounding(CharsRef word, int breakPos, Root rootBefore, Root rootAfter) { if (isNonAffixedPattern(endChars)) { - if (!charsMatch(word, breakPos - rootBefore.word.length(), rootBefore.word)) { + if (!charsMatch(word, breakPos - rootBefore.word().length(), rootBefore.word())) { return false; } } else if (!charsMatch(word, breakPos - endChars.length(), endChars)) { @@ -60,7 +60,7 @@ boolean prohibitsCompounding(CharsRef word, int breakPos, Root rootBefore, Ro } if (isNonAffixedPattern(beginChars)) { - if (!charsMatch(word, breakPos, rootAfter.word)) { + if (!charsMatch(word, breakPos, rootAfter.word())) { return false; } } else if (!charsMatch(word, breakPos, beginChars)) { @@ -84,7 +84,7 @@ private static boolean isNonAffixedPattern(String pattern) { private boolean hasAllFlags(Root root, char[] flags) { for (char flag : flags) { - if (!dictionary.hasFlag(root.entryId, flag)) { + if (!dictionary.hasFlag(root.entryId(), flag)) { return false; } } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java index 49ebc7847668..4df7610bde8b 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java @@ -24,7 +24,6 @@ import java.util.Comparator; import java.util.LinkedHashSet; import java.util.List; -import java.util.Objects; import java.util.PriorityQueue; import java.util.Set; import java.util.TreeSet; @@ -62,8 +61,7 @@ List suggest(String word, WordCase originalCase, Set prevSug private List>> findSimilarDictionaryEntries( String word, WordCase originalCase) { - Comparator>> natural = Comparator.naturalOrder(); - PriorityQueue>> roots = new PriorityQueue<>(natural.reversed()); + PriorityQueue>> roots = new PriorityQueue<>(Comparator.reverseOrder()); char[] excludeFlags = dictionary.allNonSuggestibleFlags(); FlagEnumerator.Lookup flagLookup = dictionary.flagLookup; @@ -111,7 +109,7 @@ private List>> findSimilarDictionaryEntries( private static boolean isWorseThan(int score, CharsRef candidate, Weighted> root) { return score < root.score - || score == root.score && CharSequence.compare(candidate, root.word.word) > 0; + || score == root.score && CharSequence.compare(candidate, root.word.word()) > 0; } private void processSuggestibleWords( @@ -162,11 +160,11 @@ private List expandRoot(Root root, String misspelled) { List crossProducts = new ArrayList<>(); Set result = new LinkedHashSet<>(); - if (!dictionary.hasFlag(root.entryId, dictionary.needaffix)) { - result.add(root.word); + if (!dictionary.hasFlag(root.entryId(), dictionary.needaffix)) { + result.add(root.word()); } - char[] wordChars = root.word.toCharArray(); + char[] wordChars = root.word().toCharArray(); // suffixes processAffixes( @@ -180,7 +178,7 @@ private List expandRoot(Root root, String misspelled) { } String suffix = misspelled.substring(misspelled.length() - suffixLength); - String withSuffix = root.word.substring(0, root.word.length() - stripLength) + suffix; + String withSuffix = root.word().substring(0, root.word().length() - stripLength) + suffix; result.add(withSuffix); if (dictionary.isCrossProduct(suffixId)) { crossProducts.add(withSuffix.toCharArray()); @@ -192,7 +190,7 @@ private List expandRoot(Root root, String misspelled) { true, misspelled, (prefixLength, prefixId) -> { - if (!dictionary.hasFlag(root.entryId, dictionary.affixData(prefixId, AFFIX_FLAG)) + if (!dictionary.hasFlag(root.entryId(), dictionary.affixData(prefixId, AFFIX_FLAG)) || !dictionary.isCrossProduct(prefixId)) { return; } @@ -217,7 +215,7 @@ private List expandRoot(Root root, String misspelled) { if (hasCompatibleFlags(root, prefixId) && checkAffixCondition(prefixId, wordChars, stripLength, stemLength)) { String prefix = misspelled.substring(0, prefixLength); - result.add(prefix + root.word.substring(stripLength)); + result.add(prefix + root.word().substring(stripLength)); } }); @@ -263,7 +261,7 @@ private interface AffixProcessor { } private boolean hasCompatibleFlags(Root root, int affixId) { - if (!dictionary.hasFlag(root.entryId, dictionary.affixData(affixId, AFFIX_FLAG))) { + if (!dictionary.hasFlag(root.entryId(), dictionary.affixData(affixId, AFFIX_FLAG))) { return false; } @@ -447,28 +445,8 @@ private static int commonCharacterPositionScore(String s1, String s2) { return commonScore; } - private static class Weighted> implements Comparable> { - final T word; - final int score; - - Weighted(T word, int score) { - this.word = word; - this.score = score; - } - - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (!(o instanceof Weighted)) return false; - @SuppressWarnings("unchecked") - Weighted that = (Weighted) o; - return score == that.score && word.equals(that.word); - } - - @Override - public int hashCode() { - return Objects.hash(word, score); - } + private record Weighted>(T word, int score) + implements Comparable> { @Override public String toString() { diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java index f288ef90070b..8adb82f79b29 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java @@ -132,7 +132,7 @@ boolean checkWord(String word) { Boolean checkSimpleWord(char[] wordChars, int length, WordCase originalCase) { Root entry = findStem(wordChars, 0, length, originalCase, SIMPLE_WORD); if (entry != null) { - return !dictionary.hasFlag(entry.entryId, dictionary.forbiddenword); + return !dictionary.hasFlag(entry.entryId(), dictionary.forbiddenword); } return null; @@ -229,7 +229,7 @@ private boolean checkCompounds(CharsRef word, WordCase originalCase, CompoundPar stem = findStem(word.chars, word.offset, breakPos + 1, originalCase, context); } if (stem != null - && !dictionary.hasFlag(stem.entryId, dictionary.forbiddenword) + && !dictionary.hasFlag(stem.entryId(), dictionary.forbiddenword) && (prev == null || prev.mayCompound(stem, breakPos, originalCase))) { CompoundPart part = new CompoundPart(prev, word, breakPos, stem, null); if (checkCompoundsAfter(originalCase, part)) { @@ -274,7 +274,7 @@ private boolean checkCompoundsAfter(WordCase originalCase, CompoundPart prev) { Root lastRoot = findStem(word.chars, breakOffset, remainingLength, originalCase, COMPOUND_END); if (lastRoot != null - && !dictionary.hasFlag(lastRoot.entryId, dictionary.forbiddenword) + && !dictionary.hasFlag(lastRoot.entryId(), dictionary.forbiddenword) && !(dictionary.checkCompoundDup && prev.root.equals(lastRoot)) && !hasForceUCaseProblem(lastRoot, originalCase, word.chars) && prev.mayCompound(lastRoot, remainingLength, originalCase)) { @@ -288,7 +288,7 @@ private boolean checkCompoundsAfter(WordCase originalCase, CompoundPart prev) { private boolean hasForceUCaseProblem(Root root, WordCase originalCase, char[] wordChars) { if (originalCase == WordCase.TITLE || originalCase == WordCase.UPPER) return false; if (originalCase == null && Character.isUpperCase(wordChars[0])) return false; - return dictionary.hasFlag(root.entryId, dictionary.forceUCase); + return dictionary.hasFlag(root.entryId(), dictionary.forceUCase); } /** diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java index 90def4a40fe3..dd4ec68cf6c2 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java @@ -17,8 +17,6 @@ package org.apache.lucene.analysis.hunspell; import java.io.IOException; -import java.util.Collections; -import java.util.Comparator; import java.util.List; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; @@ -117,7 +115,16 @@ public boolean incrementToken() throws IOException { } if (longestOnly && buffer.size() > 1) { - Collections.sort(buffer, lengthComparator); + buffer.sort( + (o1, o2) -> { + int cmp = Integer.compare(o2.length, o1.length); + if (cmp == 0) { + // tie break on text + return o2.compareTo(o1); + } else { + return cmp; + } + }); } CharsRef stem = buffer.remove(0); @@ -139,18 +146,4 @@ public void reset() throws IOException { super.reset(); buffer = null; } - - static final Comparator lengthComparator = - new Comparator() { - @Override - public int compare(CharsRef o1, CharsRef o2) { - int cmp = Integer.compare(o2.length, o1.length); - if (cmp == 0) { - // tie break on text - return o2.compareTo(o1); - } else { - return cmp; - } - } - }; } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Root.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Root.java index e65992e6f675..6c5ea7877c7f 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Root.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Root.java @@ -16,36 +16,13 @@ */ package org.apache.lucene.analysis.hunspell; -import java.util.Objects; - -class Root implements Comparable> { - final T word; - final int entryId; - - Root(T word, int entryId) { - this.word = word; - this.entryId = entryId; - } +record Root(T word, int entryId) implements Comparable> { @Override public String toString() { return word.toString(); } - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (!(o instanceof Root)) return false; - @SuppressWarnings("unchecked") - Root root = (Root) o; - return entryId == root.entryId && word.equals(root.word); - } - - @Override - public int hashCode() { - return Objects.hash(word, entryId); - } - @Override public int compareTo(Root o) { return CharSequence.compare(word, o.word); diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java index af4dcc1e3f4f..0773c1706488 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java @@ -18,7 +18,6 @@ import java.io.IOException; import java.util.Arrays; -import java.util.Comparator; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; @@ -147,26 +146,23 @@ private final boolean buildSingleOutputToken() throws IOException { Arrays.sort( items, - new Comparator() { - @Override - public int compare(Object o1, Object o2) { - char[] v1 = (char[]) o1; - char[] v2 = (char[]) o2; - int len1 = v1.length; - int len2 = v2.length; - int lim = Math.min(len1, len2); - - int k = 0; - while (k < lim) { - char c1 = v1[k]; - char c2 = v2[k]; - if (c1 != c2) { - return c1 - c2; - } - k++; + (o1, o2) -> { + char[] v1 = (char[]) o1; + char[] v2 = (char[]) o2; + int len1 = v1.length; + int len2 = v2.length; + int lim = Math.min(len1, len2); + + int k = 0; + while (k < lim) { + char c1 = v1[k]; + char c2 = v2[k]; + if (c1 != c2) { + return c1 - c2; } - return len1 - len2; + k++; } + return len1 - len2; }); // TODO lets append directly to termAttribute? diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java index 50275d681268..4a77405b9743 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java @@ -194,7 +194,7 @@ int type() { int type = charType(text[current]); switch (type) { - // return ALPHA word type for both lower and upper + // return ALPHA word type for both lower and upper case LOWER: case UPPER: return ALPHA; @@ -332,27 +332,27 @@ public static byte getType(int ch) { case Character.OTHER_NUMBER: return DIGIT; - // case Character.SPACE_SEPARATOR: - // case Character.LINE_SEPARATOR: - // case Character.PARAGRAPH_SEPARATOR: - // case Character.CONTROL: - // case Character.FORMAT: - // case Character.PRIVATE_USE: + // case Character.SPACE_SEPARATOR: + // case Character.LINE_SEPARATOR: + // case Character.PARAGRAPH_SEPARATOR: + // case Character.CONTROL: + // case Character.FORMAT: + // case Character.PRIVATE_USE: case Character.SURROGATE: // prevent splitting return ALPHA | DIGIT; - // case Character.DASH_PUNCTUATION: - // case Character.START_PUNCTUATION: - // case Character.END_PUNCTUATION: - // case Character.CONNECTOR_PUNCTUATION: - // case Character.OTHER_PUNCTUATION: - // case Character.MATH_SYMBOL: - // case Character.CURRENCY_SYMBOL: - // case Character.MODIFIER_SYMBOL: - // case Character.OTHER_SYMBOL: - // case Character.INITIAL_QUOTE_PUNCTUATION: - // case Character.FINAL_QUOTE_PUNCTUATION: + // case Character.DASH_PUNCTUATION: + // case Character.START_PUNCTUATION: + // case Character.END_PUNCTUATION: + // case Character.CONNECTOR_PUNCTUATION: + // case Character.OTHER_PUNCTUATION: + // case Character.MATH_SYMBOL: + // case Character.CURRENCY_SYMBOL: + // case Character.MODIFIER_SYMBOL: + // case Character.OTHER_SYMBOL: + // case Character.INITIAL_QUOTE_PUNCTUATION: + // case Character.FINAL_QUOTE_PUNCTUATION: default: return SUBWORD_DELIM; diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTypingFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTypingFilter.java index c0288ef7c892..565f605199ce 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTypingFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTypingFilter.java @@ -59,12 +59,12 @@ public PatternTypingFilter(TokenStream input, PatternTypingRule... replacementAn public final boolean incrementToken() throws IOException { if (input.incrementToken()) { for (PatternTypingRule rule : replacementAndFlagByPattern) { - Matcher matcher = rule.getPattern().matcher(termAtt); + Matcher matcher = rule.pattern().matcher(termAtt); if (matcher.find()) { // allow 2nd reset() and find() that occurs inside replaceFirst to avoid excess string // creation - typeAtt.setType(matcher.replaceFirst(rule.getTypeTemplate())); - flagAtt.setFlags(rule.getFlags()); + typeAtt.setType(matcher.replaceFirst(rule.typeTemplate())); + flagAtt.setFlags(rule.flags()); return true; } } @@ -74,27 +74,5 @@ public final boolean incrementToken() throws IOException { } /** Value holding class for pattern typing rules. */ - public static class PatternTypingRule { - private final Pattern pattern; - private final int flags; - private final String typeTemplate; - - public PatternTypingRule(Pattern pattern, int flags, String typeTemplate) { - this.pattern = pattern; - this.flags = flags; - this.typeTemplate = typeTemplate; - } - - public Pattern getPattern() { - return pattern; - } - - public int getFlags() { - return flags; - } - - public String getTypeTemplate() { - return typeTemplate; - } - } + public record PatternTypingRule(Pattern pattern, int flags, String typeTemplate) {} } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymGraphFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymGraphFilter.java index e291d8de3577..73a8aa9022e7 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymGraphFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymGraphFilter.java @@ -142,22 +142,10 @@ public void reset() { } } - static class BufferedOutputToken { - final String term; - - // Non-null if this was an incoming token: - final State state; - - final int startNode; - final int endNode; - - public BufferedOutputToken(State state, String term, int startNode, int endNode) { - this.state = state; - this.term = term; - this.startNode = startNode; - this.endNode = endNode; - } - } + /** + * @param state Non-null if this was an incoming token: + */ + record BufferedOutputToken(State state, String term, int startNode, int endNode) {} /** * Apply previously built synonyms to incoming tokens. diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/word2vec/TermAndBoost.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/word2vec/TermAndBoost.java index 349fa57c34a7..4688e57a3d07 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/word2vec/TermAndBoost.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/word2vec/TermAndBoost.java @@ -18,17 +18,15 @@ import org.apache.lucene.util.BytesRef; -/** Wraps a term and boost */ -public class TermAndBoost { - /** the term */ - public final BytesRef term; - - /** the boost */ - public final float boost; - +/** + * Wraps a term and boost + * + * @param term the term + * @param boost the boost + */ +public record TermAndBoost(BytesRef term, float boost) { /** Creates a new TermAndBoost */ - public TermAndBoost(BytesRef term, float boost) { - this.term = BytesRef.deepCopyOf(term); - this.boost = boost; + public TermAndBoost { + term = BytesRef.deepCopyOf(term); } } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/word2vec/Word2VecModel.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/word2vec/Word2VecModel.java index f43d33ca2747..68fd3b5884b0 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/word2vec/Word2VecModel.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/word2vec/Word2VecModel.java @@ -18,10 +18,10 @@ package org.apache.lucene.analysis.synonym.word2vec; import java.io.IOException; +import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefHash; import org.apache.lucene.util.TermAndVector; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; /** * Word2VecModel is a class representing the parsed Word2Vec model containing the vectors for each @@ -29,7 +29,7 @@ * * @lucene.experimental */ -public class Word2VecModel implements RandomAccessVectorValues.Floats { +public class Word2VecModel extends FloatVectorValues { private final int dictionarySize; private final int vectorDimension; @@ -56,25 +56,25 @@ private Word2VecModel( } public void addTermAndVector(TermAndVector modelEntry) { - modelEntry.normalizeVector(); + modelEntry = modelEntry.normalizeVector(); this.termsAndVectors[loadedCount++] = modelEntry; - this.word2Vec.add(modelEntry.getTerm()); + this.word2Vec.add(modelEntry.term()); } @Override public float[] vectorValue(int targetOrd) { - return termsAndVectors[targetOrd].getVector(); + return termsAndVectors[targetOrd].vector(); } public float[] vectorValue(BytesRef term) { int termOrd = this.word2Vec.find(term); if (termOrd < 0) return null; TermAndVector entry = this.termsAndVectors[termOrd]; - return (entry == null) ? null : entry.getVector(); + return (entry == null) ? null : entry.vector(); } public BytesRef termValue(int targetOrd) { - return termsAndVectors[targetOrd].getTerm(); + return termsAndVectors[targetOrd].term(); } @Override diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/word2vec/Word2VecSynonymFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/word2vec/Word2VecSynonymFilter.java index a8db4c4c764a..357e97480cdd 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/word2vec/Word2VecSynonymFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/word2vec/Word2VecSynonymFilter.java @@ -80,7 +80,7 @@ public boolean incrementToken() throws IOException { clearAttributes(); restoreState(this.lastState); termAtt.setEmpty(); - termAtt.append(synonym.term.utf8ToString()); + termAtt.append(synonym.term().utf8ToString()); typeAtt.setType(SynonymGraphFilter.TYPE_SYNONYM); posLenAtt.setPositionLength(1); posIncrementAtt.setPositionIncrement(0); diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/te/TeluguNormalizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/te/TeluguNormalizer.java index 66f30f79fc2a..0bbab021c113 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/te/TeluguNormalizer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/te/TeluguNormalizer.java @@ -38,25 +38,25 @@ int normalize(char s[], int len) { for (int i = 0; i < len; i++) { switch (s[i]) { - // candrabindu (ఀ and ఁ) -> bindu (ం) + // candrabindu (ఀ and ఁ) -> bindu (ం) case '\u0C00': // ఀ case '\u0C01': // ఁ s[i] = '\u0C02'; // ం break; - // delete visarga (ః) + // delete visarga (ః) case '\u0C03': len = delete(s, i, len); i--; break; - // zwj/zwnj -> delete + // zwj/zwnj -> delete case '\u200D': case '\u200C': len = delete(s, i, len); i--; break; - // long -> short vowels + // long -> short vowels case '\u0C14': // ఔ s[i] = '\u0C13'; // ఓ break; @@ -73,7 +73,7 @@ int normalize(char s[], int len) { s[i] = '\u0C09'; // ఉ break; - // long -> short vowels matras + // long -> short vowels matras case '\u0C40': // ీ s[i] = '\u0C3F'; // ి break; @@ -86,14 +86,14 @@ int normalize(char s[], int len) { case '\u0C4B': // ో s[i] = '\u0C4A'; // ొ break; - // decomposed dipthong (ె + ౖ) -> precomposed diphthong vowel sign (ై) + // decomposed dipthong (ె + ౖ) -> precomposed diphthong vowel sign (ై) case '\u0C46': if (i + 1 < len && s[i + 1] == '\u0C56') { s[i] = '\u0C48'; len = delete(s, i + 1, len); } break; - // composed oo or au -> oo or au + // composed oo or au -> oo or au case '\u0C12': if (i + 1 < len && s[i + 1] == '\u0C55') { // (ఒ + ౕ) -> oo (ఓ) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/tr/TurkishLowerCaseFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/tr/TurkishLowerCaseFilter.java index f1c73df4e687..4b900f172d99 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/tr/TurkishLowerCaseFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/tr/TurkishLowerCaseFilter.java @@ -61,12 +61,12 @@ public final boolean incrementToken() throws IOException { if (iOrAfter) { // all the special I turkish handling happens here. switch (ch) { - // remove COMBINING_DOT_ABOVE to mimic composed lowercase + // remove COMBINING_DOT_ABOVE to mimic composed lowercase case COMBINING_DOT_ABOVE: length = delete(buffer, i, length); continue; - // i itself, it depends if it is followed by COMBINING_DOT_ABOVE - // if it is, we will make it small i and later remove the dot + // i itself, it depends if it is followed by COMBINING_DOT_ABOVE + // if it is, we will make it small i and later remove the dot case LATIN_CAPITAL_LETTER_I: if (isBeforeDot(buffer, i + 1, length)) { buffer[i] = LATIN_SMALL_LETTER_I; diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java index f2be4f59973c..553a0c24292b 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java @@ -901,7 +901,7 @@ public int getNextToken() throws java.io.IOException { positionInc = 1; /* Break so we don't hit fall-through warning: */ break; } - // fall through + // fall through case 47: break; case 2: @@ -909,7 +909,7 @@ public int getNextToken() throws java.io.IOException { positionInc = 1; return ALPHANUM; } - // fall through + // fall through case 48: break; case 3: @@ -920,7 +920,7 @@ public int getNextToken() throws java.io.IOException { yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break; } - // fall through + // fall through case 49: break; case 4: @@ -928,7 +928,7 @@ public int getNextToken() throws java.io.IOException { positionInc = 1; return CJ; } - // fall through + // fall through case 50: break; case 5: @@ -936,7 +936,7 @@ public int getNextToken() throws java.io.IOException { positionInc = 1; /* Break so we don't hit fall-through warning: */ break; } - // fall through + // fall through case 51: break; case 6: @@ -945,7 +945,7 @@ public int getNextToken() throws java.io.IOException { numWikiTokensSeen++; return currentTokType; } - // fall through + // fall through case 52: break; case 7: @@ -954,7 +954,7 @@ public int getNextToken() throws java.io.IOException { numWikiTokensSeen++; return currentTokType; } - // fall through + // fall through case 53: break; case 8: @@ -962,7 +962,7 @@ public int getNextToken() throws java.io.IOException { /* Break so we don't hit fall-through warning: */ break; /* ignore */ } - // fall through + // fall through case 54: break; case 9: @@ -978,7 +978,7 @@ public int getNextToken() throws java.io.IOException { numLinkToks++; return currentTokType; } - // fall through + // fall through case 55: break; case 10: @@ -988,7 +988,7 @@ public int getNextToken() throws java.io.IOException { yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break; } - // fall through + // fall through case 56: break; case 11: @@ -997,7 +997,7 @@ public int getNextToken() throws java.io.IOException { yybegin(THREE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */ break; } - // fall through + // fall through case 57: break; case 12: @@ -1007,7 +1007,7 @@ public int getNextToken() throws java.io.IOException { yybegin(STRING); return currentTokType; /*italics*/ } - // fall through + // fall through case 58: break; case 13: @@ -1017,7 +1017,7 @@ public int getNextToken() throws java.io.IOException { yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break; } - // fall through + // fall through case 59: break; case 14: @@ -1026,7 +1026,7 @@ public int getNextToken() throws java.io.IOException { numWikiTokensSeen++; return currentTokType; } - // fall through + // fall through case 60: break; case 15: @@ -1036,7 +1036,7 @@ public int getNextToken() throws java.io.IOException { numWikiTokensSeen++; return currentTokType; } - // fall through + // fall through case 61: break; case 16: @@ -1046,7 +1046,7 @@ public int getNextToken() throws java.io.IOException { yybegin(STRING); /* Break so we don't hit fall-through warning: */ break; } - // fall through + // fall through case 62: break; case 17: @@ -1055,7 +1055,7 @@ public int getNextToken() throws java.io.IOException { numWikiTokensSeen = 0; return currentTokType; } - // fall through + // fall through case 63: break; case 18: @@ -1063,7 +1063,7 @@ public int getNextToken() throws java.io.IOException { /* Break so we don't hit fall-through warning: */ break; /* ignore STRING */ } - // fall through + // fall through case 64: break; case 19: @@ -1072,7 +1072,7 @@ public int getNextToken() throws java.io.IOException { numWikiTokensSeen++; return currentTokType; /* STRING ALPHANUM*/ } - // fall through + // fall through case 65: break; case 20: @@ -1083,7 +1083,7 @@ public int getNextToken() throws java.io.IOException { yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break; } - // fall through + // fall through case 66: break; case 21: @@ -1091,7 +1091,7 @@ public int getNextToken() throws java.io.IOException { yybegin(STRING); return currentTokType; /*pipe*/ } - // fall through + // fall through case 67: break; case 22: @@ -1106,7 +1106,7 @@ public int getNextToken() throws java.io.IOException { } /* Break so we don't hit fall-through warning: */ break; } - // fall through + // fall through case 68: break; case 23: @@ -1116,7 +1116,7 @@ public int getNextToken() throws java.io.IOException { yybegin(DOUBLE_EQUALS_STATE); /* Break so we don't hit fall-through warning: */ break; } - // fall through + // fall through case 69: break; case 24: @@ -1127,7 +1127,7 @@ public int getNextToken() throws java.io.IOException { yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break; } - // fall through + // fall through case 70: break; case 25: @@ -1138,7 +1138,7 @@ public int getNextToken() throws java.io.IOException { yybegin(DOUBLE_BRACE_STATE); /* Break so we don't hit fall-through warning: */ break; } - // fall through + // fall through case 71: break; case 26: @@ -1146,7 +1146,7 @@ public int getNextToken() throws java.io.IOException { yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break; } - // fall through + // fall through case 72: break; case 27: @@ -1155,7 +1155,7 @@ public int getNextToken() throws java.io.IOException { yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break; } - // fall through + // fall through case 73: break; case 28: @@ -1165,7 +1165,7 @@ public int getNextToken() throws java.io.IOException { yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break; } - // fall through + // fall through case 74: break; case 29: @@ -1175,7 +1175,7 @@ public int getNextToken() throws java.io.IOException { yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break; } - // fall through + // fall through case 75: break; case 30: @@ -1183,7 +1183,7 @@ public int getNextToken() throws java.io.IOException { yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break; } - // fall through + // fall through case 76: break; case 31: @@ -1193,7 +1193,7 @@ public int getNextToken() throws java.io.IOException { yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break; /*end italics*/ } - // fall through + // fall through case 77: break; case 32: @@ -1204,7 +1204,7 @@ public int getNextToken() throws java.io.IOException { yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break; } - // fall through + // fall through case 78: break; case 33: @@ -1212,7 +1212,7 @@ public int getNextToken() throws java.io.IOException { positionInc = 1; return NUM; } - // fall through + // fall through case 79: break; case 34: @@ -1220,7 +1220,7 @@ public int getNextToken() throws java.io.IOException { positionInc = 1; return COMPANY; } - // fall through + // fall through case 80: break; case 35: @@ -1228,7 +1228,7 @@ public int getNextToken() throws java.io.IOException { positionInc = 1; return APOSTROPHE; } - // fall through + // fall through case 81: break; case 36: @@ -1236,7 +1236,7 @@ public int getNextToken() throws java.io.IOException { positionInc = 1; return HOST; } - // fall through + // fall through case 82: break; case 37: @@ -1245,7 +1245,7 @@ public int getNextToken() throws java.io.IOException { yybegin(FIVE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */ break; } - // fall through + // fall through case 83: break; case 38: @@ -1255,7 +1255,7 @@ public int getNextToken() throws java.io.IOException { yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break; /*end bold*/ } - // fall through + // fall through case 84: break; case 39: @@ -1265,7 +1265,7 @@ public int getNextToken() throws java.io.IOException { yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break; /*end sub header*/ } - // fall through + // fall through case 85: break; case 40: @@ -1273,7 +1273,7 @@ public int getNextToken() throws java.io.IOException { positionInc = 1; return ACRONYM; } - // fall through + // fall through case 86: break; case 41: @@ -1281,7 +1281,7 @@ public int getNextToken() throws java.io.IOException { positionInc = 1; return EMAIL; } - // fall through + // fall through case 87: break; case 42: @@ -1291,7 +1291,7 @@ public int getNextToken() throws java.io.IOException { yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break; /*end bold italics*/ } - // fall through + // fall through case 88: break; case 43: @@ -1301,7 +1301,7 @@ public int getNextToken() throws java.io.IOException { yybegin(EXTERNAL_LINK_STATE); return currentTokType; } - // fall through + // fall through case 89: break; case 44: @@ -1312,7 +1312,7 @@ public int getNextToken() throws java.io.IOException { yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break; } - // fall through + // fall through case 90: break; case 45: @@ -1322,7 +1322,7 @@ public int getNextToken() throws java.io.IOException { yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break; } - // fall through + // fall through case 91: break; case 46: @@ -1333,7 +1333,7 @@ public int getNextToken() throws java.io.IOException { yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break; } - // fall through + // fall through case 92: break; default: diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java index ae2bcfdfd480..72aa96ed2a9d 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java @@ -49,7 +49,7 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase { @SuppressWarnings("deprecation") - private static final Version LUCENE_9_0_0 = Version.LUCENE_9_0_0; + private static final Version LUCENE_10_0_0 = Version.LUCENE_10_0_0; // Test some examples (TODO: we only check behavior, we may need something like // TestRandomChains...) @@ -111,7 +111,7 @@ public void testWhitespaceWithFolding() throws Exception { public void testVersionAwareFilter() throws Exception { CustomAnalyzer a = CustomAnalyzer.builder() - .withDefaultMatchVersion(LUCENE_9_0_0) + .withDefaultMatchVersion(LUCENE_10_0_0) .withTokenizer(StandardTokenizerFactory.class) .addTokenFilter(DummyVersionAwareTokenFilterFactory.class) .build(); @@ -128,7 +128,7 @@ public void testVersionAwareFilter() throws Exception { public void testFactoryHtmlStripClassicFolding() throws Exception { CustomAnalyzer a = CustomAnalyzer.builder() - .withDefaultMatchVersion(LUCENE_9_0_0) + .withDefaultMatchVersion(LUCENE_10_0_0) .addCharFilter(HTMLStripCharFilterFactory.class) .withTokenizer(ClassicTokenizerFactory.class) .addTokenFilter(ASCIIFoldingFilterFactory.class, "preserveOriginal", "true") @@ -164,7 +164,7 @@ public void testFactoryHtmlStripClassicFolding() throws Exception { public void testHtmlStripClassicFolding() throws Exception { CustomAnalyzer a = CustomAnalyzer.builder() - .withDefaultMatchVersion(LUCENE_9_0_0) + .withDefaultMatchVersion(LUCENE_10_0_0) .addCharFilter("htmlstrip") .withTokenizer("classic") .addTokenFilter("asciifolding", "preserveOriginal", "true") @@ -513,7 +513,7 @@ public DummyVersionAwareTokenFilterFactory(Map args) { @Override public TokenStream create(TokenStream input) { - if (luceneMatchVersion.equals(LUCENE_9_0_0)) { + if (luceneMatchVersion.equals(LUCENE_10_0_0)) { return input; } return new LowerCaseFilter(input); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymGraphFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymGraphFilter.java index 1eb80ea5081e..974edc8e9f6f 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymGraphFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymGraphFilter.java @@ -1490,7 +1490,7 @@ public void testRandomSyns() throws Exception { } assertTrue(approxEquals(actual, expected)); - assertTrue(Operations.sameLanguage(actual, expected)); + assertTrue(AutomatonTestUtil.sameLanguage(actual, expected)); } a.close(); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/word2vec/TestWord2VecSynonymProvider.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/word2vec/TestWord2VecSynonymProvider.java index 3e7e6bce07a3..1d36d82a3022 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/word2vec/TestWord2VecSynonymProvider.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/word2vec/TestWord2VecSynonymProvider.java @@ -64,7 +64,7 @@ public void getSynonyms_shouldReturnSynonymsBasedOnMinAcceptedSimilarity() throw assertEquals(4, actualSynonymsResults.size()); for (int i = 0; i < expectedSynonyms.length; i++) { - assertEquals(new BytesRef(expectedSynonyms[i]), actualSynonymsResults.get(i).term); + assertEquals(new BytesRef(expectedSynonyms[i]), actualSynonymsResults.get(i).term()); } } @@ -83,8 +83,8 @@ public void getSynonyms_shouldReturnSynonymsBoost() throws Exception { BytesRef expectedFirstSynonymTerm = new BytesRef("b"); double expectedFirstSynonymBoost = 1.0; - assertEquals(expectedFirstSynonymTerm, actualSynonymsResults.get(0).term); - assertEquals(expectedFirstSynonymBoost, actualSynonymsResults.get(0).boost, 0.001f); + assertEquals(expectedFirstSynonymTerm, actualSynonymsResults.get(0).term()); + assertEquals(expectedFirstSynonymBoost, actualSynonymsResults.get(0).boost(), 0.001f); } @Test @@ -120,8 +120,8 @@ public void testModel_shouldReturnNormalizedVectors() { @Test public void normalizedVector_shouldReturnModule1() { TermAndVector synonymTerm = new TermAndVector(new BytesRef("a"), new float[] {10, 10}); - synonymTerm.normalizeVector(); - float[] vector = synonymTerm.getVector(); + synonymTerm = synonymTerm.normalizeVector(); + float[] vector = synonymTerm.vector(); float len = 0; for (int i = 0; i < vector.length; i++) { len += vector[i] * vector[i]; diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseCompletionFilter.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseCompletionFilter.java index 4a22cce3f12c..310a143a52e4 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseCompletionFilter.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseCompletionFilter.java @@ -139,19 +139,7 @@ private void mayIncrementToken() throws IOException { } } - private static class CompletionToken { - final String term; - final boolean isFirst; - final int startOffset; - final int endOffset; - - CompletionToken(String term, boolean isFirst, int startOffset, int endOffset) { - this.term = term; - this.isFirst = isFirst; - this.startOffset = startOffset; - this.endOffset = endOffset; - } - } + private record CompletionToken(String term, boolean isFirst, int startOffset, int endOffset) {} private static class CompletionTokenGenerator implements Iterator { diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/completion/KatakanaRomanizer.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/completion/KatakanaRomanizer.java index 8e137a8af3f4..6e2e4f0268da 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/completion/KatakanaRomanizer.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/completion/KatakanaRomanizer.java @@ -180,13 +180,5 @@ private MatchedKeystroke longestKeystrokeMatch(CharsRef input, int inputOffset) return null; } - private static class MatchedKeystroke { - final int keystrokeLen; - final int keystrokeIndex; - - MatchedKeystroke(int keystrokeLen, int keystrokeIndex) { - this.keystrokeLen = keystrokeLen; - this.keystrokeIndex = keystrokeIndex; - } - } + private record MatchedKeystroke(int keystrokeLen, int keystrokeIndex) {} } diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java index 391ed2ba44b8..4214b72f57f9 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java @@ -20,8 +20,6 @@ import java.io.IOException; import java.io.Reader; import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; import java.util.List; import java.util.regex.Pattern; import org.apache.lucene.analysis.morph.Dictionary; @@ -83,14 +81,7 @@ private UserDictionary(List featureEntries) throws IOException { // TODO: should we allow multiple segmentations per input 'phrase'? // the old treemap didn't support this either, and i'm not sure if it's needed/useful? - Collections.sort( - featureEntries, - new Comparator() { - @Override - public int compare(String[] left, String[] right) { - return left[0].compareTo(right[0]); - } - }); + featureEntries.sort((left, right) -> left[0].compareTo(right[0])); List data = new ArrayList<>(featureEntries.size()); List segmentations = new ArrayList<>(featureEntries.size()); diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/Viterbi.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/Viterbi.java index 9f7765eaadf7..8ba9cb36979f 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/Viterbi.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/Viterbi.java @@ -268,19 +268,19 @@ protected void backtrace(Position endPosData, int fromIDX) { final KoMorphData.Morpheme morpheme = morphemes[i]; final Token compoundToken; if (token.getPOSType() == POS.Type.COMPOUND) { - assert endOffset - morpheme.surfaceForm.length() >= 0; + assert endOffset - morpheme.surfaceForm().length() >= 0; compoundToken = new DecompoundToken( - morpheme.posTag, - morpheme.surfaceForm, - endOffset - morpheme.surfaceForm.length(), + morpheme.posTag(), + morpheme.surfaceForm(), + endOffset - morpheme.surfaceForm().length(), endOffset, backType); } else { compoundToken = new DecompoundToken( - morpheme.posTag, - morpheme.surfaceForm, + morpheme.posTag(), + morpheme.surfaceForm(), token.getStartOffset(), token.getEndOffset(), backType); @@ -289,7 +289,7 @@ protected void backtrace(Position endPosData, int fromIDX) { compoundToken.setPositionIncrement(0); } ++posLen; - endOffset -= morpheme.surfaceForm.length(); + endOffset -= morpheme.surfaceForm().length(); pending.add(compoundToken); if (VERBOSE) { System.out.println(" add token=" + pending.get(pending.size() - 1)); diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/KoMorphData.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/KoMorphData.java index 0887a7f0c428..2b2c53a4d6db 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/KoMorphData.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/KoMorphData.java @@ -22,15 +22,7 @@ /** Represents Korean morphological information. */ public interface KoMorphData extends MorphData { /** A morpheme extracted from a compound token. */ - class Morpheme { - public final POS.Tag posTag; - public final String surfaceForm; - - public Morpheme(POS.Tag posTag, String surfaceForm) { - this.posTag = posTag; - this.surfaceForm = surfaceForm; - } - } + record Morpheme(POS.Tag posTag, String surfaceForm) {} /** * Get the {@link org.apache.lucene.analysis.ko.POS.Type} of specified word (morpheme, compound, diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryEntryWriter.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryEntryWriter.java index 95ce0277a9d5..8b1e91b132ac 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryEntryWriter.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryEntryWriter.java @@ -150,13 +150,13 @@ protected int putEntry(String[] entry) { int compoundOffset = 0; for (KoMorphData.Morpheme morpheme : morphemes) { if (hasSinglePOS == false) { - buffer.put((byte) morpheme.posTag.ordinal()); + buffer.put((byte) morpheme.posTag().ordinal()); } if (posType != POS.Type.INFLECT) { - buffer.put((byte) morpheme.surfaceForm.length()); - compoundOffset += morpheme.surfaceForm.length(); + buffer.put((byte) morpheme.surfaceForm().length()); + compoundOffset += morpheme.surfaceForm().length(); } else { - writeString(morpheme.surfaceForm); + writeString(morpheme.surfaceForm()); } assert compoundOffset <= entry[0].length() : Arrays.toString(entry); } diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/tokenattributes/PartOfSpeechAttributeImpl.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/tokenattributes/PartOfSpeechAttributeImpl.java index a59999db557f..8ca26387e0b6 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/tokenattributes/PartOfSpeechAttributeImpl.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/tokenattributes/PartOfSpeechAttributeImpl.java @@ -86,11 +86,11 @@ private String displayMorphemes(KoMorphData.Morpheme[] morphemes) { builder.append("+"); } builder - .append(morpheme.surfaceForm) + .append(morpheme.surfaceForm()) .append('/') - .append(morpheme.posTag.name()) + .append(morpheme.posTag().name()) .append('(') - .append(morpheme.posTag.description()) + .append(morpheme.posTag().description()) .append(')'); } return builder.toString(); diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestTokenInfoDictionary.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestTokenInfoDictionary.java index 7c6b282eecf2..0c574a70fde5 100644 --- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestTokenInfoDictionary.java +++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestTokenInfoDictionary.java @@ -170,14 +170,14 @@ public void testEnumerateAll() throws Exception { if (decompound != null) { int offset = 0; for (KoMorphData.Morpheme morph : decompound) { - assertTrue(UnicodeUtil.validUTF16String(morph.surfaceForm)); - assertFalse(morph.surfaceForm.isEmpty()); - assertEquals(morph.surfaceForm.trim(), morph.surfaceForm); + assertTrue(UnicodeUtil.validUTF16String(morph.surfaceForm())); + assertFalse(morph.surfaceForm().isEmpty()); + assertEquals(morph.surfaceForm().trim(), morph.surfaceForm()); if (type != POS.Type.INFLECT) { assertEquals( - morph.surfaceForm, - surfaceForm.substring(offset, offset + morph.surfaceForm.length())); - offset += morph.surfaceForm.length(); + morph.surfaceForm(), + surfaceForm.substring(offset, offset + morph.surfaceForm().length())); + offset += morph.surfaceForm().length(); } } assertTrue(offset <= surfaceForm.length()); diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestUserDictionary.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestUserDictionary.java index 8fb2827a336b..7ce5dd805638 100644 --- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestUserDictionary.java +++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestUserDictionary.java @@ -43,10 +43,10 @@ public void testLookup() throws IOException { dictionary.getMorphAttributes().getMorphemes(wordIds.get(1), sArray, 0, s.length()); assertNotNull(decompound); assertEquals(2, decompound.length); - assertEquals(decompound[0].posTag, POS.Tag.NNG); - assertEquals(decompound[0].surfaceForm, "세종"); - assertEquals(decompound[1].posTag, POS.Tag.NNG); - assertEquals(decompound[1].surfaceForm, "시"); + assertEquals(decompound[0].posTag(), POS.Tag.NNG); + assertEquals(decompound[0].surfaceForm(), "세종"); + assertEquals(decompound[1].posTag(), POS.Tag.NNG); + assertEquals(decompound[1].surfaceForm(), "시"); s = "c++"; sArray = s.toCharArray(); diff --git a/lucene/analysis/stempel/src/java/org/egothor/stemmer/Diff.java b/lucene/analysis/stempel/src/java/org/egothor/stemmer/Diff.java index 9f12b76c6dbf..8a92e6b7d368 100644 --- a/lucene/analysis/stempel/src/java/org/egothor/stemmer/Diff.java +++ b/lucene/analysis/stempel/src/java/org/egothor/stemmer/Diff.java @@ -245,7 +245,7 @@ public synchronized String exec(String a, String b) { deletes++; x--; break; - // delete + // delete case Y: if (deletes != base) { result.append('D').append(deletes); @@ -258,7 +258,7 @@ public synchronized String exec(String a, String b) { result.append('I'); result.append(b.charAt(--y)); break; - // insert + // insert case R: if (deletes != base) { result.append('D').append(deletes); @@ -272,7 +272,7 @@ public synchronized String exec(String a, String b) { result.append(b.charAt(--y)); x--; break; - // replace + // replace case D: if (deletes != base) { result.append('D').append(deletes); diff --git a/lucene/backward-codecs/src/generated/checksums/generateForDeltaUtil912.json b/lucene/backward-codecs/src/generated/checksums/generateForDeltaUtil912.json new file mode 100644 index 000000000000..c8c19f1c3b50 --- /dev/null +++ b/lucene/backward-codecs/src/generated/checksums/generateForDeltaUtil912.json @@ -0,0 +1,4 @@ +{ + "lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/ForDeltaUtil.java": "b81961f0b277b1458ca259e0d23ccc4eeeb47fe7", + "lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/gen_ForDeltaUtil.py": "3191d7591309b7876c5c709fb9375af5b87c2ef8" +} \ No newline at end of file diff --git a/lucene/backward-codecs/src/generated/checksums/generateForUtil912.json b/lucene/backward-codecs/src/generated/checksums/generateForUtil912.json new file mode 100644 index 000000000000..e8c1881a925d --- /dev/null +++ b/lucene/backward-codecs/src/generated/checksums/generateForUtil912.json @@ -0,0 +1,4 @@ +{ + "lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/ForUtil.java": "e6db3c665dfebca8b93eb6b4651d2eb3af637b02", + "lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/gen_ForUtil.py": "993ecc9cf7ea821963384070669695257b16e040" +} \ No newline at end of file diff --git a/lucene/backward-codecs/src/java/module-info.java b/lucene/backward-codecs/src/java/module-info.java index fbc2cdba98ee..41057c95bbf3 100644 --- a/lucene/backward-codecs/src/java/module-info.java +++ b/lucene/backward-codecs/src/java/module-info.java @@ -36,6 +36,8 @@ exports org.apache.lucene.backward_codecs.lucene94; exports org.apache.lucene.backward_codecs.lucene95; exports org.apache.lucene.backward_codecs.lucene99; + exports org.apache.lucene.backward_codecs.lucene912; + exports org.apache.lucene.backward_codecs.lucene100; exports org.apache.lucene.backward_codecs.packed; exports org.apache.lucene.backward_codecs.store; @@ -45,7 +47,8 @@ org.apache.lucene.backward_codecs.lucene50.Lucene50PostingsFormat, org.apache.lucene.backward_codecs.lucene84.Lucene84PostingsFormat, org.apache.lucene.backward_codecs.lucene90.Lucene90PostingsFormat, - org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat; + org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat, + org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat; provides org.apache.lucene.codecs.KnnVectorsFormat with org.apache.lucene.backward_codecs.lucene90.Lucene90HnswVectorsFormat, org.apache.lucene.backward_codecs.lucene91.Lucene91HnswVectorsFormat, @@ -62,5 +65,7 @@ org.apache.lucene.backward_codecs.lucene92.Lucene92Codec, org.apache.lucene.backward_codecs.lucene94.Lucene94Codec, org.apache.lucene.backward_codecs.lucene95.Lucene95Codec, - org.apache.lucene.backward_codecs.lucene99.Lucene99Codec; + org.apache.lucene.backward_codecs.lucene99.Lucene99Codec, + org.apache.lucene.backward_codecs.lucene912.Lucene912Codec, + org.apache.lucene.backward_codecs.lucene100.Lucene100Codec; } diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene100/Lucene100Codec.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene100/Lucene100Codec.java new file mode 100644 index 000000000000..14ca88a98a51 --- /dev/null +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene100/Lucene100Codec.java @@ -0,0 +1,218 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.backward_codecs.lucene100; + +import java.util.Objects; +import org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat; +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.CompoundFormat; +import org.apache.lucene.codecs.DocValuesFormat; +import org.apache.lucene.codecs.FieldInfosFormat; +import org.apache.lucene.codecs.FilterCodec; +import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.codecs.LiveDocsFormat; +import org.apache.lucene.codecs.NormsFormat; +import org.apache.lucene.codecs.PointsFormat; +import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.codecs.SegmentInfoFormat; +import org.apache.lucene.codecs.StoredFieldsFormat; +import org.apache.lucene.codecs.TermVectorsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat; +import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat; +import org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat; +import org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat; +import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; +import org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat; +import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; +import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; +import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; + +/** + * Implements the Lucene 10.0 index format + * + *

If you want to reuse functionality of this codec in another codec, extend {@link FilterCodec}. + * + * @see org.apache.lucene.backward_codecs.lucene100 package documentation for file format details. + * @lucene.experimental + */ +public class Lucene100Codec extends Codec { + + /** Configuration option for the codec. */ + public enum Mode { + /** Trade compression ratio for retrieval speed. */ + BEST_SPEED(Lucene90StoredFieldsFormat.Mode.BEST_SPEED), + /** Trade retrieval speed for compression ratio. */ + BEST_COMPRESSION(Lucene90StoredFieldsFormat.Mode.BEST_COMPRESSION); + + private final Lucene90StoredFieldsFormat.Mode storedMode; + + private Mode(Lucene90StoredFieldsFormat.Mode storedMode) { + this.storedMode = Objects.requireNonNull(storedMode); + } + } + + private final TermVectorsFormat vectorsFormat = new Lucene90TermVectorsFormat(); + private final FieldInfosFormat fieldInfosFormat = new Lucene94FieldInfosFormat(); + private final SegmentInfoFormat segmentInfosFormat = new Lucene99SegmentInfoFormat(); + private final LiveDocsFormat liveDocsFormat = new Lucene90LiveDocsFormat(); + private final CompoundFormat compoundFormat = new Lucene90CompoundFormat(); + private final NormsFormat normsFormat = new Lucene90NormsFormat(); + + private final PostingsFormat defaultPostingsFormat; + private final PostingsFormat postingsFormat = + new PerFieldPostingsFormat() { + @Override + public PostingsFormat getPostingsFormatForField(String field) { + return Lucene100Codec.this.getPostingsFormatForField(field); + } + }; + + private final DocValuesFormat defaultDVFormat; + private final DocValuesFormat docValuesFormat = + new PerFieldDocValuesFormat() { + @Override + public DocValuesFormat getDocValuesFormatForField(String field) { + return Lucene100Codec.this.getDocValuesFormatForField(field); + } + }; + + private final KnnVectorsFormat defaultKnnVectorsFormat; + private final KnnVectorsFormat knnVectorsFormat = + new PerFieldKnnVectorsFormat() { + @Override + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return Lucene100Codec.this.getKnnVectorsFormatForField(field); + } + }; + + private final StoredFieldsFormat storedFieldsFormat; + + /** Instantiates a new codec. */ + public Lucene100Codec() { + this(Mode.BEST_SPEED); + } + + /** + * Instantiates a new codec, specifying the stored fields compression mode to use. + * + * @param mode stored fields compression mode to use for newly flushed/merged segments. + */ + public Lucene100Codec(Mode mode) { + super("Lucene100"); + this.storedFieldsFormat = + new Lucene90StoredFieldsFormat(Objects.requireNonNull(mode).storedMode); + this.defaultPostingsFormat = new Lucene912PostingsFormat(); + this.defaultDVFormat = new Lucene90DocValuesFormat(); + this.defaultKnnVectorsFormat = new Lucene99HnswVectorsFormat(); + } + + @Override + public final StoredFieldsFormat storedFieldsFormat() { + return storedFieldsFormat; + } + + @Override + public final TermVectorsFormat termVectorsFormat() { + return vectorsFormat; + } + + @Override + public final PostingsFormat postingsFormat() { + return postingsFormat; + } + + @Override + public final FieldInfosFormat fieldInfosFormat() { + return fieldInfosFormat; + } + + @Override + public final SegmentInfoFormat segmentInfoFormat() { + return segmentInfosFormat; + } + + @Override + public final LiveDocsFormat liveDocsFormat() { + return liveDocsFormat; + } + + @Override + public final CompoundFormat compoundFormat() { + return compoundFormat; + } + + @Override + public final PointsFormat pointsFormat() { + return new Lucene90PointsFormat(); + } + + @Override + public final KnnVectorsFormat knnVectorsFormat() { + return knnVectorsFormat; + } + + /** + * Returns the postings format that should be used for writing new segments of field. + * + *

The default implementation always returns "Lucene912". + * + *

WARNING: if you subclass, you are responsible for index backwards compatibility: + * future version of Lucene are only guaranteed to be able to read the default implementation, + */ + public PostingsFormat getPostingsFormatForField(String field) { + return defaultPostingsFormat; + } + + /** + * Returns the docvalues format that should be used for writing new segments of field + * . + * + *

The default implementation always returns "Lucene90". + * + *

WARNING: if you subclass, you are responsible for index backwards compatibility: + * future version of Lucene are only guaranteed to be able to read the default implementation. + */ + public DocValuesFormat getDocValuesFormatForField(String field) { + return defaultDVFormat; + } + + /** + * Returns the vectors format that should be used for writing new segments of field + * + *

The default implementation always returns "Lucene99HnswVectorsFormat". + * + *

WARNING: if you subclass, you are responsible for index backwards compatibility: + * future version of Lucene are only guaranteed to be able to read the default implementation. + */ + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return defaultKnnVectorsFormat; + } + + @Override + public final DocValuesFormat docValuesFormat() { + return docValuesFormat; + } + + @Override + public final NormsFormat normsFormat() { + return normsFormat; + } +} diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene100/package-info.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene100/package-info.java new file mode 100644 index 000000000000..dd2af3acdbb7 --- /dev/null +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene100/package-info.java @@ -0,0 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Lucene 10.0 file format. */ +package org.apache.lucene.backward_codecs.lucene100; diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/ForUtil.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/ForUtil.java index 2f1f24dcd49a..b61310496942 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/ForUtil.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/ForUtil.java @@ -103,19 +103,20 @@ private static int encodedSize( for (int bpv = 1; bpv <= 32; ++bpv) { final FormatAndBits formatAndBits = PackedInts.fastestFormatAndBits(BLOCK_SIZE, bpv, acceptableOverheadRatio); - assert formatAndBits.format.isSupported(formatAndBits.bitsPerValue); - assert formatAndBits.bitsPerValue <= 32; + assert formatAndBits.format().isSupported(formatAndBits.bitsPerValue()); + assert formatAndBits.bitsPerValue() <= 32; encodedSizes[bpv] = - encodedSize(formatAndBits.format, PackedInts.VERSION_CURRENT, formatAndBits.bitsPerValue); + encodedSize( + formatAndBits.format(), PackedInts.VERSION_CURRENT, formatAndBits.bitsPerValue()); encoders[bpv] = PackedInts.getEncoder( - formatAndBits.format, PackedInts.VERSION_CURRENT, formatAndBits.bitsPerValue); + formatAndBits.format(), PackedInts.VERSION_CURRENT, formatAndBits.bitsPerValue()); decoders[bpv] = PackedInts.getDecoder( - formatAndBits.format, PackedInts.VERSION_CURRENT, formatAndBits.bitsPerValue); + formatAndBits.format(), PackedInts.VERSION_CURRENT, formatAndBits.bitsPerValue()); iterations[bpv] = computeIterations(decoders[bpv]); - out.writeVInt(formatAndBits.format.getId() << 5 | (formatAndBits.bitsPerValue - 1)); + out.writeVInt(formatAndBits.format().getId() << 5 | (formatAndBits.bitsPerValue() - 1)); } } diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/Lucene50CompoundFormat.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/Lucene50CompoundFormat.java index d473efa14a4d..ca697b2948b1 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/Lucene50CompoundFormat.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/Lucene50CompoundFormat.java @@ -77,9 +77,8 @@ public final class Lucene50CompoundFormat extends CompoundFormat { public Lucene50CompoundFormat() {} @Override - public CompoundDirectory getCompoundReader(Directory dir, SegmentInfo si, IOContext context) - throws IOException { - return new Lucene50CompoundReader(dir, si, context); + public CompoundDirectory getCompoundReader(Directory dir, SegmentInfo si) throws IOException { + return new Lucene50CompoundReader(dir, si); } @Override diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/Lucene50CompoundReader.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/Lucene50CompoundReader.java index d833ba7b3422..8083a2de7d3f 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/Lucene50CompoundReader.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/Lucene50CompoundReader.java @@ -31,6 +31,7 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.ReadAdvice; import org.apache.lucene.util.CollectionUtil; import org.apache.lucene.util.IOUtils; @@ -57,8 +58,7 @@ public static final class FileEntry { /** Create a new CompoundFileDirectory. */ // TODO: we should just pre-strip "entries" and append segment name up-front like simpletext? // this need not be a "general purpose" directory anymore (it only writes index files) - public Lucene50CompoundReader(Directory directory, SegmentInfo si, IOContext context) - throws IOException { + public Lucene50CompoundReader(Directory directory, SegmentInfo si) throws IOException { this.directory = directory; this.segmentName = si.name; String dataFileName = @@ -74,7 +74,7 @@ public Lucene50CompoundReader(Directory directory, SegmentInfo si, IOContext con } expectedLength += CodecUtil.footerLength(); - handle = directory.openInput(dataFileName, context); + handle = directory.openInput(dataFileName, IOContext.DEFAULT.withReadAdvice(ReadAdvice.NORMAL)); // DirectoryUtil.openInput(directory, dataFileName, context); try { CodecUtil.checkIndexHeader( @@ -170,7 +170,7 @@ public IndexInput openInput(String name, IOContext context) throws IOException { + entries.keySet() + ")"); } - return handle.slice(name, entry.offset, entry.length); + return handle.slice(name, entry.offset, entry.length, context.readAdvice()); } /** Returns an array of strings, one for each file in the directory. */ diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene60/Lucene60FieldInfosFormat.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene60/Lucene60FieldInfosFormat.java index 9f7f0b83e416..4e0d7c8e53e6 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene60/Lucene60FieldInfosFormat.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene60/Lucene60FieldInfosFormat.java @@ -24,6 +24,7 @@ import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.FieldInfosFormat; import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.DocValuesSkipIndexType; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; @@ -209,7 +210,7 @@ private FieldInfo[] readFieldInfos(IndexInput input, int version) throws IOExcep storePayloads, indexOptions, docValuesType, - false, + DocValuesSkipIndexType.NONE, dvGen, attributes, pointDataDimensionCount, @@ -347,7 +348,7 @@ public void write( output.writeVInt(fi.number); byte bits = 0x0; - if (fi.hasVectors()) bits |= STORE_TERMVECTOR; + if (fi.hasTermVectors()) bits |= STORE_TERMVECTOR; if (fi.omitsNorms()) bits |= OMIT_NORMS; if (fi.hasPayloads()) bits |= STORE_PAYLOADS; if (fi.isSoftDeletesField()) bits |= SOFT_DELETES_FIELD; diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene80/Lucene80DocValuesProducer.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene80/Lucene80DocValuesProducer.java index c5754e5d1e52..211267d4c031 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene80/Lucene80DocValuesProducer.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene80/Lucene80DocValuesProducer.java @@ -17,8 +17,6 @@ package org.apache.lucene.backward_codecs.lucene80; import java.io.IOException; -import java.util.HashMap; -import java.util.Map; import org.apache.lucene.backward_codecs.packed.LegacyDirectMonotonicReader; import org.apache.lucene.backward_codecs.packed.LegacyDirectReader; import org.apache.lucene.backward_codecs.store.EndiannessReverserUtil; @@ -41,6 +39,7 @@ import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.TermsEnum.SeekStatus; +import org.apache.lucene.internal.hppc.IntObjectHashMap; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.ChecksumIndexInput; import org.apache.lucene.store.DataInput; @@ -53,11 +52,11 @@ /** reader for {@link Lucene80DocValuesFormat} */ final class Lucene80DocValuesProducer extends DocValuesProducer { - private final Map numerics = new HashMap<>(); - private final Map binaries = new HashMap<>(); - private final Map sorted = new HashMap<>(); - private final Map sortedSets = new HashMap<>(); - private final Map sortedNumerics = new HashMap<>(); + private final IntObjectHashMap numerics = new IntObjectHashMap<>(); + private final IntObjectHashMap binaries = new IntObjectHashMap<>(); + private final IntObjectHashMap sorted = new IntObjectHashMap<>(); + private final IntObjectHashMap sortedSets = new IntObjectHashMap<>(); + private final IntObjectHashMap sortedNumerics = new IntObjectHashMap<>(); private final IndexInput data; private final int maxDoc; private int version = -1; @@ -139,7 +138,7 @@ private void readFields(String segmentName, IndexInput meta, FieldInfos infos) } byte type = meta.readByte(); if (type == Lucene80DocValuesFormat.NUMERIC) { - numerics.put(info.name, readNumeric(meta)); + numerics.put(info.number, readNumeric(meta)); } else if (type == Lucene80DocValuesFormat.BINARY) { final boolean compressed; if (version >= Lucene80DocValuesFormat.VERSION_CONFIGURABLE_COMPRESSION) { @@ -158,13 +157,13 @@ private void readFields(String segmentName, IndexInput meta, FieldInfos infos) } else { compressed = version >= Lucene80DocValuesFormat.VERSION_BIN_COMPRESSED; } - binaries.put(info.name, readBinary(meta, compressed)); + binaries.put(info.number, readBinary(meta, compressed)); } else if (type == Lucene80DocValuesFormat.SORTED) { - sorted.put(info.name, readSorted(meta)); + sorted.put(info.number, readSorted(meta)); } else if (type == Lucene80DocValuesFormat.SORTED_SET) { - sortedSets.put(info.name, readSortedSet(meta)); + sortedSets.put(info.number, readSortedSet(meta)); } else if (type == Lucene80DocValuesFormat.SORTED_NUMERIC) { - sortedNumerics.put(info.name, readSortedNumeric(meta)); + sortedNumerics.put(info.number, readSortedNumeric(meta)); } else { throw new CorruptIndexException("invalid type: " + type, meta); } @@ -426,7 +425,7 @@ private static class SortedNumericEntry extends NumericEntry { @Override public NumericDocValues getNumeric(FieldInfo field) throws IOException { - NumericEntry entry = numerics.get(field.name); + NumericEntry entry = numerics.get(field.number); return getNumeric(entry); } @@ -915,7 +914,7 @@ BytesRef decode(int docNumber) throws IOException { @Override public BinaryDocValues getBinary(FieldInfo field) throws IOException { - BinaryEntry entry = binaries.get(field.name); + BinaryEntry entry = binaries.get(field.number); if (entry.compressed) { return getCompressedBinary(entry); } else { @@ -973,7 +972,7 @@ public BytesRef binaryValue() throws IOException { @Override public SortedDocValues getSorted(FieldInfo field) throws IOException { - SortedEntry entry = sorted.get(field.name); + SortedEntry entry = sorted.get(field.number); return getSorted(entry); } @@ -1407,7 +1406,7 @@ public int docFreq() throws IOException { @Override public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException { - SortedNumericEntry entry = sortedNumerics.get(field.name); + SortedNumericEntry entry = sortedNumerics.get(field.number); if (entry.numValues == entry.numDocsWithField) { return DocValues.singleton(getNumeric(entry)); } @@ -1543,7 +1542,7 @@ private void set() { @Override public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { - SortedSetEntry entry = sortedSets.get(field.name); + SortedSetEntry entry = sortedSets.get(field.number); if (entry.singleValueEntry != null) { return DocValues.singleton(getSorted(entry.singleValueEntry)); } diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90FieldInfosFormat.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90FieldInfosFormat.java index c4e3dca2873c..65c7864fa9dd 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90FieldInfosFormat.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90FieldInfosFormat.java @@ -23,6 +23,7 @@ import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.FieldInfosFormat; import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.DocValuesSkipIndexType; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; @@ -186,7 +187,7 @@ public FieldInfos read( storePayloads, indexOptions, docValuesType, - false, + DocValuesSkipIndexType.NONE, dvGen, attributes, pointDataDimensionCount, @@ -333,7 +334,7 @@ public void write( output.writeVInt(fi.number); byte bits = 0x0; - if (fi.hasVectors()) bits |= STORE_TERMVECTOR; + if (fi.hasTermVectors()) bits |= STORE_TERMVECTOR; if (fi.omitsNorms()) bits |= OMIT_NORMS; if (fi.hasPayloads()) bits |= STORE_PAYLOADS; if (fi.isSoftDeletesField()) bits |= SOFT_DELETES_FIELD; diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswGraphBuilder.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswGraphBuilder.java index 52972e9dcda4..0d7fd520a303 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswGraphBuilder.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswGraphBuilder.java @@ -22,10 +22,10 @@ import java.util.Objects; import java.util.SplittableRandom; import java.util.concurrent.TimeUnit; +import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.util.InfoStream; import org.apache.lucene.util.hnsw.NeighborQueue; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; /** * Builder for HNSW graph. See {@link Lucene90OnHeapHnswGraph} for a gloss on the algorithm and the @@ -49,7 +49,7 @@ public final class Lucene90HnswGraphBuilder { private final Lucene90NeighborArray scratch; private final VectorSimilarityFunction similarityFunction; - private final RandomAccessVectorValues.Floats vectorValues; + private final FloatVectorValues vectorValues; private final SplittableRandom random; private final Lucene90BoundsChecker bound; final Lucene90OnHeapHnswGraph hnsw; @@ -58,7 +58,7 @@ public final class Lucene90HnswGraphBuilder { // we need two sources of vectors in order to perform diversity check comparisons without // colliding - private final RandomAccessVectorValues.Floats buildVectors; + private final FloatVectorValues buildVectors; /** * Reads all the vectors from vector values, builds a graph connecting them by their dense @@ -73,7 +73,7 @@ public final class Lucene90HnswGraphBuilder { * to ensure repeatable construction. */ public Lucene90HnswGraphBuilder( - RandomAccessVectorValues.Floats vectors, + FloatVectorValues vectors, VectorSimilarityFunction similarityFunction, int maxConn, int beamWidth, @@ -97,14 +97,14 @@ public Lucene90HnswGraphBuilder( } /** - * Reads all the vectors from two copies of a {@link RandomAccessVectorValues}. Providing two - * copies enables efficient retrieval without extra data copying, while avoiding collision of the + * Reads all the vectors from two copies of a {@link FloatVectorValues}. Providing two copies + * enables efficient retrieval without extra data copying, while avoiding collision of the * returned values. * * @param vectors the vectors for which to build a nearest neighbors graph. Must be an independet * accessor for the vectors */ - public Lucene90OnHeapHnswGraph build(RandomAccessVectorValues.Floats vectors) throws IOException { + public Lucene90OnHeapHnswGraph build(FloatVectorValues vectors) throws IOException { if (vectors == vectorValues) { throw new IllegalArgumentException( "Vectors to build must be independent of the source of vectors provided to HnswGraphBuilder()"); @@ -230,7 +230,7 @@ private boolean diversityCheck( float[] candidate, float score, Lucene90NeighborArray neighbors, - RandomAccessVectorValues.Floats vectorValues) + FloatVectorValues vectorValues) throws IOException { bound.set(score); for (int i = 0; i < neighbors.size(); i++) { diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsReader.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsReader.java index ab2486f4518b..015fad7490ce 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsReader.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsReader.java @@ -20,9 +20,6 @@ import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; import java.io.IOException; -import java.util.Arrays; -import java.util.HashMap; -import java.util.Map; import java.util.SplittableRandom; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.KnnVectorsReader; @@ -34,7 +31,7 @@ import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.VectorSimilarityFunction; -import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.internal.hppc.IntObjectHashMap; import org.apache.lucene.search.KnnCollector; import org.apache.lucene.search.VectorScorer; import org.apache.lucene.store.ChecksumIndexInput; @@ -44,7 +41,6 @@ import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.hnsw.HnswGraph; import org.apache.lucene.util.hnsw.NeighborQueue; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; /** * Reads vectors from the index segments along with index data structures supporting KNN search. @@ -53,14 +49,16 @@ */ public final class Lucene90HnswVectorsReader extends KnnVectorsReader { - private final Map fields = new HashMap<>(); + private final IntObjectHashMap fields = new IntObjectHashMap<>(); private final IndexInput vectorData; private final IndexInput vectorIndex; private final long checksumSeed; + private final FieldInfos fieldInfos; Lucene90HnswVectorsReader(SegmentReadState state) throws IOException { int versionMeta = readMetadata(state); long[] checksumRef = new long[1]; + this.fieldInfos = state.fieldInfos; boolean success = false; try { vectorData = @@ -161,7 +159,7 @@ private void readFields(ChecksumIndexInput meta, FieldInfos infos) throws IOExce FieldEntry fieldEntry = readField(meta, info); validateFieldEntry(info, fieldEntry); - fields.put(info.name, fieldEntry); + fields.put(info.number, fieldEntry); } } @@ -221,10 +219,18 @@ public void checkIntegrity() throws IOException { CodecUtil.checksumEntireFile(vectorIndex); } + private FieldEntry getFieldEntry(String field) { + final FieldInfo info = fieldInfos.fieldInfo(field); + final FieldEntry fieldEntry; + if (info == null || (fieldEntry = fields.get(info.number)) == null) { + throw new IllegalArgumentException("field=\"" + field + "\" not found"); + } + return fieldEntry; + } + @Override public FloatVectorValues getFloatVectorValues(String field) throws IOException { - FieldEntry fieldEntry = fields.get(field); - return getOffHeapVectorValues(fieldEntry); + return getOffHeapVectorValues(getFieldEntry(field)); } @Override @@ -235,8 +241,7 @@ public ByteVectorValues getByteVectorValues(String field) { @Override public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { - FieldEntry fieldEntry = fields.get(field); - + final FieldEntry fieldEntry = getFieldEntry(field); if (fieldEntry.size() == 0) { return; } @@ -260,7 +265,7 @@ public void search(String field, float[] target, KnnCollector knnCollector, Bits int node = results.topNode(); float minSimilarity = results.topScore(); results.pop(); - knnCollector.collect(node, minSimilarity); + knnCollector.collect(vectorValues.ordToDoc(node), minSimilarity); } } @@ -352,8 +357,7 @@ int size() { } /** Read the vector values from the index input. This supports both iterated and random access. */ - static class OffHeapFloatVectorValues extends FloatVectorValues - implements RandomAccessVectorValues.Floats { + static class OffHeapFloatVectorValues extends FloatVectorValues { final int dimension; final int[] ordToDoc; @@ -364,9 +368,6 @@ static class OffHeapFloatVectorValues extends FloatVectorValues final float[] value; final VectorSimilarityFunction similarityFunction; - int ord = -1; - int doc = -1; - OffHeapFloatVectorValues( int dimension, int[] ordToDoc, @@ -391,42 +392,6 @@ public int size() { return ordToDoc.length; } - @Override - public float[] vectorValue() throws IOException { - return vectorValue(ord); - } - - @Override - public int docID() { - return doc; - } - - @Override - public int nextDoc() { - if (++ord >= size()) { - doc = NO_MORE_DOCS; - } else { - doc = ordToDoc[ord]; - } - return doc; - } - - @Override - public int advance(int target) { - assert docID() < target; - ord = Arrays.binarySearch(ordToDoc, ord + 1, ordToDoc.length, target); - if (ord < 0) { - ord = -(ord + 1); - } - assert ord <= ordToDoc.length; - if (ord == ordToDoc.length) { - doc = NO_MORE_DOCS; - } else { - doc = ordToDoc[ord]; - } - return doc; - } - @Override public OffHeapFloatVectorValues copy() { return new OffHeapFloatVectorValues(dimension, ordToDoc, similarityFunction, dataIn.clone()); @@ -443,21 +408,32 @@ public float[] vectorValue(int targetOrd) throws IOException { return value; } + @Override + public int ordToDoc(int ord) { + return ordToDoc[ord]; + } + + @Override + public DocIndexIterator iterator() { + return createSparseIterator(); + } + @Override public VectorScorer scorer(float[] target) { if (size() == 0) { return null; } OffHeapFloatVectorValues values = this.copy(); + DocIndexIterator iterator = values.iterator(); return new VectorScorer() { @Override public float score() throws IOException { - return values.similarityFunction.compare(values.vectorValue(), target); + return values.similarityFunction.compare(values.vectorValue(iterator.index()), target); } @Override - public DocIdSetIterator iterator() { - return values; + public DocIndexIterator iterator() { + return iterator; } }; } diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90OnHeapHnswGraph.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90OnHeapHnswGraph.java index 52f2146e836b..845987c2957c 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90OnHeapHnswGraph.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90OnHeapHnswGraph.java @@ -23,12 +23,12 @@ import java.util.ArrayList; import java.util.List; import java.util.SplittableRandom; +import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.util.Bits; import org.apache.lucene.util.SparseFixedBitSet; import org.apache.lucene.util.hnsw.HnswGraph; import org.apache.lucene.util.hnsw.NeighborQueue; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; /** * An {@link HnswGraph} where all nodes and connections are held in memory. This class is used to @@ -74,7 +74,7 @@ public static NeighborQueue search( float[] query, int topK, int numSeed, - RandomAccessVectorValues.Floats vectors, + FloatVectorValues vectors, VectorSimilarityFunction similarityFunction, HnswGraph graphValues, Bits acceptOrds, diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsReader.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsReader.java index 048280466d43..e71fa66719f8 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsReader.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsReader.java @@ -21,8 +21,6 @@ import java.io.IOException; import java.util.Arrays; -import java.util.HashMap; -import java.util.Map; import java.util.function.IntUnaryOperator; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.KnnVectorsReader; @@ -35,6 +33,7 @@ import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.internal.hppc.IntObjectHashMap; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.KnnCollector; import org.apache.lucene.search.VectorScorer; @@ -46,7 +45,6 @@ import org.apache.lucene.util.hnsw.HnswGraph; import org.apache.lucene.util.hnsw.HnswGraphSearcher; import org.apache.lucene.util.hnsw.OrdinalTranslatedKnnCollector; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.hnsw.RandomVectorScorer; /** @@ -56,13 +54,15 @@ */ public final class Lucene91HnswVectorsReader extends KnnVectorsReader { - private final Map fields = new HashMap<>(); + private final IntObjectHashMap fields = new IntObjectHashMap<>(); private final IndexInput vectorData; private final IndexInput vectorIndex; private final DefaultFlatVectorScorer defaultFlatVectorScorer = new DefaultFlatVectorScorer(); + private final FieldInfos fieldInfos; Lucene91HnswVectorsReader(SegmentReadState state) throws IOException { int versionMeta = readMetadata(state); + this.fieldInfos = state.fieldInfos; boolean success = false; try { vectorData = @@ -155,7 +155,7 @@ private void readFields(ChecksumIndexInput meta, FieldInfos infos) throws IOExce } FieldEntry fieldEntry = readField(meta, info); validateFieldEntry(info, fieldEntry); - fields.put(info.name, fieldEntry); + fields.put(info.number, fieldEntry); } } @@ -215,10 +215,18 @@ public void checkIntegrity() throws IOException { CodecUtil.checksumEntireFile(vectorIndex); } + private FieldEntry getFieldEntry(String field) { + final FieldInfo info = fieldInfos.fieldInfo(field); + final FieldEntry fieldEntry; + if (info == null || (fieldEntry = fields.get(info.number)) == null) { + throw new IllegalArgumentException("field=\"" + field + "\" not found"); + } + return fieldEntry; + } + @Override public FloatVectorValues getFloatVectorValues(String field) throws IOException { - FieldEntry fieldEntry = fields.get(field); - return getOffHeapVectorValues(fieldEntry); + return getOffHeapVectorValues(getFieldEntry(field)); } @Override @@ -229,8 +237,7 @@ public ByteVectorValues getByteVectorValues(String field) throws IOException { @Override public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { - FieldEntry fieldEntry = fields.get(field); - + final FieldEntry fieldEntry = getFieldEntry(field); if (fieldEntry.size() == 0) { return; } @@ -395,8 +402,7 @@ int ordToDoc(int ord) { } /** Read the vector values from the index input. This supports both iterated and random access. */ - static class OffHeapFloatVectorValues extends FloatVectorValues - implements RandomAccessVectorValues.Floats { + static class OffHeapFloatVectorValues extends FloatVectorValues { private final int dimension; private final int size; @@ -407,9 +413,6 @@ static class OffHeapFloatVectorValues extends FloatVectorValues private final float[] value; private final VectorSimilarityFunction similarityFunction; - private int ord = -1; - private int doc = -1; - OffHeapFloatVectorValues( int dimension, int size, @@ -436,49 +439,6 @@ public int size() { return size; } - @Override - public float[] vectorValue() throws IOException { - dataIn.seek((long) ord * byteSize); - dataIn.readFloats(value, 0, value.length); - return value; - } - - @Override - public int docID() { - return doc; - } - - @Override - public int nextDoc() { - if (++ord >= size) { - doc = NO_MORE_DOCS; - } else { - doc = ordToDocOperator.applyAsInt(ord); - } - return doc; - } - - @Override - public int advance(int target) { - assert docID() < target; - - if (ordToDoc == null) { - ord = target; - } else { - ord = Arrays.binarySearch(ordToDoc, ord + 1, ordToDoc.length, target); - if (ord < 0) { - ord = -(ord + 1); - } - } - - if (ord < size) { - doc = ordToDocOperator.applyAsInt(ord); - } else { - doc = NO_MORE_DOCS; - } - return doc; - } - @Override public OffHeapFloatVectorValues copy() { return new OffHeapFloatVectorValues( @@ -492,21 +452,32 @@ public float[] vectorValue(int targetOrd) throws IOException { return value; } + @Override + public int ordToDoc(int ord) { + return ordToDocOperator.applyAsInt(ord); + } + + @Override + public DocIndexIterator iterator() { + return createSparseIterator(); + } + @Override public VectorScorer scorer(float[] target) { if (size == 0) { return null; } OffHeapFloatVectorValues values = this.copy(); + DocIndexIterator iterator = values.iterator(); return new VectorScorer() { @Override public float score() throws IOException { - return values.similarityFunction.compare(values.vectorValue(), target); + return values.similarityFunction.compare(values.vectorValue(iterator.index()), target); } @Override public DocIdSetIterator iterator() { - return values; + return iterator; } }; } diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/ForDeltaUtil.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/ForDeltaUtil.java new file mode 100644 index 000000000000..f87ffc135860 --- /dev/null +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/ForDeltaUtil.java @@ -0,0 +1,520 @@ +// This file has been automatically generated, DO NOT EDIT + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.backward_codecs.lucene912; + +import static org.apache.lucene.backward_codecs.lucene912.ForUtil.*; + +import java.io.IOException; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.packed.PackedInts; + +/** + * Inspired from https://fulmicoton.com/posts/bitpacking/ Encodes multiple integers in a long to get + * SIMD-like speedups. If bitsPerValue <= 4 then we pack 8 ints per long else if bitsPerValue + * <= 11 we pack 4 ints per long else we pack 2 ints per long + */ +final class ForDeltaUtil { + + private static final int ONE_BLOCK_SIZE_FOURTH = BLOCK_SIZE / 4; + private static final int TWO_BLOCK_SIZE_FOURTHS = BLOCK_SIZE / 2; + private static final int THREE_BLOCK_SIZE_FOURTHS = 3 * BLOCK_SIZE / 4; + + private static final int ONE_BLOCK_SIZE_EIGHT = BLOCK_SIZE / 8; + private static final int TWO_BLOCK_SIZE_EIGHTS = BLOCK_SIZE / 4; + private static final int THREE_BLOCK_SIZE_EIGHTS = 3 * BLOCK_SIZE / 8; + private static final int FOUR_BLOCK_SIZE_EIGHTS = BLOCK_SIZE / 2; + private static final int FIVE_BLOCK_SIZE_EIGHTS = 5 * BLOCK_SIZE / 8; + private static final int SIX_BLOCK_SIZE_EIGHTS = 3 * BLOCK_SIZE / 4; + private static final int SEVEN_BLOCK_SIZE_EIGHTS = 7 * BLOCK_SIZE / 8; + + // IDENTITY_PLUS_ONE[i] == i+1 + private static final long[] IDENTITY_PLUS_ONE = new long[ForUtil.BLOCK_SIZE]; + + static { + for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) { + IDENTITY_PLUS_ONE[i] = i + 1; + } + } + + private static void prefixSumOfOnes(long[] arr, long base) { + System.arraycopy(IDENTITY_PLUS_ONE, 0, arr, 0, ForUtil.BLOCK_SIZE); + // This loop gets auto-vectorized + for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) { + arr[i] += base; + } + } + + private static void prefixSum8(long[] arr, long base) { + // When the number of bits per value is 4 or less, we can sum up all values in a block without + // risking overflowing a 8-bits integer. This allows computing the prefix sum by summing up 8 + // values at once. + innerPrefixSum8(arr); + expand8(arr); + final long l0 = base; + final long l1 = l0 + arr[ONE_BLOCK_SIZE_EIGHT - 1]; + final long l2 = l1 + arr[TWO_BLOCK_SIZE_EIGHTS - 1]; + final long l3 = l2 + arr[THREE_BLOCK_SIZE_EIGHTS - 1]; + final long l4 = l3 + arr[FOUR_BLOCK_SIZE_EIGHTS - 1]; + final long l5 = l4 + arr[FIVE_BLOCK_SIZE_EIGHTS - 1]; + final long l6 = l5 + arr[SIX_BLOCK_SIZE_EIGHTS - 1]; + final long l7 = l6 + arr[SEVEN_BLOCK_SIZE_EIGHTS - 1]; + + for (int i = 0; i < ONE_BLOCK_SIZE_EIGHT; ++i) { + arr[i] += l0; + arr[ONE_BLOCK_SIZE_EIGHT + i] += l1; + arr[TWO_BLOCK_SIZE_EIGHTS + i] += l2; + arr[THREE_BLOCK_SIZE_EIGHTS + i] += l3; + arr[FOUR_BLOCK_SIZE_EIGHTS + i] += l4; + arr[FIVE_BLOCK_SIZE_EIGHTS + i] += l5; + arr[SIX_BLOCK_SIZE_EIGHTS + i] += l6; + arr[SEVEN_BLOCK_SIZE_EIGHTS + i] += l7; + } + } + + private static void prefixSum16(long[] arr, long base) { + // When the number of bits per value is 11 or less, we can sum up all values in a block without + // risking overflowing a 16-bits integer. This allows computing the prefix sum by summing up 4 + // values at once. + innerPrefixSum16(arr); + expand16(arr); + final long l0 = base; + final long l1 = l0 + arr[ONE_BLOCK_SIZE_FOURTH - 1]; + final long l2 = l1 + arr[TWO_BLOCK_SIZE_FOURTHS - 1]; + final long l3 = l2 + arr[THREE_BLOCK_SIZE_FOURTHS - 1]; + + for (int i = 0; i < ONE_BLOCK_SIZE_FOURTH; ++i) { + arr[i] += l0; + arr[ONE_BLOCK_SIZE_FOURTH + i] += l1; + arr[TWO_BLOCK_SIZE_FOURTHS + i] += l2; + arr[THREE_BLOCK_SIZE_FOURTHS + i] += l3; + } + } + + private static void prefixSum32(long[] arr, long base) { + arr[0] += base << 32; + innerPrefixSum32(arr); + expand32(arr); + final long l = arr[BLOCK_SIZE / 2 - 1]; + for (int i = BLOCK_SIZE / 2; i < BLOCK_SIZE; ++i) { + arr[i] += l; + } + } + + // For some reason unrolling seems to help + private static void innerPrefixSum8(long[] arr) { + arr[1] += arr[0]; + arr[2] += arr[1]; + arr[3] += arr[2]; + arr[4] += arr[3]; + arr[5] += arr[4]; + arr[6] += arr[5]; + arr[7] += arr[6]; + arr[8] += arr[7]; + arr[9] += arr[8]; + arr[10] += arr[9]; + arr[11] += arr[10]; + arr[12] += arr[11]; + arr[13] += arr[12]; + arr[14] += arr[13]; + arr[15] += arr[14]; + } + + // For some reason unrolling seems to help + private static void innerPrefixSum16(long[] arr) { + arr[1] += arr[0]; + arr[2] += arr[1]; + arr[3] += arr[2]; + arr[4] += arr[3]; + arr[5] += arr[4]; + arr[6] += arr[5]; + arr[7] += arr[6]; + arr[8] += arr[7]; + arr[9] += arr[8]; + arr[10] += arr[9]; + arr[11] += arr[10]; + arr[12] += arr[11]; + arr[13] += arr[12]; + arr[14] += arr[13]; + arr[15] += arr[14]; + arr[16] += arr[15]; + arr[17] += arr[16]; + arr[18] += arr[17]; + arr[19] += arr[18]; + arr[20] += arr[19]; + arr[21] += arr[20]; + arr[22] += arr[21]; + arr[23] += arr[22]; + arr[24] += arr[23]; + arr[25] += arr[24]; + arr[26] += arr[25]; + arr[27] += arr[26]; + arr[28] += arr[27]; + arr[29] += arr[28]; + arr[30] += arr[29]; + arr[31] += arr[30]; + } + + // For some reason unrolling seems to help + private static void innerPrefixSum32(long[] arr) { + arr[1] += arr[0]; + arr[2] += arr[1]; + arr[3] += arr[2]; + arr[4] += arr[3]; + arr[5] += arr[4]; + arr[6] += arr[5]; + arr[7] += arr[6]; + arr[8] += arr[7]; + arr[9] += arr[8]; + arr[10] += arr[9]; + arr[11] += arr[10]; + arr[12] += arr[11]; + arr[13] += arr[12]; + arr[14] += arr[13]; + arr[15] += arr[14]; + arr[16] += arr[15]; + arr[17] += arr[16]; + arr[18] += arr[17]; + arr[19] += arr[18]; + arr[20] += arr[19]; + arr[21] += arr[20]; + arr[22] += arr[21]; + arr[23] += arr[22]; + arr[24] += arr[23]; + arr[25] += arr[24]; + arr[26] += arr[25]; + arr[27] += arr[26]; + arr[28] += arr[27]; + arr[29] += arr[28]; + arr[30] += arr[29]; + arr[31] += arr[30]; + arr[32] += arr[31]; + arr[33] += arr[32]; + arr[34] += arr[33]; + arr[35] += arr[34]; + arr[36] += arr[35]; + arr[37] += arr[36]; + arr[38] += arr[37]; + arr[39] += arr[38]; + arr[40] += arr[39]; + arr[41] += arr[40]; + arr[42] += arr[41]; + arr[43] += arr[42]; + arr[44] += arr[43]; + arr[45] += arr[44]; + arr[46] += arr[45]; + arr[47] += arr[46]; + arr[48] += arr[47]; + arr[49] += arr[48]; + arr[50] += arr[49]; + arr[51] += arr[50]; + arr[52] += arr[51]; + arr[53] += arr[52]; + arr[54] += arr[53]; + arr[55] += arr[54]; + arr[56] += arr[55]; + arr[57] += arr[56]; + arr[58] += arr[57]; + arr[59] += arr[58]; + arr[60] += arr[59]; + arr[61] += arr[60]; + arr[62] += arr[61]; + arr[63] += arr[62]; + } + + private final long[] tmp = new long[BLOCK_SIZE / 2]; + + /** + * Encode deltas of a strictly monotonically increasing sequence of integers. The provided {@code + * longs} are expected to be deltas between consecutive values. + */ + void encodeDeltas(long[] longs, DataOutput out) throws IOException { + if (longs[0] == 1 && PForUtil.allEqual(longs)) { // happens with very dense postings + out.writeByte((byte) 0); + } else { + long or = 0; + for (long l : longs) { + or |= l; + } + assert or != 0; + final int bitsPerValue = PackedInts.bitsRequired(or); + out.writeByte((byte) bitsPerValue); + + final int primitiveSize; + if (bitsPerValue <= 4) { + primitiveSize = 8; + collapse8(longs); + } else if (bitsPerValue <= 11) { + primitiveSize = 16; + collapse16(longs); + } else { + primitiveSize = 32; + collapse32(longs); + } + encode(longs, bitsPerValue, primitiveSize, out, tmp); + } + } + + /** Decode deltas, compute the prefix sum and add {@code base} to all decoded longs. */ + void decodeAndPrefixSum(IndexInput in, long base, long[] longs) throws IOException { + final int bitsPerValue = Byte.toUnsignedInt(in.readByte()); + if (bitsPerValue == 0) { + prefixSumOfOnes(longs, base); + } else { + decodeAndPrefixSum(bitsPerValue, in, base, longs); + } + } + + /** Delta-decode 128 integers into {@code longs}. */ + void decodeAndPrefixSum(int bitsPerValue, IndexInput in, long base, long[] longs) + throws IOException { + switch (bitsPerValue) { + case 1: + decode1(in, longs); + prefixSum8(longs, base); + break; + case 2: + decode2(in, longs); + prefixSum8(longs, base); + break; + case 3: + decode3(in, tmp, longs); + prefixSum8(longs, base); + break; + case 4: + decode4(in, longs); + prefixSum8(longs, base); + break; + case 5: + decode5To16(in, tmp, longs); + prefixSum16(longs, base); + break; + case 6: + decode6To16(in, tmp, longs); + prefixSum16(longs, base); + break; + case 7: + decode7To16(in, tmp, longs); + prefixSum16(longs, base); + break; + case 8: + decode8To16(in, longs); + prefixSum16(longs, base); + break; + case 9: + decode9(in, tmp, longs); + prefixSum16(longs, base); + break; + case 10: + decode10(in, tmp, longs); + prefixSum16(longs, base); + break; + case 11: + decode11(in, tmp, longs); + prefixSum16(longs, base); + break; + case 12: + decode12To32(in, tmp, longs); + prefixSum32(longs, base); + break; + case 13: + decode13To32(in, tmp, longs); + prefixSum32(longs, base); + break; + case 14: + decode14To32(in, tmp, longs); + prefixSum32(longs, base); + break; + case 15: + decode15To32(in, tmp, longs); + prefixSum32(longs, base); + break; + case 16: + decode16To32(in, longs); + prefixSum32(longs, base); + break; + case 17: + decode17(in, tmp, longs); + prefixSum32(longs, base); + break; + case 18: + decode18(in, tmp, longs); + prefixSum32(longs, base); + break; + case 19: + decode19(in, tmp, longs); + prefixSum32(longs, base); + break; + case 20: + decode20(in, tmp, longs); + prefixSum32(longs, base); + break; + case 21: + decode21(in, tmp, longs); + prefixSum32(longs, base); + break; + case 22: + decode22(in, tmp, longs); + prefixSum32(longs, base); + break; + case 23: + decode23(in, tmp, longs); + prefixSum32(longs, base); + break; + case 24: + decode24(in, tmp, longs); + prefixSum32(longs, base); + break; + default: + decodeSlow(bitsPerValue, in, tmp, longs); + prefixSum32(longs, base); + break; + } + } + + private static void decode5To16(IndexInput in, long[] tmp, long[] longs) throws IOException { + splitLongs(in, 10, longs, 11, 5, MASK16_5, tmp, 0, MASK16_1); + for (int iter = 0, tmpIdx = 0, longsIdx = 30; iter < 2; ++iter, tmpIdx += 5, longsIdx += 1) { + long l0 = tmp[tmpIdx + 0] << 4; + l0 |= tmp[tmpIdx + 1] << 3; + l0 |= tmp[tmpIdx + 2] << 2; + l0 |= tmp[tmpIdx + 3] << 1; + l0 |= tmp[tmpIdx + 4] << 0; + longs[longsIdx + 0] = l0; + } + } + + private static void decode6To16(IndexInput in, long[] tmp, long[] longs) throws IOException { + splitLongs(in, 12, longs, 10, 6, MASK16_6, tmp, 0, MASK16_4); + for (int iter = 0, tmpIdx = 0, longsIdx = 24; iter < 4; ++iter, tmpIdx += 3, longsIdx += 2) { + long l0 = tmp[tmpIdx + 0] << 2; + l0 |= (tmp[tmpIdx + 1] >>> 2) & MASK16_2; + longs[longsIdx + 0] = l0; + long l1 = (tmp[tmpIdx + 1] & MASK16_2) << 4; + l1 |= tmp[tmpIdx + 2] << 0; + longs[longsIdx + 1] = l1; + } + } + + private static void decode7To16(IndexInput in, long[] tmp, long[] longs) throws IOException { + splitLongs(in, 14, longs, 9, 7, MASK16_7, tmp, 0, MASK16_2); + for (int iter = 0, tmpIdx = 0, longsIdx = 28; iter < 2; ++iter, tmpIdx += 7, longsIdx += 2) { + long l0 = tmp[tmpIdx + 0] << 5; + l0 |= tmp[tmpIdx + 1] << 3; + l0 |= tmp[tmpIdx + 2] << 1; + l0 |= (tmp[tmpIdx + 3] >>> 1) & MASK16_1; + longs[longsIdx + 0] = l0; + long l1 = (tmp[tmpIdx + 3] & MASK16_1) << 6; + l1 |= tmp[tmpIdx + 4] << 4; + l1 |= tmp[tmpIdx + 5] << 2; + l1 |= tmp[tmpIdx + 6] << 0; + longs[longsIdx + 1] = l1; + } + } + + private static void decode8To16(IndexInput in, long[] longs) throws IOException { + splitLongs(in, 16, longs, 8, 8, MASK16_8, longs, 16, MASK16_8); + } + + private static void decode12To32(IndexInput in, long[] tmp, long[] longs) throws IOException { + splitLongs(in, 24, longs, 20, 12, MASK32_12, tmp, 0, MASK32_8); + for (int iter = 0, tmpIdx = 0, longsIdx = 48; iter < 8; ++iter, tmpIdx += 3, longsIdx += 2) { + long l0 = tmp[tmpIdx + 0] << 4; + l0 |= (tmp[tmpIdx + 1] >>> 4) & MASK32_4; + longs[longsIdx + 0] = l0; + long l1 = (tmp[tmpIdx + 1] & MASK32_4) << 8; + l1 |= tmp[tmpIdx + 2] << 0; + longs[longsIdx + 1] = l1; + } + } + + private static void decode13To32(IndexInput in, long[] tmp, long[] longs) throws IOException { + splitLongs(in, 26, longs, 19, 13, MASK32_13, tmp, 0, MASK32_6); + for (int iter = 0, tmpIdx = 0, longsIdx = 52; iter < 2; ++iter, tmpIdx += 13, longsIdx += 6) { + long l0 = tmp[tmpIdx + 0] << 7; + l0 |= tmp[tmpIdx + 1] << 1; + l0 |= (tmp[tmpIdx + 2] >>> 5) & MASK32_1; + longs[longsIdx + 0] = l0; + long l1 = (tmp[tmpIdx + 2] & MASK32_5) << 8; + l1 |= tmp[tmpIdx + 3] << 2; + l1 |= (tmp[tmpIdx + 4] >>> 4) & MASK32_2; + longs[longsIdx + 1] = l1; + long l2 = (tmp[tmpIdx + 4] & MASK32_4) << 9; + l2 |= tmp[tmpIdx + 5] << 3; + l2 |= (tmp[tmpIdx + 6] >>> 3) & MASK32_3; + longs[longsIdx + 2] = l2; + long l3 = (tmp[tmpIdx + 6] & MASK32_3) << 10; + l3 |= tmp[tmpIdx + 7] << 4; + l3 |= (tmp[tmpIdx + 8] >>> 2) & MASK32_4; + longs[longsIdx + 3] = l3; + long l4 = (tmp[tmpIdx + 8] & MASK32_2) << 11; + l4 |= tmp[tmpIdx + 9] << 5; + l4 |= (tmp[tmpIdx + 10] >>> 1) & MASK32_5; + longs[longsIdx + 4] = l4; + long l5 = (tmp[tmpIdx + 10] & MASK32_1) << 12; + l5 |= tmp[tmpIdx + 11] << 6; + l5 |= tmp[tmpIdx + 12] << 0; + longs[longsIdx + 5] = l5; + } + } + + private static void decode14To32(IndexInput in, long[] tmp, long[] longs) throws IOException { + splitLongs(in, 28, longs, 18, 14, MASK32_14, tmp, 0, MASK32_4); + for (int iter = 0, tmpIdx = 0, longsIdx = 56; iter < 4; ++iter, tmpIdx += 7, longsIdx += 2) { + long l0 = tmp[tmpIdx + 0] << 10; + l0 |= tmp[tmpIdx + 1] << 6; + l0 |= tmp[tmpIdx + 2] << 2; + l0 |= (tmp[tmpIdx + 3] >>> 2) & MASK32_2; + longs[longsIdx + 0] = l0; + long l1 = (tmp[tmpIdx + 3] & MASK32_2) << 12; + l1 |= tmp[tmpIdx + 4] << 8; + l1 |= tmp[tmpIdx + 5] << 4; + l1 |= tmp[tmpIdx + 6] << 0; + longs[longsIdx + 1] = l1; + } + } + + private static void decode15To32(IndexInput in, long[] tmp, long[] longs) throws IOException { + splitLongs(in, 30, longs, 17, 15, MASK32_15, tmp, 0, MASK32_2); + for (int iter = 0, tmpIdx = 0, longsIdx = 60; iter < 2; ++iter, tmpIdx += 15, longsIdx += 2) { + long l0 = tmp[tmpIdx + 0] << 13; + l0 |= tmp[tmpIdx + 1] << 11; + l0 |= tmp[tmpIdx + 2] << 9; + l0 |= tmp[tmpIdx + 3] << 7; + l0 |= tmp[tmpIdx + 4] << 5; + l0 |= tmp[tmpIdx + 5] << 3; + l0 |= tmp[tmpIdx + 6] << 1; + l0 |= (tmp[tmpIdx + 7] >>> 1) & MASK32_1; + longs[longsIdx + 0] = l0; + long l1 = (tmp[tmpIdx + 7] & MASK32_1) << 14; + l1 |= tmp[tmpIdx + 8] << 12; + l1 |= tmp[tmpIdx + 9] << 10; + l1 |= tmp[tmpIdx + 10] << 8; + l1 |= tmp[tmpIdx + 11] << 6; + l1 |= tmp[tmpIdx + 12] << 4; + l1 |= tmp[tmpIdx + 13] << 2; + l1 |= tmp[tmpIdx + 14] << 0; + longs[longsIdx + 1] = l1; + } + } + + private static void decode16To32(IndexInput in, long[] longs) throws IOException { + splitLongs(in, 32, longs, 16, 16, MASK32_16, longs, 32, MASK32_16); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/ForUtil.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/ForUtil.java similarity index 54% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene912/ForUtil.java rename to lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/ForUtil.java index 63ee7baaf10a..9f38052b2eb1 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/ForUtil.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/ForUtil.java @@ -16,47 +16,47 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene912; +package org.apache.lucene.backward_codecs.lucene912; import java.io.IOException; -import org.apache.lucene.store.DataInput; import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.IndexInput; -// Inspired from https://fulmicoton.com/posts/bitpacking/ -// Encodes multiple integers in a long to get SIMD-like speedups. -// If bitsPerValue <= 8 then we pack 8 ints per long -// else if bitsPerValue <= 16 we pack 4 ints per long -// else we pack 2 ints per long +/** + * Inspired from https://fulmicoton.com/posts/bitpacking/ Encodes multiple integers in a long to get + * SIMD-like speedups. If bitsPerValue <= 8 then we pack 8 ints per long else if bitsPerValue + * <= 16 we pack 4 ints per long else we pack 2 ints per long + */ final class ForUtil { - static final int BLOCK_SIZE = 128; - private static final int BLOCK_SIZE_LOG2 = 7; + public static final int BLOCK_SIZE = 128; + static final int BLOCK_SIZE_LOG2 = 7; - private static long expandMask32(long mask32) { + static long expandMask32(long mask32) { return mask32 | (mask32 << 32); } - private static long expandMask16(long mask16) { + static long expandMask16(long mask16) { return expandMask32(mask16 | (mask16 << 16)); } - private static long expandMask8(long mask8) { + static long expandMask8(long mask8) { return expandMask16(mask8 | (mask8 << 8)); } - private static long mask32(int bitsPerValue) { + static long mask32(int bitsPerValue) { return expandMask32((1L << bitsPerValue) - 1); } - private static long mask16(int bitsPerValue) { + static long mask16(int bitsPerValue) { return expandMask16((1L << bitsPerValue) - 1); } - private static long mask8(int bitsPerValue) { + static long mask8(int bitsPerValue) { return expandMask8((1L << bitsPerValue) - 1); } - private static void expand8(long[] arr) { + static void expand8(long[] arr) { for (int i = 0; i < 16; ++i) { long l = arr[i]; arr[i] = (l >>> 56) & 0xFFL; @@ -70,17 +70,7 @@ private static void expand8(long[] arr) { } } - private static void expand8To32(long[] arr) { - for (int i = 0; i < 16; ++i) { - long l = arr[i]; - arr[i] = (l >>> 24) & 0x000000FF000000FFL; - arr[16 + i] = (l >>> 16) & 0x000000FF000000FFL; - arr[32 + i] = (l >>> 8) & 0x000000FF000000FFL; - arr[48 + i] = l & 0x000000FF000000FFL; - } - } - - private static void collapse8(long[] arr) { + static void collapse8(long[] arr) { for (int i = 0; i < 16; ++i) { arr[i] = (arr[i] << 56) @@ -94,7 +84,7 @@ private static void collapse8(long[] arr) { } } - private static void expand16(long[] arr) { + static void expand16(long[] arr) { for (int i = 0; i < 32; ++i) { long l = arr[i]; arr[i] = (l >>> 48) & 0xFFFFL; @@ -104,21 +94,13 @@ private static void expand16(long[] arr) { } } - private static void expand16To32(long[] arr) { - for (int i = 0; i < 32; ++i) { - long l = arr[i]; - arr[i] = (l >>> 16) & 0x0000FFFF0000FFFFL; - arr[32 + i] = l & 0x0000FFFF0000FFFFL; - } - } - - private static void collapse16(long[] arr) { + static void collapse16(long[] arr) { for (int i = 0; i < 32; ++i) { arr[i] = (arr[i] << 48) | (arr[32 + i] << 32) | (arr[64 + i] << 16) | arr[96 + i]; } } - private static void expand32(long[] arr) { + static void expand32(long[] arr) { for (int i = 0; i < 64; ++i) { long l = arr[i]; arr[i] = l >>> 32; @@ -126,123 +108,37 @@ private static void expand32(long[] arr) { } } - private static void collapse32(long[] arr) { + static void collapse32(long[] arr) { for (int i = 0; i < 64; ++i) { arr[i] = (arr[i] << 32) | arr[64 + i]; } } - private static void prefixSum8(long[] arr, long base) { - expand8To32(arr); - prefixSum32(arr, base); - } - - private static void prefixSum16(long[] arr, long base) { - // We need to move to the next primitive size to avoid overflows - expand16To32(arr); - prefixSum32(arr, base); - } - - private static void prefixSum32(long[] arr, long base) { - arr[0] += base << 32; - innerPrefixSum32(arr); - expand32(arr); - final long l = arr[BLOCK_SIZE / 2 - 1]; - for (int i = BLOCK_SIZE / 2; i < BLOCK_SIZE; ++i) { - arr[i] += l; - } - } - - // For some reason unrolling seems to help - private static void innerPrefixSum32(long[] arr) { - arr[1] += arr[0]; - arr[2] += arr[1]; - arr[3] += arr[2]; - arr[4] += arr[3]; - arr[5] += arr[4]; - arr[6] += arr[5]; - arr[7] += arr[6]; - arr[8] += arr[7]; - arr[9] += arr[8]; - arr[10] += arr[9]; - arr[11] += arr[10]; - arr[12] += arr[11]; - arr[13] += arr[12]; - arr[14] += arr[13]; - arr[15] += arr[14]; - arr[16] += arr[15]; - arr[17] += arr[16]; - arr[18] += arr[17]; - arr[19] += arr[18]; - arr[20] += arr[19]; - arr[21] += arr[20]; - arr[22] += arr[21]; - arr[23] += arr[22]; - arr[24] += arr[23]; - arr[25] += arr[24]; - arr[26] += arr[25]; - arr[27] += arr[26]; - arr[28] += arr[27]; - arr[29] += arr[28]; - arr[30] += arr[29]; - arr[31] += arr[30]; - arr[32] += arr[31]; - arr[33] += arr[32]; - arr[34] += arr[33]; - arr[35] += arr[34]; - arr[36] += arr[35]; - arr[37] += arr[36]; - arr[38] += arr[37]; - arr[39] += arr[38]; - arr[40] += arr[39]; - arr[41] += arr[40]; - arr[42] += arr[41]; - arr[43] += arr[42]; - arr[44] += arr[43]; - arr[45] += arr[44]; - arr[46] += arr[45]; - arr[47] += arr[46]; - arr[48] += arr[47]; - arr[49] += arr[48]; - arr[50] += arr[49]; - arr[51] += arr[50]; - arr[52] += arr[51]; - arr[53] += arr[52]; - arr[54] += arr[53]; - arr[55] += arr[54]; - arr[56] += arr[55]; - arr[57] += arr[56]; - arr[58] += arr[57]; - arr[59] += arr[58]; - arr[60] += arr[59]; - arr[61] += arr[60]; - arr[62] += arr[61]; - arr[63] += arr[62]; - } - private final long[] tmp = new long[BLOCK_SIZE / 2]; /** Encode 128 integers from {@code longs} into {@code out}. */ void encode(long[] longs, int bitsPerValue, DataOutput out) throws IOException { final int nextPrimitive; - final int numLongs; if (bitsPerValue <= 8) { nextPrimitive = 8; - numLongs = BLOCK_SIZE / 8; collapse8(longs); } else if (bitsPerValue <= 16) { nextPrimitive = 16; - numLongs = BLOCK_SIZE / 4; collapse16(longs); } else { nextPrimitive = 32; - numLongs = BLOCK_SIZE / 2; collapse32(longs); } + encode(longs, bitsPerValue, nextPrimitive, out, tmp); + } + + static void encode(long[] longs, int bitsPerValue, int primitiveSize, DataOutput out, long[] tmp) + throws IOException { + final int numLongs = BLOCK_SIZE * primitiveSize / Long.SIZE; final int numLongsPerShift = bitsPerValue * 2; int idx = 0; - int shift = nextPrimitive - bitsPerValue; + int shift = primitiveSize - bitsPerValue; for (int i = 0; i < numLongsPerShift; ++i) { tmp[i] = longs[idx++] << shift; } @@ -254,9 +150,9 @@ void encode(long[] longs, int bitsPerValue, DataOutput out) throws IOException { final int remainingBitsPerLong = shift + bitsPerValue; final long maskRemainingBitsPerLong; - if (nextPrimitive == 8) { + if (primitiveSize == 8) { maskRemainingBitsPerLong = MASKS8[remainingBitsPerLong]; - } else if (nextPrimitive == 16) { + } else if (primitiveSize == 16) { maskRemainingBitsPerLong = MASKS16[remainingBitsPerLong]; } else { maskRemainingBitsPerLong = MASKS32[remainingBitsPerLong]; @@ -274,10 +170,10 @@ void encode(long[] longs, int bitsPerValue, DataOutput out) throws IOException { } } else { final long mask1, mask2; - if (nextPrimitive == 8) { + if (primitiveSize == 8) { mask1 = MASKS8[remainingBitsPerValue]; mask2 = MASKS8[remainingBitsPerLong - remainingBitsPerValue]; - } else if (nextPrimitive == 16) { + } else if (primitiveSize == 16) { mask1 = MASKS16[remainingBitsPerValue]; mask2 = MASKS16[remainingBitsPerLong - remainingBitsPerValue]; } else { @@ -296,26 +192,20 @@ void encode(long[] longs, int bitsPerValue, DataOutput out) throws IOException { } /** Number of bytes required to encode 128 integers of {@code bitsPerValue} bits per value. */ - int numBytes(int bitsPerValue) { + static int numBytes(int bitsPerValue) { return bitsPerValue << (BLOCK_SIZE_LOG2 - 3); } - private static void decodeSlow(int bitsPerValue, DataInput in, long[] tmp, long[] longs) + static void decodeSlow(int bitsPerValue, IndexInput in, long[] tmp, long[] longs) throws IOException { final int numLongs = bitsPerValue << 1; - in.readLongs(tmp, 0, numLongs); final long mask = MASKS32[bitsPerValue]; - int longsIdx = 0; - int shift = 32 - bitsPerValue; - for (; shift >= 0; shift -= bitsPerValue) { - shiftLongs(tmp, numLongs, longs, longsIdx, shift, mask); - longsIdx += numLongs; - } - final int remainingBitsPerLong = shift + bitsPerValue; + splitLongs(in, numLongs, longs, 32 - bitsPerValue, 32, mask, tmp, 0, -1L); + final int remainingBitsPerLong = 32 - bitsPerValue; final long mask32RemainingBitsPerLong = MASKS32[remainingBitsPerLong]; int tmpIdx = 0; int remainingBits = remainingBitsPerLong; - for (; longsIdx < BLOCK_SIZE / 2; ++longsIdx) { + for (int longsIdx = numLongs; longsIdx < BLOCK_SIZE / 2; ++longsIdx) { int b = bitsPerValue - remainingBits; long l = (tmp[tmpIdx++] & MASKS32[remainingBits]) << b; while (b >= remainingBitsPerLong) { @@ -332,19 +222,31 @@ private static void decodeSlow(int bitsPerValue, DataInput in, long[] tmp, long[ } } - /** - * The pattern that this shiftLongs method applies is recognized by the C2 compiler, which - * generates SIMD instructions for it in order to shift multiple longs at once. - */ - private static void shiftLongs(long[] a, int count, long[] b, int bi, int shift, long mask) { + static void splitLongs( + IndexInput in, + int count, + long[] b, + int bShift, + int dec, + long bMask, + long[] c, + int cIndex, + long cMask) + throws IOException { + // takes advantage of the C2 compiler's loop unrolling and auto-vectorization. + in.readLongs(c, cIndex, count); + int maxIter = (bShift - 1) / dec; for (int i = 0; i < count; ++i) { - b[bi + i] = (a[i] >>> shift) & mask; + for (int j = 0; j <= maxIter; ++j) { + b[count * j + i] = (c[cIndex + i] >>> (bShift - j * dec)) & bMask; + } + c[cIndex + i] &= cMask; } } - private static final long[] MASKS8 = new long[8]; - private static final long[] MASKS16 = new long[16]; - private static final long[] MASKS32 = new long[32]; + static final long[] MASKS8 = new long[8]; + static final long[] MASKS16 = new long[16]; + static final long[] MASKS32 = new long[32]; static { for (int i = 0; i < 8; ++i) { @@ -360,60 +262,62 @@ private static void shiftLongs(long[] a, int count, long[] b, int bi, int shift, // mark values in array as final longs to avoid the cost of reading array, arrays should only be // used when the idx is a variable - private static final long MASK8_1 = MASKS8[1]; - private static final long MASK8_2 = MASKS8[2]; - private static final long MASK8_3 = MASKS8[3]; - private static final long MASK8_4 = MASKS8[4]; - private static final long MASK8_5 = MASKS8[5]; - private static final long MASK8_6 = MASKS8[6]; - private static final long MASK8_7 = MASKS8[7]; - private static final long MASK16_1 = MASKS16[1]; - private static final long MASK16_2 = MASKS16[2]; - private static final long MASK16_3 = MASKS16[3]; - private static final long MASK16_4 = MASKS16[4]; - private static final long MASK16_5 = MASKS16[5]; - private static final long MASK16_6 = MASKS16[6]; - private static final long MASK16_7 = MASKS16[7]; - private static final long MASK16_9 = MASKS16[9]; - private static final long MASK16_10 = MASKS16[10]; - private static final long MASK16_11 = MASKS16[11]; - private static final long MASK16_12 = MASKS16[12]; - private static final long MASK16_13 = MASKS16[13]; - private static final long MASK16_14 = MASKS16[14]; - private static final long MASK16_15 = MASKS16[15]; - private static final long MASK32_1 = MASKS32[1]; - private static final long MASK32_2 = MASKS32[2]; - private static final long MASK32_3 = MASKS32[3]; - private static final long MASK32_4 = MASKS32[4]; - private static final long MASK32_5 = MASKS32[5]; - private static final long MASK32_6 = MASKS32[6]; - private static final long MASK32_7 = MASKS32[7]; - private static final long MASK32_8 = MASKS32[8]; - private static final long MASK32_9 = MASKS32[9]; - private static final long MASK32_10 = MASKS32[10]; - private static final long MASK32_11 = MASKS32[11]; - private static final long MASK32_12 = MASKS32[12]; - private static final long MASK32_13 = MASKS32[13]; - private static final long MASK32_14 = MASKS32[14]; - private static final long MASK32_15 = MASKS32[15]; - private static final long MASK32_17 = MASKS32[17]; - private static final long MASK32_18 = MASKS32[18]; - private static final long MASK32_19 = MASKS32[19]; - private static final long MASK32_20 = MASKS32[20]; - private static final long MASK32_21 = MASKS32[21]; - private static final long MASK32_22 = MASKS32[22]; - private static final long MASK32_23 = MASKS32[23]; - private static final long MASK32_24 = MASKS32[24]; + static final long MASK8_1 = MASKS8[1]; + static final long MASK8_2 = MASKS8[2]; + static final long MASK8_3 = MASKS8[3]; + static final long MASK8_4 = MASKS8[4]; + static final long MASK8_5 = MASKS8[5]; + static final long MASK8_6 = MASKS8[6]; + static final long MASK8_7 = MASKS8[7]; + static final long MASK16_1 = MASKS16[1]; + static final long MASK16_2 = MASKS16[2]; + static final long MASK16_3 = MASKS16[3]; + static final long MASK16_4 = MASKS16[4]; + static final long MASK16_5 = MASKS16[5]; + static final long MASK16_6 = MASKS16[6]; + static final long MASK16_7 = MASKS16[7]; + static final long MASK16_8 = MASKS16[8]; + static final long MASK16_9 = MASKS16[9]; + static final long MASK16_10 = MASKS16[10]; + static final long MASK16_11 = MASKS16[11]; + static final long MASK16_12 = MASKS16[12]; + static final long MASK16_13 = MASKS16[13]; + static final long MASK16_14 = MASKS16[14]; + static final long MASK16_15 = MASKS16[15]; + static final long MASK32_1 = MASKS32[1]; + static final long MASK32_2 = MASKS32[2]; + static final long MASK32_3 = MASKS32[3]; + static final long MASK32_4 = MASKS32[4]; + static final long MASK32_5 = MASKS32[5]; + static final long MASK32_6 = MASKS32[6]; + static final long MASK32_7 = MASKS32[7]; + static final long MASK32_8 = MASKS32[8]; + static final long MASK32_9 = MASKS32[9]; + static final long MASK32_10 = MASKS32[10]; + static final long MASK32_11 = MASKS32[11]; + static final long MASK32_12 = MASKS32[12]; + static final long MASK32_13 = MASKS32[13]; + static final long MASK32_14 = MASKS32[14]; + static final long MASK32_15 = MASKS32[15]; + static final long MASK32_16 = MASKS32[16]; + static final long MASK32_17 = MASKS32[17]; + static final long MASK32_18 = MASKS32[18]; + static final long MASK32_19 = MASKS32[19]; + static final long MASK32_20 = MASKS32[20]; + static final long MASK32_21 = MASKS32[21]; + static final long MASK32_22 = MASKS32[22]; + static final long MASK32_23 = MASKS32[23]; + static final long MASK32_24 = MASKS32[24]; /** Decode 128 integers into {@code longs}. */ - void decode(int bitsPerValue, DataInput in, long[] longs) throws IOException { + void decode(int bitsPerValue, IndexInput in, long[] longs) throws IOException { switch (bitsPerValue) { case 1: - decode1(in, tmp, longs); + decode1(in, longs); expand8(longs); break; case 2: - decode2(in, tmp, longs); + decode2(in, longs); expand8(longs); break; case 3: @@ -421,7 +325,7 @@ void decode(int bitsPerValue, DataInput in, long[] longs) throws IOException { expand8(longs); break; case 4: - decode4(in, tmp, longs); + decode4(in, longs); expand8(longs); break; case 5: @@ -437,7 +341,7 @@ void decode(int bitsPerValue, DataInput in, long[] longs) throws IOException { expand8(longs); break; case 8: - decode8(in, tmp, longs); + decode8(in, longs); expand8(longs); break; case 9: @@ -469,7 +373,7 @@ void decode(int bitsPerValue, DataInput in, long[] longs) throws IOException { expand16(longs); break; case 16: - decode16(in, tmp, longs); + decode16(in, longs); expand16(longs); break; case 17: @@ -511,174 +415,48 @@ void decode(int bitsPerValue, DataInput in, long[] longs) throws IOException { } } - /** Delta-decode 128 integers into {@code longs}. */ - void decodeAndPrefixSum(int bitsPerValue, DataInput in, long base, long[] longs) - throws IOException { - switch (bitsPerValue) { - case 1: - decode1(in, tmp, longs); - prefixSum8(longs, base); - break; - case 2: - decode2(in, tmp, longs); - prefixSum8(longs, base); - break; - case 3: - decode3(in, tmp, longs); - prefixSum8(longs, base); - break; - case 4: - decode4(in, tmp, longs); - prefixSum8(longs, base); - break; - case 5: - decode5(in, tmp, longs); - prefixSum8(longs, base); - break; - case 6: - decode6(in, tmp, longs); - prefixSum8(longs, base); - break; - case 7: - decode7(in, tmp, longs); - prefixSum8(longs, base); - break; - case 8: - decode8(in, tmp, longs); - prefixSum8(longs, base); - break; - case 9: - decode9(in, tmp, longs); - prefixSum16(longs, base); - break; - case 10: - decode10(in, tmp, longs); - prefixSum16(longs, base); - break; - case 11: - decode11(in, tmp, longs); - prefixSum16(longs, base); - break; - case 12: - decode12(in, tmp, longs); - prefixSum16(longs, base); - break; - case 13: - decode13(in, tmp, longs); - prefixSum16(longs, base); - break; - case 14: - decode14(in, tmp, longs); - prefixSum16(longs, base); - break; - case 15: - decode15(in, tmp, longs); - prefixSum16(longs, base); - break; - case 16: - decode16(in, tmp, longs); - prefixSum16(longs, base); - break; - case 17: - decode17(in, tmp, longs); - prefixSum32(longs, base); - break; - case 18: - decode18(in, tmp, longs); - prefixSum32(longs, base); - break; - case 19: - decode19(in, tmp, longs); - prefixSum32(longs, base); - break; - case 20: - decode20(in, tmp, longs); - prefixSum32(longs, base); - break; - case 21: - decode21(in, tmp, longs); - prefixSum32(longs, base); - break; - case 22: - decode22(in, tmp, longs); - prefixSum32(longs, base); - break; - case 23: - decode23(in, tmp, longs); - prefixSum32(longs, base); - break; - case 24: - decode24(in, tmp, longs); - prefixSum32(longs, base); - break; - default: - decodeSlow(bitsPerValue, in, tmp, longs); - prefixSum32(longs, base); - break; - } - } - - private static void decode1(DataInput in, long[] tmp, long[] longs) throws IOException { - in.readLongs(tmp, 0, 2); - shiftLongs(tmp, 2, longs, 0, 7, MASK8_1); - shiftLongs(tmp, 2, longs, 2, 6, MASK8_1); - shiftLongs(tmp, 2, longs, 4, 5, MASK8_1); - shiftLongs(tmp, 2, longs, 6, 4, MASK8_1); - shiftLongs(tmp, 2, longs, 8, 3, MASK8_1); - shiftLongs(tmp, 2, longs, 10, 2, MASK8_1); - shiftLongs(tmp, 2, longs, 12, 1, MASK8_1); - shiftLongs(tmp, 2, longs, 14, 0, MASK8_1); + static void decode1(IndexInput in, long[] longs) throws IOException { + splitLongs(in, 2, longs, 7, 1, MASK8_1, longs, 14, MASK8_1); } - private static void decode2(DataInput in, long[] tmp, long[] longs) throws IOException { - in.readLongs(tmp, 0, 4); - shiftLongs(tmp, 4, longs, 0, 6, MASK8_2); - shiftLongs(tmp, 4, longs, 4, 4, MASK8_2); - shiftLongs(tmp, 4, longs, 8, 2, MASK8_2); - shiftLongs(tmp, 4, longs, 12, 0, MASK8_2); + static void decode2(IndexInput in, long[] longs) throws IOException { + splitLongs(in, 4, longs, 6, 2, MASK8_2, longs, 12, MASK8_2); } - private static void decode3(DataInput in, long[] tmp, long[] longs) throws IOException { - in.readLongs(tmp, 0, 6); - shiftLongs(tmp, 6, longs, 0, 5, MASK8_3); - shiftLongs(tmp, 6, longs, 6, 2, MASK8_3); + static void decode3(IndexInput in, long[] tmp, long[] longs) throws IOException { + splitLongs(in, 6, longs, 5, 3, MASK8_3, tmp, 0, MASK8_2); for (int iter = 0, tmpIdx = 0, longsIdx = 12; iter < 2; ++iter, tmpIdx += 3, longsIdx += 2) { - long l0 = (tmp[tmpIdx + 0] & MASK8_2) << 1; + long l0 = tmp[tmpIdx + 0] << 1; l0 |= (tmp[tmpIdx + 1] >>> 1) & MASK8_1; longs[longsIdx + 0] = l0; long l1 = (tmp[tmpIdx + 1] & MASK8_1) << 2; - l1 |= (tmp[tmpIdx + 2] & MASK8_2) << 0; + l1 |= tmp[tmpIdx + 2] << 0; longs[longsIdx + 1] = l1; } } - private static void decode4(DataInput in, long[] tmp, long[] longs) throws IOException { - in.readLongs(tmp, 0, 8); - shiftLongs(tmp, 8, longs, 0, 4, MASK8_4); - shiftLongs(tmp, 8, longs, 8, 0, MASK8_4); + static void decode4(IndexInput in, long[] longs) throws IOException { + splitLongs(in, 8, longs, 4, 4, MASK8_4, longs, 8, MASK8_4); } - private static void decode5(DataInput in, long[] tmp, long[] longs) throws IOException { - in.readLongs(tmp, 0, 10); - shiftLongs(tmp, 10, longs, 0, 3, MASK8_5); + static void decode5(IndexInput in, long[] tmp, long[] longs) throws IOException { + splitLongs(in, 10, longs, 3, 5, MASK8_5, tmp, 0, MASK8_3); for (int iter = 0, tmpIdx = 0, longsIdx = 10; iter < 2; ++iter, tmpIdx += 5, longsIdx += 3) { - long l0 = (tmp[tmpIdx + 0] & MASK8_3) << 2; + long l0 = tmp[tmpIdx + 0] << 2; l0 |= (tmp[tmpIdx + 1] >>> 1) & MASK8_2; longs[longsIdx + 0] = l0; long l1 = (tmp[tmpIdx + 1] & MASK8_1) << 4; - l1 |= (tmp[tmpIdx + 2] & MASK8_3) << 1; + l1 |= tmp[tmpIdx + 2] << 1; l1 |= (tmp[tmpIdx + 3] >>> 2) & MASK8_1; longs[longsIdx + 1] = l1; long l2 = (tmp[tmpIdx + 3] & MASK8_2) << 3; - l2 |= (tmp[tmpIdx + 4] & MASK8_3) << 0; + l2 |= tmp[tmpIdx + 4] << 0; longs[longsIdx + 2] = l2; } } - private static void decode6(DataInput in, long[] tmp, long[] longs) throws IOException { - in.readLongs(tmp, 0, 12); - shiftLongs(tmp, 12, longs, 0, 2, MASK8_6); - shiftLongs(tmp, 12, tmp, 0, 0, MASK8_2); + static void decode6(IndexInput in, long[] tmp, long[] longs) throws IOException { + splitLongs(in, 12, longs, 2, 6, MASK8_6, tmp, 0, MASK8_2); for (int iter = 0, tmpIdx = 0, longsIdx = 12; iter < 4; ++iter, tmpIdx += 3, longsIdx += 1) { long l0 = tmp[tmpIdx + 0] << 4; l0 |= tmp[tmpIdx + 1] << 2; @@ -687,10 +465,8 @@ private static void decode6(DataInput in, long[] tmp, long[] longs) throws IOExc } } - private static void decode7(DataInput in, long[] tmp, long[] longs) throws IOException { - in.readLongs(tmp, 0, 14); - shiftLongs(tmp, 14, longs, 0, 1, MASK8_7); - shiftLongs(tmp, 14, tmp, 0, 0, MASK8_1); + static void decode7(IndexInput in, long[] tmp, long[] longs) throws IOException { + splitLongs(in, 14, longs, 1, 7, MASK8_7, tmp, 0, MASK8_1); for (int iter = 0, tmpIdx = 0, longsIdx = 14; iter < 2; ++iter, tmpIdx += 7, longsIdx += 1) { long l0 = tmp[tmpIdx + 0] << 6; l0 |= tmp[tmpIdx + 1] << 5; @@ -703,15 +479,14 @@ private static void decode7(DataInput in, long[] tmp, long[] longs) throws IOExc } } - private static void decode8(DataInput in, long[] tmp, long[] longs) throws IOException { + static void decode8(IndexInput in, long[] longs) throws IOException { in.readLongs(longs, 0, 16); } - private static void decode9(DataInput in, long[] tmp, long[] longs) throws IOException { - in.readLongs(tmp, 0, 18); - shiftLongs(tmp, 18, longs, 0, 7, MASK16_9); + static void decode9(IndexInput in, long[] tmp, long[] longs) throws IOException { + splitLongs(in, 18, longs, 7, 9, MASK16_9, tmp, 0, MASK16_7); for (int iter = 0, tmpIdx = 0, longsIdx = 18; iter < 2; ++iter, tmpIdx += 9, longsIdx += 7) { - long l0 = (tmp[tmpIdx + 0] & MASK16_7) << 2; + long l0 = tmp[tmpIdx + 0] << 2; l0 |= (tmp[tmpIdx + 1] >>> 5) & MASK16_2; longs[longsIdx + 0] = l0; long l1 = (tmp[tmpIdx + 1] & MASK16_5) << 4; @@ -721,7 +496,7 @@ private static void decode9(DataInput in, long[] tmp, long[] longs) throws IOExc l2 |= (tmp[tmpIdx + 3] >>> 1) & MASK16_6; longs[longsIdx + 2] = l2; long l3 = (tmp[tmpIdx + 3] & MASK16_1) << 8; - l3 |= (tmp[tmpIdx + 4] & MASK16_7) << 1; + l3 |= tmp[tmpIdx + 4] << 1; l3 |= (tmp[tmpIdx + 5] >>> 6) & MASK16_1; longs[longsIdx + 3] = l3; long l4 = (tmp[tmpIdx + 5] & MASK16_6) << 3; @@ -731,59 +506,55 @@ private static void decode9(DataInput in, long[] tmp, long[] longs) throws IOExc l5 |= (tmp[tmpIdx + 7] >>> 2) & MASK16_5; longs[longsIdx + 5] = l5; long l6 = (tmp[tmpIdx + 7] & MASK16_2) << 7; - l6 |= (tmp[tmpIdx + 8] & MASK16_7) << 0; + l6 |= tmp[tmpIdx + 8] << 0; longs[longsIdx + 6] = l6; } } - private static void decode10(DataInput in, long[] tmp, long[] longs) throws IOException { - in.readLongs(tmp, 0, 20); - shiftLongs(tmp, 20, longs, 0, 6, MASK16_10); + static void decode10(IndexInput in, long[] tmp, long[] longs) throws IOException { + splitLongs(in, 20, longs, 6, 10, MASK16_10, tmp, 0, MASK16_6); for (int iter = 0, tmpIdx = 0, longsIdx = 20; iter < 4; ++iter, tmpIdx += 5, longsIdx += 3) { - long l0 = (tmp[tmpIdx + 0] & MASK16_6) << 4; + long l0 = tmp[tmpIdx + 0] << 4; l0 |= (tmp[tmpIdx + 1] >>> 2) & MASK16_4; longs[longsIdx + 0] = l0; long l1 = (tmp[tmpIdx + 1] & MASK16_2) << 8; - l1 |= (tmp[tmpIdx + 2] & MASK16_6) << 2; + l1 |= tmp[tmpIdx + 2] << 2; l1 |= (tmp[tmpIdx + 3] >>> 4) & MASK16_2; longs[longsIdx + 1] = l1; long l2 = (tmp[tmpIdx + 3] & MASK16_4) << 6; - l2 |= (tmp[tmpIdx + 4] & MASK16_6) << 0; + l2 |= tmp[tmpIdx + 4] << 0; longs[longsIdx + 2] = l2; } } - private static void decode11(DataInput in, long[] tmp, long[] longs) throws IOException { - in.readLongs(tmp, 0, 22); - shiftLongs(tmp, 22, longs, 0, 5, MASK16_11); + static void decode11(IndexInput in, long[] tmp, long[] longs) throws IOException { + splitLongs(in, 22, longs, 5, 11, MASK16_11, tmp, 0, MASK16_5); for (int iter = 0, tmpIdx = 0, longsIdx = 22; iter < 2; ++iter, tmpIdx += 11, longsIdx += 5) { - long l0 = (tmp[tmpIdx + 0] & MASK16_5) << 6; - l0 |= (tmp[tmpIdx + 1] & MASK16_5) << 1; + long l0 = tmp[tmpIdx + 0] << 6; + l0 |= tmp[tmpIdx + 1] << 1; l0 |= (tmp[tmpIdx + 2] >>> 4) & MASK16_1; longs[longsIdx + 0] = l0; long l1 = (tmp[tmpIdx + 2] & MASK16_4) << 7; - l1 |= (tmp[tmpIdx + 3] & MASK16_5) << 2; + l1 |= tmp[tmpIdx + 3] << 2; l1 |= (tmp[tmpIdx + 4] >>> 3) & MASK16_2; longs[longsIdx + 1] = l1; long l2 = (tmp[tmpIdx + 4] & MASK16_3) << 8; - l2 |= (tmp[tmpIdx + 5] & MASK16_5) << 3; + l2 |= tmp[tmpIdx + 5] << 3; l2 |= (tmp[tmpIdx + 6] >>> 2) & MASK16_3; longs[longsIdx + 2] = l2; long l3 = (tmp[tmpIdx + 6] & MASK16_2) << 9; - l3 |= (tmp[tmpIdx + 7] & MASK16_5) << 4; + l3 |= tmp[tmpIdx + 7] << 4; l3 |= (tmp[tmpIdx + 8] >>> 1) & MASK16_4; longs[longsIdx + 3] = l3; long l4 = (tmp[tmpIdx + 8] & MASK16_1) << 10; - l4 |= (tmp[tmpIdx + 9] & MASK16_5) << 5; - l4 |= (tmp[tmpIdx + 10] & MASK16_5) << 0; + l4 |= tmp[tmpIdx + 9] << 5; + l4 |= tmp[tmpIdx + 10] << 0; longs[longsIdx + 4] = l4; } } - private static void decode12(DataInput in, long[] tmp, long[] longs) throws IOException { - in.readLongs(tmp, 0, 24); - shiftLongs(tmp, 24, longs, 0, 4, MASK16_12); - shiftLongs(tmp, 24, tmp, 0, 0, MASK16_4); + static void decode12(IndexInput in, long[] tmp, long[] longs) throws IOException { + splitLongs(in, 24, longs, 4, 12, MASK16_12, tmp, 0, MASK16_4); for (int iter = 0, tmpIdx = 0, longsIdx = 24; iter < 8; ++iter, tmpIdx += 3, longsIdx += 1) { long l0 = tmp[tmpIdx + 0] << 8; l0 |= tmp[tmpIdx + 1] << 4; @@ -792,35 +563,32 @@ private static void decode12(DataInput in, long[] tmp, long[] longs) throws IOEx } } - private static void decode13(DataInput in, long[] tmp, long[] longs) throws IOException { - in.readLongs(tmp, 0, 26); - shiftLongs(tmp, 26, longs, 0, 3, MASK16_13); + static void decode13(IndexInput in, long[] tmp, long[] longs) throws IOException { + splitLongs(in, 26, longs, 3, 13, MASK16_13, tmp, 0, MASK16_3); for (int iter = 0, tmpIdx = 0, longsIdx = 26; iter < 2; ++iter, tmpIdx += 13, longsIdx += 3) { - long l0 = (tmp[tmpIdx + 0] & MASK16_3) << 10; - l0 |= (tmp[tmpIdx + 1] & MASK16_3) << 7; - l0 |= (tmp[tmpIdx + 2] & MASK16_3) << 4; - l0 |= (tmp[tmpIdx + 3] & MASK16_3) << 1; + long l0 = tmp[tmpIdx + 0] << 10; + l0 |= tmp[tmpIdx + 1] << 7; + l0 |= tmp[tmpIdx + 2] << 4; + l0 |= tmp[tmpIdx + 3] << 1; l0 |= (tmp[tmpIdx + 4] >>> 2) & MASK16_1; longs[longsIdx + 0] = l0; long l1 = (tmp[tmpIdx + 4] & MASK16_2) << 11; - l1 |= (tmp[tmpIdx + 5] & MASK16_3) << 8; - l1 |= (tmp[tmpIdx + 6] & MASK16_3) << 5; - l1 |= (tmp[tmpIdx + 7] & MASK16_3) << 2; + l1 |= tmp[tmpIdx + 5] << 8; + l1 |= tmp[tmpIdx + 6] << 5; + l1 |= tmp[tmpIdx + 7] << 2; l1 |= (tmp[tmpIdx + 8] >>> 1) & MASK16_2; longs[longsIdx + 1] = l1; long l2 = (tmp[tmpIdx + 8] & MASK16_1) << 12; - l2 |= (tmp[tmpIdx + 9] & MASK16_3) << 9; - l2 |= (tmp[tmpIdx + 10] & MASK16_3) << 6; - l2 |= (tmp[tmpIdx + 11] & MASK16_3) << 3; - l2 |= (tmp[tmpIdx + 12] & MASK16_3) << 0; + l2 |= tmp[tmpIdx + 9] << 9; + l2 |= tmp[tmpIdx + 10] << 6; + l2 |= tmp[tmpIdx + 11] << 3; + l2 |= tmp[tmpIdx + 12] << 0; longs[longsIdx + 2] = l2; } } - private static void decode14(DataInput in, long[] tmp, long[] longs) throws IOException { - in.readLongs(tmp, 0, 28); - shiftLongs(tmp, 28, longs, 0, 2, MASK16_14); - shiftLongs(tmp, 28, tmp, 0, 0, MASK16_2); + static void decode14(IndexInput in, long[] tmp, long[] longs) throws IOException { + splitLongs(in, 28, longs, 2, 14, MASK16_14, tmp, 0, MASK16_2); for (int iter = 0, tmpIdx = 0, longsIdx = 28; iter < 4; ++iter, tmpIdx += 7, longsIdx += 1) { long l0 = tmp[tmpIdx + 0] << 12; l0 |= tmp[tmpIdx + 1] << 10; @@ -833,10 +601,8 @@ private static void decode14(DataInput in, long[] tmp, long[] longs) throws IOEx } } - private static void decode15(DataInput in, long[] tmp, long[] longs) throws IOException { - in.readLongs(tmp, 0, 30); - shiftLongs(tmp, 30, longs, 0, 1, MASK16_15); - shiftLongs(tmp, 30, tmp, 0, 0, MASK16_1); + static void decode15(IndexInput in, long[] tmp, long[] longs) throws IOException { + splitLongs(in, 30, longs, 1, 15, MASK16_15, tmp, 0, MASK16_1); for (int iter = 0, tmpIdx = 0, longsIdx = 30; iter < 2; ++iter, tmpIdx += 15, longsIdx += 1) { long l0 = tmp[tmpIdx + 0] << 14; l0 |= tmp[tmpIdx + 1] << 13; @@ -857,15 +623,14 @@ private static void decode15(DataInput in, long[] tmp, long[] longs) throws IOEx } } - private static void decode16(DataInput in, long[] tmp, long[] longs) throws IOException { + static void decode16(IndexInput in, long[] longs) throws IOException { in.readLongs(longs, 0, 32); } - private static void decode17(DataInput in, long[] tmp, long[] longs) throws IOException { - in.readLongs(tmp, 0, 34); - shiftLongs(tmp, 34, longs, 0, 15, MASK32_17); + static void decode17(IndexInput in, long[] tmp, long[] longs) throws IOException { + splitLongs(in, 34, longs, 15, 17, MASK32_17, tmp, 0, MASK32_15); for (int iter = 0, tmpIdx = 0, longsIdx = 34; iter < 2; ++iter, tmpIdx += 17, longsIdx += 15) { - long l0 = (tmp[tmpIdx + 0] & MASK32_15) << 2; + long l0 = tmp[tmpIdx + 0] << 2; l0 |= (tmp[tmpIdx + 1] >>> 13) & MASK32_2; longs[longsIdx + 0] = l0; long l1 = (tmp[tmpIdx + 1] & MASK32_13) << 4; @@ -887,7 +652,7 @@ private static void decode17(DataInput in, long[] tmp, long[] longs) throws IOEx l6 |= (tmp[tmpIdx + 7] >>> 1) & MASK32_14; longs[longsIdx + 6] = l6; long l7 = (tmp[tmpIdx + 7] & MASK32_1) << 16; - l7 |= (tmp[tmpIdx + 8] & MASK32_15) << 1; + l7 |= tmp[tmpIdx + 8] << 1; l7 |= (tmp[tmpIdx + 9] >>> 14) & MASK32_1; longs[longsIdx + 7] = l7; long l8 = (tmp[tmpIdx + 9] & MASK32_14) << 3; @@ -909,16 +674,15 @@ private static void decode17(DataInput in, long[] tmp, long[] longs) throws IOEx l13 |= (tmp[tmpIdx + 15] >>> 2) & MASK32_13; longs[longsIdx + 13] = l13; long l14 = (tmp[tmpIdx + 15] & MASK32_2) << 15; - l14 |= (tmp[tmpIdx + 16] & MASK32_15) << 0; + l14 |= tmp[tmpIdx + 16] << 0; longs[longsIdx + 14] = l14; } } - private static void decode18(DataInput in, long[] tmp, long[] longs) throws IOException { - in.readLongs(tmp, 0, 36); - shiftLongs(tmp, 36, longs, 0, 14, MASK32_18); + static void decode18(IndexInput in, long[] tmp, long[] longs) throws IOException { + splitLongs(in, 36, longs, 14, 18, MASK32_18, tmp, 0, MASK32_14); for (int iter = 0, tmpIdx = 0, longsIdx = 36; iter < 4; ++iter, tmpIdx += 9, longsIdx += 7) { - long l0 = (tmp[tmpIdx + 0] & MASK32_14) << 4; + long l0 = tmp[tmpIdx + 0] << 4; l0 |= (tmp[tmpIdx + 1] >>> 10) & MASK32_4; longs[longsIdx + 0] = l0; long l1 = (tmp[tmpIdx + 1] & MASK32_10) << 8; @@ -928,7 +692,7 @@ private static void decode18(DataInput in, long[] tmp, long[] longs) throws IOEx l2 |= (tmp[tmpIdx + 3] >>> 2) & MASK32_12; longs[longsIdx + 2] = l2; long l3 = (tmp[tmpIdx + 3] & MASK32_2) << 16; - l3 |= (tmp[tmpIdx + 4] & MASK32_14) << 2; + l3 |= tmp[tmpIdx + 4] << 2; l3 |= (tmp[tmpIdx + 5] >>> 12) & MASK32_2; longs[longsIdx + 3] = l3; long l4 = (tmp[tmpIdx + 5] & MASK32_12) << 6; @@ -938,206 +702,199 @@ private static void decode18(DataInput in, long[] tmp, long[] longs) throws IOEx l5 |= (tmp[tmpIdx + 7] >>> 4) & MASK32_10; longs[longsIdx + 5] = l5; long l6 = (tmp[tmpIdx + 7] & MASK32_4) << 14; - l6 |= (tmp[tmpIdx + 8] & MASK32_14) << 0; + l6 |= tmp[tmpIdx + 8] << 0; longs[longsIdx + 6] = l6; } } - private static void decode19(DataInput in, long[] tmp, long[] longs) throws IOException { - in.readLongs(tmp, 0, 38); - shiftLongs(tmp, 38, longs, 0, 13, MASK32_19); + static void decode19(IndexInput in, long[] tmp, long[] longs) throws IOException { + splitLongs(in, 38, longs, 13, 19, MASK32_19, tmp, 0, MASK32_13); for (int iter = 0, tmpIdx = 0, longsIdx = 38; iter < 2; ++iter, tmpIdx += 19, longsIdx += 13) { - long l0 = (tmp[tmpIdx + 0] & MASK32_13) << 6; + long l0 = tmp[tmpIdx + 0] << 6; l0 |= (tmp[tmpIdx + 1] >>> 7) & MASK32_6; longs[longsIdx + 0] = l0; long l1 = (tmp[tmpIdx + 1] & MASK32_7) << 12; l1 |= (tmp[tmpIdx + 2] >>> 1) & MASK32_12; longs[longsIdx + 1] = l1; long l2 = (tmp[tmpIdx + 2] & MASK32_1) << 18; - l2 |= (tmp[tmpIdx + 3] & MASK32_13) << 5; + l2 |= tmp[tmpIdx + 3] << 5; l2 |= (tmp[tmpIdx + 4] >>> 8) & MASK32_5; longs[longsIdx + 2] = l2; long l3 = (tmp[tmpIdx + 4] & MASK32_8) << 11; l3 |= (tmp[tmpIdx + 5] >>> 2) & MASK32_11; longs[longsIdx + 3] = l3; long l4 = (tmp[tmpIdx + 5] & MASK32_2) << 17; - l4 |= (tmp[tmpIdx + 6] & MASK32_13) << 4; + l4 |= tmp[tmpIdx + 6] << 4; l4 |= (tmp[tmpIdx + 7] >>> 9) & MASK32_4; longs[longsIdx + 4] = l4; long l5 = (tmp[tmpIdx + 7] & MASK32_9) << 10; l5 |= (tmp[tmpIdx + 8] >>> 3) & MASK32_10; longs[longsIdx + 5] = l5; long l6 = (tmp[tmpIdx + 8] & MASK32_3) << 16; - l6 |= (tmp[tmpIdx + 9] & MASK32_13) << 3; + l6 |= tmp[tmpIdx + 9] << 3; l6 |= (tmp[tmpIdx + 10] >>> 10) & MASK32_3; longs[longsIdx + 6] = l6; long l7 = (tmp[tmpIdx + 10] & MASK32_10) << 9; l7 |= (tmp[tmpIdx + 11] >>> 4) & MASK32_9; longs[longsIdx + 7] = l7; long l8 = (tmp[tmpIdx + 11] & MASK32_4) << 15; - l8 |= (tmp[tmpIdx + 12] & MASK32_13) << 2; + l8 |= tmp[tmpIdx + 12] << 2; l8 |= (tmp[tmpIdx + 13] >>> 11) & MASK32_2; longs[longsIdx + 8] = l8; long l9 = (tmp[tmpIdx + 13] & MASK32_11) << 8; l9 |= (tmp[tmpIdx + 14] >>> 5) & MASK32_8; longs[longsIdx + 9] = l9; long l10 = (tmp[tmpIdx + 14] & MASK32_5) << 14; - l10 |= (tmp[tmpIdx + 15] & MASK32_13) << 1; + l10 |= tmp[tmpIdx + 15] << 1; l10 |= (tmp[tmpIdx + 16] >>> 12) & MASK32_1; longs[longsIdx + 10] = l10; long l11 = (tmp[tmpIdx + 16] & MASK32_12) << 7; l11 |= (tmp[tmpIdx + 17] >>> 6) & MASK32_7; longs[longsIdx + 11] = l11; long l12 = (tmp[tmpIdx + 17] & MASK32_6) << 13; - l12 |= (tmp[tmpIdx + 18] & MASK32_13) << 0; + l12 |= tmp[tmpIdx + 18] << 0; longs[longsIdx + 12] = l12; } } - private static void decode20(DataInput in, long[] tmp, long[] longs) throws IOException { - in.readLongs(tmp, 0, 40); - shiftLongs(tmp, 40, longs, 0, 12, MASK32_20); + static void decode20(IndexInput in, long[] tmp, long[] longs) throws IOException { + splitLongs(in, 40, longs, 12, 20, MASK32_20, tmp, 0, MASK32_12); for (int iter = 0, tmpIdx = 0, longsIdx = 40; iter < 8; ++iter, tmpIdx += 5, longsIdx += 3) { - long l0 = (tmp[tmpIdx + 0] & MASK32_12) << 8; + long l0 = tmp[tmpIdx + 0] << 8; l0 |= (tmp[tmpIdx + 1] >>> 4) & MASK32_8; longs[longsIdx + 0] = l0; long l1 = (tmp[tmpIdx + 1] & MASK32_4) << 16; - l1 |= (tmp[tmpIdx + 2] & MASK32_12) << 4; + l1 |= tmp[tmpIdx + 2] << 4; l1 |= (tmp[tmpIdx + 3] >>> 8) & MASK32_4; longs[longsIdx + 1] = l1; long l2 = (tmp[tmpIdx + 3] & MASK32_8) << 12; - l2 |= (tmp[tmpIdx + 4] & MASK32_12) << 0; + l2 |= tmp[tmpIdx + 4] << 0; longs[longsIdx + 2] = l2; } } - private static void decode21(DataInput in, long[] tmp, long[] longs) throws IOException { - in.readLongs(tmp, 0, 42); - shiftLongs(tmp, 42, longs, 0, 11, MASK32_21); + static void decode21(IndexInput in, long[] tmp, long[] longs) throws IOException { + splitLongs(in, 42, longs, 11, 21, MASK32_21, tmp, 0, MASK32_11); for (int iter = 0, tmpIdx = 0, longsIdx = 42; iter < 2; ++iter, tmpIdx += 21, longsIdx += 11) { - long l0 = (tmp[tmpIdx + 0] & MASK32_11) << 10; + long l0 = tmp[tmpIdx + 0] << 10; l0 |= (tmp[tmpIdx + 1] >>> 1) & MASK32_10; longs[longsIdx + 0] = l0; long l1 = (tmp[tmpIdx + 1] & MASK32_1) << 20; - l1 |= (tmp[tmpIdx + 2] & MASK32_11) << 9; + l1 |= tmp[tmpIdx + 2] << 9; l1 |= (tmp[tmpIdx + 3] >>> 2) & MASK32_9; longs[longsIdx + 1] = l1; long l2 = (tmp[tmpIdx + 3] & MASK32_2) << 19; - l2 |= (tmp[tmpIdx + 4] & MASK32_11) << 8; + l2 |= tmp[tmpIdx + 4] << 8; l2 |= (tmp[tmpIdx + 5] >>> 3) & MASK32_8; longs[longsIdx + 2] = l2; long l3 = (tmp[tmpIdx + 5] & MASK32_3) << 18; - l3 |= (tmp[tmpIdx + 6] & MASK32_11) << 7; + l3 |= tmp[tmpIdx + 6] << 7; l3 |= (tmp[tmpIdx + 7] >>> 4) & MASK32_7; longs[longsIdx + 3] = l3; long l4 = (tmp[tmpIdx + 7] & MASK32_4) << 17; - l4 |= (tmp[tmpIdx + 8] & MASK32_11) << 6; + l4 |= tmp[tmpIdx + 8] << 6; l4 |= (tmp[tmpIdx + 9] >>> 5) & MASK32_6; longs[longsIdx + 4] = l4; long l5 = (tmp[tmpIdx + 9] & MASK32_5) << 16; - l5 |= (tmp[tmpIdx + 10] & MASK32_11) << 5; + l5 |= tmp[tmpIdx + 10] << 5; l5 |= (tmp[tmpIdx + 11] >>> 6) & MASK32_5; longs[longsIdx + 5] = l5; long l6 = (tmp[tmpIdx + 11] & MASK32_6) << 15; - l6 |= (tmp[tmpIdx + 12] & MASK32_11) << 4; + l6 |= tmp[tmpIdx + 12] << 4; l6 |= (tmp[tmpIdx + 13] >>> 7) & MASK32_4; longs[longsIdx + 6] = l6; long l7 = (tmp[tmpIdx + 13] & MASK32_7) << 14; - l7 |= (tmp[tmpIdx + 14] & MASK32_11) << 3; + l7 |= tmp[tmpIdx + 14] << 3; l7 |= (tmp[tmpIdx + 15] >>> 8) & MASK32_3; longs[longsIdx + 7] = l7; long l8 = (tmp[tmpIdx + 15] & MASK32_8) << 13; - l8 |= (tmp[tmpIdx + 16] & MASK32_11) << 2; + l8 |= tmp[tmpIdx + 16] << 2; l8 |= (tmp[tmpIdx + 17] >>> 9) & MASK32_2; longs[longsIdx + 8] = l8; long l9 = (tmp[tmpIdx + 17] & MASK32_9) << 12; - l9 |= (tmp[tmpIdx + 18] & MASK32_11) << 1; + l9 |= tmp[tmpIdx + 18] << 1; l9 |= (tmp[tmpIdx + 19] >>> 10) & MASK32_1; longs[longsIdx + 9] = l9; long l10 = (tmp[tmpIdx + 19] & MASK32_10) << 11; - l10 |= (tmp[tmpIdx + 20] & MASK32_11) << 0; + l10 |= tmp[tmpIdx + 20] << 0; longs[longsIdx + 10] = l10; } } - private static void decode22(DataInput in, long[] tmp, long[] longs) throws IOException { - in.readLongs(tmp, 0, 44); - shiftLongs(tmp, 44, longs, 0, 10, MASK32_22); + static void decode22(IndexInput in, long[] tmp, long[] longs) throws IOException { + splitLongs(in, 44, longs, 10, 22, MASK32_22, tmp, 0, MASK32_10); for (int iter = 0, tmpIdx = 0, longsIdx = 44; iter < 4; ++iter, tmpIdx += 11, longsIdx += 5) { - long l0 = (tmp[tmpIdx + 0] & MASK32_10) << 12; - l0 |= (tmp[tmpIdx + 1] & MASK32_10) << 2; + long l0 = tmp[tmpIdx + 0] << 12; + l0 |= tmp[tmpIdx + 1] << 2; l0 |= (tmp[tmpIdx + 2] >>> 8) & MASK32_2; longs[longsIdx + 0] = l0; long l1 = (tmp[tmpIdx + 2] & MASK32_8) << 14; - l1 |= (tmp[tmpIdx + 3] & MASK32_10) << 4; + l1 |= tmp[tmpIdx + 3] << 4; l1 |= (tmp[tmpIdx + 4] >>> 6) & MASK32_4; longs[longsIdx + 1] = l1; long l2 = (tmp[tmpIdx + 4] & MASK32_6) << 16; - l2 |= (tmp[tmpIdx + 5] & MASK32_10) << 6; + l2 |= tmp[tmpIdx + 5] << 6; l2 |= (tmp[tmpIdx + 6] >>> 4) & MASK32_6; longs[longsIdx + 2] = l2; long l3 = (tmp[tmpIdx + 6] & MASK32_4) << 18; - l3 |= (tmp[tmpIdx + 7] & MASK32_10) << 8; + l3 |= tmp[tmpIdx + 7] << 8; l3 |= (tmp[tmpIdx + 8] >>> 2) & MASK32_8; longs[longsIdx + 3] = l3; long l4 = (tmp[tmpIdx + 8] & MASK32_2) << 20; - l4 |= (tmp[tmpIdx + 9] & MASK32_10) << 10; - l4 |= (tmp[tmpIdx + 10] & MASK32_10) << 0; + l4 |= tmp[tmpIdx + 9] << 10; + l4 |= tmp[tmpIdx + 10] << 0; longs[longsIdx + 4] = l4; } } - private static void decode23(DataInput in, long[] tmp, long[] longs) throws IOException { - in.readLongs(tmp, 0, 46); - shiftLongs(tmp, 46, longs, 0, 9, MASK32_23); + static void decode23(IndexInput in, long[] tmp, long[] longs) throws IOException { + splitLongs(in, 46, longs, 9, 23, MASK32_23, tmp, 0, MASK32_9); for (int iter = 0, tmpIdx = 0, longsIdx = 46; iter < 2; ++iter, tmpIdx += 23, longsIdx += 9) { - long l0 = (tmp[tmpIdx + 0] & MASK32_9) << 14; - l0 |= (tmp[tmpIdx + 1] & MASK32_9) << 5; + long l0 = tmp[tmpIdx + 0] << 14; + l0 |= tmp[tmpIdx + 1] << 5; l0 |= (tmp[tmpIdx + 2] >>> 4) & MASK32_5; longs[longsIdx + 0] = l0; long l1 = (tmp[tmpIdx + 2] & MASK32_4) << 19; - l1 |= (tmp[tmpIdx + 3] & MASK32_9) << 10; - l1 |= (tmp[tmpIdx + 4] & MASK32_9) << 1; + l1 |= tmp[tmpIdx + 3] << 10; + l1 |= tmp[tmpIdx + 4] << 1; l1 |= (tmp[tmpIdx + 5] >>> 8) & MASK32_1; longs[longsIdx + 1] = l1; long l2 = (tmp[tmpIdx + 5] & MASK32_8) << 15; - l2 |= (tmp[tmpIdx + 6] & MASK32_9) << 6; + l2 |= tmp[tmpIdx + 6] << 6; l2 |= (tmp[tmpIdx + 7] >>> 3) & MASK32_6; longs[longsIdx + 2] = l2; long l3 = (tmp[tmpIdx + 7] & MASK32_3) << 20; - l3 |= (tmp[tmpIdx + 8] & MASK32_9) << 11; - l3 |= (tmp[tmpIdx + 9] & MASK32_9) << 2; + l3 |= tmp[tmpIdx + 8] << 11; + l3 |= tmp[tmpIdx + 9] << 2; l3 |= (tmp[tmpIdx + 10] >>> 7) & MASK32_2; longs[longsIdx + 3] = l3; long l4 = (tmp[tmpIdx + 10] & MASK32_7) << 16; - l4 |= (tmp[tmpIdx + 11] & MASK32_9) << 7; + l4 |= tmp[tmpIdx + 11] << 7; l4 |= (tmp[tmpIdx + 12] >>> 2) & MASK32_7; longs[longsIdx + 4] = l4; long l5 = (tmp[tmpIdx + 12] & MASK32_2) << 21; - l5 |= (tmp[tmpIdx + 13] & MASK32_9) << 12; - l5 |= (tmp[tmpIdx + 14] & MASK32_9) << 3; + l5 |= tmp[tmpIdx + 13] << 12; + l5 |= tmp[tmpIdx + 14] << 3; l5 |= (tmp[tmpIdx + 15] >>> 6) & MASK32_3; longs[longsIdx + 5] = l5; long l6 = (tmp[tmpIdx + 15] & MASK32_6) << 17; - l6 |= (tmp[tmpIdx + 16] & MASK32_9) << 8; + l6 |= tmp[tmpIdx + 16] << 8; l6 |= (tmp[tmpIdx + 17] >>> 1) & MASK32_8; longs[longsIdx + 6] = l6; long l7 = (tmp[tmpIdx + 17] & MASK32_1) << 22; - l7 |= (tmp[tmpIdx + 18] & MASK32_9) << 13; - l7 |= (tmp[tmpIdx + 19] & MASK32_9) << 4; + l7 |= tmp[tmpIdx + 18] << 13; + l7 |= tmp[tmpIdx + 19] << 4; l7 |= (tmp[tmpIdx + 20] >>> 5) & MASK32_4; longs[longsIdx + 7] = l7; long l8 = (tmp[tmpIdx + 20] & MASK32_5) << 18; - l8 |= (tmp[tmpIdx + 21] & MASK32_9) << 9; - l8 |= (tmp[tmpIdx + 22] & MASK32_9) << 0; + l8 |= tmp[tmpIdx + 21] << 9; + l8 |= tmp[tmpIdx + 22] << 0; longs[longsIdx + 8] = l8; } } - private static void decode24(DataInput in, long[] tmp, long[] longs) throws IOException { - in.readLongs(tmp, 0, 48); - shiftLongs(tmp, 48, longs, 0, 8, MASK32_24); - shiftLongs(tmp, 48, tmp, 0, 0, MASK32_8); + static void decode24(IndexInput in, long[] tmp, long[] longs) throws IOException { + splitLongs(in, 48, longs, 8, 24, MASK32_24, tmp, 0, MASK32_8); for (int iter = 0, tmpIdx = 0, longsIdx = 48; iter < 16; ++iter, tmpIdx += 3, longsIdx += 1) { long l0 = tmp[tmpIdx + 0] << 16; l0 |= tmp[tmpIdx + 1] << 8; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/Lucene912Codec.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/Lucene912Codec.java similarity index 99% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene912/Lucene912Codec.java rename to lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/Lucene912Codec.java index cb4ef755a6bc..e455c2cc6a30 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/Lucene912Codec.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/Lucene912Codec.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene912; +package org.apache.lucene.backward_codecs.lucene912; import java.util.Objects; import org.apache.lucene.codecs.Codec; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/Lucene912PostingsFormat.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/Lucene912PostingsFormat.java similarity index 94% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene912/Lucene912PostingsFormat.java rename to lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/Lucene912PostingsFormat.java index 1c452175b070..a0342635d766 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/Lucene912PostingsFormat.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/Lucene912PostingsFormat.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene912; +package org.apache.lucene.backward_codecs.lucene912; import java.io.IOException; import org.apache.lucene.codecs.BlockTermState; @@ -23,7 +23,6 @@ import org.apache.lucene.codecs.FieldsProducer; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.PostingsReaderBase; -import org.apache.lucene.codecs.PostingsWriterBase; import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader; import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter; import org.apache.lucene.index.IndexOptions; @@ -318,7 +317,7 @@ * * @lucene.experimental */ -public final class Lucene912PostingsFormat extends PostingsFormat { +public class Lucene912PostingsFormat extends PostingsFormat { /** Filename extension for some small metadata about how postings are encoded. */ public static final String META_EXTENSION = "psm"; @@ -341,7 +340,7 @@ public final class Lucene912PostingsFormat extends PostingsFormat { /** Size of blocks. */ public static final int BLOCK_SIZE = ForUtil.BLOCK_SIZE; - public static final int BLOCK_MASK = BLOCK_SIZE - 1; + static final int BLOCK_MASK = BLOCK_SIZE - 1; /** We insert skip data on every block and every SKIP_FACTOR=32 blocks. */ public static final int LEVEL1_FACTOR = 32; @@ -349,7 +348,7 @@ public final class Lucene912PostingsFormat extends PostingsFormat { /** Total number of docs covered by level 1 skip data: 32 * 128 = 4,096 */ public static final int LEVEL1_NUM_DOCS = LEVEL1_FACTOR * BLOCK_SIZE; - public static final int LEVEL1_MASK = LEVEL1_NUM_DOCS - 1; + static final int LEVEL1_MASK = LEVEL1_NUM_DOCS - 1; static final String TERMS_CODEC = "Lucene90PostingsWriterTerms"; static final String META_CODEC = "Lucene912PostingsWriterMeta"; @@ -360,45 +359,15 @@ public final class Lucene912PostingsFormat extends PostingsFormat { static final int VERSION_START = 0; static final int VERSION_CURRENT = VERSION_START; - private final int minTermBlockSize; - private final int maxTermBlockSize; - /** Creates {@code Lucene912PostingsFormat} with default settings. */ public Lucene912PostingsFormat() { - this( - Lucene90BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, - Lucene90BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE); - } - - /** - * Creates {@code Lucene912PostingsFormat} with custom values for {@code minBlockSize} and {@code - * maxBlockSize} passed to block terms dictionary. - * - * @see - * Lucene90BlockTreeTermsWriter#Lucene90BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int) - */ - public Lucene912PostingsFormat(int minTermBlockSize, int maxTermBlockSize) { super("Lucene912"); - Lucene90BlockTreeTermsWriter.validateSettings(minTermBlockSize, maxTermBlockSize); - this.minTermBlockSize = minTermBlockSize; - this.maxTermBlockSize = maxTermBlockSize; } @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(state); - boolean success = false; - try { - FieldsConsumer ret = - new Lucene90BlockTreeTermsWriter( - state, postingsWriter, minTermBlockSize, maxTermBlockSize); - success = true; - return ret; - } finally { - if (!success) { - IOUtils.closeWhileHandlingException(postingsWriter); - } - } + throw new UnsupportedOperationException( + "This postings format may not be used for writing, use the current postings format"); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/Lucene912PostingsReader.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/Lucene912PostingsReader.java similarity index 68% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene912/Lucene912PostingsReader.java rename to lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/Lucene912PostingsReader.java index 5e66a200929e..a51c848c4cce 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/Lucene912PostingsReader.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/Lucene912PostingsReader.java @@ -14,17 +14,17 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene912; - -import static org.apache.lucene.codecs.lucene912.ForUtil.BLOCK_SIZE; -import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.DOC_CODEC; -import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.LEVEL1_NUM_DOCS; -import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.META_CODEC; -import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.PAY_CODEC; -import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.POS_CODEC; -import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.TERMS_CODEC; -import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.VERSION_CURRENT; -import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.VERSION_START; +package org.apache.lucene.backward_codecs.lucene912; + +import static org.apache.lucene.backward_codecs.lucene912.ForUtil.BLOCK_SIZE; +import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.DOC_CODEC; +import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.LEVEL1_NUM_DOCS; +import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.META_CODEC; +import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.PAY_CODEC; +import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.POS_CODEC; +import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.TERMS_CODEC; +import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.VERSION_CURRENT; +import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.VERSION_START; import java.io.IOException; import java.util.AbstractList; @@ -32,10 +32,10 @@ import java.util.Collections; import java.util.List; import java.util.RandomAccess; +import org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.IntBlockTermState; import org.apache.lucene.codecs.BlockTermState; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.PostingsReaderBase; -import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.IntBlockTermState; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.Impact; import org.apache.lucene.index.Impacts; @@ -45,7 +45,6 @@ import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SlowImpactsEnum; -import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.ChecksumIndexInput; import org.apache.lucene.store.DataInput; @@ -54,7 +53,6 @@ import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BitUtil; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.IOUtils; /** @@ -64,6 +62,13 @@ */ public final class Lucene912PostingsReader extends PostingsReaderBase { + // Dummy impacts, composed of the maximum possible term frequency and the lowest possible + // (unsigned) norm value. This is typically used on tail blocks, which don't actually record + // impacts as the storage overhead would not be worth any query evaluation speedup, since there's + // less than 128 docs left to evaluate anyway. + private static final List DUMMY_IMPACTS = + Collections.singletonList(new Impact(Integer.MAX_VALUE, 1L)); + private final IndexInput docIn; private final IndexInput posIn; private final IndexInput payIn; @@ -73,8 +78,6 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { private final int maxNumImpactsAtLevel1; private final int maxImpactNumBytesAtLevel1; - private final int version; - /** Sole constructor. */ public Lucene912PostingsReader(SegmentReadState state) throws IOException { String metaName = @@ -83,6 +86,7 @@ public Lucene912PostingsReader(SegmentReadState state) throws IOException { final long expectedDocFileLength, expectedPosFileLength, expectedPayFileLength; ChecksumIndexInput metaIn = null; boolean success = false; + int version; try { metaIn = state.directory.openChecksumInput(metaName); version = @@ -208,15 +212,6 @@ static void prefixSum(long[] buffer, int count, long base) { } } - static int findFirstGreater(long[] buffer, int target, int from) { - for (int i = from; i < BLOCK_SIZE; ++i) { - if (buffer[i] >= target) { - return i; - } - } - return BLOCK_SIZE; - } - @Override public BlockTermState newTermState() { return new IntBlockTermState(); @@ -232,13 +227,6 @@ public void decodeTerm( DataInput in, FieldInfo fieldInfo, BlockTermState _termState, boolean absolute) throws IOException { final IntBlockTermState termState = (IntBlockTermState) _termState; - final boolean fieldHasPositions = - fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; - final boolean fieldHasOffsets = - fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) - >= 0; - final boolean fieldHasPayloads = fieldInfo.hasPayloads(); - if (absolute) { termState.docStartFP = 0; termState.posStartFP = 0; @@ -259,9 +247,13 @@ public void decodeTerm( termState.singletonDocID += BitUtil.zigZagDecode(l >>> 1); } - if (fieldHasPositions) { + if (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) { termState.posStartFP += in.readVLong(); - if (fieldHasOffsets || fieldHasPayloads) { + if (fieldInfo + .getIndexOptions() + .compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) + >= 0 + || fieldInfo.hasPayloads()) { termState.payStartFP += in.readVLong(); } if (termState.totalTermFreq > BLOCK_SIZE) { @@ -276,154 +268,114 @@ public void decodeTerm( public PostingsEnum postings( FieldInfo fieldInfo, BlockTermState termState, PostingsEnum reuse, int flags) throws IOException { - - boolean indexHasPositions = - fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; - - if (indexHasPositions == false + if (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0 || PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS) == false) { - BlockDocsEnum docsEnum; - if (reuse instanceof BlockDocsEnum) { - docsEnum = (BlockDocsEnum) reuse; - if (!docsEnum.canReuse(docIn, fieldInfo)) { - docsEnum = new BlockDocsEnum(fieldInfo); - } - } else { - docsEnum = new BlockDocsEnum(fieldInfo); - } - return docsEnum.reset((IntBlockTermState) termState, flags); + return (reuse instanceof BlockDocsEnum blockDocsEnum + && blockDocsEnum.canReuse(docIn, fieldInfo) + ? blockDocsEnum + : new BlockDocsEnum(fieldInfo)) + .reset((IntBlockTermState) termState, flags); } else { - EverythingEnum everythingEnum; - if (reuse instanceof EverythingEnum) { - everythingEnum = (EverythingEnum) reuse; - if (!everythingEnum.canReuse(docIn, fieldInfo)) { - everythingEnum = new EverythingEnum(fieldInfo); - } - } else { - everythingEnum = new EverythingEnum(fieldInfo); - } - return everythingEnum.reset((IntBlockTermState) termState, flags); + return (reuse instanceof EverythingEnum everythingEnum + && everythingEnum.canReuse(docIn, fieldInfo) + ? everythingEnum + : new EverythingEnum(fieldInfo)) + .reset((IntBlockTermState) termState, flags); } } @Override public ImpactsEnum impacts(FieldInfo fieldInfo, BlockTermState state, int flags) throws IOException { - final boolean indexHasFreqs = - fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; + final IndexOptions options = fieldInfo.getIndexOptions(); final boolean indexHasPositions = - fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + options.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; - if (state.docFreq >= BLOCK_SIZE - && indexHasFreqs - && (indexHasPositions == false - || PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS) == false)) { - return new BlockImpactsDocsEnum(fieldInfo, (IntBlockTermState) state); - } - - final boolean indexHasOffsets = - fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) - >= 0; - final boolean indexHasPayloads = fieldInfo.hasPayloads(); + if (state.docFreq >= BLOCK_SIZE) { + if (options.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0 + && (indexHasPositions == false + || PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS) == false)) { + return new BlockImpactsDocsEnum(indexHasPositions, (IntBlockTermState) state); + } - if (state.docFreq >= BLOCK_SIZE - && indexHasPositions - && (indexHasOffsets == false - || PostingsEnum.featureRequested(flags, PostingsEnum.OFFSETS) == false) - && (indexHasPayloads == false - || PostingsEnum.featureRequested(flags, PostingsEnum.PAYLOADS) == false)) { - return new BlockImpactsPostingsEnum(fieldInfo, (IntBlockTermState) state); + if (indexHasPositions + && (options.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) < 0 + || PostingsEnum.featureRequested(flags, PostingsEnum.OFFSETS) == false) + && (fieldInfo.hasPayloads() == false + || PostingsEnum.featureRequested(flags, PostingsEnum.PAYLOADS) == false)) { + return new BlockImpactsPostingsEnum(fieldInfo, (IntBlockTermState) state); + } } return new SlowImpactsEnum(postings(fieldInfo, state, null, flags)); } - final class BlockDocsEnum extends PostingsEnum { - - final ForUtil forUtil = new ForUtil(); - final ForDeltaUtil forDeltaUtil = new ForDeltaUtil(forUtil); - final PForUtil pforUtil = new PForUtil(forUtil); - - private final long[] docBuffer = new long[BLOCK_SIZE + 1]; - private final long[] freqBuffer = new long[BLOCK_SIZE]; + private static long sumOverRange(long[] arr, int start, int end) { + long res = 0L; + for (int i = start; i < end; i++) { + res += arr[i]; + } + return res; + } - private int docBufferUpto; + private abstract class AbstractPostingsEnum extends PostingsEnum { - final IndexInput startDocIn; + protected ForDeltaUtil forDeltaUtil; + protected PForUtil pforUtil; - IndexInput docIn; - final boolean indexHasFreq; - final boolean indexHasPos; - final boolean indexHasOffsetsOrPayloads; + protected final long[] docBuffer = new long[BLOCK_SIZE + 1]; + protected final boolean indexHasFreq; - private int docFreq; // number of docs in this posting list - private long totalTermFreq; // sum of freqBuffer in this posting list (or docFreq when omitted) - private int docCountUpto; // number of docs in or before the current block - private int doc; // doc we last read - private long prevDocID; // last doc ID of the previous block + protected int doc; // doc we last read // level 0 skip data - private int level0LastDocID; + protected int level0LastDocID; + // level 1 skip data - private int level1LastDocID; - private long level1DocEndFP; - private int level1DocCountUpto; + protected int level1LastDocID; + protected long level1DocEndFP; + protected int level1DocCountUpto; - private boolean needsFreq; // true if the caller actually needs frequencies - private int singletonDocID; // docid when there is a single pulsed posting, otherwise -1 - private long freqFP; + protected int docFreq; // number of docs in this posting list + protected long + totalTermFreq; // sum of freqBuffer in this posting list (or docFreq when omitted) + + protected int singletonDocID; // docid when there is a single pulsed posting, otherwise -1 + + protected int docCountUpto; // number of docs in or before the current block + protected long prevDocID; // last doc ID of the previous block - public BlockDocsEnum(FieldInfo fieldInfo) throws IOException { - this.startDocIn = Lucene912PostingsReader.this.docIn; - this.docIn = null; + protected int docBufferSize; + protected int docBufferUpto; + + protected IndexInput docIn; + + protected AbstractPostingsEnum(FieldInfo fieldInfo) { indexHasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; - indexHasPos = - fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; - indexHasOffsetsOrPayloads = - fieldInfo - .getIndexOptions() - .compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) - >= 0 - || fieldInfo.hasPayloads(); // We set the last element of docBuffer to NO_MORE_DOCS, it helps save conditionals in // advance() docBuffer[BLOCK_SIZE] = NO_MORE_DOCS; } - public boolean canReuse(IndexInput docIn, FieldInfo fieldInfo) { - return docIn == startDocIn - && indexHasFreq - == (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0) - && indexHasPos - == (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) - >= 0) - && indexHasOffsetsOrPayloads - == (fieldInfo - .getIndexOptions() - .compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) - >= 0 - || fieldInfo.hasPayloads()); + @Override + public int docID() { + return doc; } - public PostingsEnum reset(IntBlockTermState termState, int flags) throws IOException { + protected void resetIndexInput(IntBlockTermState termState) throws IOException { docFreq = termState.docFreq; - totalTermFreq = indexHasFreq ? termState.totalTermFreq : docFreq; singletonDocID = termState.singletonDocID; if (docFreq > 1) { if (docIn == null) { // lazy init - docIn = startDocIn.clone(); + docIn = Lucene912PostingsReader.this.docIn.clone(); } prefetchPostings(docIn, termState); } + } + protected PostingsEnum resetIdsAndLevelParams(IntBlockTermState termState) throws IOException { doc = -1; - this.needsFreq = PostingsEnum.featureRequested(flags, PostingsEnum.FREQS); - if (indexHasFreq == false || needsFreq == false) { - // Filling this buffer may not be cheap when doing primary key lookups, so we make sure to - // not fill more than `docFreq` entries. - Arrays.fill(freqBuffer, 0, Math.min(ForUtil.BLOCK_SIZE, docFreq), 1); - } prevDocID = -1; docCountUpto = 0; level0LastDocID = -1; @@ -437,10 +389,46 @@ public PostingsEnum reset(IntBlockTermState termState, int flags) throws IOExcep level1DocEndFP = termState.docStartFP; } level1DocCountUpto = 0; + docBufferSize = BLOCK_SIZE; docBufferUpto = BLOCK_SIZE; - freqFP = -1; return this; } + } + + final class BlockDocsEnum extends AbstractPostingsEnum { + + private final long[] freqBuffer = new long[BLOCK_SIZE]; + + private boolean needsFreq; // true if the caller actually needs frequencies + private long freqFP; + + public BlockDocsEnum(FieldInfo fieldInfo) { + super(fieldInfo); + } + + public boolean canReuse(IndexInput docIn, FieldInfo fieldInfo) { + final IndexOptions options = fieldInfo.getIndexOptions(); + return docIn == Lucene912PostingsReader.this.docIn + && indexHasFreq == (options.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0); + } + + public PostingsEnum reset(IntBlockTermState termState, int flags) throws IOException { + resetIndexInput(termState); + if (pforUtil == null && docFreq >= BLOCK_SIZE) { + pforUtil = new PForUtil(); + forDeltaUtil = new ForDeltaUtil(); + } + totalTermFreq = indexHasFreq ? termState.totalTermFreq : docFreq; + + this.needsFreq = PostingsEnum.featureRequested(flags, PostingsEnum.FREQS); + if (indexHasFreq == false || needsFreq == false) { + // Filling this buffer may not be cheap when doing primary key lookups, so we make sure to + // not fill more than `docFreq` entries. + Arrays.fill(freqBuffer, 0, Math.min(ForUtil.BLOCK_SIZE, docFreq), 1); + } + freqFP = -1; + return resetIdsAndLevelParams(termState); + } @Override public int freq() throws IOException { @@ -454,30 +442,25 @@ public int freq() throws IOException { } @Override - public int nextPosition() throws IOException { + public int nextPosition() { return -1; } @Override - public int startOffset() throws IOException { + public int startOffset() { return -1; } @Override - public int endOffset() throws IOException { + public int endOffset() { return -1; } @Override - public BytesRef getPayload() throws IOException { + public BytesRef getPayload() { return null; } - @Override - public int docID() { - return doc; - } - private void refillFullBlock() throws IOException { assert docFreq - docCountUpto >= BLOCK_SIZE; @@ -487,12 +470,12 @@ private void refillFullBlock() throws IOException { if (needsFreq) { freqFP = docIn.getFilePointer(); } - pforUtil.skip(docIn); + PForUtil.skip(docIn); } docCountUpto += BLOCK_SIZE; prevDocID = docBuffer[BLOCK_SIZE - 1]; docBufferUpto = 0; - assert docBuffer[BLOCK_SIZE] == NO_MORE_DOCS; + assert docBuffer[docBufferSize] == NO_MORE_DOCS; } private void refillRemainder() throws IOException { @@ -513,6 +496,7 @@ private void refillRemainder() throws IOException { docCountUpto += left; } docBufferUpto = 0; + docBufferSize = left; freqFP = -1; } @@ -525,7 +509,7 @@ private void skipLevel1To(int target) throws IOException { level1DocCountUpto += LEVEL1_NUM_DOCS; if (docFreq - docCountUpto < LEVEL1_NUM_DOCS) { - level1LastDocID = DocIdSetIterator.NO_MORE_DOCS; + level1LastDocID = NO_MORE_DOCS; break; } @@ -561,7 +545,7 @@ private void skipLevel0To(int target) throws IOException { docIn.skipBytes(readVLong15(docIn)); docCountUpto += BLOCK_SIZE; } else { - level0LastDocID = DocIdSetIterator.NO_MORE_DOCS; + level0LastDocID = NO_MORE_DOCS; break; } } @@ -578,14 +562,14 @@ private void moveToNextLevel0Block() throws IOException { refillFullBlock(); level0LastDocID = (int) docBuffer[BLOCK_SIZE - 1]; } else { - level0LastDocID = DocIdSetIterator.NO_MORE_DOCS; + level0LastDocID = NO_MORE_DOCS; refillRemainder(); } } @Override public int nextDoc() throws IOException { - if (doc == level0LastDocID) { // advance skip data on level 0 + if (docBufferUpto == BLOCK_SIZE) { // advance skip data on level 0 moveToNextLevel0Block(); } @@ -609,7 +593,7 @@ public int advance(int target) throws IOException { } } - int next = findFirstGreater(docBuffer, target, docBufferUpto); + int next = findNextGEQ(docBuffer, target, docBufferUpto, docBufferSize); this.doc = (int) docBuffer[next]; docBufferUpto = next + 1; return doc; @@ -621,13 +605,8 @@ public long cost() { } } - final class EverythingEnum extends PostingsEnum { - - final ForUtil forUtil = new ForUtil(); - final ForDeltaUtil forDeltaUtil = new ForDeltaUtil(forUtil); - final PForUtil pforUtil = new PForUtil(forUtil); + final class EverythingEnum extends AbstractPostingsEnum { - private final long[] docBuffer = new long[BLOCK_SIZE + 1]; private final long[] freqBuffer = new long[BLOCK_SIZE + 1]; private final long[] posDeltaBuffer = new long[BLOCK_SIZE]; @@ -643,27 +622,16 @@ final class EverythingEnum extends PostingsEnum { private int startOffset; private int endOffset; - private int docBufferUpto; private int posBufferUpto; - final IndexInput startDocIn; - - IndexInput docIn; final IndexInput posIn; final IndexInput payIn; final BytesRef payload; - final boolean indexHasFreq; - final boolean indexHasPos; final boolean indexHasOffsets; final boolean indexHasPayloads; final boolean indexHasOffsetsOrPayloads; - private int docFreq; // number of docs in this posting list - private long totalTermFreq; // sum of freqBuffer in this posting list (or docFreq when omitted) - private int docCountUpto; // number of docs in or before the current block - private int doc; // doc we last read - private long prevDocID; // last doc ID of the previous block private int freq; // freq we last read private int position; // current position @@ -671,28 +639,16 @@ final class EverythingEnum extends PostingsEnum { // skip these to "catch up": private long posPendingCount; - // Where this term's postings start in the .pos file: - private long posTermStartFP; - - // Where this term's payloads/offsets start in the .pay - // file: - private long payTermStartFP; - // File pointer where the last (vInt encoded) pos delta // block is. We need this to know whether to bulk // decode vs vInt decode the block: private long lastPosBlockFP; - // level 0 skip data - private int level0LastDocID; private long level0PosEndFP; private int level0BlockPosUpto; private long level0PayEndFP; private int level0BlockPayUpto; - // level 1 skip data - private int level1LastDocID; - private long level1DocEndFP; - private int level1DocCountUpto; + private long level1PosEndFP; private int level1BlockPosUpto; private long level1PayEndFP; @@ -701,14 +657,8 @@ final class EverythingEnum extends PostingsEnum { private boolean needsOffsets; // true if we actually need offsets private boolean needsPayloads; // true if we actually need payloads - private int singletonDocID; // docid when there is a single pulsed posting, otherwise -1 - public EverythingEnum(FieldInfo fieldInfo) throws IOException { - this.startDocIn = Lucene912PostingsReader.this.docIn; - this.docIn = null; - indexHasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; - indexHasPos = - fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + super(fieldInfo); indexHasOffsets = fieldInfo .getIndexOptions() @@ -742,14 +692,10 @@ public EverythingEnum(FieldInfo fieldInfo) throws IOException { payloadBytes = null; payload = null; } - - // We set the last element of docBuffer to NO_MORE_DOCS, it helps save conditionals in - // advance() - docBuffer[BLOCK_SIZE] = NO_MORE_DOCS; } public boolean canReuse(IndexInput docIn, FieldInfo fieldInfo) { - return docIn == startDocIn + return docIn == Lucene912PostingsReader.this.docIn && indexHasOffsets == (fieldInfo .getIndexOptions() @@ -759,18 +705,19 @@ public boolean canReuse(IndexInput docIn, FieldInfo fieldInfo) { } public PostingsEnum reset(IntBlockTermState termState, int flags) throws IOException { - docFreq = termState.docFreq; - posTermStartFP = termState.posStartFP; - payTermStartFP = termState.payStartFP; - totalTermFreq = termState.totalTermFreq; - singletonDocID = termState.singletonDocID; - if (docFreq > 1) { - if (docIn == null) { - // lazy init - docIn = startDocIn.clone(); - } - prefetchPostings(docIn, termState); + resetIndexInput(termState); + if (forDeltaUtil == null && docFreq >= BLOCK_SIZE) { + forDeltaUtil = new ForDeltaUtil(); } + totalTermFreq = termState.totalTermFreq; + if (pforUtil == null && totalTermFreq >= BLOCK_SIZE) { + pforUtil = new PForUtil(); + } + // Where this term's postings start in the .pos file: + final long posTermStartFP = termState.posStartFP; + // Where this term's payloads/offsets start in the .pay + // file: + final long payTermStartFP = termState.payStartFP; posIn.seek(posTermStartFP); if (indexHasOffsetsOrPayloads) { payIn.seek(payTermStartFP); @@ -792,37 +739,18 @@ public PostingsEnum reset(IntBlockTermState termState, int flags) throws IOExcep this.needsOffsets = PostingsEnum.featureRequested(flags, PostingsEnum.OFFSETS); this.needsPayloads = PostingsEnum.featureRequested(flags, PostingsEnum.PAYLOADS); - doc = -1; - prevDocID = -1; - docCountUpto = 0; - level0LastDocID = -1; - if (docFreq < LEVEL1_NUM_DOCS) { - level1LastDocID = NO_MORE_DOCS; - if (docFreq > 1) { - docIn.seek(termState.docStartFP); - } - } else { - level1LastDocID = -1; - level1DocEndFP = termState.docStartFP; - } - level1DocCountUpto = 0; level1BlockPosUpto = 0; level1BlockPayUpto = 0; level0BlockPosUpto = 0; level0BlockPayUpto = 0; - docBufferUpto = BLOCK_SIZE; posBufferUpto = BLOCK_SIZE; - return this; - } - @Override - public int freq() throws IOException { - return freq; + return resetIdsAndLevelParams(termState); } @Override - public int docID() { - return doc; + public int freq() { + return freq; } private void refillDocs() throws IOException { @@ -838,16 +766,18 @@ private void refillDocs() throws IOException { freqBuffer[0] = totalTermFreq; docBuffer[1] = NO_MORE_DOCS; docCountUpto++; + docBufferSize = 1; } else { // Read vInts: PostingsUtil.readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq, true); prefixSum(docBuffer, left, prevDocID); docBuffer[left] = NO_MORE_DOCS; docCountUpto += left; + docBufferSize = left; } prevDocID = docBuffer[BLOCK_SIZE - 1]; docBufferUpto = 0; - assert docBuffer[BLOCK_SIZE] == NO_MORE_DOCS; + assert docBuffer[docBufferSize] == NO_MORE_DOCS; } private void skipLevel1To(int target) throws IOException { @@ -865,7 +795,7 @@ private void skipLevel1To(int target) throws IOException { level1DocCountUpto += LEVEL1_NUM_DOCS; if (docFreq - docCountUpto < LEVEL1_NUM_DOCS) { - level1LastDocID = DocIdSetIterator.NO_MORE_DOCS; + level1LastDocID = NO_MORE_DOCS; break; } @@ -923,7 +853,7 @@ private void moveToNextLevel0Block() throws IOException { level0BlockPayUpto = docIn.readVInt(); } } else { - level0LastDocID = DocIdSetIterator.NO_MORE_DOCS; + level0LastDocID = NO_MORE_DOCS; } refillDocs(); @@ -931,7 +861,7 @@ private void moveToNextLevel0Block() throws IOException { @Override public int nextDoc() throws IOException { - if (doc == level0LastDocID) { // advance level 0 skip data + if (docBufferUpto == BLOCK_SIZE) { // advance level 0 skip data moveToNextLevel0Block(); } @@ -962,9 +892,7 @@ private void skipLevel0To(int target) throws IOException { } posBufferUpto = BLOCK_SIZE; } else { - for (int i = docBufferUpto; i < BLOCK_SIZE; ++i) { - posPendingCount += freqBuffer[i]; - } + posPendingCount += sumOverRange(freqBuffer, docBufferUpto, BLOCK_SIZE); } if (docFreq - docCountUpto >= BLOCK_SIZE) { @@ -990,7 +918,7 @@ private void skipLevel0To(int target) throws IOException { docIn.seek(blockEndFP); docCountUpto += BLOCK_SIZE; } else { - level0LastDocID = DocIdSetIterator.NO_MORE_DOCS; + level0LastDocID = NO_MORE_DOCS; break; } } @@ -1009,10 +937,8 @@ public int advance(int target) throws IOException { refillDocs(); } - int next = findFirstGreater(docBuffer, target, docBufferUpto); - for (int i = docBufferUpto; i <= next; ++i) { - posPendingCount += freqBuffer[i]; - } + int next = findNextGEQ(docBuffer, target, docBufferUpto, docBufferSize); + posPendingCount += sumOverRange(freqBuffer, docBufferUpto, next + 1); this.freq = (int) freqBuffer[next]; this.docBufferUpto = next + 1; position = 0; @@ -1032,20 +958,18 @@ private void skipPositions() throws IOException { if (toSkip < leftInBlock) { int end = (int) (posBufferUpto + toSkip); if (indexHasPayloads) { - for (int i = posBufferUpto; i < end; ++i) { - payloadByteUpto += payloadLengthBuffer[i]; - } + payloadByteUpto += sumOverRange(payloadLengthBuffer, posBufferUpto, end); } posBufferUpto = end; } else { toSkip -= leftInBlock; while (toSkip >= BLOCK_SIZE) { assert posIn.getFilePointer() != lastPosBlockFP; - pforUtil.skip(posIn); + PForUtil.skip(posIn); if (indexHasPayloads) { // Skip payloadLength block: - pforUtil.skip(payIn); + PForUtil.skip(payIn); // Skip payloadBytes block: int numBytes = payIn.readVInt(); @@ -1053,19 +977,16 @@ private void skipPositions() throws IOException { } if (indexHasOffsets) { - pforUtil.skip(payIn); - pforUtil.skip(payIn); + PForUtil.skip(payIn); + PForUtil.skip(payIn); } toSkip -= BLOCK_SIZE; } refillPositions(); payloadByteUpto = 0; - posBufferUpto = 0; final int toSkipInt = (int) toSkip; if (indexHasPayloads) { - for (int i = 0; i < toSkipInt; ++i) { - payloadByteUpto += payloadLengthBuffer[i]; - } + payloadByteUpto += sumOverRange(payloadLengthBuffer, 0, toSkipInt); } posBufferUpto = toSkipInt; } @@ -1124,7 +1045,7 @@ private void refillPositions() throws IOException { } else { // this works, because when writing a vint block we always force the first length to be // written - pforUtil.skip(payIn); // skip over lengths + PForUtil.skip(payIn); // skip over lengths int numBytes = payIn.readVInt(); // read length of payloadBytes payIn.seek(payIn.getFilePointer() + numBytes); // skip over payloadBytes } @@ -1138,8 +1059,8 @@ private void refillPositions() throws IOException { } else { // this works, because when writing a vint block we always force the first length to be // written - pforUtil.skip(payIn); // skip over starts - pforUtil.skip(payIn); // skip over lengths + PForUtil.skip(payIn); // skip over starts + PForUtil.skip(payIn); // skip over lengths } } } @@ -1204,82 +1125,47 @@ public long cost() { } } - final class BlockImpactsDocsEnum extends ImpactsEnum { - - final ForUtil forUtil = new ForUtil(); - final ForDeltaUtil forDeltaUtil = new ForDeltaUtil(forUtil); - final PForUtil pforUtil = new PForUtil(forUtil); + private abstract class BlockImpactsEnum extends ImpactsEnum { - private final long[] docBuffer = new long[BLOCK_SIZE + 1]; - private final long[] freqBuffer = new long[BLOCK_SIZE]; + protected final ForDeltaUtil forDeltaUtil = new ForDeltaUtil(); + protected final PForUtil pforUtil = new PForUtil(); - private int docBufferUpto; + protected final long[] docBuffer = new long[BLOCK_SIZE + 1]; + protected final long[] freqBuffer = new long[BLOCK_SIZE]; - final IndexInput startDocIn; + protected final int docFreq; // number of docs in this posting list - IndexInput docIn; - final boolean indexHasFreq; - final boolean indexHasPos; - final boolean indexHasOffsetsOrPayloads; + protected final IndexInput docIn; - private int docFreq; // number of docs in this posting list - private int docCountUpto; // number of docs in or before the current block - private int doc; // doc we last read - private long prevDocID; // last doc ID of the previous block - private long freqFP; + protected int docCountUpto; // number of docs in or before the current block + protected int doc = -1; // doc we last read + protected long prevDocID = -1; // last doc ID of the previous block + protected int docBufferSize = BLOCK_SIZE; + protected int docBufferUpto = BLOCK_SIZE; // true if we shallow-advanced to a new block that we have not decoded yet - private boolean needsRefilling; + protected boolean needsRefilling; // level 0 skip data - private int level0LastDocID; - private long level0DocEndFP; - private final BytesRef level0SerializedImpacts; - private final ByteArrayDataInput level0SerializedImpactsIn = new ByteArrayDataInput(); - private final MutableImpactList level0Impacts; + protected int level0LastDocID = -1; + protected long level0DocEndFP; + protected final BytesRef level0SerializedImpacts; + protected final MutableImpactList level0Impacts; // level 1 skip data - private int level1LastDocID; - private long level1DocEndFP; - private int level1DocCountUpto; - private final BytesRef level1SerializedImpacts; - private final ByteArrayDataInput level1SerializedImpactsIn = new ByteArrayDataInput(); - private final MutableImpactList level1Impacts; - - public BlockImpactsDocsEnum(FieldInfo fieldInfo, IntBlockTermState termState) - throws IOException { - this.startDocIn = Lucene912PostingsReader.this.docIn; - this.docIn = null; - indexHasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; - indexHasPos = - fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; - indexHasOffsetsOrPayloads = - fieldInfo - .getIndexOptions() - .compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) - >= 0 - || fieldInfo.hasPayloads(); - // We set the last element of docBuffer to NO_MORE_DOCS, it helps save conditionals in - // advance() - docBuffer[BLOCK_SIZE] = NO_MORE_DOCS; - - docFreq = termState.docFreq; - if (docFreq > 1) { - if (docIn == null) { - // lazy init - docIn = startDocIn.clone(); - } - prefetchPostings(docIn, termState); - } - - doc = -1; - if (indexHasFreq == false) { - // Filling this buffer may not be cheap when doing primary key lookups, so we make sure to - // not fill more than `docFreq` entries. - Arrays.fill(freqBuffer, 0, Math.min(ForUtil.BLOCK_SIZE, docFreq), 1); - } - prevDocID = -1; - docCountUpto = 0; - level0LastDocID = -1; + protected int level1LastDocID; + protected long level1DocEndFP; + protected int level1DocCountUpto = 0; + protected final BytesRef level1SerializedImpacts; + protected final MutableImpactList level1Impacts; + + private BlockImpactsEnum(IntBlockTermState termState) throws IOException { + this.docFreq = termState.docFreq; + this.docIn = Lucene912PostingsReader.this.docIn.clone(); + prefetchPostings(docIn, termState); + level0SerializedImpacts = new BytesRef(maxImpactNumBytesAtLevel0); + level1SerializedImpacts = new BytesRef(maxImpactNumBytesAtLevel1); + level0Impacts = new MutableImpactList(maxNumImpactsAtLevel0); + level1Impacts = new MutableImpactList(maxNumImpactsAtLevel1); if (docFreq < LEVEL1_NUM_DOCS) { level1LastDocID = NO_MORE_DOCS; if (docFreq > 1) { @@ -1289,48 +1175,104 @@ public BlockImpactsDocsEnum(FieldInfo fieldInfo, IntBlockTermState termState) level1LastDocID = -1; level1DocEndFP = termState.docStartFP; } - level1DocCountUpto = 0; - docBufferUpto = BLOCK_SIZE; - freqFP = -1; - level0SerializedImpacts = new BytesRef(maxImpactNumBytesAtLevel0); - level1SerializedImpacts = new BytesRef(maxImpactNumBytesAtLevel1); - level0Impacts = new MutableImpactList(maxNumImpactsAtLevel0); - level1Impacts = new MutableImpactList(maxNumImpactsAtLevel1); + // We set the last element of docBuffer to NO_MORE_DOCS, it helps save conditionals in + // advance() + docBuffer[BLOCK_SIZE] = NO_MORE_DOCS; } @Override - public int freq() throws IOException { - if (freqFP != -1) { - docIn.seek(freqFP); - pforUtil.decode(docIn, freqBuffer); - freqFP = -1; - } - return (int) freqBuffer[docBufferUpto - 1]; + public int docID() { + return doc; } @Override - public int nextPosition() throws IOException { + public int startOffset() { return -1; } @Override - public int startOffset() throws IOException { + public int endOffset() { return -1; } @Override - public int endOffset() throws IOException { - return -1; + public BytesRef getPayload() { + return null; } @Override - public BytesRef getPayload() throws IOException { - return null; + public long cost() { + return docFreq; } + private final Impacts impacts = + new Impacts() { + + private final ByteArrayDataInput scratch = new ByteArrayDataInput(); + + @Override + public int numLevels() { + return level1LastDocID == NO_MORE_DOCS ? 1 : 2; + } + + @Override + public int getDocIdUpTo(int level) { + if (level == 0) { + return level0LastDocID; + } + return level == 1 ? level1LastDocID : NO_MORE_DOCS; + } + + @Override + public List getImpacts(int level) { + if (level == 0 && level0LastDocID != NO_MORE_DOCS) { + return readImpacts(level0SerializedImpacts, level0Impacts); + } + if (level == 1) { + return readImpacts(level1SerializedImpacts, level1Impacts); + } + return DUMMY_IMPACTS; + } + + private List readImpacts(BytesRef serialized, MutableImpactList impactsList) { + var scratch = this.scratch; + scratch.reset(serialized.bytes, 0, serialized.length); + Lucene912PostingsReader.readImpacts(scratch, impactsList); + return impactsList; + } + }; + @Override - public int docID() { - return doc; + public Impacts getImpacts() { + return impacts; + } + } + + final class BlockImpactsDocsEnum extends BlockImpactsEnum { + final boolean indexHasPos; + + private long freqFP; + + public BlockImpactsDocsEnum(boolean indexHasPos, IntBlockTermState termState) + throws IOException { + super(termState); + this.indexHasPos = indexHasPos; + freqFP = -1; + } + + @Override + public int freq() throws IOException { + if (freqFP != -1) { + docIn.seek(freqFP); + pforUtil.decode(docIn, freqBuffer); + freqFP = -1; + } + return (int) freqBuffer[docBufferUpto - 1]; + } + + @Override + public int nextPosition() { + return -1; } private void refillDocs() throws IOException { @@ -1339,23 +1281,21 @@ private void refillDocs() throws IOException { if (left >= BLOCK_SIZE) { forDeltaUtil.decodeAndPrefixSum(docIn, prevDocID, docBuffer); - - if (indexHasFreq) { - freqFP = docIn.getFilePointer(); - pforUtil.skip(docIn); - } + freqFP = docIn.getFilePointer(); + PForUtil.skip(docIn); docCountUpto += BLOCK_SIZE; } else { // Read vInts: - PostingsUtil.readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq, true); + PostingsUtil.readVIntBlock(docIn, docBuffer, freqBuffer, left, true, true); prefixSum(docBuffer, left, prevDocID); docBuffer[left] = NO_MORE_DOCS; freqFP = -1; docCountUpto += left; + docBufferSize = left; } prevDocID = docBuffer[BLOCK_SIZE - 1]; docBufferUpto = 0; - assert docBuffer[BLOCK_SIZE] == NO_MORE_DOCS; + assert docBuffer[docBufferSize] == NO_MORE_DOCS; } private void skipLevel1To(int target) throws IOException { @@ -1367,7 +1307,7 @@ private void skipLevel1To(int target) throws IOException { level1DocCountUpto += LEVEL1_NUM_DOCS; if (docFreq - docCountUpto < LEVEL1_NUM_DOCS) { - level1LastDocID = DocIdSetIterator.NO_MORE_DOCS; + level1LastDocID = NO_MORE_DOCS; break; } @@ -1411,7 +1351,7 @@ private void skipLevel0To(int target) throws IOException { docIn.skipBytes(blockLength); docCountUpto += BLOCK_SIZE; } else { - level0LastDocID = DocIdSetIterator.NO_MORE_DOCS; + level0LastDocID = NO_MORE_DOCS; break; } } @@ -1454,7 +1394,7 @@ private void moveToNextLevel0Block() throws IOException { level0SerializedImpacts.length = numImpactBytes; docIn.seek(skip0End); } else { - level0LastDocID = DocIdSetIterator.NO_MORE_DOCS; + level0LastDocID = NO_MORE_DOCS; } refillDocs(); @@ -1463,11 +1403,13 @@ private void moveToNextLevel0Block() throws IOException { @Override public int nextDoc() throws IOException { - if (doc == level0LastDocID) { - moveToNextLevel0Block(); - } else if (needsRefilling) { - refillDocs(); - needsRefilling = false; + if (docBufferUpto == BLOCK_SIZE) { + if (needsRefilling) { + refillDocs(); + needsRefilling = false; + } else { + moveToNextLevel0Block(); + } } return this.doc = (int) docBuffer[docBufferUpto++]; @@ -1481,112 +1423,26 @@ public int advance(int target) throws IOException { needsRefilling = false; } - int next = findFirstGreater(docBuffer, target, docBufferUpto); + int next = findNextGEQ(docBuffer, target, docBufferUpto, docBufferSize); this.doc = (int) docBuffer[next]; docBufferUpto = next + 1; return doc; } - - @Override - public Impacts getImpacts() throws IOException { - return new Impacts() { - - @Override - public int numLevels() { - int numLevels = 0; - if (level0LastDocID != NO_MORE_DOCS) { - numLevels++; - } - if (level1LastDocID != NO_MORE_DOCS) { - numLevels++; - } - if (numLevels == 0) { - numLevels++; - } - return numLevels; - } - - @Override - public int getDocIdUpTo(int level) { - if (level0LastDocID != NO_MORE_DOCS) { - if (level == 0) { - return level0LastDocID; - } - level--; - } - - if (level1LastDocID != NO_MORE_DOCS) { - if (level == 0) { - return level1LastDocID; - } - level--; - } - - return NO_MORE_DOCS; - } - - @Override - public List getImpacts(int level) { - if (level0LastDocID != NO_MORE_DOCS) { - if (level == 0) { - level0SerializedImpactsIn.reset( - level0SerializedImpacts.bytes, 0, level0SerializedImpacts.length); - readImpacts(level0SerializedImpactsIn, level0Impacts); - return level0Impacts; - } - level--; - } - - if (level1LastDocID != NO_MORE_DOCS) { - if (level == 0) { - level1SerializedImpactsIn.reset( - level1SerializedImpacts.bytes, 0, level1SerializedImpacts.length); - readImpacts(level1SerializedImpactsIn, level1Impacts); - return level1Impacts; - } - level--; - } - - return Collections.singletonList(new Impact(Integer.MAX_VALUE, 1L)); - } - }; - } - - @Override - public long cost() { - return docFreq; - } } - final class BlockImpactsPostingsEnum extends ImpactsEnum { - - final ForUtil forUtil = new ForUtil(); - final ForDeltaUtil forDeltaUtil = new ForDeltaUtil(forUtil); - final PForUtil pforUtil = new PForUtil(forUtil); - - private final long[] docBuffer = new long[BLOCK_SIZE + 1]; - private final long[] freqBuffer = new long[BLOCK_SIZE]; + final class BlockImpactsPostingsEnum extends BlockImpactsEnum { private final long[] posDeltaBuffer = new long[BLOCK_SIZE]; - private int docBufferUpto; private int posBufferUpto; - - final IndexInput startDocIn; - - IndexInput docIn; final IndexInput posIn; final boolean indexHasFreq; - final boolean indexHasPos; final boolean indexHasOffsets; final boolean indexHasPayloads; final boolean indexHasOffsetsOrPayloads; - private int docFreq; // number of docs in this posting list - private long totalTermFreq; // sum of freqBuffer in this posting list (or docFreq when omitted) - private int docCountUpto; // number of docs in or before the current block - private int doc; // doc we last read - private long prevDocID; // last doc ID of the previous block + private final long + totalTermFreq; // sum of freqBuffer in this posting list (or docFreq when omitted) private int freq; // freq we last read private int position; // current position @@ -1594,69 +1450,36 @@ final class BlockImpactsPostingsEnum extends ImpactsEnum { // skip these to "catch up": private long posPendingCount; - // Where this term's postings start in the .pos file: - private long posTermStartFP; - // File pointer where the last (vInt encoded) pos delta // block is. We need this to know whether to bulk // decode vs vInt decode the block: - private long lastPosBlockFP; - - // true if we shallow-advanced to a new block that we have not decoded yet - private boolean needsRefilling; + private final long lastPosBlockFP; // level 0 skip data - private int level0LastDocID; - private long level0DocEndFP; private long level0PosEndFP; private int level0BlockPosUpto; - private final BytesRefBuilder level0SerializedImpacts = new BytesRefBuilder(); - private final ByteArrayDataInput level0SerializedImpactsIn = new ByteArrayDataInput(); - private final MutableImpactList level0Impacts; // level 1 skip data - private int level1LastDocID; - private long level1DocEndFP; - private int level1DocCountUpto; private long level1PosEndFP; private int level1BlockPosUpto; - private final BytesRefBuilder level1SerializedImpacts = new BytesRefBuilder(); - private final ByteArrayDataInput level1SerializedImpactsIn = new ByteArrayDataInput(); - private final MutableImpactList level1Impacts; - private int singletonDocID; // docid when there is a single pulsed posting, otherwise -1 + private final int singletonDocID; // docid when there is a single pulsed posting, otherwise -1 public BlockImpactsPostingsEnum(FieldInfo fieldInfo, IntBlockTermState termState) throws IOException { - this.startDocIn = Lucene912PostingsReader.this.docIn; - this.docIn = null; - indexHasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; - indexHasPos = - fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + super(termState); + final IndexOptions options = fieldInfo.getIndexOptions(); + indexHasFreq = options.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; indexHasOffsets = - fieldInfo - .getIndexOptions() - .compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) - >= 0; + options.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; indexHasPayloads = fieldInfo.hasPayloads(); indexHasOffsetsOrPayloads = indexHasOffsets || indexHasPayloads; this.posIn = Lucene912PostingsReader.this.posIn.clone(); - // We set the last element of docBuffer to NO_MORE_DOCS, it helps save conditionals in - // advance() - docBuffer[BLOCK_SIZE] = NO_MORE_DOCS; - - docFreq = termState.docFreq; - posTermStartFP = termState.posStartFP; + // Where this term's postings start in the .pos file: + final long posTermStartFP = termState.posStartFP; totalTermFreq = termState.totalTermFreq; singletonDocID = termState.singletonDocID; - if (docFreq > 1) { - if (docIn == null) { - // lazy init - docIn = startDocIn.clone(); - } - prefetchPostings(docIn, termState); - } posIn.seek(posTermStartFP); level1PosEndFP = posTermStartFP; level0PosEndFP = posTermStartFP; @@ -1668,40 +1491,15 @@ public BlockImpactsPostingsEnum(FieldInfo fieldInfo, IntBlockTermState termState } else { lastPosBlockFP = posTermStartFP + termState.lastPosBlockOffset; } - - doc = -1; - prevDocID = -1; - docCountUpto = 0; - level0LastDocID = -1; - if (docFreq < LEVEL1_NUM_DOCS) { - level1LastDocID = NO_MORE_DOCS; - if (docFreq > 1) { - docIn.seek(termState.docStartFP); - } - } else { - level1LastDocID = -1; - level1DocEndFP = termState.docStartFP; - } - level1DocCountUpto = 0; level1BlockPosUpto = 0; - docBufferUpto = BLOCK_SIZE; posBufferUpto = BLOCK_SIZE; - level0SerializedImpacts.growNoCopy(maxImpactNumBytesAtLevel0); - level1SerializedImpacts.growNoCopy(maxImpactNumBytesAtLevel1); - level0Impacts = new MutableImpactList(maxNumImpactsAtLevel0); - level1Impacts = new MutableImpactList(maxNumImpactsAtLevel1); } @Override - public int freq() throws IOException { + public int freq() { return freq; } - @Override - public int docID() { - return doc; - } - private void refillDocs() throws IOException { final int left = docFreq - docCountUpto; assert left >= 0; @@ -1721,10 +1519,11 @@ private void refillDocs() throws IOException { prefixSum(docBuffer, left, prevDocID); docBuffer[left] = NO_MORE_DOCS; docCountUpto += left; + docBufferSize = left; } prevDocID = docBuffer[BLOCK_SIZE - 1]; docBufferUpto = 0; - assert docBuffer[BLOCK_SIZE] == NO_MORE_DOCS; + assert docBuffer[docBufferSize] == NO_MORE_DOCS; } private void skipLevel1To(int target) throws IOException { @@ -1738,7 +1537,7 @@ private void skipLevel1To(int target) throws IOException { level1DocCountUpto += LEVEL1_NUM_DOCS; if (docFreq - docCountUpto < LEVEL1_NUM_DOCS) { - level1LastDocID = DocIdSetIterator.NO_MORE_DOCS; + level1LastDocID = NO_MORE_DOCS; break; } @@ -1748,8 +1547,8 @@ private void skipLevel1To(int target) throws IOException { long skip1EndFP = docIn.readShort() + docIn.getFilePointer(); int numImpactBytes = docIn.readShort(); if (level1LastDocID >= target) { - docIn.readBytes(level1SerializedImpacts.bytes(), 0, numImpactBytes); - level1SerializedImpacts.setLength(numImpactBytes); + docIn.readBytes(level1SerializedImpacts.bytes, 0, numImpactBytes); + level1SerializedImpacts.length = numImpactBytes; } else { docIn.skipBytes(numImpactBytes); } @@ -1777,9 +1576,7 @@ private void skipLevel0To(int target) throws IOException { posPendingCount = level0BlockPosUpto; posBufferUpto = BLOCK_SIZE; } else { - for (int i = docBufferUpto; i < BLOCK_SIZE; ++i) { - posPendingCount += freqBuffer[i]; - } + posPendingCount += sumOverRange(freqBuffer, docBufferUpto, BLOCK_SIZE); } if (docFreq - docCountUpto >= BLOCK_SIZE) { @@ -1792,8 +1589,8 @@ private void skipLevel0To(int target) throws IOException { if (target <= level0LastDocID) { int numImpactBytes = docIn.readVInt(); - docIn.readBytes(level0SerializedImpacts.bytes(), 0, numImpactBytes); - level0SerializedImpacts.setLength(numImpactBytes); + docIn.readBytes(level0SerializedImpacts.bytes, 0, numImpactBytes); + level0SerializedImpacts.length = numImpactBytes; level0PosEndFP += docIn.readVLong(); level0BlockPosUpto = docIn.readByte(); if (indexHasOffsetsOrPayloads) { @@ -1809,7 +1606,7 @@ private void skipLevel0To(int target) throws IOException { docIn.seek(level0DocEndFP); docCountUpto += BLOCK_SIZE; } else { - level0LastDocID = DocIdSetIterator.NO_MORE_DOCS; + level0LastDocID = NO_MORE_DOCS; break; } } @@ -1832,75 +1629,11 @@ public void advanceShallow(int target) throws IOException { } } - @Override - public Impacts getImpacts() throws IOException { - return new Impacts() { - - @Override - public int numLevels() { - int numLevels = 0; - if (level0LastDocID != NO_MORE_DOCS) { - numLevels++; - } - if (level1LastDocID != NO_MORE_DOCS) { - numLevels++; - } - if (numLevels == 0) { - numLevels++; - } - return numLevels; - } - - @Override - public int getDocIdUpTo(int level) { - if (level0LastDocID != NO_MORE_DOCS) { - if (level == 0) { - return level0LastDocID; - } - level--; - } - - if (level1LastDocID != NO_MORE_DOCS) { - if (level == 0) { - return level1LastDocID; - } - level--; - } - - return NO_MORE_DOCS; - } - - @Override - public List getImpacts(int level) { - if (level0LastDocID != NO_MORE_DOCS) { - if (level == 0) { - level0SerializedImpactsIn.reset( - level0SerializedImpacts.bytes(), 0, level0SerializedImpacts.length()); - readImpacts(level0SerializedImpactsIn, level0Impacts); - return level0Impacts; - } - level--; - } - - if (level1LastDocID != NO_MORE_DOCS) { - if (level == 0) { - level1SerializedImpactsIn.reset( - level1SerializedImpacts.bytes(), 0, level1SerializedImpacts.length()); - readImpacts(level1SerializedImpactsIn, level1Impacts); - return level1Impacts; - } - level--; - } - - return Collections.singletonList(new Impact(Integer.MAX_VALUE, 1L)); - } - }; - } - @Override public int nextDoc() throws IOException { - advanceShallow(doc + 1); - if (needsRefilling) { + if (docBufferUpto == BLOCK_SIZE) { + advanceShallow(doc + 1); + assert needsRefilling; refillDocs(); needsRefilling = false; } @@ -1921,10 +1654,8 @@ public int advance(int target) throws IOException { needsRefilling = false; } - int next = findFirstGreater(docBuffer, target, docBufferUpto); - for (int i = docBufferUpto; i <= next; ++i) { - posPendingCount += freqBuffer[i]; - } + int next = findNextGEQ(docBuffer, target, docBufferUpto, docBufferSize); + posPendingCount += sumOverRange(freqBuffer, docBufferUpto, next + 1); freq = (int) freqBuffer[next]; docBufferUpto = next + 1; position = 0; @@ -1945,7 +1676,7 @@ private void skipPositions() throws IOException { toSkip -= leftInBlock; while (toSkip >= BLOCK_SIZE) { assert posIn.getFilePointer() != lastPosBlockFP; - pforUtil.skip(posIn); + PForUtil.skip(posIn); toSkip -= BLOCK_SIZE; } refillPositions(); @@ -2004,31 +1735,8 @@ public int nextPosition() throws IOException { posPendingCount--; return position; } - - @Override - public int startOffset() { - return -1; - } - - @Override - public int endOffset() { - return -1; - } - - @Override - public BytesRef getPayload() { - return null; - } - - @Override - public long cost() { - return docFreq; - } } - /** - * @see Lucene912PostingsWriter#writeVInt15(org.apache.lucene.store.DataOutput, int) - */ static int readVInt15(DataInput in) throws IOException { short s = in.readShort(); if (s >= 0) { @@ -2038,9 +1746,6 @@ static int readVInt15(DataInput in) throws IOException { } } - /** - * @see Lucene912PostingsWriter#writeVLong15(org.apache.lucene.store.DataOutput, long) - */ static long readVLong15(DataInput in) throws IOException { short s = in.readShort(); if (s >= 0) { @@ -2050,7 +1755,17 @@ static long readVLong15(DataInput in) throws IOException { } } - private void prefetchPostings(IndexInput docIn, IntBlockTermState state) throws IOException { + private static int findNextGEQ(long[] buffer, long target, int from, int to) { + for (int i = from; i < to; ++i) { + if (buffer[i] >= target) { + return i; + } + } + return to; + } + + private static void prefetchPostings(IndexInput docIn, IntBlockTermState state) + throws IOException { assert state.docFreq > 1; // Singletons are inlined in the terms dict, nothing to prefetch if (docIn.getFilePointer() != state.docStartFP) { // Don't prefetch if the input is already positioned at the right offset, which suggests that diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/PForUtil.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/PForUtil.java similarity index 92% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene912/PForUtil.java rename to lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/PForUtil.java index f4405ae66fab..a075e42ec361 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/PForUtil.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/PForUtil.java @@ -14,12 +14,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene912; +package org.apache.lucene.backward_codecs.lucene912; import java.io.IOException; import java.util.Arrays; import org.apache.lucene.store.DataInput; import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.LongHeap; import org.apache.lucene.util.packed.PackedInts; @@ -37,11 +38,10 @@ static boolean allEqual(long[] l) { return true; } - private final ForUtil forUtil; + private final ForUtil forUtil = new ForUtil(); - PForUtil(ForUtil forUtil) { + static { assert ForUtil.BLOCK_SIZE <= 256 : "blocksize must fit in one byte. got " + ForUtil.BLOCK_SIZE; - this.forUtil = forUtil; } /** Encode 128 integers from {@code longs} into {@code out}. */ @@ -104,15 +104,15 @@ void encode(long[] longs, DataOutput out) throws IOException { } /** Decode 128 integers into {@code ints}. */ - void decode(DataInput in, long[] longs) throws IOException { + void decode(IndexInput in, long[] longs) throws IOException { final int token = Byte.toUnsignedInt(in.readByte()); final int bitsPerValue = token & 0x1f; - final int numExceptions = token >>> 5; if (bitsPerValue == 0) { Arrays.fill(longs, 0, ForUtil.BLOCK_SIZE, in.readVLong()); } else { forUtil.decode(bitsPerValue, in, longs); } + final int numExceptions = token >>> 5; for (int i = 0; i < numExceptions; ++i) { longs[Byte.toUnsignedInt(in.readByte())] |= Byte.toUnsignedLong(in.readByte()) << bitsPerValue; @@ -120,7 +120,7 @@ void decode(DataInput in, long[] longs) throws IOException { } /** Skip 128 integers. */ - void skip(DataInput in) throws IOException { + static void skip(DataInput in) throws IOException { final int token = Byte.toUnsignedInt(in.readByte()); final int bitsPerValue = token & 0x1f; final int numExceptions = token >>> 5; @@ -128,7 +128,7 @@ void skip(DataInput in) throws IOException { in.readVLong(); in.skipBytes((numExceptions << 1)); } else { - in.skipBytes(forUtil.numBytes(bitsPerValue) + (numExceptions << 1)); + in.skipBytes(ForUtil.numBytes(bitsPerValue) + (numExceptions << 1)); } } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/PostingsUtil.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/PostingsUtil.java similarity index 93% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene912/PostingsUtil.java rename to lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/PostingsUtil.java index 4834dd73e226..8f526f7ef104 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/PostingsUtil.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/PostingsUtil.java @@ -14,11 +14,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene912; +package org.apache.lucene.backward_codecs.lucene912; import java.io.IOException; import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.GroupVIntUtil; /** Utility class to encode/decode postings block. */ final class PostingsUtil { @@ -35,7 +36,7 @@ static void readVIntBlock( boolean indexHasFreq, boolean decodeFreq) throws IOException { - docIn.readGroupVInts(docBuffer, num); + GroupVIntUtil.readGroupVInts(docIn, docBuffer, num); if (indexHasFreq && decodeFreq) { for (int i = 0; i < num; ++i) { freqBuffer[i] = docBuffer[i] & 0x01; diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/gen_ForDeltaUtil.py b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/gen_ForDeltaUtil.py new file mode 100644 index 000000000000..90604ee75cca --- /dev/null +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/gen_ForDeltaUtil.py @@ -0,0 +1,420 @@ +#! /usr/bin/env python + +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from math import gcd + +"""Code generation for ForDeltaUtil.java""" + +MAX_SPECIALIZED_BITS_PER_VALUE = 24 +OUTPUT_FILE = "ForDeltaUtil.java" +PRIMITIVE_SIZE = [8, 16, 32] +HEADER = """// This file has been automatically generated, DO NOT EDIT + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.backward_codecs.lucene912; + +import java.io.IOException; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.packed.PackedInts; + +import static org.apache.lucene.backward_codecs.lucene912.ForUtil.*; + +/** + * Inspired from https://fulmicoton.com/posts/bitpacking/ + * Encodes multiple integers in a long to get SIMD-like speedups. + * If bitsPerValue <= 4 then we pack 8 ints per long + * else if bitsPerValue <= 11 we pack 4 ints per long + * else we pack 2 ints per long + */ +final class ForDeltaUtil { + + private static final int ONE_BLOCK_SIZE_FOURTH = BLOCK_SIZE / 4; + private static final int TWO_BLOCK_SIZE_FOURTHS = BLOCK_SIZE / 2; + private static final int THREE_BLOCK_SIZE_FOURTHS = 3 * BLOCK_SIZE / 4; + + private static final int ONE_BLOCK_SIZE_EIGHT = BLOCK_SIZE / 8; + private static final int TWO_BLOCK_SIZE_EIGHTS = BLOCK_SIZE / 4; + private static final int THREE_BLOCK_SIZE_EIGHTS = 3 * BLOCK_SIZE / 8; + private static final int FOUR_BLOCK_SIZE_EIGHTS = BLOCK_SIZE / 2; + private static final int FIVE_BLOCK_SIZE_EIGHTS = 5 * BLOCK_SIZE / 8; + private static final int SIX_BLOCK_SIZE_EIGHTS = 3 * BLOCK_SIZE / 4; + private static final int SEVEN_BLOCK_SIZE_EIGHTS = 7 * BLOCK_SIZE / 8; + + // IDENTITY_PLUS_ONE[i] == i+1 + private static final long[] IDENTITY_PLUS_ONE = new long[ForUtil.BLOCK_SIZE]; + + static { + for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) { + IDENTITY_PLUS_ONE[i] = i + 1; + } + } + + private static void prefixSumOfOnes(long[] arr, long base) { + System.arraycopy(IDENTITY_PLUS_ONE, 0, arr, 0, ForUtil.BLOCK_SIZE); + // This loop gets auto-vectorized + for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) { + arr[i] += base; + } + } + + private static void prefixSum8(long[] arr, long base) { + // When the number of bits per value is 4 or less, we can sum up all values in a block without + // risking overflowing a 8-bits integer. This allows computing the prefix sum by summing up 8 + // values at once. + innerPrefixSum8(arr); + expand8(arr); + final long l0 = base; + final long l1 = l0 + arr[ONE_BLOCK_SIZE_EIGHT - 1]; + final long l2 = l1 + arr[TWO_BLOCK_SIZE_EIGHTS - 1]; + final long l3 = l2 + arr[THREE_BLOCK_SIZE_EIGHTS - 1]; + final long l4 = l3 + arr[FOUR_BLOCK_SIZE_EIGHTS - 1]; + final long l5 = l4 + arr[FIVE_BLOCK_SIZE_EIGHTS - 1]; + final long l6 = l5 + arr[SIX_BLOCK_SIZE_EIGHTS - 1]; + final long l7 = l6 + arr[SEVEN_BLOCK_SIZE_EIGHTS - 1]; + + for (int i = 0; i < ONE_BLOCK_SIZE_EIGHT; ++i) { + arr[i] += l0; + arr[ONE_BLOCK_SIZE_EIGHT + i] += l1; + arr[TWO_BLOCK_SIZE_EIGHTS + i] += l2; + arr[THREE_BLOCK_SIZE_EIGHTS + i] += l3; + arr[FOUR_BLOCK_SIZE_EIGHTS + i] += l4; + arr[FIVE_BLOCK_SIZE_EIGHTS + i] += l5; + arr[SIX_BLOCK_SIZE_EIGHTS + i] += l6; + arr[SEVEN_BLOCK_SIZE_EIGHTS + i] += l7; + } + } + + private static void prefixSum16(long[] arr, long base) { + // When the number of bits per value is 11 or less, we can sum up all values in a block without + // risking overflowing a 16-bits integer. This allows computing the prefix sum by summing up 4 + // values at once. + innerPrefixSum16(arr); + expand16(arr); + final long l0 = base; + final long l1 = l0 + arr[ONE_BLOCK_SIZE_FOURTH - 1]; + final long l2 = l1 + arr[TWO_BLOCK_SIZE_FOURTHS - 1]; + final long l3 = l2 + arr[THREE_BLOCK_SIZE_FOURTHS - 1]; + + for (int i = 0; i < ONE_BLOCK_SIZE_FOURTH; ++i) { + arr[i] += l0; + arr[ONE_BLOCK_SIZE_FOURTH + i] += l1; + arr[TWO_BLOCK_SIZE_FOURTHS + i] += l2; + arr[THREE_BLOCK_SIZE_FOURTHS + i] += l3; + } + } + + private static void prefixSum32(long[] arr, long base) { + arr[0] += base << 32; + innerPrefixSum32(arr); + expand32(arr); + final long l = arr[BLOCK_SIZE/2-1]; + for (int i = BLOCK_SIZE/2; i < BLOCK_SIZE; ++i) { + arr[i] += l; + } + } + + // For some reason unrolling seems to help + private static void innerPrefixSum8(long[] arr) { + arr[1] += arr[0]; + arr[2] += arr[1]; + arr[3] += arr[2]; + arr[4] += arr[3]; + arr[5] += arr[4]; + arr[6] += arr[5]; + arr[7] += arr[6]; + arr[8] += arr[7]; + arr[9] += arr[8]; + arr[10] += arr[9]; + arr[11] += arr[10]; + arr[12] += arr[11]; + arr[13] += arr[12]; + arr[14] += arr[13]; + arr[15] += arr[14]; + } + + // For some reason unrolling seems to help + private static void innerPrefixSum16(long[] arr) { + arr[1] += arr[0]; + arr[2] += arr[1]; + arr[3] += arr[2]; + arr[4] += arr[3]; + arr[5] += arr[4]; + arr[6] += arr[5]; + arr[7] += arr[6]; + arr[8] += arr[7]; + arr[9] += arr[8]; + arr[10] += arr[9]; + arr[11] += arr[10]; + arr[12] += arr[11]; + arr[13] += arr[12]; + arr[14] += arr[13]; + arr[15] += arr[14]; + arr[16] += arr[15]; + arr[17] += arr[16]; + arr[18] += arr[17]; + arr[19] += arr[18]; + arr[20] += arr[19]; + arr[21] += arr[20]; + arr[22] += arr[21]; + arr[23] += arr[22]; + arr[24] += arr[23]; + arr[25] += arr[24]; + arr[26] += arr[25]; + arr[27] += arr[26]; + arr[28] += arr[27]; + arr[29] += arr[28]; + arr[30] += arr[29]; + arr[31] += arr[30]; + } + + // For some reason unrolling seems to help + private static void innerPrefixSum32(long[] arr) { + arr[1] += arr[0]; + arr[2] += arr[1]; + arr[3] += arr[2]; + arr[4] += arr[3]; + arr[5] += arr[4]; + arr[6] += arr[5]; + arr[7] += arr[6]; + arr[8] += arr[7]; + arr[9] += arr[8]; + arr[10] += arr[9]; + arr[11] += arr[10]; + arr[12] += arr[11]; + arr[13] += arr[12]; + arr[14] += arr[13]; + arr[15] += arr[14]; + arr[16] += arr[15]; + arr[17] += arr[16]; + arr[18] += arr[17]; + arr[19] += arr[18]; + arr[20] += arr[19]; + arr[21] += arr[20]; + arr[22] += arr[21]; + arr[23] += arr[22]; + arr[24] += arr[23]; + arr[25] += arr[24]; + arr[26] += arr[25]; + arr[27] += arr[26]; + arr[28] += arr[27]; + arr[29] += arr[28]; + arr[30] += arr[29]; + arr[31] += arr[30]; + arr[32] += arr[31]; + arr[33] += arr[32]; + arr[34] += arr[33]; + arr[35] += arr[34]; + arr[36] += arr[35]; + arr[37] += arr[36]; + arr[38] += arr[37]; + arr[39] += arr[38]; + arr[40] += arr[39]; + arr[41] += arr[40]; + arr[42] += arr[41]; + arr[43] += arr[42]; + arr[44] += arr[43]; + arr[45] += arr[44]; + arr[46] += arr[45]; + arr[47] += arr[46]; + arr[48] += arr[47]; + arr[49] += arr[48]; + arr[50] += arr[49]; + arr[51] += arr[50]; + arr[52] += arr[51]; + arr[53] += arr[52]; + arr[54] += arr[53]; + arr[55] += arr[54]; + arr[56] += arr[55]; + arr[57] += arr[56]; + arr[58] += arr[57]; + arr[59] += arr[58]; + arr[60] += arr[59]; + arr[61] += arr[60]; + arr[62] += arr[61]; + arr[63] += arr[62]; + } + + private final long[] tmp = new long[BLOCK_SIZE / 2]; + + /** + * Encode deltas of a strictly monotonically increasing sequence of integers. The provided {@code + * longs} are expected to be deltas between consecutive values. + */ + void encodeDeltas(long[] longs, DataOutput out) throws IOException { + if (longs[0] == 1 && PForUtil.allEqual(longs)) { // happens with very dense postings + out.writeByte((byte) 0); + } else { + long or = 0; + for (long l : longs) { + or |= l; + } + assert or != 0; + final int bitsPerValue = PackedInts.bitsRequired(or); + out.writeByte((byte) bitsPerValue); + + final int primitiveSize; + if (bitsPerValue <= 4) { + primitiveSize = 8; + collapse8(longs); + } else if (bitsPerValue <= 11) { + primitiveSize = 16; + collapse16(longs); + } else { + primitiveSize = 32; + collapse32(longs); + } + encode(longs, bitsPerValue, primitiveSize, out, tmp); + } + } + + /** Decode deltas, compute the prefix sum and add {@code base} to all decoded longs. */ + void decodeAndPrefixSum(IndexInput in, long base, long[] longs) throws IOException { + final int bitsPerValue = Byte.toUnsignedInt(in.readByte()); + if (bitsPerValue == 0) { + prefixSumOfOnes(longs, base); + } else { + decodeAndPrefixSum(bitsPerValue, in, base, longs); + } + } + +""" + +def primitive_size_for_bpv(bpv): + if bpv <= 4: + # If we have 4 bits per value or less then we can compute the prefix sum of 16 longs that store 8 4-bit values each without overflowing. + return 8 + elif bpv <= 11: + # If we have 11 bits per value or less then we can compute the prefix sum of 32 longs that store 4 16-bit values each without overflowing. + return 16 + else: + # No risk of overflow with 32 bits per value + return 32 + +def next_primitive(bpv): + if bpv <= 8: + return 8 + elif bpv <= 16: + return 16 + else: + return 32 + +def writeRemainder(bpv, next_primitive, remaining_bits_per_long, o, num_values, f): + iteration = 1 + num_longs = bpv * num_values / remaining_bits_per_long + while num_longs % 2 == 0 and num_values % 2 == 0: + num_longs /= 2 + num_values /= 2 + iteration *= 2 + f.write(' for (int iter = 0, tmpIdx = 0, longsIdx = %d; iter < %d; ++iter, tmpIdx += %d, longsIdx += %d) {\n' %(o, iteration, num_longs, num_values)) + i = 0 + remaining_bits = 0 + tmp_idx = 0 + for i in range(int(num_values)): + b = bpv + if remaining_bits == 0: + b -= remaining_bits_per_long + f.write(' long l%d = tmp[tmpIdx + %d] << %d;\n' %(i, tmp_idx, b)) + else: + b -= remaining_bits + f.write(' long l%d = (tmp[tmpIdx + %d] & MASK%d_%d) << %d;\n' %(i, tmp_idx, next_primitive, remaining_bits, b)) + tmp_idx += 1 + while b >= remaining_bits_per_long: + b -= remaining_bits_per_long + f.write(' l%d |= tmp[tmpIdx + %d] << %d;\n' %(i, tmp_idx, b)) + tmp_idx += 1 + if b > 0: + f.write(' l%d |= (tmp[tmpIdx + %d] >>> %d) & MASK%d_%d;\n' %(i, tmp_idx, remaining_bits_per_long-b, next_primitive, b)) + remaining_bits = remaining_bits_per_long-b + f.write(' longs[longsIdx + %d] = l%d;\n' %(i, i)) + f.write(' }\n') + +def writeDecode(bpv, f): + next_primitive = primitive_size_for_bpv(bpv) + if next_primitive % bpv == 0: + f.write(' private static void decode%dTo%d(IndexInput in, long[] longs) throws IOException {\n' %(bpv, next_primitive)) + else: + f.write(' private static void decode%dTo%d(IndexInput in, long[] tmp, long[] longs) throws IOException {\n' %(bpv, next_primitive)) + if bpv == next_primitive: + f.write(' in.readLongs(longs, 0, %d);\n' %(bpv*2)) + else: + num_values_per_long = 64 / next_primitive + remaining_bits = next_primitive % bpv + num_iters = (next_primitive - 1) // bpv + o = 2 * bpv * num_iters + if remaining_bits == 0: + f.write(' splitLongs(in, %d, longs, %d, %d, MASK%d_%d, longs, %d, MASK%d_%d);\n' %(bpv*2, next_primitive - bpv, bpv, next_primitive, bpv, o, next_primitive, next_primitive - num_iters * bpv)) + else: + f.write(' splitLongs(in, %d, longs, %d, %d, MASK%d_%d, tmp, 0, MASK%d_%d);\n' %(bpv*2, next_primitive - bpv, bpv, next_primitive, bpv, next_primitive, next_primitive - num_iters * bpv)) + writeRemainder(bpv, next_primitive, remaining_bits, o, 128/num_values_per_long - o, f) + f.write(' }\n') + +if __name__ == '__main__': + f = open(OUTPUT_FILE, 'w') + f.write(HEADER) + f.write(""" + /** + * Delta-decode 128 integers into {@code longs}. + */ + void decodeAndPrefixSum(int bitsPerValue, IndexInput in, long base, long[] longs) throws IOException { + switch (bitsPerValue) { +""") + for bpv in range(1, MAX_SPECIALIZED_BITS_PER_VALUE+1): + primitive_size = primitive_size_for_bpv(bpv) + f.write(' case %d:\n' %bpv) + if next_primitive(bpv) == primitive_size: + if primitive_size % bpv == 0: + f.write(' decode%d(in, longs);\n' %bpv) + else: + f.write(' decode%d(in, tmp, longs);\n' %bpv) + else: + if primitive_size % bpv == 0: + f.write(' decode%dTo%d(in, longs);\n' %(bpv, primitive_size)) + else: + f.write(' decode%dTo%d(in, tmp, longs);\n' %(bpv, primitive_size)) + f.write(' prefixSum%d(longs, base);\n' %primitive_size) + f.write(' break;\n') + f.write(' default:\n') + f.write(' decodeSlow(bitsPerValue, in, tmp, longs);\n') + f.write(' prefixSum32(longs, base);\n') + f.write(' break;\n') + f.write(' }\n') + f.write(' }\n') + + f.write('\n') + for bpv in range(1, MAX_SPECIALIZED_BITS_PER_VALUE+1): + if next_primitive(bpv) != primitive_size_for_bpv(bpv): + writeDecode(bpv, f) + if bpv < MAX_SPECIALIZED_BITS_PER_VALUE: + f.write('\n') + + f.write('}\n') diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/gen_ForUtil.py b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/gen_ForUtil.py similarity index 55% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene912/gen_ForUtil.py rename to lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/gen_ForUtil.py index c6a33ceef53e..29543ffe671b 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/gen_ForUtil.py +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/gen_ForUtil.py @@ -40,47 +40,49 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene912; +package org.apache.lucene.backward_codecs.lucene912; import java.io.IOException; -import org.apache.lucene.store.DataInput; import org.apache.lucene.store.DataOutput; - -// Inspired from https://fulmicoton.com/posts/bitpacking/ -// Encodes multiple integers in a long to get SIMD-like speedups. -// If bitsPerValue <= 8 then we pack 8 ints per long -// else if bitsPerValue <= 16 we pack 4 ints per long -// else we pack 2 ints per long +import org.apache.lucene.store.IndexInput; + +/** + * Inspired from https://fulmicoton.com/posts/bitpacking/ + * Encodes multiple integers in a long to get SIMD-like speedups. + * If bitsPerValue <= 8 then we pack 8 ints per long + * else if bitsPerValue <= 16 we pack 4 ints per long + * else we pack 2 ints per long + */ final class ForUtil { - static final int BLOCK_SIZE = 128; - private static final int BLOCK_SIZE_LOG2 = 7; + public static final int BLOCK_SIZE = 128; + static final int BLOCK_SIZE_LOG2 = 7; - private static long expandMask32(long mask32) { + static long expandMask32(long mask32) { return mask32 | (mask32 << 32); } - private static long expandMask16(long mask16) { + static long expandMask16(long mask16) { return expandMask32(mask16 | (mask16 << 16)); } - private static long expandMask8(long mask8) { + static long expandMask8(long mask8) { return expandMask16(mask8 | (mask8 << 8)); } - private static long mask32(int bitsPerValue) { + static long mask32(int bitsPerValue) { return expandMask32((1L << bitsPerValue) - 1); } - private static long mask16(int bitsPerValue) { + static long mask16(int bitsPerValue) { return expandMask16((1L << bitsPerValue) - 1); } - private static long mask8(int bitsPerValue) { + static long mask8(int bitsPerValue) { return expandMask8((1L << bitsPerValue) - 1); } - private static void expand8(long[] arr) { + static void expand8(long[] arr) { for (int i = 0; i < 16; ++i) { long l = arr[i]; arr[i] = (l >>> 56) & 0xFFL; @@ -94,17 +96,7 @@ } } - private static void expand8To32(long[] arr) { - for (int i = 0; i < 16; ++i) { - long l = arr[i]; - arr[i] = (l >>> 24) & 0x000000FF000000FFL; - arr[16 + i] = (l >>> 16) & 0x000000FF000000FFL; - arr[32 + i] = (l >>> 8) & 0x000000FF000000FFL; - arr[48 + i] = l & 0x000000FF000000FFL; - } - } - - private static void collapse8(long[] arr) { + static void collapse8(long[] arr) { for (int i = 0; i < 16; ++i) { arr[i] = (arr[i] << 56) @@ -118,7 +110,7 @@ } } - private static void expand16(long[] arr) { + static void expand16(long[] arr) { for (int i = 0; i < 32; ++i) { long l = arr[i]; arr[i] = (l >>> 48) & 0xFFFFL; @@ -128,21 +120,13 @@ } } - private static void expand16To32(long[] arr) { - for (int i = 0; i < 32; ++i) { - long l = arr[i]; - arr[i] = (l >>> 16) & 0x0000FFFF0000FFFFL; - arr[32 + i] = l & 0x0000FFFF0000FFFFL; - } - } - - private static void collapse16(long[] arr) { + static void collapse16(long[] arr) { for (int i = 0; i < 32; ++i) { arr[i] = (arr[i] << 48) | (arr[32 + i] << 32) | (arr[64 + i] << 16) | arr[96 + i]; } } - private static void expand32(long[] arr) { + static void expand32(long[] arr) { for (int i = 0; i < 64; ++i) { long l = arr[i]; arr[i] = l >>> 32; @@ -150,123 +134,36 @@ } } - private static void collapse32(long[] arr) { + static void collapse32(long[] arr) { for (int i = 0; i < 64; ++i) { arr[i] = (arr[i] << 32) | arr[64 + i]; } } - private static void prefixSum8(long[] arr, long base) { - expand8To32(arr); - prefixSum32(arr, base); - } - - private static void prefixSum16(long[] arr, long base) { - // We need to move to the next primitive size to avoid overflows - expand16To32(arr); - prefixSum32(arr, base); - } - - private static void prefixSum32(long[] arr, long base) { - arr[0] += base << 32; - innerPrefixSum32(arr); - expand32(arr); - final long l = arr[BLOCK_SIZE/2-1]; - for (int i = BLOCK_SIZE/2; i < BLOCK_SIZE; ++i) { - arr[i] += l; - } - } - - // For some reason unrolling seems to help - private static void innerPrefixSum32(long[] arr) { - arr[1] += arr[0]; - arr[2] += arr[1]; - arr[3] += arr[2]; - arr[4] += arr[3]; - arr[5] += arr[4]; - arr[6] += arr[5]; - arr[7] += arr[6]; - arr[8] += arr[7]; - arr[9] += arr[8]; - arr[10] += arr[9]; - arr[11] += arr[10]; - arr[12] += arr[11]; - arr[13] += arr[12]; - arr[14] += arr[13]; - arr[15] += arr[14]; - arr[16] += arr[15]; - arr[17] += arr[16]; - arr[18] += arr[17]; - arr[19] += arr[18]; - arr[20] += arr[19]; - arr[21] += arr[20]; - arr[22] += arr[21]; - arr[23] += arr[22]; - arr[24] += arr[23]; - arr[25] += arr[24]; - arr[26] += arr[25]; - arr[27] += arr[26]; - arr[28] += arr[27]; - arr[29] += arr[28]; - arr[30] += arr[29]; - arr[31] += arr[30]; - arr[32] += arr[31]; - arr[33] += arr[32]; - arr[34] += arr[33]; - arr[35] += arr[34]; - arr[36] += arr[35]; - arr[37] += arr[36]; - arr[38] += arr[37]; - arr[39] += arr[38]; - arr[40] += arr[39]; - arr[41] += arr[40]; - arr[42] += arr[41]; - arr[43] += arr[42]; - arr[44] += arr[43]; - arr[45] += arr[44]; - arr[46] += arr[45]; - arr[47] += arr[46]; - arr[48] += arr[47]; - arr[49] += arr[48]; - arr[50] += arr[49]; - arr[51] += arr[50]; - arr[52] += arr[51]; - arr[53] += arr[52]; - arr[54] += arr[53]; - arr[55] += arr[54]; - arr[56] += arr[55]; - arr[57] += arr[56]; - arr[58] += arr[57]; - arr[59] += arr[58]; - arr[60] += arr[59]; - arr[61] += arr[60]; - arr[62] += arr[61]; - arr[63] += arr[62]; - } - private final long[] tmp = new long[BLOCK_SIZE / 2]; /** Encode 128 integers from {@code longs} into {@code out}. */ void encode(long[] longs, int bitsPerValue, DataOutput out) throws IOException { final int nextPrimitive; - final int numLongs; if (bitsPerValue <= 8) { nextPrimitive = 8; - numLongs = BLOCK_SIZE / 8; collapse8(longs); } else if (bitsPerValue <= 16) { nextPrimitive = 16; - numLongs = BLOCK_SIZE / 4; collapse16(longs); } else { nextPrimitive = 32; - numLongs = BLOCK_SIZE / 2; collapse32(longs); } + encode(longs, bitsPerValue, nextPrimitive, out, tmp); + } + + static void encode(long[] longs, int bitsPerValue, int primitiveSize, DataOutput out, long[] tmp) throws IOException { + final int numLongs = BLOCK_SIZE * primitiveSize / Long.SIZE; final int numLongsPerShift = bitsPerValue * 2; int idx = 0; - int shift = nextPrimitive - bitsPerValue; + int shift = primitiveSize - bitsPerValue; for (int i = 0; i < numLongsPerShift; ++i) { tmp[i] = longs[idx++] << shift; } @@ -278,9 +175,9 @@ final int remainingBitsPerLong = shift + bitsPerValue; final long maskRemainingBitsPerLong; - if (nextPrimitive == 8) { + if (primitiveSize == 8) { maskRemainingBitsPerLong = MASKS8[remainingBitsPerLong]; - } else if (nextPrimitive == 16) { + } else if (primitiveSize == 16) { maskRemainingBitsPerLong = MASKS16[remainingBitsPerLong]; } else { maskRemainingBitsPerLong = MASKS32[remainingBitsPerLong]; @@ -298,10 +195,10 @@ } } else { final long mask1, mask2; - if (nextPrimitive == 8) { + if (primitiveSize == 8) { mask1 = MASKS8[remainingBitsPerValue]; mask2 = MASKS8[remainingBitsPerLong - remainingBitsPerValue]; - } else if (nextPrimitive == 16) { + } else if (primitiveSize == 16) { mask1 = MASKS16[remainingBitsPerValue]; mask2 = MASKS16[remainingBitsPerLong - remainingBitsPerValue]; } else { @@ -320,26 +217,20 @@ } /** Number of bytes required to encode 128 integers of {@code bitsPerValue} bits per value. */ - int numBytes(int bitsPerValue) { + static int numBytes(int bitsPerValue) { return bitsPerValue << (BLOCK_SIZE_LOG2 - 3); } - private static void decodeSlow(int bitsPerValue, DataInput in, long[] tmp, long[] longs) + static void decodeSlow(int bitsPerValue, IndexInput in, long[] tmp, long[] longs) throws IOException { final int numLongs = bitsPerValue << 1; - in.readLongs(tmp, 0, numLongs); final long mask = MASKS32[bitsPerValue]; - int longsIdx = 0; - int shift = 32 - bitsPerValue; - for (; shift >= 0; shift -= bitsPerValue) { - shiftLongs(tmp, numLongs, longs, longsIdx, shift, mask); - longsIdx += numLongs; - } - final int remainingBitsPerLong = shift + bitsPerValue; + splitLongs(in, numLongs, longs, 32 - bitsPerValue, 32, mask, tmp, 0, -1L); + final int remainingBitsPerLong = 32 - bitsPerValue; final long mask32RemainingBitsPerLong = MASKS32[remainingBitsPerLong]; int tmpIdx = 0; int remainingBits = remainingBitsPerLong; - for (; longsIdx < BLOCK_SIZE / 2; ++longsIdx) { + for (int longsIdx = numLongs; longsIdx < BLOCK_SIZE / 2; ++longsIdx) { int b = bitsPerValue - remainingBits; long l = (tmp[tmpIdx++] & MASKS32[remainingBits]) << b; while (b >= remainingBitsPerLong) { @@ -356,41 +247,22 @@ } } - /** - * The pattern that this shiftLongs method applies is recognized by the C2 compiler, which - * generates SIMD instructions for it in order to shift multiple longs at once. - */ - private static void shiftLongs(long[] a, int count, long[] b, int bi, int shift, long mask) { + static void splitLongs( + IndexInput in, int count, long[] b, int bShift, int dec, long bMask, long[] c, int cIndex, long cMask) + throws IOException { + // takes advantage of the C2 compiler's loop unrolling and auto-vectorization. + in.readLongs(c, cIndex, count); + int maxIter = (bShift - 1) / dec; for (int i = 0; i < count; ++i) { - b[bi + i] = (a[i] >>> shift) & mask; + for (int j = 0; j <= maxIter; ++j) { + b[count * j + i] = (c[cIndex + i] >>> (bShift - j * dec)) & bMask; + } + c[cIndex + i] &= cMask; } } """ -def writeRemainderWithSIMDOptimize(bpv, next_primitive, remaining_bits_per_long, o, num_values, f): - iteration = 1 - num_longs = bpv * num_values / remaining_bits_per_long - while num_longs % 2 == 0 and num_values % 2 == 0: - num_longs /= 2 - num_values /= 2 - iteration *= 2 - - f.write(' shiftLongs(tmp, %d, tmp, 0, 0, MASK%d_%d);\n' % (iteration * num_longs, next_primitive, remaining_bits_per_long)) - f.write(' for (int iter = 0, tmpIdx = 0, longsIdx = %d; iter < %d; ++iter, tmpIdx += %d, longsIdx += %d) {\n' %(o, iteration, num_longs, num_values)) - tmp_idx = 0 - b = bpv - b -= remaining_bits_per_long - f.write(' long l0 = tmp[tmpIdx + %d] << %d;\n' %(tmp_idx, b)) - tmp_idx += 1 - while b >= remaining_bits_per_long: - b -= remaining_bits_per_long - f.write(' l0 |= tmp[tmpIdx + %d] << %d;\n' %(tmp_idx, b)) - tmp_idx += 1 - f.write(' longs[longsIdx + 0] = l0;\n') - f.write(' }\n') - - def writeRemainder(bpv, next_primitive, remaining_bits_per_long, o, num_values, f): iteration = 1 num_longs = bpv * num_values / remaining_bits_per_long @@ -406,14 +278,14 @@ def writeRemainder(bpv, next_primitive, remaining_bits_per_long, o, num_values, b = bpv if remaining_bits == 0: b -= remaining_bits_per_long - f.write(' long l%d = (tmp[tmpIdx + %d] & MASK%d_%d) << %d;\n' %(i, tmp_idx, next_primitive, remaining_bits_per_long, b)) + f.write(' long l%d = tmp[tmpIdx + %d] << %d;\n' %(i, tmp_idx, b)) else: b -= remaining_bits f.write(' long l%d = (tmp[tmpIdx + %d] & MASK%d_%d) << %d;\n' %(i, tmp_idx, next_primitive, remaining_bits, b)) tmp_idx += 1 while b >= remaining_bits_per_long: b -= remaining_bits_per_long - f.write(' l%d |= (tmp[tmpIdx + %d] & MASK%d_%d) << %d;\n' %(i, tmp_idx, next_primitive, remaining_bits_per_long, b)) + f.write(' l%d |= tmp[tmpIdx + %d] << %d;\n' %(i, tmp_idx, b)) tmp_idx += 1 if b > 0: f.write(' l%d |= (tmp[tmpIdx + %d] >>> %d) & MASK%d_%d;\n' %(i, tmp_idx, remaining_bits_per_long-b, next_primitive, b)) @@ -428,31 +300,28 @@ def writeDecode(bpv, f): next_primitive = 8 elif bpv <= 16: next_primitive = 16 - f.write(' private static void decode%d(DataInput in, long[] tmp, long[] longs) throws IOException {\n' %bpv) - num_values_per_long = 64 / next_primitive if bpv == next_primitive: + f.write(' static void decode%d(IndexInput in, long[] longs) throws IOException {\n' %bpv) f.write(' in.readLongs(longs, 0, %d);\n' %(bpv*2)) else: - f.write(' in.readLongs(tmp, 0, %d);\n' %(bpv*2)) - shift = next_primitive - bpv - o = 0 - while shift >= 0: - f.write(' shiftLongs(tmp, %d, longs, %d, %d, MASK%d_%d);\n' %(bpv*2, o, shift, next_primitive, bpv)) - o += bpv*2 - shift -= bpv - if shift + bpv > 0: - if bpv % (next_primitive % bpv) == 0: - writeRemainderWithSIMDOptimize(bpv, next_primitive, shift + bpv, o, 128/num_values_per_long - o, f) - else: - writeRemainder(bpv, next_primitive, shift + bpv, o, 128/num_values_per_long - o, f) + num_values_per_long = 64 / next_primitive + remaining_bits = next_primitive % bpv + num_iters = (next_primitive - 1) // bpv + o = 2 * bpv * num_iters + if remaining_bits == 0: + f.write(' static void decode%d(IndexInput in, long[] longs) throws IOException {\n' %bpv) + f.write(' splitLongs(in, %d, longs, %d, %d, MASK%d_%d, longs, %d, MASK%d_%d);\n' %(bpv*2, next_primitive - bpv, bpv, next_primitive, bpv, o, next_primitive, next_primitive - num_iters * bpv)) + else: + f.write(' static void decode%d(IndexInput in, long[] tmp, long[] longs) throws IOException {\n' %bpv) + f.write(' splitLongs(in, %d, longs, %d, %d, MASK%d_%d, tmp, 0, MASK%d_%d);\n' %(bpv*2, next_primitive - bpv, bpv, next_primitive, bpv, next_primitive, next_primitive - num_iters * bpv)) + writeRemainder(bpv, next_primitive, remaining_bits, o, 128/num_values_per_long - o, f) f.write(' }\n') - if __name__ == '__main__': f = open(OUTPUT_FILE, 'w') f.write(HEADER) for primitive_size in PRIMITIVE_SIZE: - f.write(' private static final long[] MASKS%d = new long[%d];\n' %(primitive_size, primitive_size)) + f.write(' static final long[] MASKS%d = new long[%d];\n' %(primitive_size, primitive_size)) f.write('\n') f.write(' static {\n') for primitive_size in PRIMITIVE_SIZE: @@ -466,12 +335,11 @@ def writeDecode(bpv, f): """) for primitive_size in PRIMITIVE_SIZE: for bpv in range(1, min(MAX_SPECIALIZED_BITS_PER_VALUE + 1, primitive_size)): - if bpv * 2 != primitive_size or primitive_size == 8: - f.write(' private static final long MASK%d_%d = MASKS%d[%d];\n' %(primitive_size, bpv, primitive_size, bpv)) + f.write(' static final long MASK%d_%d = MASKS%d[%d];\n' %(primitive_size, bpv, primitive_size, bpv)) f.write(""" /** Decode 128 integers into {@code longs}. */ - void decode(int bitsPerValue, DataInput in, long[] longs) throws IOException { + void decode(int bitsPerValue, IndexInput in, long[] longs) throws IOException { switch (bitsPerValue) { """) for bpv in range(1, MAX_SPECIALIZED_BITS_PER_VALUE+1): @@ -481,7 +349,10 @@ def writeDecode(bpv, f): elif bpv <= 16: next_primitive = 16 f.write(' case %d:\n' %bpv) - f.write(' decode%d(in, tmp, longs);\n' %bpv) + if next_primitive % bpv == 0: + f.write(' decode%d(in, longs);\n' %bpv) + else: + f.write(' decode%d(in, tmp, longs);\n' %bpv) f.write(' expand%d(longs);\n' %next_primitive) f.write(' break;\n') f.write(' default:\n') @@ -491,31 +362,6 @@ def writeDecode(bpv, f): f.write(' }\n') f.write(' }\n') - f.write(""" - /** - * Delta-decode 128 integers into {@code longs}. - */ - void decodeAndPrefixSum(int bitsPerValue, DataInput in, long base, long[] longs) throws IOException { - switch (bitsPerValue) { -""") - for bpv in range(1, MAX_SPECIALIZED_BITS_PER_VALUE+1): - next_primitive = 32 - if bpv <= 8: - next_primitive = 8 - elif bpv <= 16: - next_primitive = 16 - f.write(' case %d:\n' %bpv) - f.write(' decode%d(in, tmp, longs);\n' %bpv) - f.write(' prefixSum%d(longs, base);\n' %next_primitive) - f.write(' break;\n') - f.write(' default:\n') - f.write(' decodeSlow(bitsPerValue, in, tmp, longs);\n') - f.write(' prefixSum32(longs, base);\n') - f.write(' break;\n') - f.write(' }\n') - f.write(' }\n') - - f.write('\n') for i in range(1, MAX_SPECIALIZED_BITS_PER_VALUE+1): writeDecode(i, f) if i < MAX_SPECIALIZED_BITS_PER_VALUE: diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/package-info.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/package-info.java new file mode 100644 index 000000000000..76666469faaa --- /dev/null +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/package-info.java @@ -0,0 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Lucene 9.12 file format. */ +package org.apache.lucene.backward_codecs.lucene912; diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/Lucene92HnswVectorsReader.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/Lucene92HnswVectorsReader.java index 833efdf80259..034967efbaab 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/Lucene92HnswVectorsReader.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/Lucene92HnswVectorsReader.java @@ -21,8 +21,6 @@ import java.io.IOException; import java.util.Arrays; -import java.util.HashMap; -import java.util.Map; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer; @@ -34,6 +32,7 @@ import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.internal.hppc.IntObjectHashMap; import org.apache.lucene.search.KnnCollector; import org.apache.lucene.store.ChecksumIndexInput; import org.apache.lucene.store.DataInput; @@ -53,13 +52,15 @@ */ public final class Lucene92HnswVectorsReader extends KnnVectorsReader { - private final Map fields = new HashMap<>(); + private final IntObjectHashMap fields = new IntObjectHashMap<>(); private final IndexInput vectorData; private final IndexInput vectorIndex; private final DefaultFlatVectorScorer defaultFlatVectorScorer = new DefaultFlatVectorScorer(); + private final FieldInfos fieldInfos; Lucene92HnswVectorsReader(SegmentReadState state) throws IOException { int versionMeta = readMetadata(state); + this.fieldInfos = state.fieldInfos; boolean success = false; try { vectorData = @@ -152,7 +153,7 @@ private void readFields(ChecksumIndexInput meta, FieldInfos infos) throws IOExce } FieldEntry fieldEntry = readField(meta, info); validateFieldEntry(info, fieldEntry); - fields.put(info.name, fieldEntry); + fields.put(info.number, fieldEntry); } } @@ -212,10 +213,18 @@ public void checkIntegrity() throws IOException { CodecUtil.checksumEntireFile(vectorIndex); } + private FieldEntry getFieldEntry(String field) { + final FieldInfo info = fieldInfos.fieldInfo(field); + final FieldEntry fieldEntry; + if (info == null || (fieldEntry = fields.get(info.number)) == null) { + throw new IllegalArgumentException("field=\"" + field + "\" not found"); + } + return fieldEntry; + } + @Override public FloatVectorValues getFloatVectorValues(String field) throws IOException { - FieldEntry fieldEntry = fields.get(field); - return OffHeapFloatVectorValues.load(fieldEntry, vectorData); + return OffHeapFloatVectorValues.load(getFieldEntry(field), vectorData); } @Override @@ -226,8 +235,7 @@ public ByteVectorValues getByteVectorValues(String field) { @Override public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { - FieldEntry fieldEntry = fields.get(field); - + final FieldEntry fieldEntry = getFieldEntry(field); if (fieldEntry.size() == 0) { return; } diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/OffHeapFloatVectorValues.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/OffHeapFloatVectorValues.java index 19dc82cc46d5..7c87bac5e54a 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/OffHeapFloatVectorValues.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/OffHeapFloatVectorValues.java @@ -26,12 +26,10 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.RandomAccessInput; import org.apache.lucene.util.Bits; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.packed.DirectMonotonicReader; /** Read the vector values from the index input. This supports both iterated and random access. */ -abstract class OffHeapFloatVectorValues extends FloatVectorValues - implements RandomAccessVectorValues.Floats { +abstract class OffHeapFloatVectorValues extends FloatVectorValues { protected final int dimension; protected final int size; @@ -95,8 +93,6 @@ static OffHeapFloatVectorValues load( static class DenseOffHeapVectorValues extends OffHeapFloatVectorValues { - private int doc = -1; - public DenseOffHeapVectorValues( int dimension, int size, @@ -106,32 +102,13 @@ public DenseOffHeapVectorValues( } @Override - public float[] vectorValue() throws IOException { - return vectorValue(doc); - } - - @Override - public int docID() { - return doc; - } - - @Override - public int nextDoc() throws IOException { - return advance(doc + 1); - } - - @Override - public int advance(int target) throws IOException { - assert docID() < target; - if (target >= size) { - return doc = NO_MORE_DOCS; - } - return doc = target; + public DenseOffHeapVectorValues copy() throws IOException { + return new DenseOffHeapVectorValues(dimension, size, vectorSimilarityFunction, slice.clone()); } @Override - public DenseOffHeapVectorValues copy() throws IOException { - return new DenseOffHeapVectorValues(dimension, size, vectorSimilarityFunction, slice.clone()); + public DocIndexIterator iterator() { + return createDenseIterator(); } @Override @@ -142,15 +119,17 @@ public Bits getAcceptOrds(Bits acceptDocs) { @Override public VectorScorer scorer(float[] query) throws IOException { DenseOffHeapVectorValues values = this.copy(); + DocIndexIterator iterator = values.iterator(); return new VectorScorer() { @Override public float score() throws IOException { - return values.vectorSimilarityFunction.compare(values.vectorValue(), query); + return values.vectorSimilarityFunction.compare( + values.vectorValue(iterator.index()), query); } @Override public DocIdSetIterator iterator() { - return values; + return iterator; } }; } @@ -186,33 +165,17 @@ public SparseOffHeapVectorValues( fieldEntry.size()); } - @Override - public float[] vectorValue() throws IOException { - return vectorValue(disi.index()); - } - - @Override - public int docID() { - return disi.docID(); - } - - @Override - public int nextDoc() throws IOException { - return disi.nextDoc(); - } - - @Override - public int advance(int target) throws IOException { - assert docID() < target; - return disi.advance(target); - } - @Override public SparseOffHeapVectorValues copy() throws IOException { return new SparseOffHeapVectorValues( fieldEntry, dataIn, vectorSimilarityFunction, slice.clone()); } + @Override + public DocIndexIterator iterator() { + return IndexedDISI.asDocIndexIterator(disi); + } + @Override public int ordToDoc(int ord) { return (int) ordToDoc.get(ord); @@ -239,15 +202,17 @@ public int length() { @Override public VectorScorer scorer(float[] query) throws IOException { SparseOffHeapVectorValues values = this.copy(); + DocIndexIterator iterator = values.iterator(); return new VectorScorer() { @Override public float score() throws IOException { - return values.vectorSimilarityFunction.compare(values.vectorValue(), query); + return values.vectorSimilarityFunction.compare( + values.vectorValue(iterator.index()), query); } @Override public DocIdSetIterator iterator() { - return values; + return iterator; } }; } @@ -259,8 +224,6 @@ public EmptyOffHeapVectorValues(int dimension) { super(dimension, 0, VectorSimilarityFunction.COSINE, null); } - private int doc = -1; - @Override public int dimension() { return super.dimension(); @@ -271,26 +234,6 @@ public int size() { return 0; } - @Override - public float[] vectorValue() throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public int docID() { - return doc; - } - - @Override - public int nextDoc() throws IOException { - return advance(doc + 1); - } - - @Override - public int advance(int target) throws IOException { - return doc = NO_MORE_DOCS; - } - @Override public OffHeapFloatVectorValues copy() throws IOException { throw new UnsupportedOperationException(); @@ -306,6 +249,11 @@ public int ordToDoc(int ord) { throw new UnsupportedOperationException(); } + @Override + public DocIndexIterator iterator() { + return createDenseIterator(); + } + @Override public Bits getAcceptOrds(Bits acceptDocs) { return null; diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/Lucene94HnswVectorsReader.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/Lucene94HnswVectorsReader.java index a948ab7bee3f..1ad2e3023642 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/Lucene94HnswVectorsReader.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/Lucene94HnswVectorsReader.java @@ -21,8 +21,6 @@ import java.io.IOException; import java.util.Arrays; -import java.util.HashMap; -import java.util.Map; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer; @@ -35,6 +33,7 @@ import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.VectorEncoding; import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.internal.hppc.IntObjectHashMap; import org.apache.lucene.search.KnnCollector; import org.apache.lucene.store.ChecksumIndexInput; import org.apache.lucene.store.DataInput; @@ -54,13 +53,15 @@ */ public final class Lucene94HnswVectorsReader extends KnnVectorsReader { - private final Map fields = new HashMap<>(); + private final IntObjectHashMap fields = new IntObjectHashMap<>(); private final IndexInput vectorData; private final IndexInput vectorIndex; private final DefaultFlatVectorScorer defaultFlatVectorScorer = new DefaultFlatVectorScorer(); + private final FieldInfos fieldInfos; Lucene94HnswVectorsReader(SegmentReadState state) throws IOException { int versionMeta = readMetadata(state); + this.fieldInfos = state.fieldInfos; boolean success = false; try { vectorData = @@ -153,7 +154,7 @@ private void readFields(ChecksumIndexInput meta, FieldInfos infos) throws IOExce } FieldEntry fieldEntry = readField(meta, info); validateFieldEntry(info, fieldEntry); - fields.put(info.name, fieldEntry); + fields.put(info.number, fieldEntry); } } @@ -230,42 +231,41 @@ public void checkIntegrity() throws IOException { CodecUtil.checksumEntireFile(vectorIndex); } - @Override - public FloatVectorValues getFloatVectorValues(String field) throws IOException { - FieldEntry fieldEntry = fields.get(field); - if (fieldEntry.vectorEncoding != VectorEncoding.FLOAT32) { + private FieldEntry getFieldEntry(String field, VectorEncoding expectedEncoding) { + final FieldInfo info = fieldInfos.fieldInfo(field); + final FieldEntry fieldEntry; + if (info == null || (fieldEntry = fields.get(info.number)) == null) { + throw new IllegalArgumentException("field=\"" + field + "\" not found"); + } + if (fieldEntry.vectorEncoding != expectedEncoding) { throw new IllegalArgumentException( "field=\"" + field + "\" is encoded as: " + fieldEntry.vectorEncoding + " expected: " - + VectorEncoding.FLOAT32); + + expectedEncoding); } + return fieldEntry; + } + + @Override + public FloatVectorValues getFloatVectorValues(String field) throws IOException { + final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.FLOAT32); return OffHeapFloatVectorValues.load(fieldEntry, vectorData); } @Override public ByteVectorValues getByteVectorValues(String field) throws IOException { - FieldEntry fieldEntry = fields.get(field); - if (fieldEntry.vectorEncoding != VectorEncoding.BYTE) { - throw new IllegalArgumentException( - "field=\"" - + field - + "\" is encoded as: " - + fieldEntry.vectorEncoding - + " expected: " - + VectorEncoding.BYTE); - } + final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.BYTE); return OffHeapByteVectorValues.load(fieldEntry, vectorData); } @Override public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { - FieldEntry fieldEntry = fields.get(field); - - if (fieldEntry.size() == 0 || fieldEntry.vectorEncoding != VectorEncoding.FLOAT32) { + final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.FLOAT32); + if (fieldEntry.size() == 0 || knnCollector.k() == 0) { return; } @@ -283,9 +283,8 @@ public void search(String field, float[] target, KnnCollector knnCollector, Bits @Override public void search(String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { - FieldEntry fieldEntry = fields.get(field); - - if (fieldEntry.size() == 0 || fieldEntry.vectorEncoding != VectorEncoding.BYTE) { + final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.BYTE); + if (fieldEntry.size() == 0 || knnCollector.k() == 0) { return; } diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapByteVectorValues.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapByteVectorValues.java index 0c909e3839df..0c428bb169f3 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapByteVectorValues.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapByteVectorValues.java @@ -28,12 +28,10 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.RandomAccessInput; import org.apache.lucene.util.Bits; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.packed.DirectMonotonicReader; /** Read the vector values from the index input. This supports both iterated and random access. */ -abstract class OffHeapByteVectorValues extends ByteVectorValues - implements RandomAccessVectorValues.Bytes { +abstract class OffHeapByteVectorValues extends ByteVectorValues { protected final int dimension; protected final int size; @@ -108,8 +106,6 @@ static OffHeapByteVectorValues load( static class DenseOffHeapVectorValues extends OffHeapByteVectorValues { - private int doc = -1; - public DenseOffHeapVectorValues( int dimension, int size, @@ -119,36 +115,17 @@ public DenseOffHeapVectorValues( super(dimension, size, slice, vectorSimilarityFunction, byteSize); } - @Override - public byte[] vectorValue() throws IOException { - return vectorValue(doc); - } - - @Override - public int docID() { - return doc; - } - - @Override - public int nextDoc() throws IOException { - return advance(doc + 1); - } - - @Override - public int advance(int target) throws IOException { - assert docID() < target; - if (target >= size) { - return doc = NO_MORE_DOCS; - } - return doc = target; - } - @Override public DenseOffHeapVectorValues copy() throws IOException { return new DenseOffHeapVectorValues( dimension, size, slice.clone(), vectorSimilarityFunction, byteSize); } + @Override + public DocIndexIterator iterator() { + return createDenseIterator(); + } + @Override public Bits getAcceptOrds(Bits acceptDocs) { return acceptDocs; @@ -157,15 +134,16 @@ public Bits getAcceptOrds(Bits acceptDocs) { @Override public VectorScorer scorer(byte[] query) throws IOException { DenseOffHeapVectorValues copy = this.copy(); + DocIndexIterator iterator = copy.iterator(); return new VectorScorer() { @Override public float score() throws IOException { - return vectorSimilarityFunction.compare(copy.vectorValue(), query); + return vectorSimilarityFunction.compare(copy.vectorValue(iterator.index()), query); } @Override public DocIdSetIterator iterator() { - return copy; + return iterator; } }; } @@ -202,27 +180,6 @@ public SparseOffHeapVectorValues( fieldEntry.size()); } - @Override - public byte[] vectorValue() throws IOException { - return vectorValue(disi.index()); - } - - @Override - public int docID() { - return disi.docID(); - } - - @Override - public int nextDoc() throws IOException { - return disi.nextDoc(); - } - - @Override - public int advance(int target) throws IOException { - assert docID() < target; - return disi.advance(target); - } - @Override public SparseOffHeapVectorValues copy() throws IOException { return new SparseOffHeapVectorValues( @@ -234,6 +191,11 @@ public int ordToDoc(int ord) { return (int) ordToDoc.get(ord); } + @Override + public DocIndexIterator iterator() { + return fromDISI(disi); + } + @Override public Bits getAcceptOrds(Bits acceptDocs) { if (acceptDocs == null) { @@ -255,15 +217,16 @@ public int length() { @Override public VectorScorer scorer(byte[] query) throws IOException { SparseOffHeapVectorValues copy = this.copy(); + IndexedDISI disi = copy.disi; return new VectorScorer() { @Override public float score() throws IOException { - return vectorSimilarityFunction.compare(copy.vectorValue(), query); + return vectorSimilarityFunction.compare(copy.vectorValue(disi.index()), query); } @Override public DocIdSetIterator iterator() { - return copy; + return disi; } }; } @@ -275,8 +238,6 @@ public EmptyOffHeapVectorValues(int dimension) { super(dimension, 0, null, VectorSimilarityFunction.COSINE, 0); } - private int doc = -1; - @Override public int dimension() { return super.dimension(); @@ -287,26 +248,6 @@ public int size() { return 0; } - @Override - public byte[] vectorValue() throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public int docID() { - return doc; - } - - @Override - public int nextDoc() throws IOException { - return advance(doc + 1); - } - - @Override - public int advance(int target) throws IOException { - return doc = NO_MORE_DOCS; - } - @Override public OffHeapByteVectorValues copy() throws IOException { throw new UnsupportedOperationException(); @@ -322,6 +263,11 @@ public int ordToDoc(int ord) { throw new UnsupportedOperationException(); } + @Override + public DocIndexIterator iterator() { + return createDenseIterator(); + } + @Override public Bits getAcceptOrds(Bits acceptDocs) { return null; diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapFloatVectorValues.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapFloatVectorValues.java index 91f97b8a41fa..b21df901ddb6 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapFloatVectorValues.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapFloatVectorValues.java @@ -26,12 +26,10 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.RandomAccessInput; import org.apache.lucene.util.Bits; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.packed.DirectMonotonicReader; /** Read the vector values from the index input. This supports both iterated and random access. */ -abstract class OffHeapFloatVectorValues extends FloatVectorValues - implements RandomAccessVectorValues.Floats { +abstract class OffHeapFloatVectorValues extends FloatVectorValues { protected final int dimension; protected final int size; @@ -104,8 +102,6 @@ static OffHeapFloatVectorValues load( static class DenseOffHeapVectorValues extends OffHeapFloatVectorValues { - private int doc = -1; - public DenseOffHeapVectorValues( int dimension, int size, @@ -115,36 +111,17 @@ public DenseOffHeapVectorValues( super(dimension, size, slice, vectorSimilarityFunction, byteSize); } - @Override - public float[] vectorValue() throws IOException { - return vectorValue(doc); - } - - @Override - public int docID() { - return doc; - } - - @Override - public int nextDoc() throws IOException { - return advance(doc + 1); - } - - @Override - public int advance(int target) throws IOException { - assert docID() < target; - if (target >= size) { - return doc = NO_MORE_DOCS; - } - return doc = target; - } - @Override public DenseOffHeapVectorValues copy() throws IOException { return new DenseOffHeapVectorValues( dimension, size, slice.clone(), vectorSimilarityFunction, byteSize); } + @Override + public DocIndexIterator iterator() { + return createDenseIterator(); + } + @Override public Bits getAcceptOrds(Bits acceptDocs) { return acceptDocs; @@ -153,15 +130,18 @@ public Bits getAcceptOrds(Bits acceptDocs) { @Override public VectorScorer scorer(float[] query) throws IOException { DenseOffHeapVectorValues values = this.copy(); + DocIndexIterator iterator = values.iterator(); + return new VectorScorer() { @Override public float score() throws IOException { - return values.vectorSimilarityFunction.compare(values.vectorValue(), query); + return values.vectorSimilarityFunction.compare( + values.vectorValue(iterator.index()), query); } @Override public DocIdSetIterator iterator() { - return values; + return iterator; } }; } @@ -198,33 +178,17 @@ public SparseOffHeapVectorValues( fieldEntry.size()); } - @Override - public float[] vectorValue() throws IOException { - return vectorValue(disi.index()); - } - - @Override - public int docID() { - return disi.docID(); - } - - @Override - public int nextDoc() throws IOException { - return disi.nextDoc(); - } - - @Override - public int advance(int target) throws IOException { - assert docID() < target; - return disi.advance(target); - } - @Override public SparseOffHeapVectorValues copy() throws IOException { return new SparseOffHeapVectorValues( fieldEntry, dataIn, slice.clone(), vectorSimilarityFunction, byteSize); } + @Override + public DocIndexIterator iterator() { + return IndexedDISI.asDocIndexIterator(disi); + } + @Override public int ordToDoc(int ord) { return (int) ordToDoc.get(ord); @@ -251,15 +215,17 @@ public int length() { @Override public VectorScorer scorer(float[] query) throws IOException { SparseOffHeapVectorValues values = this.copy(); + DocIndexIterator iterator = values.iterator(); return new VectorScorer() { @Override public float score() throws IOException { - return values.vectorSimilarityFunction.compare(values.vectorValue(), query); + return values.vectorSimilarityFunction.compare( + values.vectorValue(iterator.index()), query); } @Override public DocIdSetIterator iterator() { - return values; + return iterator; } }; } @@ -271,8 +237,6 @@ public EmptyOffHeapVectorValues(int dimension) { super(dimension, 0, null, VectorSimilarityFunction.COSINE, 0); } - private int doc = -1; - @Override public int dimension() { return super.dimension(); @@ -283,26 +247,6 @@ public int size() { return 0; } - @Override - public float[] vectorValue() throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public int docID() { - return doc; - } - - @Override - public int nextDoc() throws IOException { - return advance(doc + 1); - } - - @Override - public int advance(int target) throws IOException { - return doc = NO_MORE_DOCS; - } - @Override public OffHeapFloatVectorValues copy() throws IOException { throw new UnsupportedOperationException(); @@ -318,6 +262,11 @@ public int ordToDoc(int ord) { throw new UnsupportedOperationException(); } + @Override + public DocIndexIterator iterator() { + return createDenseIterator(); + } + @Override public Bits getAcceptOrds(Bits acceptDocs) { return null; diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene95/Lucene95HnswVectorsReader.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene95/Lucene95HnswVectorsReader.java index 1b74ff94c18c..b5859daf9f2f 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene95/Lucene95HnswVectorsReader.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene95/Lucene95HnswVectorsReader.java @@ -21,8 +21,6 @@ import java.io.IOException; import java.util.Arrays; -import java.util.HashMap; -import java.util.Map; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer; @@ -39,6 +37,7 @@ import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.VectorEncoding; import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.internal.hppc.IntObjectHashMap; import org.apache.lucene.search.KnnCollector; import org.apache.lucene.store.ChecksumIndexInput; import org.apache.lucene.store.DataInput; @@ -61,7 +60,7 @@ public final class Lucene95HnswVectorsReader extends KnnVectorsReader implements HnswGraphProvider { private final FieldInfos fieldInfos; - private final Map fields = new HashMap<>(); + private final IntObjectHashMap fields = new IntObjectHashMap<>(); private final IndexInput vectorData; private final IndexInput vectorIndex; private final DefaultFlatVectorScorer defaultFlatVectorScorer = new DefaultFlatVectorScorer(); @@ -161,7 +160,7 @@ private void readFields(ChecksumIndexInput meta, FieldInfos infos) throws IOExce } FieldEntry fieldEntry = readField(meta, info); validateFieldEntry(info, fieldEntry); - fields.put(info.name, fieldEntry); + fields.put(info.number, fieldEntry); } } @@ -238,18 +237,27 @@ public void checkIntegrity() throws IOException { CodecUtil.checksumEntireFile(vectorIndex); } - @Override - public FloatVectorValues getFloatVectorValues(String field) throws IOException { - FieldEntry fieldEntry = fields.get(field); - if (fieldEntry.vectorEncoding != VectorEncoding.FLOAT32) { + private FieldEntry getFieldEntry(String field, VectorEncoding expectedEncoding) { + final FieldInfo info = fieldInfos.fieldInfo(field); + final FieldEntry fieldEntry; + if (info == null || (fieldEntry = fields.get(info.number)) == null) { + throw new IllegalArgumentException("field=\"" + field + "\" not found"); + } + if (fieldEntry.vectorEncoding != expectedEncoding) { throw new IllegalArgumentException( "field=\"" + field + "\" is encoded as: " + fieldEntry.vectorEncoding + " expected: " - + VectorEncoding.FLOAT32); + + expectedEncoding); } + return fieldEntry; + } + + @Override + public FloatVectorValues getFloatVectorValues(String field) throws IOException { + final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.FLOAT32); return OffHeapFloatVectorValues.load( fieldEntry.similarityFunction, defaultFlatVectorScorer, @@ -263,16 +271,7 @@ public FloatVectorValues getFloatVectorValues(String field) throws IOException { @Override public ByteVectorValues getByteVectorValues(String field) throws IOException { - FieldEntry fieldEntry = fields.get(field); - if (fieldEntry.vectorEncoding != VectorEncoding.BYTE) { - throw new IllegalArgumentException( - "field=\"" - + field - + "\" is encoded as: " - + fieldEntry.vectorEncoding - + " expected: " - + VectorEncoding.BYTE); - } + final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.BYTE); return OffHeapByteVectorValues.load( fieldEntry.similarityFunction, defaultFlatVectorScorer, @@ -287,11 +286,8 @@ public ByteVectorValues getByteVectorValues(String field) throws IOException { @Override public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { - FieldEntry fieldEntry = fields.get(field); - - if (fieldEntry.size() == 0 - || knnCollector.k() == 0 - || fieldEntry.vectorEncoding != VectorEncoding.FLOAT32) { + final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.FLOAT32); + if (fieldEntry.size() == 0 || knnCollector.k() == 0) { return; } @@ -318,11 +314,8 @@ public void search(String field, float[] target, KnnCollector knnCollector, Bits @Override public void search(String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { - FieldEntry fieldEntry = fields.get(field); - - if (fieldEntry.size() == 0 - || knnCollector.k() == 0 - || fieldEntry.vectorEncoding != VectorEncoding.BYTE) { + final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.BYTE); + if (fieldEntry.size() == 0 || knnCollector.k() == 0) { return; } @@ -349,12 +342,12 @@ public void search(String field, byte[] target, KnnCollector knnCollector, Bits /** Get knn graph values; used for testing */ @Override public HnswGraph getGraph(String field) throws IOException { - FieldInfo info = fieldInfos.fieldInfo(field); - if (info == null) { - throw new IllegalArgumentException("No such field '" + field + "'"); + final FieldInfo info = fieldInfos.fieldInfo(field); + final FieldEntry entry; + if (info == null || (entry = fields.get(info.number)) == null) { + throw new IllegalArgumentException("field=\"" + field + "\" not found"); } - FieldEntry entry = fields.get(field); - if (entry != null && entry.vectorIndexLength > 0) { + if (entry.vectorIndexLength > 0) { return getGraph(entry); } else { return HnswGraph.EMPTY; diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene99/Lucene99Codec.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene99/Lucene99Codec.java index d540abc85c28..bb7764da8e86 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene99/Lucene99Codec.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene99/Lucene99Codec.java @@ -17,6 +17,7 @@ package org.apache.lucene.backward_codecs.lucene99; import java.util.Objects; +import org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.CompoundFormat; import org.apache.lucene.codecs.DocValuesFormat; @@ -37,7 +38,6 @@ import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat; import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat; import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat; -import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat; import org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat; import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; import org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat; diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene99/Lucene99SkipWriter.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene99/Lucene99SkipWriter.java index 495cf07d9afd..0c8db5bd0f25 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene99/Lucene99SkipWriter.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene99/Lucene99SkipWriter.java @@ -46,10 +46,10 @@ * uptos(position, payload). 4. start offset. */ public final class Lucene99SkipWriter extends MultiLevelSkipListWriter { - private int[] lastSkipDoc; - private long[] lastSkipDocPointer; - private long[] lastSkipPosPointer; - private long[] lastSkipPayPointer; + private final int[] lastSkipDoc; + private final long[] lastSkipDocPointer; + private final long[] lastSkipPosPointer; + private final long[] lastSkipPayPointer; private final IndexOutput docOut; private final IndexOutput posOut; @@ -61,7 +61,7 @@ public final class Lucene99SkipWriter extends MultiLevelSkipListWriter { private long curPayPointer; private int curPosBufferUpto; private int curPayloadByteUpto; - private CompetitiveImpactAccumulator[] curCompetitiveFreqNorms; + private final CompetitiveImpactAccumulator[] curCompetitiveFreqNorms; private boolean fieldHasPositions; private boolean fieldHasOffsets; private boolean fieldHasPayloads; @@ -85,7 +85,12 @@ public Lucene99SkipWriter( lastSkipPosPointer = new long[maxSkipLevels]; if (payOut != null) { lastSkipPayPointer = new long[maxSkipLevels]; + } else { + lastSkipPayPointer = null; } + } else { + lastSkipPosPointer = null; + lastSkipPayPointer = null; } curCompetitiveFreqNorms = new CompetitiveImpactAccumulator[maxSkipLevels]; for (int i = 0; i < maxSkipLevels; ++i) { diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene99/PostingsUtil.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene99/PostingsUtil.java index 7b95bada5bc7..dce8c2b145d5 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene99/PostingsUtil.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene99/PostingsUtil.java @@ -19,6 +19,7 @@ import java.io.IOException; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.GroupVIntUtil; /** Utility class to encode/decode postings block. */ final class PostingsUtil { @@ -35,7 +36,7 @@ static void readVIntBlock( boolean indexHasFreq, boolean decodeFreq) throws IOException { - docIn.readGroupVInts(docBuffer, num); + GroupVIntUtil.readGroupVInts(docIn, docBuffer, num); if (indexHasFreq && decodeFreq) { for (int i = 0; i < num; ++i) { freqBuffer[i] = docBuffer[i] & 0x01; diff --git a/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec index df14387fc688..ff4d7eeda4e9 100644 --- a/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec +++ b/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec @@ -23,3 +23,5 @@ org.apache.lucene.backward_codecs.lucene92.Lucene92Codec org.apache.lucene.backward_codecs.lucene94.Lucene94Codec org.apache.lucene.backward_codecs.lucene95.Lucene95Codec org.apache.lucene.backward_codecs.lucene99.Lucene99Codec +org.apache.lucene.backward_codecs.lucene912.Lucene912Codec +org.apache.lucene.backward_codecs.lucene100.Lucene100Codec diff --git a/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat b/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat index 9733362abe79..49d917dc4273 100644 --- a/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat +++ b/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat @@ -17,3 +17,4 @@ org.apache.lucene.backward_codecs.lucene50.Lucene50PostingsFormat org.apache.lucene.backward_codecs.lucene84.Lucene84PostingsFormat org.apache.lucene.backward_codecs.lucene90.Lucene90PostingsFormat org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat +org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene50/Lucene50RWCompoundFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene50/Lucene50RWCompoundFormat.java index 2817b19828db..cfd269d9dddc 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene50/Lucene50RWCompoundFormat.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene50/Lucene50RWCompoundFormat.java @@ -81,9 +81,8 @@ public final class Lucene50RWCompoundFormat extends CompoundFormat { public Lucene50RWCompoundFormat() {} @Override - public CompoundDirectory getCompoundReader(Directory dir, SegmentInfo si, IOContext context) - throws IOException { - return new Lucene50CompoundReader(dir, si, context); + public CompoundDirectory getCompoundReader(Directory dir, SegmentInfo si) throws IOException { + return new Lucene50CompoundReader(dir, si); } @Override diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene60/Lucene60PointsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene60/Lucene60PointsWriter.java index bcfa479a058c..ca148491753f 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene60/Lucene60PointsWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene60/Lucene60PointsWriter.java @@ -218,7 +218,7 @@ public void merge(MergeState mergeState) throws IOException { FieldInfos readerFieldInfos = mergeState.fieldInfos[i]; FieldInfo readerFieldInfo = readerFieldInfos.fieldInfo(fieldInfo.name); if (readerFieldInfo != null && readerFieldInfo.getPointDimensionCount() > 0) { - PointValues bkdReader = reader60.readers.get(readerFieldInfo.number); + PointValues bkdReader = reader60.getValues(readerFieldInfo.name); if (bkdReader != null) { bkdReaders.add(bkdReader); docMaps.add(mergeState.docMaps[i]); diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene60/bkd/BKDWriter60.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene60/bkd/BKDWriter60.java index dccd4f745abf..a1e304a3149d 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene60/bkd/BKDWriter60.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene60/bkd/BKDWriter60.java @@ -642,13 +642,13 @@ public long merge(IndexOutput out, List docMaps, List impacts = level1CompetitiveFreqNormAccumulator.getCompetitiveFreqNormPairs(); @@ -449,7 +449,7 @@ private void writeLevel1SkipData() throws IOException { maxNumImpactsAtLevel1 = impacts.size(); } writeImpacts(impacts, scratchOutput); - numImpactBytes = scratchOutput.size(); + long numImpactBytes = scratchOutput.size(); if (numImpactBytes > maxImpactNumBytesAtLevel1) { maxImpactNumBytesAtLevel1 = Math.toIntExact(numImpactBytes); } diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene912/Lucene912RWPostingsFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene912/Lucene912RWPostingsFormat.java new file mode 100644 index 000000000000..af1037432afa --- /dev/null +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene912/Lucene912RWPostingsFormat.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.backward_codecs.lucene912; + +import java.io.IOException; +import org.apache.lucene.codecs.FieldsConsumer; +import org.apache.lucene.codecs.PostingsWriterBase; +import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.util.IOUtils; + +/** Read-write impersonation of {@link Lucene912PostingsFormat}. */ +public final class Lucene912RWPostingsFormat extends Lucene912PostingsFormat { + + private final int minTermBlockSize; + private final int maxTermBlockSize; + + /** Creates {@code Lucene912PostingsFormat} with default settings. */ + public Lucene912RWPostingsFormat() { + this( + Lucene90BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, + Lucene90BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE); + } + + /** + * Creates {@code Lucene912PostingsFormat} with custom values for {@code minBlockSize} and {@code + * maxBlockSize} passed to block terms dictionary. + * + * @see + * Lucene90BlockTreeTermsWriter#Lucene90BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int) + */ + public Lucene912RWPostingsFormat(int minTermBlockSize, int maxTermBlockSize) { + super(); + Lucene90BlockTreeTermsWriter.validateSettings(minTermBlockSize, maxTermBlockSize); + this.minTermBlockSize = minTermBlockSize; + this.maxTermBlockSize = maxTermBlockSize; + } + + @Override + public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(state); + boolean success = false; + try { + FieldsConsumer ret = + new Lucene90BlockTreeTermsWriter( + state, postingsWriter, minTermBlockSize, maxTermBlockSize); + success = true; + return ret; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(postingsWriter); + } + } + } +} diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene912/TestForDeltaUtil.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene912/TestForDeltaUtil.java new file mode 100644 index 000000000000..471333b20f46 --- /dev/null +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene912/TestForDeltaUtil.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.backward_codecs.lucene912; + +import com.carrotsearch.randomizedtesting.generators.RandomNumbers; +import java.io.IOException; +import java.util.Arrays; +import org.apache.lucene.store.ByteBuffersDirectory; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.tests.util.TestUtil; +import org.apache.lucene.util.packed.PackedInts; + +public class TestForDeltaUtil extends LuceneTestCase { + + public void testEncodeDecode() throws IOException { + final int iterations = RandomNumbers.randomIntBetween(random(), 50, 1000); + final int[] values = new int[iterations * ForUtil.BLOCK_SIZE]; + + for (int i = 0; i < iterations; ++i) { + final int bpv = TestUtil.nextInt(random(), 1, 31 - 7); + for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) { + values[i * ForUtil.BLOCK_SIZE + j] = + RandomNumbers.randomIntBetween(random(), 1, (int) PackedInts.maxValue(bpv)); + } + } + + final Directory d = new ByteBuffersDirectory(); + final long endPointer; + + { + // encode + IndexOutput out = d.createOutput("test.bin", IOContext.DEFAULT); + final ForDeltaUtil forDeltaUtil = new ForDeltaUtil(); + + for (int i = 0; i < iterations; ++i) { + long[] source = new long[ForUtil.BLOCK_SIZE]; + for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) { + source[j] = values[i * ForUtil.BLOCK_SIZE + j]; + } + forDeltaUtil.encodeDeltas(source, out); + } + endPointer = out.getFilePointer(); + out.close(); + } + + { + // decode + IndexInput in = d.openInput("test.bin", IOContext.READONCE); + ForDeltaUtil forDeltaUtil = new ForDeltaUtil(); + for (int i = 0; i < iterations; ++i) { + long base = 0; + final long[] restored = new long[ForUtil.BLOCK_SIZE]; + forDeltaUtil.decodeAndPrefixSum(in, base, restored); + final long[] expected = new long[ForUtil.BLOCK_SIZE]; + for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) { + expected[j] = values[i * ForUtil.BLOCK_SIZE + j]; + if (j > 0) { + expected[j] += expected[j - 1]; + } else { + expected[j] += base; + } + } + assertArrayEquals(Arrays.toString(restored), expected, restored); + } + assertEquals(endPointer, in.getFilePointer()); + in.close(); + } + + d.close(); + } +} diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene912/TestForUtil.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene912/TestForUtil.java new file mode 100644 index 000000000000..e728cb9e50d3 --- /dev/null +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene912/TestForUtil.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.backward_codecs.lucene912; + +import com.carrotsearch.randomizedtesting.generators.RandomNumbers; +import java.io.IOException; +import java.util.Arrays; +import org.apache.lucene.store.ByteBuffersDirectory; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.tests.util.TestUtil; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.packed.PackedInts; + +public class TestForUtil extends LuceneTestCase { + + public void testEncodeDecode() throws IOException { + final int iterations = RandomNumbers.randomIntBetween(random(), 50, 1000); + final int[] values = new int[iterations * ForUtil.BLOCK_SIZE]; + + for (int i = 0; i < iterations; ++i) { + final int bpv = TestUtil.nextInt(random(), 1, 31); + for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) { + values[i * ForUtil.BLOCK_SIZE + j] = + RandomNumbers.randomIntBetween(random(), 0, (int) PackedInts.maxValue(bpv)); + } + } + + final Directory d = new ByteBuffersDirectory(); + final long endPointer; + + { + // encode + IndexOutput out = d.createOutput("test.bin", IOContext.DEFAULT); + final ForUtil forUtil = new ForUtil(); + + for (int i = 0; i < iterations; ++i) { + long[] source = new long[ForUtil.BLOCK_SIZE]; + long or = 0; + for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) { + source[j] = values[i * ForUtil.BLOCK_SIZE + j]; + or |= source[j]; + } + final int bpv = PackedInts.bitsRequired(or); + out.writeByte((byte) bpv); + forUtil.encode(source, bpv, out); + } + endPointer = out.getFilePointer(); + out.close(); + } + + { + // decode + IndexInput in = d.openInput("test.bin", IOContext.READONCE); + ForUtil forUtil = new ForUtil(); + for (int i = 0; i < iterations; ++i) { + final int bitsPerValue = in.readByte(); + final long currentFilePointer = in.getFilePointer(); + final long[] restored = new long[ForUtil.BLOCK_SIZE]; + forUtil.decode(bitsPerValue, in, restored); + int[] ints = new int[ForUtil.BLOCK_SIZE]; + for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) { + ints[j] = Math.toIntExact(restored[j]); + } + assertArrayEquals( + Arrays.toString(ints), + ArrayUtil.copyOfSubArray(values, i * ForUtil.BLOCK_SIZE, (i + 1) * ForUtil.BLOCK_SIZE), + ints); + assertEquals(ForUtil.numBytes(bitsPerValue), in.getFilePointer() - currentFilePointer); + } + assertEquals(endPointer, in.getFilePointer()); + in.close(); + } + + d.close(); + } +} diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene912/TestLucene912PostingsFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene912/TestLucene912PostingsFormat.java similarity index 96% rename from lucene/core/src/test/org/apache/lucene/codecs/lucene912/TestLucene912PostingsFormat.java rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene912/TestLucene912PostingsFormat.java index 1b8d0618c601..adf8aaf9ec76 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene912/TestLucene912PostingsFormat.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene912/TestLucene912PostingsFormat.java @@ -14,17 +14,17 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene912; +package org.apache.lucene.backward_codecs.lucene912; import java.io.IOException; import java.util.Arrays; import java.util.Collections; import java.util.List; +import org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsReader.MutableImpactList; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.CompetitiveImpactAccumulator; import org.apache.lucene.codecs.lucene90.blocktree.FieldReader; import org.apache.lucene.codecs.lucene90.blocktree.Stats; -import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader.MutableImpactList; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.DirectoryReader; @@ -45,7 +45,7 @@ public class TestLucene912PostingsFormat extends BasePostingsFormatTestCase { @Override protected Codec getCodec() { - return TestUtil.alwaysPostingsFormat(new Lucene912PostingsFormat()); + return TestUtil.alwaysPostingsFormat(new Lucene912RWPostingsFormat()); } public void testVInt15() throws IOException { diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene912/TestPForUtil.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene912/TestPForUtil.java new file mode 100644 index 000000000000..f3d550dff4d3 --- /dev/null +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene912/TestPForUtil.java @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.backward_codecs.lucene912; + +import com.carrotsearch.randomizedtesting.generators.RandomNumbers; +import java.io.IOException; +import java.util.Arrays; +import org.apache.lucene.store.ByteBuffersDirectory; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.tests.util.TestUtil; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.packed.PackedInts; + +public class TestPForUtil extends LuceneTestCase { + + public void testEncodeDecode() throws IOException { + final int iterations = RandomNumbers.randomIntBetween(random(), 50, 1000); + final int[] values = createTestData(iterations, 31); + + final Directory d = new ByteBuffersDirectory(); + final long endPointer = encodeTestData(iterations, values, d); + + IndexInput in = d.openInput("test.bin", IOContext.READONCE); + final PForUtil pforUtil = new PForUtil(); + for (int i = 0; i < iterations; ++i) { + if (random().nextInt(5) == 0) { + PForUtil.skip(in); + continue; + } + final long[] restored = new long[ForUtil.BLOCK_SIZE]; + pforUtil.decode(in, restored); + int[] ints = new int[ForUtil.BLOCK_SIZE]; + for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) { + ints[j] = Math.toIntExact(restored[j]); + } + assertArrayEquals( + Arrays.toString(ints), + ArrayUtil.copyOfSubArray(values, i * ForUtil.BLOCK_SIZE, (i + 1) * ForUtil.BLOCK_SIZE), + ints); + } + assertEquals(endPointer, in.getFilePointer()); + in.close(); + + d.close(); + } + + private int[] createTestData(int iterations, int maxBpv) { + final int[] values = new int[iterations * ForUtil.BLOCK_SIZE]; + + for (int i = 0; i < iterations; ++i) { + final int bpv = TestUtil.nextInt(random(), 0, maxBpv); + for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) { + values[i * ForUtil.BLOCK_SIZE + j] = + RandomNumbers.randomIntBetween(random(), 0, (int) PackedInts.maxValue(bpv)); + if (random().nextInt(100) == 0) { + final int exceptionBpv; + if (random().nextInt(10) == 0) { + exceptionBpv = Math.min(bpv + TestUtil.nextInt(random(), 9, 16), maxBpv); + } else { + exceptionBpv = Math.min(bpv + TestUtil.nextInt(random(), 1, 8), maxBpv); + } + values[i * ForUtil.BLOCK_SIZE + j] |= random().nextInt(1 << (exceptionBpv - bpv)) << bpv; + } + } + } + + return values; + } + + private long encodeTestData(int iterations, int[] values, Directory d) throws IOException { + IndexOutput out = d.createOutput("test.bin", IOContext.DEFAULT); + final PForUtil pforUtil = new PForUtil(); + + for (int i = 0; i < iterations; ++i) { + long[] source = new long[ForUtil.BLOCK_SIZE]; + for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) { + source[j] = values[i * ForUtil.BLOCK_SIZE + j]; + } + pforUtil.encode(source, out); + } + final long endPointer = out.getFilePointer(); + out.close(); + + return endPointer; + } +} diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene912/TestPostingsUtil.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene912/TestPostingsUtil.java new file mode 100644 index 000000000000..869345ed4f8e --- /dev/null +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene912/TestPostingsUtil.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.backward_codecs.lucene912; + +import java.io.IOException; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.tests.util.LuceneTestCase; + +public class TestPostingsUtil extends LuceneTestCase { + + // checks for bug described in https://github.com/apache/lucene/issues/13373 + public void testIntegerOverflow() throws IOException { + final int size = random().nextInt(1, ForUtil.BLOCK_SIZE); + final long[] docDeltaBuffer = new long[size]; + final long[] freqBuffer = new long[size]; + + final int delta = 1 << 30; + docDeltaBuffer[0] = delta; + try (Directory dir = newDirectory()) { + try (IndexOutput out = dir.createOutput("test", IOContext.DEFAULT)) { + // In old implementation, this would cause integer overflow exception. + PostingsUtil.writeVIntBlock(out, docDeltaBuffer, freqBuffer, size, true); + } + long[] restoredDocs = new long[size]; + long[] restoredFreqs = new long[size]; + try (IndexInput in = dir.openInput("test", IOContext.DEFAULT)) { + PostingsUtil.readVIntBlock(in, restoredDocs, restoredFreqs, size, true, true); + } + assertEquals(delta, restoredDocs[0]); + } + } +} diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/Lucene92HnswVectorsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/Lucene92HnswVectorsWriter.java index caa8fc3da149..bf1c89a536d8 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/Lucene92HnswVectorsWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/Lucene92HnswVectorsWriter.java @@ -18,7 +18,6 @@ package org.apache.lucene.backward_codecs.lucene92; import static org.apache.lucene.backward_codecs.lucene92.Lucene92RWHnswVectorsFormat.DIRECT_MONOTONIC_BLOCK_SHIFT; -import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; import java.io.IOException; import java.nio.ByteBuffer; @@ -33,6 +32,7 @@ import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.search.DocIdSetIterator; @@ -43,7 +43,6 @@ import org.apache.lucene.util.hnsw.HnswGraphBuilder; import org.apache.lucene.util.hnsw.NeighborArray; import org.apache.lucene.util.hnsw.OnHeapHnswGraph; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier; import org.apache.lucene.util.packed.DirectMonotonicWriter; @@ -190,9 +189,12 @@ private static DocsWithFieldSet writeVectorData(IndexOutput output, FloatVectorV DocsWithFieldSet docsWithField = new DocsWithFieldSet(); ByteBuffer binaryVector = ByteBuffer.allocate(vectors.dimension() * Float.BYTES).order(ByteOrder.LITTLE_ENDIAN); - for (int docV = vectors.nextDoc(); docV != NO_MORE_DOCS; docV = vectors.nextDoc()) { + KnnVectorValues.DocIndexIterator iterator = vectors.iterator(); + for (int docV = iterator.nextDoc(); + docV != DocIdSetIterator.NO_MORE_DOCS; + docV = iterator.nextDoc()) { // write vector - float[] vectorValue = vectors.vectorValue(); + float[] vectorValue = vectors.vectorValue(iterator.index()); binaryVector.asFloatBuffer().put(vectorValue); output.writeBytes(binaryVector.array(), binaryVector.limit()); docsWithField.add(docV); @@ -277,7 +279,7 @@ private void writeMeta( } private OnHeapHnswGraph writeGraph( - RandomAccessVectorValues.Floats vectorValues, VectorSimilarityFunction similarityFunction) + FloatVectorValues vectorValues, VectorSimilarityFunction similarityFunction) throws IOException { DefaultFlatVectorScorer defaultFlatVectorScorer = new DefaultFlatVectorScorer(); // build graph diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/TestLucene92HnswVectorsFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/TestLucene92HnswVectorsFormat.java index 5189791ef17c..192f70a63972 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/TestLucene92HnswVectorsFormat.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/TestLucene92HnswVectorsFormat.java @@ -67,4 +67,14 @@ public void testByteVectorScorerIteration() { public void testEmptyByteVectorData() { // unimplemented } + + @Override + public void testMergingWithDifferentByteKnnFields() { + // unimplemented + } + + @Override + public void testMismatchedFields() throws Exception { + // requires byte support + } } diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene94/Lucene94HnswVectorsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene94/Lucene94HnswVectorsWriter.java index 9726a3b19e87..01698da79893 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene94/Lucene94HnswVectorsWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene94/Lucene94HnswVectorsWriter.java @@ -36,6 +36,7 @@ import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.MergeState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.Sorter; @@ -52,7 +53,6 @@ import org.apache.lucene.util.hnsw.HnswGraphBuilder; import org.apache.lucene.util.hnsw.NeighborArray; import org.apache.lucene.util.hnsw.OnHeapHnswGraph; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier; import org.apache.lucene.util.packed.DirectMonotonicWriter; @@ -216,9 +216,7 @@ private void writeSortingField(FieldWriter fieldData, int maxDoc, Sorter.DocM final int[] docIdOffsets = new int[sortMap.size()]; int offset = 1; // 0 means no vector for this (field, document) DocIdSetIterator iterator = fieldData.docsWithField.iterator(); - for (int docID = iterator.nextDoc(); - docID != DocIdSetIterator.NO_MORE_DOCS; - docID = iterator.nextDoc()) { + for (int docID = iterator.nextDoc(); docID != NO_MORE_DOCS; docID = iterator.nextDoc()) { int newDocID = sortMap.oldToNew(docID); docIdOffsets[newDocID] = offset++; } @@ -388,10 +386,14 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE // write the vector data to a temporary file DocsWithFieldSet docsWithField = switch (fieldInfo.getVectorEncoding()) { - case BYTE -> writeByteVectorData( - tempVectorData, MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState)); - case FLOAT32 -> writeVectorData( - tempVectorData, MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState)); + case BYTE -> + writeByteVectorData( + tempVectorData, + MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState)); + case FLOAT32 -> + writeVectorData( + tempVectorData, + MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState)); }; CodecUtil.writeFooter(tempVectorData); IOUtils.close(tempVectorData); @@ -552,9 +554,7 @@ private void writeMeta( final DirectMonotonicWriter ordToDocWriter = DirectMonotonicWriter.getInstance(meta, vectorData, count, DIRECT_MONOTONIC_BLOCK_SHIFT); DocIdSetIterator iterator = docsWithField.iterator(); - for (int doc = iterator.nextDoc(); - doc != DocIdSetIterator.NO_MORE_DOCS; - doc = iterator.nextDoc()) { + for (int doc = iterator.nextDoc(); doc != NO_MORE_DOCS; doc = iterator.nextDoc()) { ordToDocWriter.add(doc); } ordToDocWriter.finish(); @@ -586,11 +586,10 @@ private void writeMeta( private static DocsWithFieldSet writeByteVectorData( IndexOutput output, ByteVectorValues byteVectorValues) throws IOException { DocsWithFieldSet docsWithField = new DocsWithFieldSet(); - for (int docV = byteVectorValues.nextDoc(); - docV != NO_MORE_DOCS; - docV = byteVectorValues.nextDoc()) { + KnnVectorValues.DocIndexIterator iter = byteVectorValues.iterator(); + for (int docV = iter.nextDoc(); docV != NO_MORE_DOCS; docV = iter.nextDoc()) { // write vector - byte[] binaryValue = byteVectorValues.vectorValue(); + byte[] binaryValue = byteVectorValues.vectorValue(iter.index()); assert binaryValue.length == byteVectorValues.dimension() * VectorEncoding.BYTE.byteSize; output.writeBytes(binaryValue, binaryValue.length); docsWithField.add(docV); @@ -604,14 +603,13 @@ private static DocsWithFieldSet writeByteVectorData( private static DocsWithFieldSet writeVectorData( IndexOutput output, FloatVectorValues floatVectorValues) throws IOException { DocsWithFieldSet docsWithField = new DocsWithFieldSet(); + KnnVectorValues.DocIndexIterator iter = floatVectorValues.iterator(); ByteBuffer binaryVector = ByteBuffer.allocate(floatVectorValues.dimension() * VectorEncoding.FLOAT32.byteSize) .order(ByteOrder.LITTLE_ENDIAN); - for (int docV = floatVectorValues.nextDoc(); - docV != NO_MORE_DOCS; - docV = floatVectorValues.nextDoc()) { + for (int docV = iter.nextDoc(); docV != NO_MORE_DOCS; docV = iter.nextDoc()) { // write vector - float[] vectorValue = floatVectorValues.vectorValue(); + float[] vectorValue = floatVectorValues.vectorValue(iter.index()); binaryVector.asFloatBuffer().put(vectorValue); output.writeBytes(binaryVector.array(), binaryVector.limit()); docsWithField.add(docV); @@ -638,18 +636,20 @@ static FieldWriter create(FieldInfo fieldInfo, int M, int beamWidth, InfoStre throws IOException { int dim = fieldInfo.getVectorDimension(); return switch (fieldInfo.getVectorEncoding()) { - case BYTE -> new FieldWriter(fieldInfo, M, beamWidth, infoStream) { - @Override - public byte[] copyValue(byte[] value) { - return ArrayUtil.copyOfSubArray(value, 0, dim); - } - }; - case FLOAT32 -> new FieldWriter(fieldInfo, M, beamWidth, infoStream) { - @Override - public float[] copyValue(float[] value) { - return ArrayUtil.copyOfSubArray(value, 0, dim); - } - }; + case BYTE -> + new FieldWriter(fieldInfo, M, beamWidth, infoStream) { + @Override + public byte[] copyValue(byte[] value) { + return ArrayUtil.copyOfSubArray(value, 0, dim); + } + }; + case FLOAT32 -> + new FieldWriter(fieldInfo, M, beamWidth, infoStream) { + @Override + public float[] copyValue(float[] value) { + return ArrayUtil.copyOfSubArray(value, 0, dim); + } + }; }; } @@ -663,12 +663,14 @@ public float[] copyValue(float[] value) { DefaultFlatVectorScorer defaultFlatVectorScorer = new DefaultFlatVectorScorer(); RandomVectorScorerSupplier scorerSupplier = switch (fieldInfo.getVectorEncoding()) { - case BYTE -> defaultFlatVectorScorer.getRandomVectorScorerSupplier( - fieldInfo.getVectorSimilarityFunction(), - RandomAccessVectorValues.fromBytes((List) vectors, dim)); - case FLOAT32 -> defaultFlatVectorScorer.getRandomVectorScorerSupplier( - fieldInfo.getVectorSimilarityFunction(), - RandomAccessVectorValues.fromFloats((List) vectors, dim)); + case BYTE -> + defaultFlatVectorScorer.getRandomVectorScorerSupplier( + fieldInfo.getVectorSimilarityFunction(), + ByteVectorValues.fromBytes((List) vectors, dim)); + case FLOAT32 -> + defaultFlatVectorScorer.getRandomVectorScorerSupplier( + fieldInfo.getVectorSimilarityFunction(), + FloatVectorValues.fromFloats((List) vectors, dim)); }; hnswGraphBuilder = HnswGraphBuilder.create(scorerSupplier, M, beamWidth, HnswGraphBuilder.randSeed); @@ -693,9 +695,9 @@ public void addValue(int docID, Object value) throws IOException { lastDocID = docID; } - OnHeapHnswGraph getGraph() { + OnHeapHnswGraph getGraph() throws IOException { if (vectors.size() > 0) { - return hnswGraphBuilder.getGraph(); + return hnswGraphBuilder.getCompletedGraph(); } else { return null; } diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene95/Lucene95HnswVectorsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene95/Lucene95HnswVectorsWriter.java index c74d34fb9ad6..c855d8f5e073 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene95/Lucene95HnswVectorsWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene95/Lucene95HnswVectorsWriter.java @@ -18,6 +18,7 @@ package org.apache.lucene.backward_codecs.lucene95; import static org.apache.lucene.backward_codecs.lucene95.Lucene95HnswVectorsFormat.DIRECT_MONOTONIC_BLOCK_SHIFT; +import static org.apache.lucene.codecs.KnnVectorsWriter.MergedVectorValues.hasVectorValues; import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; import java.io.IOException; @@ -38,6 +39,7 @@ import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.MergeState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.Sorter; @@ -55,7 +57,6 @@ import org.apache.lucene.util.hnsw.IncrementalHnswGraphMerger; import org.apache.lucene.util.hnsw.NeighborArray; import org.apache.lucene.util.hnsw.OnHeapHnswGraph; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier; import org.apache.lucene.util.packed.DirectMonotonicWriter; @@ -220,9 +221,7 @@ private void writeSortingField(FieldWriter fieldData, int maxDoc, Sorter.DocM final int[] docIdOffsets = new int[sortMap.size()]; int offset = 1; // 0 means no vector for this (field, document) DocIdSetIterator iterator = fieldData.docsWithField.iterator(); - for (int docID = iterator.nextDoc(); - docID != DocIdSetIterator.NO_MORE_DOCS; - docID = iterator.nextDoc()) { + for (int docID = iterator.nextDoc(); docID != NO_MORE_DOCS; docID = iterator.nextDoc()) { int newDocID = sortMap.oldToNew(docID); docIdOffsets[newDocID] = offset++; } @@ -414,10 +413,14 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE // write the vector data to a temporary file DocsWithFieldSet docsWithField = switch (fieldInfo.getVectorEncoding()) { - case BYTE -> writeByteVectorData( - tempVectorData, MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState)); - case FLOAT32 -> writeVectorData( - tempVectorData, MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState)); + case BYTE -> + writeByteVectorData( + tempVectorData, + MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState)); + case FLOAT32 -> + writeVectorData( + tempVectorData, + MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState)); }; CodecUtil.writeFooter(tempVectorData); IOUtils.close(tempVectorData); @@ -472,19 +475,23 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE IncrementalHnswGraphMerger merger = new IncrementalHnswGraphMerger(fieldInfo, scorerSupplier, M, beamWidth); for (int i = 0; i < mergeState.liveDocs.length; i++) { - merger.addReader( - mergeState.knnVectorsReaders[i], mergeState.docMaps[i], mergeState.liveDocs[i]); + if (hasVectorValues(mergeState.fieldInfos[i], fieldInfo.name)) { + merger.addReader( + mergeState.knnVectorsReaders[i], mergeState.docMaps[i], mergeState.liveDocs[i]); + } } - DocIdSetIterator mergedVectorIterator = null; + KnnVectorValues mergedVectorValues = null; switch (fieldInfo.getVectorEncoding()) { - case BYTE -> mergedVectorIterator = - KnnVectorsWriter.MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState); - case FLOAT32 -> mergedVectorIterator = - KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState); + case BYTE -> + mergedVectorValues = + KnnVectorsWriter.MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState); + case FLOAT32 -> + mergedVectorValues = + KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState); } graph = merger.merge( - mergedVectorIterator, segmentWriteState.infoStream, docsWithField.cardinality()); + mergedVectorValues, segmentWriteState.infoStream, docsWithField.cardinality()); vectorIndexNodeOffsets = writeGraph(graph); } long vectorIndexLength = vectorIndex.getFilePointer() - vectorIndexOffset; @@ -627,14 +634,13 @@ private void writeMeta( private static DocsWithFieldSet writeByteVectorData( IndexOutput output, ByteVectorValues byteVectorValues) throws IOException { DocsWithFieldSet docsWithField = new DocsWithFieldSet(); - for (int docV = byteVectorValues.nextDoc(); - docV != NO_MORE_DOCS; - docV = byteVectorValues.nextDoc()) { + KnnVectorValues.DocIndexIterator iter = byteVectorValues.iterator(); + for (int docId = iter.nextDoc(); docId != NO_MORE_DOCS; docId = iter.nextDoc()) { // write vector - byte[] binaryValue = byteVectorValues.vectorValue(); + byte[] binaryValue = byteVectorValues.vectorValue(iter.index()); assert binaryValue.length == byteVectorValues.dimension() * VectorEncoding.BYTE.byteSize; output.writeBytes(binaryValue, binaryValue.length); - docsWithField.add(docV); + docsWithField.add(docId); } return docsWithField; } @@ -648,11 +654,10 @@ private static DocsWithFieldSet writeVectorData( ByteBuffer buffer = ByteBuffer.allocate(floatVectorValues.dimension() * VectorEncoding.FLOAT32.byteSize) .order(ByteOrder.LITTLE_ENDIAN); - for (int docV = floatVectorValues.nextDoc(); - docV != NO_MORE_DOCS; - docV = floatVectorValues.nextDoc()) { + KnnVectorValues.DocIndexIterator iter = floatVectorValues.iterator(); + for (int docV = iter.nextDoc(); docV != NO_MORE_DOCS; docV = iter.nextDoc()) { // write vector - float[] value = floatVectorValues.vectorValue(); + float[] value = floatVectorValues.vectorValue(iter.index()); buffer.asFloatBuffer().put(value); output.writeBytes(buffer.array(), buffer.limit()); docsWithField.add(docV); @@ -680,18 +685,20 @@ static FieldWriter create(FieldInfo fieldInfo, int M, int beamWidth, InfoStre throws IOException { int dim = fieldInfo.getVectorDimension(); return switch (fieldInfo.getVectorEncoding()) { - case BYTE -> new FieldWriter(fieldInfo, M, beamWidth, infoStream) { - @Override - public byte[] copyValue(byte[] value) { - return ArrayUtil.copyOfSubArray(value, 0, dim); - } - }; - case FLOAT32 -> new FieldWriter(fieldInfo, M, beamWidth, infoStream) { - @Override - public float[] copyValue(float[] value) { - return ArrayUtil.copyOfSubArray(value, 0, dim); - } - }; + case BYTE -> + new FieldWriter(fieldInfo, M, beamWidth, infoStream) { + @Override + public byte[] copyValue(byte[] value) { + return ArrayUtil.copyOfSubArray(value, 0, dim); + } + }; + case FLOAT32 -> + new FieldWriter(fieldInfo, M, beamWidth, infoStream) { + @Override + public float[] copyValue(float[] value) { + return ArrayUtil.copyOfSubArray(value, 0, dim); + } + }; }; } @@ -704,12 +711,14 @@ public float[] copyValue(float[] value) { vectors = new ArrayList<>(); RandomVectorScorerSupplier scorerSupplier = switch (fieldInfo.getVectorEncoding()) { - case BYTE -> defaultFlatVectorScorer.getRandomVectorScorerSupplier( - fieldInfo.getVectorSimilarityFunction(), - RandomAccessVectorValues.fromBytes((List) vectors, dim)); - case FLOAT32 -> defaultFlatVectorScorer.getRandomVectorScorerSupplier( - fieldInfo.getVectorSimilarityFunction(), - RandomAccessVectorValues.fromFloats((List) vectors, dim)); + case BYTE -> + defaultFlatVectorScorer.getRandomVectorScorerSupplier( + fieldInfo.getVectorSimilarityFunction(), + ByteVectorValues.fromBytes((List) vectors, dim)); + case FLOAT32 -> + defaultFlatVectorScorer.getRandomVectorScorerSupplier( + fieldInfo.getVectorSimilarityFunction(), + FloatVectorValues.fromFloats((List) vectors, dim)); }; hnswGraphBuilder = HnswGraphBuilder.create(scorerSupplier, M, beamWidth, HnswGraphBuilder.randSeed); @@ -732,9 +741,9 @@ public void addValue(int docID, T vectorValue) throws IOException { lastDocID = docID; } - OnHeapHnswGraph getGraph() { + OnHeapHnswGraph getGraph() throws IOException { if (vectors.size() > 0) { - return hnswGraphBuilder.getGraph(); + return hnswGraphBuilder.getCompletedGraph(); } else { return null; } diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene99/TestLucene99HnswScalarQuantizedVectorsFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene99/TestLucene99HnswScalarQuantizedVectorsFormat.java index 01d5b4ab17fc..268d231c4f49 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene99/TestLucene99HnswScalarQuantizedVectorsFormat.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene99/TestLucene99HnswScalarQuantizedVectorsFormat.java @@ -18,17 +18,12 @@ package org.apache.lucene.backward_codecs.lucene99; import org.apache.lucene.codecs.Codec; -import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase; +import org.apache.lucene.tests.util.TestUtil; public class TestLucene99HnswScalarQuantizedVectorsFormat extends BaseKnnVectorsFormatTestCase { @Override protected Codec getCodec() { - return new Lucene99Codec() { - @Override - public KnnVectorsFormat getKnnVectorsFormatForField(String field) { - return new Lucene99RWHnswScalarQuantizationVectorsFormat(); - } - }; + return TestUtil.alwaysKnnVectorsFormat(new Lucene99RWHnswScalarQuantizationVectorsFormat()); } } diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/BackwardsCompatibilityTestBase.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/BackwardsCompatibilityTestBase.java index ae5920de3685..edbed96be752 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/BackwardsCompatibilityTestBase.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/BackwardsCompatibilityTestBase.java @@ -106,8 +106,8 @@ public abstract class BackwardsCompatibilityTestBase extends LuceneTestCase { * This is a base constructor for parameterized BWC tests. The constructor arguments are provided * by {@link com.carrotsearch.randomizedtesting.RandomizedRunner} during test execution. A {@link * com.carrotsearch.randomizedtesting.annotations.ParametersFactory} specified in a subclass - * provides a list lists of arguments for the tests and RandomizedRunner will execute the test for - * each of the argument list. + * provides a list of arguments for the tests and RandomizedRunner will execute the test for each + * of the argument list. * * @param version the version this test should run for * @param indexPattern an index pattern in order to open an index of see {@link diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestAncientIndicesCompatibility.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestAncientIndicesCompatibility.java index 88adfadf1c88..cf7df98345d3 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestAncientIndicesCompatibility.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestAncientIndicesCompatibility.java @@ -198,7 +198,7 @@ public void testUnsupportedOldIndexes() throws Exception { checker.setInfoStream(new PrintStream(bos, false, UTF_8)); checker.setLevel(CheckIndex.Level.MIN_LEVEL_FOR_INTEGRITY_CHECKS); CheckIndex.Status indexStatus = checker.checkIndex(); - if (version.startsWith("8.")) { + if (version.startsWith("8.") || version.startsWith("9.")) { assertTrue(indexStatus.clean); } else { assertFalse(indexStatus.clean); @@ -219,10 +219,11 @@ public void testUnsupportedOldIndexes() throws Exception { // #12895: test on a carefully crafted 9.8.0 index (from a small contiguous subset // of wikibigall unique terms) that shows the read-time exception of // IntersectTermsEnum (used by WildcardQuery) + @AwaitsFix(bugUrl = "https://github.com/apache/lucene/issues/13847") public void testWildcardQueryExceptions990() throws IOException { Path path = createTempDir("12895"); - String name = "index.12895.9.8.0.zip"; + String name = "unsupported.12895.9.8.0.zip"; InputStream resource = TestAncientIndicesCompatibility.class.getResourceAsStream(name); assertNotNull("missing zip file to reproduce #12895", resource); TestUtil.unzip(resource, path); diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestBasicBackwardsCompatibility.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestBasicBackwardsCompatibility.java index 8d35a1128be9..262567f9f765 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestBasicBackwardsCompatibility.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestBasicBackwardsCompatibility.java @@ -17,7 +17,6 @@ package org.apache.lucene.backward_index; import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; -import static org.apache.lucene.util.Version.LUCENE_9_0_0; import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; import java.io.IOException; @@ -52,6 +51,7 @@ import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexableField; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.LogByteSizeMergePolicy; import org.apache.lucene.index.MultiBits; @@ -95,7 +95,7 @@ public class TestBasicBackwardsCompatibility extends BackwardsCompatibilityTestB private static final int DOCS_COUNT = 35; private static final int DELETED_ID = 7; - private static final int KNN_VECTOR_MIN_SUPPORTED_VERSION = LUCENE_9_0_0.major; + private static final int KNN_VECTOR_MIN_SUPPORTED_VERSION = Version.fromBits(9, 0, 0).major; private static final String KNN_VECTOR_FIELD = "knn_field"; private static final FieldType KNN_VECTOR_FIELD_TYPE = KnnFloatVectorField.createFieldType(3, VectorSimilarityFunction.COSINE); @@ -477,10 +477,14 @@ public static void searchIndex( FloatVectorValues values = ctx.reader().getFloatVectorValues(KNN_VECTOR_FIELD); if (values != null) { assertEquals(KNN_VECTOR_FIELD_TYPE.vectorDimension(), values.dimension()); - for (int doc = values.nextDoc(); doc != NO_MORE_DOCS; doc = values.nextDoc()) { + KnnVectorValues.DocIndexIterator it = values.iterator(); + for (int doc = it.nextDoc(); doc != NO_MORE_DOCS; doc = it.nextDoc()) { float[] expectedVector = {KNN_VECTOR[0], KNN_VECTOR[1], KNN_VECTOR[2] + 0.1f * cnt}; assertArrayEquals( - "vectors do not match for doc=" + cnt, expectedVector, values.vectorValue(), 0); + "vectors do not match for doc=" + cnt, + expectedVector, + values.vectorValue(it.index()), + 0); cnt++; } } @@ -828,7 +832,7 @@ public void testAddOldIndexesReader() throws IOException { expectThrows(IllegalArgumentException.class, () -> TestUtil.addIndexesSlowly(w, reader)); assertEquals( e.getMessage(), - "Cannot merge a segment that has been created with major version 9 into this index which has been created by major version 10"); + "Cannot merge a segment that has been created with major version 10 into this index which has been created by major version 11"); w.close(); targetDir2.close(); diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestDVUpdateBackwardsCompatibility.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestDVUpdateBackwardsCompatibility.java index 332daa621ed5..cfe29028cdb9 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestDVUpdateBackwardsCompatibility.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestDVUpdateBackwardsCompatibility.java @@ -58,7 +58,7 @@ public TestDVUpdateBackwardsCompatibility(Version version, String pattern) { public static Iterable testVersionsFactory() { List params = new ArrayList<>(); // TODO - WHY ONLY on the first major version? - params.add(new Object[] {Version.LUCENE_9_0_0, createPattern(INDEX_NAME, SUFFIX)}); + params.add(new Object[] {Version.LUCENE_10_0_0, createPattern(INDEX_NAME, SUFFIX)}); return params; } diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestEmptyIndexBackwardsCompatibility.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestEmptyIndexBackwardsCompatibility.java index 40fcd4c59bf9..2367e20d6a03 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestEmptyIndexBackwardsCompatibility.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestEmptyIndexBackwardsCompatibility.java @@ -53,14 +53,14 @@ protected void createIndex(Directory directory) throws IOException { public static Iterable testVersionsFactory() { List params = new ArrayList<>(); // TODO - WHY ONLY on the first major version? - params.add(new Object[] {Version.LUCENE_9_0_0, createPattern(INDEX_NAME, SUFFIX)}); + params.add(new Object[] {Version.LUCENE_10_0_0, createPattern(INDEX_NAME, SUFFIX)}); return params; } public void testUpgradeEmptyOldIndex() throws Exception { try (Directory dir = newDirectory(directory)) { TestIndexUpgradeBackwardsCompatibility.newIndexUpgrader(dir).upgrade(); - TestIndexUpgradeBackwardsCompatibility.checkAllSegmentsUpgraded(dir, 9); + TestIndexUpgradeBackwardsCompatibility.checkAllSegmentsUpgraded(dir, 10); } } } diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestGenerateBwcIndices.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestGenerateBwcIndices.java index 936a4c28cf24..6989731ae141 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestGenerateBwcIndices.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestGenerateBwcIndices.java @@ -39,7 +39,7 @@ public class TestGenerateBwcIndices extends LuceneTestCase { // To generate backcompat indexes with the current default codec, run the following gradle // command: // gradlew test -Ptests.bwcdir=/path/to/store/indexes -Ptests.codec=default - // -Ptests.useSecurityManager=false --tests TestGenerateBwcIndices + // -Ptests.useSecurityManager=false --tests TestGenerateBwcIndices --max-workers=1 // // Also add testmethod with one of the index creation methods below, for example: // -Ptestmethod=testCreateCFS @@ -82,14 +82,14 @@ public void testCreateSortedIndex() throws IOException { sortedTest.createBWCIndex(); } - public void testCreateInt8HNSWIndices() throws IOException { - TestInt8HnswBackwardsCompatibility int8HnswBackwardsCompatibility = - new TestInt8HnswBackwardsCompatibility( + public void testCreateInt7HNSWIndices() throws IOException { + TestInt7HnswBackwardsCompatibility int7HnswBackwardsCompatibility = + new TestInt7HnswBackwardsCompatibility( Version.LATEST, createPattern( - TestInt8HnswBackwardsCompatibility.INDEX_NAME, - TestInt8HnswBackwardsCompatibility.SUFFIX)); - int8HnswBackwardsCompatibility.createBWCIndex(); + TestInt7HnswBackwardsCompatibility.INDEX_NAME, + TestInt7HnswBackwardsCompatibility.SUFFIX)); + int7HnswBackwardsCompatibility.createBWCIndex(); } private boolean isInitialMajorVersionRelease() { diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestIndexSortBackwardsCompatibility.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestIndexSortBackwardsCompatibility.java index 82de070189cf..ad5432b91ad6 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestIndexSortBackwardsCompatibility.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestIndexSortBackwardsCompatibility.java @@ -55,7 +55,7 @@ public class TestIndexSortBackwardsCompatibility extends BackwardsCompatibilityT static final String INDEX_NAME = "sorted"; static final String SUFFIX = ""; - private static final Version FIRST_PARENT_DOC_VERSION = Version.LUCENE_9_11_0; + private static final Version FIRST_PARENT_DOC_VERSION = Version.fromBits(9, 11, 0); private static final String PARENT_FIELD_NAME = "___parent"; public TestIndexSortBackwardsCompatibility(Version version, String pattern) { @@ -72,7 +72,7 @@ public void testSortedIndexAddDocBlocks() throws Exception { final Sort sort; try (DirectoryReader reader = DirectoryReader.open(directory)) { assertEquals(1, reader.leaves().size()); - sort = reader.leaves().get(0).reader().getMetaData().getSort(); + sort = reader.leaves().get(0).reader().getMetaData().sort(); assertNotNull(sort); searchExampleIndex(reader); } @@ -125,8 +125,8 @@ public void testSortedIndexAddDocBlocks() throws Exception { .add(new TermQuery(new Term("bid", "" + i)), BooleanClause.Occur.MUST) .build(), 2); - assertEquals(2, children.totalHits.value); - assertEquals(1, parents.totalHits.value); + assertEquals(2, children.totalHits.value()); + assertEquals(1, parents.totalHits.value()); // make sure it's sorted assertEquals(children.scoreDocs[0].doc + 1, children.scoreDocs[1].doc); assertEquals(children.scoreDocs[1].doc + 1, parents.scoreDocs[0].doc); @@ -140,7 +140,7 @@ public void testSortedIndexAddDocBlocks() throws Exception { public void testSortedIndex() throws Exception { try (DirectoryReader reader = DirectoryReader.open(directory)) { assertEquals(1, reader.leaves().size()); - Sort sort = reader.leaves().get(0).reader().getMetaData().getSort(); + Sort sort = reader.leaves().get(0).reader().getMetaData().sort(); assertNotNull(sort); assertEquals("!", sort.toString()); // This will confirm the docs are really sorted @@ -195,28 +195,28 @@ public static void searchExampleIndex(DirectoryReader reader) throws IOException IndexSearcher searcher = newSearcher(reader); TopDocs topDocs = searcher.search(new FieldExistsQuery("titleTokenized"), 10); - assertEquals(50, topDocs.totalHits.value); + assertEquals(50, topDocs.totalHits.value()); topDocs = searcher.search(new FieldExistsQuery("titleDV"), 10); - assertEquals(50, topDocs.totalHits.value); + assertEquals(50, topDocs.totalHits.value()); topDocs = searcher.search( IntPoint.newRangeQuery("docid_int", 42, 44), 10, new Sort(new SortField("docid_intDV", SortField.Type.INT))); - assertEquals(3, topDocs.totalHits.value); + assertEquals(3, topDocs.totalHits.value()); assertEquals(3, topDocs.scoreDocs.length); assertEquals(42, ((FieldDoc) topDocs.scoreDocs[0]).fields[0]); assertEquals(43, ((FieldDoc) topDocs.scoreDocs[1]).fields[0]); assertEquals(44, ((FieldDoc) topDocs.scoreDocs[2]).fields[0]); topDocs = searcher.search(new TermQuery(new Term("body", "the")), 5); - assertTrue(topDocs.totalHits.value > 0); + assertTrue(topDocs.totalHits.value() > 0); topDocs = searcher.search( new MatchAllDocsQuery(), 5, new Sort(new SortField("dateDV", SortField.Type.LONG))); - assertEquals(50, topDocs.totalHits.value); + assertEquals(50, topDocs.totalHits.value()); assertEquals(5, topDocs.scoreDocs.length); long firstDate = (Long) ((FieldDoc) topDocs.scoreDocs[0]).fields[0]; long lastDate = (Long) ((FieldDoc) topDocs.scoreDocs[4]).fields[0]; diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestInt8HnswBackwardsCompatibility.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestInt7HnswBackwardsCompatibility.java similarity index 72% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestInt8HnswBackwardsCompatibility.java rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestInt7HnswBackwardsCompatibility.java index 8db406df992b..384ce889c5a1 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestInt8HnswBackwardsCompatibility.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestInt7HnswBackwardsCompatibility.java @@ -20,20 +20,23 @@ import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; import java.io.IOException; -import org.apache.lucene.backward_codecs.lucene99.Lucene99Codec; import org.apache.lucene.codecs.Codec; -import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat; import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; +import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader; +import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.KnnFloatVectorField; import org.apache.lucene.document.StringField; +import org.apache.lucene.index.CodecReader; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.NoMergePolicy; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.search.IndexSearcher; @@ -41,42 +44,38 @@ import org.apache.lucene.tests.analysis.MockAnalyzer; import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.Version; +import org.apache.lucene.util.quantization.QuantizedByteVectorValues; -public class TestInt8HnswBackwardsCompatibility extends BackwardsCompatibilityTestBase { +public class TestInt7HnswBackwardsCompatibility extends BackwardsCompatibilityTestBase { - static final String INDEX_NAME = "int8_hnsw"; + static final String INDEX_NAME = "int7_hnsw"; static final String SUFFIX = ""; - private static final Version FIRST_INT8_HNSW_VERSION = Version.LUCENE_9_10_0; + private static final Version FIRST_INT7_HNSW_VERSION = Version.fromBits(9, 10, 0); private static final String KNN_VECTOR_FIELD = "knn_field"; private static final int DOC_COUNT = 30; private static final FieldType KNN_VECTOR_FIELD_TYPE = KnnFloatVectorField.createFieldType(3, VectorSimilarityFunction.COSINE); private static final float[] KNN_VECTOR = {0.2f, -0.1f, 0.1f}; - public TestInt8HnswBackwardsCompatibility(Version version, String pattern) { + public TestInt7HnswBackwardsCompatibility(Version version, String pattern) { super(version, pattern); } - /** Provides all sorted versions to the test-framework */ @ParametersFactory(argumentFormatting = "Lucene-Version:%1$s; Pattern: %2$s") public static Iterable testVersionsFactory() throws IllegalAccessException { return allVersion(INDEX_NAME, SUFFIX); } protected Codec getCodec() { - return new Lucene99Codec() { - @Override - public KnnVectorsFormat getKnnVectorsFormatForField(String field) { - return new Lucene99HnswScalarQuantizedVectorsFormat( + return TestUtil.alwaysKnnVectorsFormat( + new Lucene99HnswScalarQuantizedVectorsFormat( Lucene99HnswVectorsFormat.DEFAULT_MAX_CONN, - Lucene99HnswVectorsFormat.DEFAULT_BEAM_WIDTH); - } - }; + Lucene99HnswVectorsFormat.DEFAULT_BEAM_WIDTH)); } @Override protected boolean supportsVersion(Version version) { - return version.onOrAfter(FIRST_INT8_HNSW_VERSION); + return version.onOrAfter(FIRST_INT7_HNSW_VERSION); } @Override @@ -84,7 +83,7 @@ void verifyUsesDefaultCodec(Directory dir, String name) throws IOException { // We don't use the default codec } - public void testInt8HnswIndexAndSearch() throws Exception { + public void testInt7HnswIndexAndSearch() throws Exception { IndexWriterConfig indexWriterConfig = newIndexWriterConfig(new MockAnalyzer(random())) .setOpenMode(IndexWriterConfig.OpenMode.APPEND) @@ -108,7 +107,6 @@ public void testInt8HnswIndexAndSearch() throws Exception { assertKNNSearch(searcher, KNN_VECTOR, 10, 10, "0"); } } - // This will confirm the docs are really sorted TestUtil.checkIndex(directory); } @@ -117,7 +115,7 @@ protected void createIndex(Directory dir) throws IOException { IndexWriterConfig conf = new IndexWriterConfig(new MockAnalyzer(random())) .setMaxBufferedDocs(10) - .setCodec(TestUtil.getDefaultCodec()) + .setCodec(getCodec()) .setMergePolicy(NoMergePolicy.INSTANCE); try (IndexWriter writer = new IndexWriter(dir, conf)) { for (int i = 0; i < DOC_COUNT; i++) { @@ -147,4 +145,29 @@ public void testReadOldIndices() throws Exception { assertKNNSearch(searcher, KNN_VECTOR, 10, 10, "0"); } } + + // #13880: make sure the BWC index really contains quantized HNSW not float32 + public void testIndexIsReallyQuantized() throws Exception { + try (DirectoryReader reader = DirectoryReader.open(directory)) { + for (LeafReaderContext leafContext : reader.leaves()) { + KnnVectorsReader knnVectorsReader = ((CodecReader) leafContext.reader()).getVectorReader(); + assertTrue( + "expected PerFieldKnnVectorsFormat.FieldsReader but got: " + knnVectorsReader, + knnVectorsReader instanceof PerFieldKnnVectorsFormat.FieldsReader); + + KnnVectorsReader forField = + ((PerFieldKnnVectorsFormat.FieldsReader) knnVectorsReader) + .getFieldReader(KNN_VECTOR_FIELD); + + assertTrue(forField instanceof Lucene99HnswVectorsReader); + + QuantizedByteVectorValues quantized = + ((Lucene99HnswVectorsReader) forField).getQuantizedVectorValues(KNN_VECTOR_FIELD); + + assertNotNull( + "KnnVectorsReader should have quantized interface for field " + KNN_VECTOR_FIELD, + quantized); + } + } + } } diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestMoreTermsBackwardsCompatibility.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestMoreTermsBackwardsCompatibility.java index 6bacb49dd652..6b33eeb5add9 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestMoreTermsBackwardsCompatibility.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestMoreTermsBackwardsCompatibility.java @@ -31,13 +31,15 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.tests.analysis.MockAnalyzer; import org.apache.lucene.tests.util.LineFileDocs; +import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.Version; +@LuceneTestCase.AwaitsFix(bugUrl = "https://github.com/apache/lucene/issues/13847") public class TestMoreTermsBackwardsCompatibility extends BackwardsCompatibilityTestBase { - static final String INDEX_NAME = "moreterms"; + static final String INDEX_NAME = "unsupported.moreterms"; static final String SUFFIX = ""; @@ -48,7 +50,7 @@ public TestMoreTermsBackwardsCompatibility(Version version, String pattern) { @ParametersFactory(argumentFormatting = "Lucene-Version:%1$s; Pattern: %2$s") public static Iterable testVersionsFactory() { List params = new ArrayList<>(); - params.add(new Object[] {Version.LUCENE_9_0_0, createPattern(INDEX_NAME, SUFFIX)}); + params.add(new Object[] {Version.fromBits(9, 0, 0), createPattern(INDEX_NAME, SUFFIX)}); return params; } diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/dvupdates.10.0.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/dvupdates.10.0.0.zip new file mode 100644 index 000000000000..db5d5260bcc3 Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/dvupdates.10.0.0.zip differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/empty.10.0.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/empty.10.0.0.zip new file mode 100644 index 000000000000..d906538645b7 Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/empty.10.0.0.zip differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.10.0.0-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.10.0.0-cfs.zip new file mode 100644 index 000000000000..73c79500c855 Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.10.0.0-cfs.zip differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.10.0.0-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.10.0.0-nocfs.zip new file mode 100644 index 000000000000..d8b8216c6396 Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.10.0.0-nocfs.zip differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/int7_hnsw.10.0.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/int7_hnsw.10.0.0.zip new file mode 100644 index 000000000000..99a28f7631cc Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/int7_hnsw.10.0.0.zip differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/int8_hnsw.9.10.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/int8_hnsw.9.10.0.zip deleted file mode 100644 index 2799f04b65a8..000000000000 Binary files a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/int8_hnsw.9.10.0.zip and /dev/null differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/int8_hnsw.9.11.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/int8_hnsw.9.11.0.zip deleted file mode 100644 index 5fd94783427b..000000000000 Binary files a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/int8_hnsw.9.11.0.zip and /dev/null differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/int8_hnsw.9.11.1.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/int8_hnsw.9.11.1.zip deleted file mode 100644 index c4bb86b5f1b6..000000000000 Binary files a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/int8_hnsw.9.11.1.zip and /dev/null differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/moreterms.10.0.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/moreterms.10.0.0.zip new file mode 100644 index 000000000000..6ee086756ccf Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/moreterms.10.0.0.zip differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.10.0.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.10.0.0.zip new file mode 100644 index 000000000000..e08962568967 Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.10.0.0.zip differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.12895.9.8.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.12895.9.8.0.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.12895.9.8.0.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.12895.9.8.0.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.8.11.4-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.8.11.4-cfs.zip new file mode 100644 index 000000000000..bb3e4f017530 Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.8.11.4-cfs.zip differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.8.11.4-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.8.11.4-nocfs.zip new file mode 100644 index 000000000000..a19fa717096d Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.8.11.4-nocfs.zip differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.0.0-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.0.0-cfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.0.0-cfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.0.0-cfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.0.0-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.0.0-nocfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.0.0-nocfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.0.0-nocfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.1.0-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.1.0-cfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.1.0-cfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.1.0-cfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.1.0-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.1.0-nocfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.1.0-nocfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.1.0-nocfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.10.0-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.10.0-cfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.10.0-cfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.10.0-cfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.10.0-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.10.0-nocfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.10.0-nocfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.10.0-nocfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.11.0-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.11.0-cfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.11.0-cfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.11.0-cfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.11.0-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.11.0-nocfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.11.0-nocfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.11.0-nocfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.11.1-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.11.1-cfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.11.1-cfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.11.1-cfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.11.1-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.11.1-nocfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.11.1-nocfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.11.1-nocfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.12.0-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.12.0-cfs.zip new file mode 100644 index 000000000000..6fc0118f222b Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.12.0-cfs.zip differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.12.0-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.12.0-nocfs.zip new file mode 100644 index 000000000000..56b5c1325c88 Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.12.0-nocfs.zip differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.2.0-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.2.0-cfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.2.0-cfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.2.0-cfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.2.0-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.2.0-nocfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.2.0-nocfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.2.0-nocfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.3.0-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.3.0-cfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.3.0-cfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.3.0-cfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.3.0-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.3.0-nocfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.3.0-nocfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.3.0-nocfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.4.0-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.4.0-cfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.4.0-cfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.4.0-cfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.4.0-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.4.0-nocfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.4.0-nocfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.4.0-nocfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.4.1-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.4.1-cfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.4.1-cfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.4.1-cfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.4.1-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.4.1-nocfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.4.1-nocfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.4.1-nocfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.4.2-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.4.2-cfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.4.2-cfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.4.2-cfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.4.2-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.4.2-nocfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.4.2-nocfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.4.2-nocfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.5.0-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.5.0-cfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.5.0-cfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.5.0-cfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.5.0-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.5.0-nocfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.5.0-nocfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.5.0-nocfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.6.0-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.6.0-cfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.6.0-cfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.6.0-cfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.6.0-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.6.0-nocfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.6.0-nocfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.6.0-nocfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.7.0-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.7.0-cfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.7.0-cfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.7.0-cfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.7.0-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.7.0-nocfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.7.0-nocfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.7.0-nocfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.8.0-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.8.0-cfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.8.0-cfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.8.0-cfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.8.0-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.8.0-nocfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.8.0-nocfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.8.0-nocfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.9.0-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.9.0-cfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.9.0-cfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.9.0-cfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.9.0-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.9.0-nocfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.9.0-nocfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.9.0-nocfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.9.1-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.9.1-cfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.9.1-cfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.9.1-cfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.9.1-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.9.1-nocfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.9.1-nocfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.9.1-nocfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.9.2-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.9.2-cfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.9.2-cfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.9.2-cfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.9.2-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.9.2-nocfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.9.2-nocfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.9.2-nocfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/dvupdates.9.0.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.dvupdates.9.0.0.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/dvupdates.9.0.0.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.dvupdates.9.0.0.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/empty.9.0.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.empty.9.0.0.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/empty.9.0.0.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.empty.9.0.0.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.int7_hnsw.9.10.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.int7_hnsw.9.10.0.zip new file mode 100644 index 000000000000..0425b451fa0c Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.int7_hnsw.9.10.0.zip differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.int7_hnsw.9.11.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.int7_hnsw.9.11.0.zip new file mode 100644 index 000000000000..9dd53d92a993 Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.int7_hnsw.9.11.0.zip differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.int7_hnsw.9.11.1.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.int7_hnsw.9.11.1.zip new file mode 100644 index 000000000000..29aef1b909fd Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.int7_hnsw.9.11.1.zip differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.int7_hnsw.9.12.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.int7_hnsw.9.12.0.zip new file mode 100644 index 000000000000..bfe07de81437 Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.int7_hnsw.9.12.0.zip differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/moreterms.9.0.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.moreterms.9.0.0.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/moreterms.9.0.0.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.moreterms.9.0.0.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.8.11.4.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.8.11.4.zip new file mode 100644 index 000000000000..9736c6aca981 Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.8.11.4.zip differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.0.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.0.0.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.0.0.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.0.0.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.1.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.1.0.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.1.0.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.1.0.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.10.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.10.0.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.10.0.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.10.0.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.11.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.11.0.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.11.0.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.11.0.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.11.1.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.11.1.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.11.1.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.11.1.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.12.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.12.0.zip new file mode 100644 index 000000000000..9ad1590e3e40 Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.12.0.zip differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.2.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.2.0.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.2.0.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.2.0.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.3.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.3.0.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.3.0.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.3.0.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.4.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.4.0.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.4.0.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.4.0.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.4.1.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.4.1.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.4.1.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.4.1.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.4.2.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.4.2.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.4.2.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.4.2.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.5.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.5.0.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.5.0.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.5.0.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.6.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.6.0.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.6.0.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.6.0.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.7.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.7.0.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.7.0.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.7.0.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.8.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.8.0.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.8.0.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.8.0.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.9.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.9.0.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.9.0.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.9.0.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.9.1.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.9.1.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.9.1.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.9.1.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.9.2.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.9.2.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.9.2.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.9.2.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported_versions.txt b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported_versions.txt index 8f298d3ae05a..521f12c28042 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported_versions.txt +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported_versions.txt @@ -119,4 +119,23 @@ 8.11.0 8.11.1 8.11.2 -8.11.3 \ No newline at end of file +8.11.3 +8.11.4 +9.0.0 +9.1.0 +9.2.0 +9.3.0 +9.4.0 +9.4.1 +9.4.2 +9.5.0 +9.6.0 +9.7.0 +9.8.0 +9.9.0 +9.9.1 +9.9.2 +9.10.0 +9.11.0 +9.11.1 +9.12.0 \ No newline at end of file diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/versions.txt b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/versions.txt index 4572b6fadfe4..7529186caca2 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/versions.txt +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/versions.txt @@ -1,29 +1,3 @@ -8.0.0 -8.1.0 -8.1.1 -8.2.0 -8.3.0 -8.3.1 -8.4.0 -8.4.1 -8.5.0 -8.5.1 -8.5.2 -8.6.0 -8.6.1 -8.6.2 -8.6.3 -8.7.0 -8.8.0 -8.8.1 -8.8.2 -8.9.0 -8.10.0 -8.10.1 -8.11.0 -8.11.1 -8.11.2 -8.11.3 9.0.0 9.1.0 9.2.0 @@ -41,3 +15,5 @@ 9.10.0 9.11.0 9.11.1 +9.12.0 +10.0.0 \ No newline at end of file diff --git a/lucene/benchmark-jmh/src/java/module-info.java b/lucene/benchmark-jmh/src/java/module-info.java index d92164cfae1d..019e08abce1b 100644 --- a/lucene/benchmark-jmh/src/java/module-info.java +++ b/lucene/benchmark-jmh/src/java/module-info.java @@ -16,6 +16,9 @@ */ /** Lucene JMH benchmarks. */ + +// jmh.core is not modularized and causes a warning. Suppressing it until it is modularized. +@SuppressWarnings("requires-automatic") module org.apache.lucene.benchmark.jmh { requires jmh.core; requires jdk.unsupported; diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/AdvanceBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/AdvanceBenchmark.java new file mode 100644 index 000000000000..784ace1ae321 --- /dev/null +++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/AdvanceBenchmark.java @@ -0,0 +1,180 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.benchmark.jmh; + +import java.util.Arrays; +import java.util.Random; +import java.util.concurrent.TimeUnit; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.util.VectorUtil; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.CompilerControl; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; + +@BenchmarkMode(Mode.Throughput) +@OutputTimeUnit(TimeUnit.MILLISECONDS) +@State(Scope.Benchmark) +@Warmup(iterations = 5, time = 1) +@Measurement(iterations = 5, time = 1) +@Fork( + value = 3, + jvmArgsAppend = { + "-Xmx1g", + "-Xms1g", + "-XX:+AlwaysPreTouch", + "--add-modules", + "jdk.incubator.vector" + }) +public class AdvanceBenchmark { + + private final int[] values = new int[129]; + private final int[] startIndexes = new int[1_000]; + private final int[] targets = new int[startIndexes.length]; + + @Setup(Level.Trial) + public void setup() throws Exception { + for (int i = 0; i < 128; ++i) { + values[i] = i; + } + values[128] = DocIdSetIterator.NO_MORE_DOCS; + Random r = new Random(0); + for (int i = 0; i < startIndexes.length; ++i) { + startIndexes[i] = r.nextInt(64); + targets[i] = startIndexes[i] + 1 + r.nextInt(1 << r.nextInt(7)); + } + } + + @Benchmark + public void binarySearch() { + for (int i = 0; i < startIndexes.length; ++i) { + binarySearch(values, targets[i], startIndexes[i]); + } + } + + @CompilerControl(CompilerControl.Mode.DONT_INLINE) + private static int binarySearch(int[] values, int target, int startIndex) { + // Standard binary search + int i = Arrays.binarySearch(values, startIndex, values.length, target); + if (i < 0) { + i = -1 - i; + } + return i; + } + + @Benchmark + public void inlinedBranchlessBinarySearch() { + for (int i = 0; i < targets.length; ++i) { + inlinedBranchlessBinarySearch(values, targets[i]); + } + } + + @CompilerControl(CompilerControl.Mode.DONT_INLINE) + private static int inlinedBranchlessBinarySearch(int[] values, int target) { + // This compiles to cmov instructions. + int start = 0; + + if (values[63] < target) { + start += 64; + } + if (values[start + 31] < target) { + start += 32; + } + if (values[start + 15] < target) { + start += 16; + } + if (values[start + 7] < target) { + start += 8; + } + if (values[start + 3] < target) { + start += 4; + } + if (values[start + 1] < target) { + start += 2; + } + if (values[start] < target) { + start += 1; + } + + return start; + } + + @Benchmark + public void linearSearch() { + for (int i = 0; i < startIndexes.length; ++i) { + linearSearch(values, targets[i], startIndexes[i]); + } + } + + @CompilerControl(CompilerControl.Mode.DONT_INLINE) + private static int linearSearch(int[] values, long target, int startIndex) { + // Naive linear search. + for (int i = startIndex; i < values.length; ++i) { + if (values[i] >= target) { + return i; + } + } + return values.length; + } + + @Benchmark + public void vectorUtilSearch() { + for (int i = 0; i < startIndexes.length; ++i) { + VectorUtil.findNextGEQ(values, targets[i], startIndexes[i], 128); + } + } + + @CompilerControl(CompilerControl.Mode.DONT_INLINE) + private static int vectorUtilSearch(int[] values, int target, int startIndex) { + return VectorUtil.findNextGEQ(values, target, startIndex, 128); + } + + private static void assertEquals(int expected, int actual) { + if (expected != actual) { + throw new AssertionError("Expected: " + expected + ", got " + actual); + } + } + + public static void main(String[] args) { + // For testing purposes + int[] values = new int[129]; + for (int i = 0; i < 128; ++i) { + values[i] = i; + } + values[128] = DocIdSetIterator.NO_MORE_DOCS; + for (int start = 0; start < 128; ++start) { + for (int targetIndex = start; targetIndex < 128; ++targetIndex) { + int actualIndex = binarySearch(values, values[targetIndex], start); + assertEquals(targetIndex, actualIndex); + actualIndex = inlinedBranchlessBinarySearch(values, values[targetIndex]); + assertEquals(targetIndex, actualIndex); + actualIndex = linearSearch(values, values[targetIndex], start); + assertEquals(targetIndex, actualIndex); + actualIndex = vectorUtilSearch(values, values[targetIndex], start); + assertEquals(targetIndex, actualIndex); + } + } + } +} diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/GroupVIntBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/GroupVIntBenchmark.java index 0df0d7ecf504..48b955706944 100644 --- a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/GroupVIntBenchmark.java +++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/GroupVIntBenchmark.java @@ -186,7 +186,7 @@ public void benchMMapDirectoryInputs_readVInt(Blackhole bh) throws IOException { @Benchmark public void benchMMapDirectoryInputs_readGroupVInt(Blackhole bh) throws IOException { byteBufferGVIntIn.seek(0); - byteBufferGVIntIn.readGroupVInts(values, size); + GroupVIntUtil.readGroupVInts(byteBufferGVIntIn, values, size); bh.consume(values); } @@ -209,14 +209,14 @@ public void benchByteArrayDataInput_readVInt(Blackhole bh) { @Benchmark public void benchByteArrayDataInput_readGroupVInt(Blackhole bh) throws IOException { byteArrayGVIntIn.rewind(); - byteArrayGVIntIn.readGroupVInts(values, size); + GroupVIntUtil.readGroupVInts(byteArrayGVIntIn, values, size); bh.consume(values); } @Benchmark public void benchNIOFSDirectoryInputs_readGroupVInt(Blackhole bh) throws IOException { nioGVIntIn.seek(0); - nioGVIntIn.readGroupVInts(values, size); + GroupVIntUtil.readGroupVInts(nioGVIntIn, values, size); bh.consume(values); } @@ -230,7 +230,7 @@ public void benchNIOFSDirectoryInputs_readGroupVIntBaseline(Blackhole bh) throws @Benchmark public void benchByteBuffersIndexInput_readGroupVInt(Blackhole bh) throws IOException { byteBuffersGVIntIn.seek(0); - byteBuffersGVIntIn.readGroupVInts(values, size); + GroupVIntUtil.readGroupVInts(byteBuffersGVIntIn, values, size); bh.consume(values); } diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/PostingIndexInputBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/PostingIndexInputBenchmark.java new file mode 100644 index 000000000000..241b289c5f61 --- /dev/null +++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/PostingIndexInputBenchmark.java @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.benchmark.jmh; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Random; +import java.util.concurrent.TimeUnit; +import org.apache.lucene.codecs.lucene101.ForDeltaUtil; +import org.apache.lucene.codecs.lucene101.ForUtil; +import org.apache.lucene.codecs.lucene101.PostingIndexInput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.MMapDirectory; +import org.apache.lucene.util.IOUtils; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.TearDown; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.infra.Blackhole; + +@BenchmarkMode(Mode.Throughput) +@OutputTimeUnit(TimeUnit.MICROSECONDS) +@State(Scope.Benchmark) +@Warmup(iterations = 5, time = 1) +@Measurement(iterations = 5, time = 1) +@Fork( + value = 3, + jvmArgsAppend = {"-Xmx1g", "-Xms1g", "-XX:+AlwaysPreTouch"}) +public class PostingIndexInputBenchmark { + + private Path path; + private Directory dir; + private IndexInput in; + private PostingIndexInput postingIn; + private final ForUtil forUtil = new ForUtil(); + private final ForDeltaUtil forDeltaUtil = new ForDeltaUtil(); + private final int[] values = new int[ForUtil.BLOCK_SIZE]; + + @Param({"2", "3", "4", "5", "6", "7", "8", "9", "10"}) + public int bpv; + + @Setup(Level.Trial) + public void setup() throws Exception { + path = Files.createTempDirectory("forUtil"); + dir = MMapDirectory.open(path); + try (IndexOutput out = dir.createOutput("docs", IOContext.DEFAULT)) { + Random r = new Random(0); + // Write enough random data to not reach EOF while decoding + for (int i = 0; i < 100; ++i) { + out.writeLong(r.nextLong()); + } + } + in = dir.openInput("docs", IOContext.DEFAULT); + postingIn = new PostingIndexInput(in, forUtil, forDeltaUtil); + } + + @TearDown(Level.Trial) + public void tearDown() throws Exception { + if (dir != null) { + dir.deleteFile("docs"); + } + IOUtils.close(in, dir); + in = null; + dir = null; + Files.deleteIfExists(path); + } + + @Benchmark + public void decode(Blackhole bh) throws IOException { + in.seek(3); // random unaligned offset + postingIn.decode(bpv, values); + bh.consume(values); + } + + @Benchmark + public void decodeAndPrefixSum(Blackhole bh) throws IOException { + in.seek(3); // random unaligned offset + postingIn.decodeAndPrefixSum(bpv, 100, values); + bh.consume(values); + } +} diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java index c4d3040f2835..0a4da1f48867 100644 --- a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java +++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java @@ -25,6 +25,7 @@ import org.apache.lucene.codecs.hnsw.FlatVectorScorerUtil; import org.apache.lucene.codecs.hnsw.FlatVectorsScorer; import org.apache.lucene.codecs.lucene95.OffHeapByteVectorValues; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; @@ -32,7 +33,6 @@ import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.MMapDirectory; import org.apache.lucene.util.IOUtils; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.hnsw.RandomVectorScorer; import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier; import org.openjdk.jmh.annotations.*; @@ -55,7 +55,7 @@ public class VectorScorerBenchmark { Directory dir; IndexInput in; - RandomAccessVectorValues vectorValues; + KnnVectorValues vectorValues; byte[] vec1, vec2; RandomVectorScorer scorer; @@ -95,7 +95,7 @@ public float binaryDotProductMemSeg() throws IOException { return scorer.score(1); } - static RandomAccessVectorValues vectorValues( + static KnnVectorValues vectorValues( int dims, int size, IndexInput in, VectorSimilarityFunction sim) throws IOException { return new OffHeapByteVectorValues.DenseOffHeapVectorValues( dims, size, in.slice("test", 0, in.length()), dims, new ThrowingFlatVectorScorer(), sim); @@ -105,23 +105,19 @@ static final class ThrowingFlatVectorScorer implements FlatVectorsScorer { @Override public RandomVectorScorerSupplier getRandomVectorScorerSupplier( - VectorSimilarityFunction similarityFunction, RandomAccessVectorValues vectorValues) { + VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues) { throw new UnsupportedOperationException(); } @Override public RandomVectorScorer getRandomVectorScorer( - VectorSimilarityFunction similarityFunction, - RandomAccessVectorValues vectorValues, - float[] target) { + VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues, float[] target) { throw new UnsupportedOperationException(); } @Override public RandomVectorScorer getRandomVectorScorer( - VectorSimilarityFunction similarityFunction, - RandomAccessVectorValues vectorValues, - byte[] target) { + VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues, byte[] target) { throw new UnsupportedOperationException(); } } diff --git a/lucene/benchmark/conf/collector-small.alg b/lucene/benchmark/conf/collector-small.alg index e57ee8646b11..7d312d4f2434 100644 --- a/lucene/benchmark/conf/collector-small.alg +++ b/lucene/benchmark/conf/collector-small.alg @@ -17,11 +17,10 @@ # ------------------------------------------------------------------------------------- # multi val params are iterated by NewRound's, added to reports, start with column name. -# collector.class can be: -# Fully Qualified Class Name of a Collector with a empty constructor -# topScoreDocOrdered - Creates a TopScoreDocCollector that requires in order docs -# topScoreDocUnordered - Like above, but allows out of order -collector.class=coll:topScoreDoc +# collector.manager.class can be: +# Fully Qualified Class Name of a CollectorManager with a empty constructor +# topScoreDoc - Creates a TopScoreDocCollectorManager +collector.manager.class=coll:topScoreDoc analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer directory=FSDirectory diff --git a/lucene/benchmark/conf/collector.alg b/lucene/benchmark/conf/collector.alg index e2843492dcab..96873b9402f9 100644 --- a/lucene/benchmark/conf/collector.alg +++ b/lucene/benchmark/conf/collector.alg @@ -17,11 +17,10 @@ # ------------------------------------------------------------------------------------- # multi val params are iterated by NewRound's, added to reports, start with column name. -# collector.class can be: -# Fully Qualified Class Name of a Collector with a empty constructor -# topScoreDocOrdered - Creates a TopScoreDocCollector that requires in order docs -# topScoreDocUnordered - Like above, but allows out of order -collector.class=coll:topScoreDoc +# collector.manager.class can be: +# Fully Qualified Class Name of a CollectorManager with a empty constructor +# topScoreDoc - Creates a TopScoreDocCollectorManager +collector.manager.class=coll:topScoreDoc analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer directory=FSDirectory diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java index cd0daa42d4bf..b439a75a934e 100644 --- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java +++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java @@ -238,7 +238,7 @@ public void startElement( time = null; id = null; break; - // intentional fall-through. + // intentional fall-through. case BODY: case DATE: case TITLE: diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SpatialDocMaker.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SpatialDocMaker.java index 968f83b56e82..5bd00a5d8658 100644 --- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SpatialDocMaker.java +++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SpatialDocMaker.java @@ -99,7 +99,7 @@ protected SpatialStrategy makeSpatialStrategy( return makeRPTStrategy(SPATIAL_FIELD, config, configMap, ctx); case "composite": return makeCompositeStrategy(config, configMap, ctx); - // TODO add more as-needed + // TODO add more as-needed default: throw new IllegalStateException("Unknown spatial.strategy: " + strategyName); } diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java index 58cf8e79efae..9940196ce479 100644 --- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java +++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java @@ -24,7 +24,7 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.MultiBits; import org.apache.lucene.index.StoredFields; -import org.apache.lucene.search.Collector; +import org.apache.lucene.search.CollectorManager; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; @@ -112,16 +112,13 @@ public int doLogic() throws Exception { // pulling the Weight ourselves: int totalHitsThreshold = withTotalHits() ? Integer.MAX_VALUE : 1; TopFieldCollectorManager collectorManager = - new TopFieldCollectorManager( - sort, numHits, null, totalHitsThreshold, searcher.getSlices().length > 1); + new TopFieldCollectorManager(sort, numHits, null, totalHitsThreshold); hits = searcher.search(q, collectorManager); } else { hits = searcher.search(q, numHits); } } else { - Collector collector = createCollector(); - - searcher.search(q, collector); + searcher.search(q, createCollectorManager()); // hits = collector.topDocs(); } @@ -184,9 +181,8 @@ protected int withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws return res; } - protected Collector createCollector() throws Exception { - return new TopScoreDocCollectorManager(numHits(), withTotalHits() ? Integer.MAX_VALUE : 1) - .newCollector(); + protected CollectorManager createCollectorManager() throws Exception { + return new TopScoreDocCollectorManager(numHits(), withTotalHits() ? Integer.MAX_VALUE : 1); } protected Document retrieveDoc(StoredFields storedFields, int id) throws IOException { diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchWithCollectorTask.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchWithCollectorTask.java index f41994f48d63..c753ccde65a4 100644 --- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchWithCollectorTask.java +++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchWithCollectorTask.java @@ -19,7 +19,7 @@ import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.feeds.QueryMaker; import org.apache.lucene.benchmark.byTask.utils.Config; -import org.apache.lucene.search.Collector; +import org.apache.lucene.search.CollectorManager; import org.apache.lucene.search.TopScoreDocCollectorManager; /** Does search w/ a custom collector */ @@ -37,7 +37,11 @@ public void setup() throws Exception { // check to make sure either the doc is being stored PerfRunData runData = getRunData(); Config config = runData.getConfig(); - clnName = config.get("collector.class", ""); + if (config.get("collector.class", null) != null) { + throw new IllegalArgumentException( + "collector.class is no longer supported as a config parameter, use collector.manager.class instead to provide a CollectorManager class name"); + } + clnName = config.get("collector.manager.class", ""); } @Override @@ -46,18 +50,17 @@ public boolean withCollector() { } @Override - protected Collector createCollector() throws Exception { - Collector collector = null; + protected CollectorManager createCollectorManager() throws Exception { + CollectorManager collectorManager; if (clnName.equalsIgnoreCase("topScoreDoc") == true) { - collector = - new TopScoreDocCollectorManager(numHits(), null, Integer.MAX_VALUE, false).newCollector(); + collectorManager = new TopScoreDocCollectorManager(numHits(), Integer.MAX_VALUE); } else if (clnName.length() > 0) { - collector = Class.forName(clnName).asSubclass(Collector.class).getConstructor().newInstance(); - + collectorManager = + Class.forName(clnName).asSubclass(CollectorManager.class).getConstructor().newInstance(); } else { - collector = super.createCollector(); + collectorManager = super.createCollectorManager(); } - return collector; + return collectorManager; } @Override diff --git a/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TestDocMaker.java b/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TestDocMaker.java index b6da2d2f404c..eb8101eeb9bb 100644 --- a/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TestDocMaker.java +++ b/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TestDocMaker.java @@ -91,7 +91,7 @@ private void doTestIndexProperties( IndexReader reader = DirectoryReader.open(runData.getDirectory()); IndexSearcher searcher = newSearcher(reader); TopDocs td = searcher.search(new TermQuery(new Term("key", "value")), 10); - assertEquals(numExpectedResults, td.totalHits.value); + assertEquals(numExpectedResults, td.totalHits.value()); reader.close(); } diff --git a/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TestLineDocSource.java b/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TestLineDocSource.java index 393d352abf2c..a060db7a08c7 100644 --- a/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TestLineDocSource.java +++ b/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TestLineDocSource.java @@ -160,7 +160,7 @@ private void doIndexAndSearchTestWithRepeats( reader = DirectoryReader.open(runData.getDirectory()); searcher = newSearcher(reader); TopDocs td = searcher.search(new TermQuery(new Term("body", "body")), 10); - assertEquals(numAdds, td.totalHits.value); + assertEquals(numAdds, td.totalHits.value()); assertNotNull(td.scoreDocs[0]); if (storedField == null) { diff --git a/lucene/classification/src/java/org/apache/lucene/classification/BM25NBClassifier.java b/lucene/classification/src/java/org/apache/lucene/classification/BM25NBClassifier.java index 40e773381921..1f56d19a7141 100644 --- a/lucene/classification/src/java/org/apache/lucene/classification/BM25NBClassifier.java +++ b/lucene/classification/src/java/org/apache/lucene/classification/BM25NBClassifier.java @@ -151,13 +151,13 @@ private ArrayList> normClassificationResults( if (!assignedClasses.isEmpty()) { Collections.sort(assignedClasses); // this is a negative number closest to 0 = a - double smax = assignedClasses.get(0).getScore(); + double smax = assignedClasses.get(0).score(); double sumLog = 0; // log(sum(exp(x_n-a))) for (ClassificationResult cr : assignedClasses) { // getScore-smax <=0 (both negative, smax is the smallest abs() - sumLog += Math.exp(cr.getScore() - smax); + sumLog += Math.exp(cr.score() - smax); } // loga=a+log(sum(exp(x_n-a))) = log(sum(exp(x_n))) double loga = smax; @@ -165,8 +165,8 @@ private ArrayList> normClassificationResults( // 1/sum*x = exp(log(x))*1/sum = exp(log(x)-log(sum)) for (ClassificationResult cr : assignedClasses) { - double scoreDiff = cr.getScore() - loga; - returnList.add(new ClassificationResult<>(cr.getAssignedClass(), Math.exp(scoreDiff))); + double scoreDiff = cr.score() - loga; + returnList.add(new ClassificationResult<>(cr.assignedClass(), Math.exp(scoreDiff))); } } return returnList; @@ -216,7 +216,7 @@ private double getTermProbForClass(Term classTerm, String... words) throws IOExc builder.add(query, BooleanClause.Occur.MUST); } TopDocs search = indexSearcher.search(builder.build(), 1); - return search.totalHits.value > 0 ? search.scoreDocs[0].score : 1; + return search.totalHits.value() > 0 ? search.scoreDocs[0].score : 1; } private double calculateLogPrior(Term term) throws IOException { @@ -227,6 +227,6 @@ private double calculateLogPrior(Term term) throws IOException { bq.add(query, BooleanClause.Occur.MUST); } TopDocs topDocs = indexSearcher.search(bq.build(), 1); - return topDocs.totalHits.value > 0 ? Math.log(topDocs.scoreDocs[0].score) : 0; + return topDocs.totalHits.value() > 0 ? Math.log(topDocs.scoreDocs[0].score) : 0; } } diff --git a/lucene/classification/src/java/org/apache/lucene/classification/BooleanPerceptronClassifier.java b/lucene/classification/src/java/org/apache/lucene/classification/BooleanPerceptronClassifier.java index e4c4ccb1a958..d45bcb1f813c 100644 --- a/lucene/classification/src/java/org/apache/lucene/classification/BooleanPerceptronClassifier.java +++ b/lucene/classification/src/java/org/apache/lucene/classification/BooleanPerceptronClassifier.java @@ -148,7 +148,7 @@ public BooleanPerceptronClassifier( if (textField != null && classField != null) { // assign class to the doc ClassificationResult classificationResult = assignClass(textField.stringValue()); - Boolean assignedClass = classificationResult.getAssignedClass(); + Boolean assignedClass = classificationResult.assignedClass(); Boolean correctClass = Boolean.valueOf(classField.stringValue()); double modifier = Math.signum(correctClass.compareTo(assignedClass)); diff --git a/lucene/classification/src/java/org/apache/lucene/classification/CachingNaiveBayesClassifier.java b/lucene/classification/src/java/org/apache/lucene/classification/CachingNaiveBayesClassifier.java index deff4202167a..f251b4d61b03 100644 --- a/lucene/classification/src/java/org/apache/lucene/classification/CachingNaiveBayesClassifier.java +++ b/lucene/classification/src/java/org/apache/lucene/classification/CachingNaiveBayesClassifier.java @@ -126,7 +126,7 @@ private List> calculateLogLikelihood(String[] tok int removeIdx = -1; int i = 0; for (ClassificationResult cr : ret) { - if (cr.getAssignedClass().equals(cclass)) { + if (cr.assignedClass().equals(cclass)) { removeIdx = i; break; } @@ -137,7 +137,7 @@ private List> calculateLogLikelihood(String[] tok ClassificationResult toRemove = ret.get(removeIdx); ret.add( new ClassificationResult<>( - toRemove.getAssignedClass(), toRemove.getScore() + Math.log(wordProbability))); + toRemove.assignedClass(), toRemove.score() + Math.log(wordProbability))); ret.remove(removeIdx); } } diff --git a/lucene/classification/src/java/org/apache/lucene/classification/ClassificationResult.java b/lucene/classification/src/java/org/apache/lucene/classification/ClassificationResult.java index 87836d72dbdc..c6acf47d7a7f 100644 --- a/lucene/classification/src/java/org/apache/lucene/classification/ClassificationResult.java +++ b/lucene/classification/src/java/org/apache/lucene/classification/ClassificationResult.java @@ -20,44 +20,15 @@ * The result of a call to {@link Classifier#assignClass(String)} holding an assigned class of type * T and a score. * + * @param assignedClass the class T assigned by a {@link Classifier} + * @param score score the score for the assignedClass as a double * @lucene.experimental */ -public class ClassificationResult implements Comparable> { - - private final T assignedClass; - private final double score; - - /** - * Constructor - * - * @param assignedClass the class T assigned by a {@link Classifier} - * @param score the score for the assignedClass as a double - */ - public ClassificationResult(T assignedClass, double score) { - this.assignedClass = assignedClass; - this.score = score; - } - - /** - * retrieve the result class - * - * @return a T representing an assigned class - */ - public T getAssignedClass() { - return assignedClass; - } - - /** - * retrieve the result score - * - * @return a double representing a result score - */ - public double getScore() { - return score; - } +public record ClassificationResult(T assignedClass, double score) + implements Comparable> { @Override public int compareTo(ClassificationResult o) { - return Double.compare(o.getScore(), this.getScore()); + return Double.compare(o.score(), this.score()); } } diff --git a/lucene/classification/src/java/org/apache/lucene/classification/KNearestFuzzyClassifier.java b/lucene/classification/src/java/org/apache/lucene/classification/KNearestFuzzyClassifier.java index 235ed2c0d6b1..ae3e5aa2b976 100644 --- a/lucene/classification/src/java/org/apache/lucene/classification/KNearestFuzzyClassifier.java +++ b/lucene/classification/src/java/org/apache/lucene/classification/KNearestFuzzyClassifier.java @@ -108,9 +108,9 @@ public ClassificationResult assignClass(String text) throws IOExceptio ClassificationResult assignedClass = null; double maxscore = -Double.MAX_VALUE; for (ClassificationResult cl : assignedClasses) { - if (cl.getScore() > maxscore) { + if (cl.score() > maxscore) { assignedClass = cl; - maxscore = cl.getScore(); + maxscore = cl.score(); } } return assignedClass; @@ -159,7 +159,7 @@ private List> buildListFromTopDocs(TopDocs topDoc Map classCounts = new HashMap<>(); Map classBoosts = new HashMap<>(); // this is a boost based on class ranking positions in topDocs - float maxScore = topDocs.totalHits.value == 0 ? Float.NaN : topDocs.scoreDocs[0].score; + float maxScore = topDocs.totalHits.value() == 0 ? Float.NaN : topDocs.scoreDocs[0].score; StoredFields storedFields = indexSearcher.storedFields(); for (ScoreDoc scoreDoc : topDocs.scoreDocs) { IndexableField storableField = storedFields.document(scoreDoc.doc).getField(classFieldName); @@ -193,7 +193,7 @@ private List> buildListFromTopDocs(TopDocs topDoc if (sumdoc < k) { for (ClassificationResult cr : temporaryList) { returnList.add( - new ClassificationResult<>(cr.getAssignedClass(), cr.getScore() * k / (double) sumdoc)); + new ClassificationResult<>(cr.assignedClass(), cr.score() * k / (double) sumdoc)); } } else { returnList = temporaryList; diff --git a/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java b/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java index 48424d4f0edd..2dba8bfd6323 100644 --- a/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java +++ b/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java @@ -129,9 +129,9 @@ protected ClassificationResult classifyFromTopDocs(TopDocs knnResults) ClassificationResult assignedClass = null; double maxscore = -Double.MAX_VALUE; for (ClassificationResult cl : assignedClasses) { - if (cl.getScore() > maxscore) { + if (cl.score() > maxscore) { assignedClass = cl; - maxscore = cl.getScore(); + maxscore = cl.score(); } } return assignedClass; @@ -192,7 +192,7 @@ protected List> buildListFromTopDocs(TopDocs topD Map classCounts = new HashMap<>(); Map classBoosts = new HashMap<>(); // this is a boost based on class ranking positions in topDocs - float maxScore = topDocs.totalHits.value == 0 ? Float.NaN : topDocs.scoreDocs[0].score; + float maxScore = topDocs.totalHits.value() == 0 ? Float.NaN : topDocs.scoreDocs[0].score; StoredFields storedFields = indexSearcher.storedFields(); for (ScoreDoc scoreDoc : topDocs.scoreDocs) { IndexableField[] storableFields = @@ -229,7 +229,7 @@ protected List> buildListFromTopDocs(TopDocs topD if (sumdoc < k) { for (ClassificationResult cr : temporaryList) { returnList.add( - new ClassificationResult<>(cr.getAssignedClass(), cr.getScore() * k / (double) sumdoc)); + new ClassificationResult<>(cr.assignedClass(), cr.score() * k / (double) sumdoc)); } } else { returnList = temporaryList; diff --git a/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java b/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java index 668c6499f413..9897f95c6a94 100644 --- a/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java +++ b/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java @@ -105,9 +105,9 @@ public ClassificationResult assignClass(String inputDocument) throws I ClassificationResult assignedClass = null; double maxscore = -Double.MAX_VALUE; for (ClassificationResult c : assignedClasses) { - if (c.getScore() > maxscore) { + if (c.score() > maxscore) { assignedClass = c; - maxscore = c.getScore(); + maxscore = c.score(); } } return assignedClass; @@ -297,13 +297,13 @@ protected ArrayList> normClassificationResults( if (!assignedClasses.isEmpty()) { Collections.sort(assignedClasses); // this is a negative number closest to 0 = a - double smax = assignedClasses.get(0).getScore(); + double smax = assignedClasses.get(0).score(); double sumLog = 0; // log(sum(exp(x_n-a))) for (ClassificationResult cr : assignedClasses) { // getScore-smax <=0 (both negative, smax is the smallest abs() - sumLog += Math.exp(cr.getScore() - smax); + sumLog += Math.exp(cr.score() - smax); } // loga=a+log(sum(exp(x_n-a))) = log(sum(exp(x_n))) double loga = smax; @@ -311,8 +311,8 @@ protected ArrayList> normClassificationResults( // 1/sum*x = exp(log(x))*1/sum = exp(log(x)-log(sum)) for (ClassificationResult cr : assignedClasses) { - double scoreDiff = cr.getScore() - loga; - returnList.add(new ClassificationResult<>(cr.getAssignedClass(), Math.exp(scoreDiff))); + double scoreDiff = cr.score() - loga; + returnList.add(new ClassificationResult<>(cr.assignedClass(), Math.exp(scoreDiff))); } } return returnList; diff --git a/lucene/classification/src/java/org/apache/lucene/classification/document/SimpleNaiveBayesDocumentClassifier.java b/lucene/classification/src/java/org/apache/lucene/classification/document/SimpleNaiveBayesDocumentClassifier.java index d8ff07d8fd72..1c34d870935f 100644 --- a/lucene/classification/src/java/org/apache/lucene/classification/document/SimpleNaiveBayesDocumentClassifier.java +++ b/lucene/classification/src/java/org/apache/lucene/classification/document/SimpleNaiveBayesDocumentClassifier.java @@ -80,9 +80,9 @@ public ClassificationResult assignClass(Document document) throws IOEx ClassificationResult assignedClass = null; double maxscore = -Double.MAX_VALUE; for (ClassificationResult c : assignedClasses) { - if (c.getScore() > maxscore) { + if (c.score() > maxscore) { assignedClass = c; - maxscore = c.getScore(); + maxscore = c.score(); } } return assignedClass; diff --git a/lucene/classification/src/java/org/apache/lucene/classification/utils/ConfusionMatrixGenerator.java b/lucene/classification/src/java/org/apache/lucene/classification/utils/ConfusionMatrixGenerator.java index 4994115b0a34..fb9ff922296a 100644 --- a/lucene/classification/src/java/org/apache/lucene/classification/utils/ConfusionMatrixGenerator.java +++ b/lucene/classification/src/java/org/apache/lucene/classification/utils/ConfusionMatrixGenerator.java @@ -107,7 +107,7 @@ public static ConfusionMatrix getConfusionMatrix( time += end - start; if (result != null) { - T assignedClass = result.getAssignedClass(); + T assignedClass = result.assignedClass(); if (assignedClass != null) { counter++; String classified = diff --git a/lucene/classification/src/java/org/apache/lucene/classification/utils/DatasetSplitter.java b/lucene/classification/src/java/org/apache/lucene/classification/utils/DatasetSplitter.java index ecc2d3cbd02c..2e6ca311140b 100644 --- a/lucene/classification/src/java/org/apache/lucene/classification/utils/DatasetSplitter.java +++ b/lucene/classification/src/java/org/apache/lucene/classification/utils/DatasetSplitter.java @@ -138,13 +138,13 @@ public void split( // iterate over existing documents StoredFields storedFields = originalIndex.storedFields(); for (GroupDocs group : topGroups.groups) { - assert group.totalHits.relation == TotalHits.Relation.EQUAL_TO; - long totalHits = group.totalHits.value; + assert group.totalHits().relation() == TotalHits.Relation.EQUAL_TO; + long totalHits = group.totalHits().value(); double testSize = totalHits * testRatio; int tc = 0; double cvSize = totalHits * crossValidationRatio; int cvc = 0; - for (ScoreDoc scoreDoc : group.scoreDocs) { + for (ScoreDoc scoreDoc : group.scoreDocs()) { // create a new document for indexing Document doc = createNewDoc(storedFields, ft, scoreDoc, fieldNames); diff --git a/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java b/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java index 988a6eada9f5..410ef8874c96 100644 --- a/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java +++ b/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java @@ -91,7 +91,7 @@ protected ClassificationResult checkCorrectClassification( Classifier classifier, String inputDoc, T expectedResult) throws Exception { ClassificationResult classificationResult = classifier.assignClass(inputDoc); assertNotNull(classificationResult); - T assignedClass = classificationResult.getAssignedClass(); + T assignedClass = classificationResult.assignedClass(); assertNotNull(assignedClass); assertEquals( "got an assigned class of " + assignedClass, @@ -101,7 +101,7 @@ protected ClassificationResult checkCorrectClassification( assignedClass instanceof BytesRef ? ((BytesRef) assignedClass).utf8ToString() : assignedClass); - double score = classificationResult.getScore(); + double score = classificationResult.score(); assertTrue("score should be between 0 and 1, got:" + score, score <= 1 && score >= 0); return classificationResult; } @@ -130,18 +130,17 @@ protected void checkOnlineClassification( getSampleIndex(analyzer); ClassificationResult classificationResult = classifier.assignClass(inputDoc); - assertNotNull(classificationResult.getAssignedClass()); + assertNotNull(classificationResult.assignedClass()); assertEquals( - "got an assigned class of " + classificationResult.getAssignedClass(), + "got an assigned class of " + classificationResult.assignedClass(), expectedResult, - classificationResult.getAssignedClass()); - double score = classificationResult.getScore(); + classificationResult.assignedClass()); + double score = classificationResult.score(); assertTrue("score should be between 0 and 1, got: " + score, score <= 1 && score >= 0); updateSampleIndex(); ClassificationResult secondClassificationResult = classifier.assignClass(inputDoc); - assertEquals( - classificationResult.getAssignedClass(), secondClassificationResult.getAssignedClass()); - assertEquals(Double.valueOf(score), Double.valueOf(secondClassificationResult.getScore())); + assertEquals(classificationResult.assignedClass(), secondClassificationResult.assignedClass()); + assertEquals(Double.valueOf(score), Double.valueOf(secondClassificationResult.score())); } protected LeafReader getSampleIndex(Analyzer analyzer) throws IOException { diff --git a/lucene/classification/src/test/org/apache/lucene/classification/TestKNearestNeighborClassifier.java b/lucene/classification/src/test/org/apache/lucene/classification/TestKNearestNeighborClassifier.java index f437dc575565..073703ae61e9 100644 --- a/lucene/classification/src/test/org/apache/lucene/classification/TestKNearestNeighborClassifier.java +++ b/lucene/classification/src/test/org/apache/lucene/classification/TestKNearestNeighborClassifier.java @@ -88,7 +88,7 @@ public void testBasicUsage() throws Exception { textFieldName), TECHNOLOGY_INPUT, TECHNOLOGY_RESULT); - assertTrue(resultDS.getScore() != resultLMS.getScore()); + assertTrue(resultDS.score() != resultLMS.score()); } finally { IOUtils.close(leafReader); } @@ -113,7 +113,7 @@ public void testRankedClasses() throws Exception { leafReader, null, analyzer, null, 6, 1, 1, categoryFieldName, textFieldName); List> classes = knnClassifier.getClasses(STRONG_TECHNOLOGY_INPUT); - assertTrue(classes.get(0).getScore() > classes.get(1).getScore()); + assertTrue(classes.get(0).score() > classes.get(1).score()); checkCorrectClassification(knnClassifier, STRONG_TECHNOLOGY_INPUT, TECHNOLOGY_RESULT); } finally { IOUtils.close(leafReader); @@ -139,7 +139,7 @@ public void testUnbalancedClasses() throws Exception { leafReader, null, analyzer, null, 3, 1, 1, categoryFieldName, textFieldName); List> classes = knnClassifier.getClasses(SUPER_STRONG_TECHNOLOGY_INPUT); - assertTrue(classes.get(0).getScore() > classes.get(1).getScore()); + assertTrue(classes.get(0).score() > classes.get(1).score()); checkCorrectClassification(knnClassifier, SUPER_STRONG_TECHNOLOGY_INPUT, TECHNOLOGY_RESULT); } finally { IOUtils.close(leafReader); diff --git a/lucene/classification/src/test/org/apache/lucene/classification/document/DocumentClassificationTestBase.java b/lucene/classification/src/test/org/apache/lucene/classification/document/DocumentClassificationTestBase.java index 7e9fa45282d4..5e944af3454d 100644 --- a/lucene/classification/src/test/org/apache/lucene/classification/document/DocumentClassificationTestBase.java +++ b/lucene/classification/src/test/org/apache/lucene/classification/document/DocumentClassificationTestBase.java @@ -58,12 +58,12 @@ public void init() throws IOException { protected double checkCorrectDocumentClassification( DocumentClassifier classifier, Document inputDoc, T expectedResult) throws Exception { ClassificationResult classificationResult = classifier.assignClass(inputDoc); - assertNotNull(classificationResult.getAssignedClass()); + assertNotNull(classificationResult.assignedClass()); assertEquals( - "got an assigned class of " + classificationResult.getAssignedClass(), + "got an assigned class of " + classificationResult.assignedClass(), expectedResult, - classificationResult.getAssignedClass()); - double score = classificationResult.getScore(); + classificationResult.assignedClass()); + double score = classificationResult.score(); assertTrue("score should be between 0 and 1, got:" + score, score <= 1 && score >= 0); return score; } diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/bitvectors/FlatBitVectorsScorer.java b/lucene/codecs/src/java/org/apache/lucene/codecs/bitvectors/FlatBitVectorsScorer.java index b8ff37c2654a..8ffcc1c8d50e 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/bitvectors/FlatBitVectorsScorer.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/bitvectors/FlatBitVectorsScorer.java @@ -19,10 +19,11 @@ import java.io.IOException; import org.apache.lucene.codecs.hnsw.FlatVectorsScorer; +import org.apache.lucene.index.ByteVectorValues; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.util.Bits; import org.apache.lucene.util.VectorUtil; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.hnsw.RandomVectorScorer; import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier; @@ -30,45 +31,39 @@ public class FlatBitVectorsScorer implements FlatVectorsScorer { @Override public RandomVectorScorerSupplier getRandomVectorScorerSupplier( - VectorSimilarityFunction similarityFunction, RandomAccessVectorValues vectorValues) + VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues) throws IOException { - assert vectorValues instanceof RandomAccessVectorValues.Bytes; - if (vectorValues instanceof RandomAccessVectorValues.Bytes byteVectorValues) { + assert vectorValues instanceof ByteVectorValues; + if (vectorValues instanceof ByteVectorValues byteVectorValues) { return new BitRandomVectorScorerSupplier(byteVectorValues); } - throw new IllegalArgumentException( - "vectorValues must be an instance of RandomAccessVectorValues.Bytes"); + throw new IllegalArgumentException("vectorValues must be an instance of ByteVectorValues"); } @Override public RandomVectorScorer getRandomVectorScorer( - VectorSimilarityFunction similarityFunction, - RandomAccessVectorValues vectorValues, - float[] target) + VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues, float[] target) throws IOException { throw new IllegalArgumentException("bit vectors do not support float[] targets"); } @Override public RandomVectorScorer getRandomVectorScorer( - VectorSimilarityFunction similarityFunction, - RandomAccessVectorValues vectorValues, - byte[] target) + VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues, byte[] target) throws IOException { - assert vectorValues instanceof RandomAccessVectorValues.Bytes; - if (vectorValues instanceof RandomAccessVectorValues.Bytes byteVectorValues) { + assert vectorValues instanceof ByteVectorValues; + if (vectorValues instanceof ByteVectorValues byteVectorValues) { return new BitRandomVectorScorer(byteVectorValues, target); } - throw new IllegalArgumentException( - "vectorValues must be an instance of RandomAccessVectorValues.Bytes"); + throw new IllegalArgumentException("vectorValues must be an instance of ByteVectorValues"); } static class BitRandomVectorScorer implements RandomVectorScorer { - private final RandomAccessVectorValues.Bytes vectorValues; + private final ByteVectorValues vectorValues; private final int bitDimensions; private final byte[] query; - BitRandomVectorScorer(RandomAccessVectorValues.Bytes vectorValues, byte[] query) { + BitRandomVectorScorer(ByteVectorValues vectorValues, byte[] query) { this.query = query; this.bitDimensions = vectorValues.dimension() * Byte.SIZE; this.vectorValues = vectorValues; @@ -97,12 +92,11 @@ public Bits getAcceptOrds(Bits acceptDocs) { } static class BitRandomVectorScorerSupplier implements RandomVectorScorerSupplier { - protected final RandomAccessVectorValues.Bytes vectorValues; - protected final RandomAccessVectorValues.Bytes vectorValues1; - protected final RandomAccessVectorValues.Bytes vectorValues2; + protected final ByteVectorValues vectorValues; + protected final ByteVectorValues vectorValues1; + protected final ByteVectorValues vectorValues2; - public BitRandomVectorScorerSupplier(RandomAccessVectorValues.Bytes vectorValues) - throws IOException { + public BitRandomVectorScorerSupplier(ByteVectorValues vectorValues) throws IOException { this.vectorValues = vectorValues; this.vectorValues1 = vectorValues.copy(); this.vectorValues2 = vectorValues.copy(); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java index 3784f4113b68..9fbde960083c 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java @@ -69,28 +69,15 @@ public class BlockTermsWriter extends FieldsConsumer { private final TermsIndexWriterBase termsIndexWriter; private final int maxDoc; - private static class FieldMetaData { - public final FieldInfo fieldInfo; - public final long numTerms; - public final long termsStartPointer; - public final long sumTotalTermFreq; - public final long sumDocFreq; - public final int docCount; - - public FieldMetaData( - FieldInfo fieldInfo, - long numTerms, - long termsStartPointer, - long sumTotalTermFreq, - long sumDocFreq, - int docCount) { + private record FieldMetaData( + FieldInfo fieldInfo, + long numTerms, + long termsStartPointer, + long sumTotalTermFreq, + long sumDocFreq, + int docCount) { + private FieldMetaData { assert numTerms > 0; - this.fieldInfo = fieldInfo; - this.termsStartPointer = termsStartPointer; - this.numTerms = numTerms; - this.sumTotalTermFreq = sumTotalTermFreq; - this.sumDocFreq = sumDocFreq; - this.docCount = docCount; } } diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexWriter.java index 97a58a9ff977..ab0ad75ca8a2 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexWriter.java @@ -127,7 +127,7 @@ public EveryNOrDocFreqTermSelector(int docFreqThresh, int interval) { @Override public boolean isIndexTerm(BytesRef term, TermStats stats) { - if (stats.docFreq >= docFreqThresh || count >= interval) { + if (stats.docFreq() >= docFreqThresh || count >= interval) { count = 1; return true; } else { diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/BlockTreeOrdsPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/BlockTreeOrdsPostingsFormat.java index db7041485739..29854aa500f9 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/BlockTreeOrdsPostingsFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/BlockTreeOrdsPostingsFormat.java @@ -22,14 +22,14 @@ import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.PostingsReaderBase; import org.apache.lucene.codecs.PostingsWriterBase; +import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader; +import org.apache.lucene.codecs.lucene101.Lucene101PostingsWriter; import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter; -import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader; -import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.util.IOUtils; -/** Uses {@link OrdsBlockTreeTermsWriter} with {@link Lucene912PostingsWriter}. */ +/** Uses {@link OrdsBlockTreeTermsWriter} with {@link Lucene101PostingsWriter}. */ public class BlockTreeOrdsPostingsFormat extends PostingsFormat { private final int minTermBlockSize; @@ -67,7 +67,7 @@ public String toString() { @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(state); + PostingsWriterBase postingsWriter = new Lucene101PostingsWriter(state); boolean success = false; try { @@ -84,7 +84,7 @@ public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { - PostingsReaderBase postingsReader = new Lucene912PostingsReader(state); + PostingsReaderBase postingsReader = new Lucene101PostingsReader(state); boolean success = false; try { FieldsProducer ret = new OrdsBlockTreeTermsReader(postingsReader, state); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/FSTOrdsOutputs.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/FSTOrdsOutputs.java index 9e83f261b042..286db291b0a4 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/FSTOrdsOutputs.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/FSTOrdsOutputs.java @@ -34,19 +34,14 @@ final class FSTOrdsOutputs extends Outputs { private static final BytesRef NO_BYTES = new BytesRef(); - public static final class Output { - public final BytesRef bytes; - // Inclusive: - public final long startOrd; - // Inclusive: - public final long endOrd; - - public Output(BytesRef bytes, long startOrd, long endOrd) { + /** + * @param startOrd Inclusive: + * @param endOrd Inclusive: + */ + public record Output(BytesRef bytes, long startOrd, long endOrd) { + public Output { assert startOrd >= 0 : "startOrd=" + startOrd; assert endOrd >= 0 : "endOrd=" + endOrd; - this.bytes = bytes; - this.startOrd = startOrd; - this.endOrd = endOrd; } @Override @@ -60,24 +55,6 @@ public String toString() { } return startOrd + " to " + x; } - - @Override - public int hashCode() { - int hash = bytes.hashCode(); - hash = (int) (hash ^ startOrd); - hash = (int) (hash ^ endOrd); - return hash; - } - - @Override - public boolean equals(Object _other) { - if (_other instanceof Output) { - Output other = (Output) _other; - return bytes.equals(other.bytes) && startOrd == other.startOrd && endOrd == other.endOrd; - } else { - return false; - } - } } @Override diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java index fb6b0de56b7d..852c06ae5de6 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java @@ -139,38 +139,19 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer { final PostingsWriterBase postingsWriter; final FieldInfos fieldInfos; - private static class FieldMetaData { - public final FieldInfo fieldInfo; - public final Output rootCode; - public final long numTerms; - public final long indexStartFP; - public final long sumTotalTermFreq; - public final long sumDocFreq; - public final int docCount; - public final BytesRef minTerm; - public final BytesRef maxTerm; - - public FieldMetaData( - FieldInfo fieldInfo, - Output rootCode, - long numTerms, - long indexStartFP, - long sumTotalTermFreq, - long sumDocFreq, - int docCount, - BytesRef minTerm, - BytesRef maxTerm) { + private record FieldMetaData( + FieldInfo fieldInfo, + Output rootCode, + long numTerms, + long indexStartFP, + long sumTotalTermFreq, + long sumDocFreq, + int docCount, + BytesRef minTerm, + BytesRef maxTerm) { + private FieldMetaData { assert numTerms > 0; - this.fieldInfo = fieldInfo; assert rootCode != null : "field=" + fieldInfo.name + " numTerms=" + numTerms; - this.rootCode = rootCode; - this.indexStartFP = indexStartFP; - this.numTerms = numTerms; - this.sumTotalTermFreq = sumTotalTermFreq; - this.sumDocFreq = sumDocFreq; - this.docCount = docCount; - this.minTerm = minTerm; - this.maxTerm = maxTerm; } } @@ -293,15 +274,7 @@ public String toString() { } } - private static final class SubIndex { - public final FST index; - public final long termOrdStart; - - public SubIndex(FST index, long termOrdStart) { - this.index = index; - this.termOrdStart = termOrdStart; - } - } + private record SubIndex(FST index, long termOrdStart) {} private static final class PendingBlock extends PendingEntry { public final BytesRef prefix; @@ -438,7 +411,7 @@ private void append( // long blockTermCount = output.endOrd - output.startOrd + 1; Output newOutput = FST_OUTPUTS.newOutput( - output.bytes, termOrdOffset + output.startOrd, output.endOrd - termOrdOffset); + output.bytes(), termOrdOffset + output.startOrd(), output.endOrd() - termOrdOffset); // System.out.println(" append sub=" + indexEnt.input + " output=" + indexEnt.output + // " termOrdOffset=" + termOrdOffset + " blockTermCount=" + blockTermCount + " newOutput=" // + newOutput + " endOrd=" + (termOrdOffset+Long.MAX_VALUE-output.endOrd)); @@ -969,9 +942,11 @@ public void close() throws IOException { out.writeVInt(field.fieldInfo.number); assert field.numTerms > 0; out.writeVLong(field.numTerms); - out.writeVInt(field.rootCode.bytes.length); + out.writeVInt(field.rootCode.bytes().length); out.writeBytes( - field.rootCode.bytes.bytes, field.rootCode.bytes.offset, field.rootCode.bytes.length); + field.rootCode.bytes().bytes, + field.rootCode.bytes().offset, + field.rootCode.bytes().length); if (field.fieldInfo.getIndexOptions() != IndexOptions.DOCS) { out.writeVLong(field.sumTotalTermFreq); } diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsFieldReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsFieldReader.java index c3e2b0362f4d..5f83f9e12a59 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsFieldReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsFieldReader.java @@ -79,7 +79,8 @@ final class OrdsFieldReader extends Terms { // } rootBlockFP = - (new ByteArrayDataInput(rootCode.bytes.bytes, rootCode.bytes.offset, rootCode.bytes.length)) + (new ByteArrayDataInput( + rootCode.bytes().bytes, rootCode.bytes().offset, rootCode.bytes().length)) .readVLong() >>> OrdsBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS; diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsIntersectTermsEnumFrame.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsIntersectTermsEnumFrame.java index 88e553625918..c5a1bfa58f58 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsIntersectTermsEnumFrame.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsIntersectTermsEnumFrame.java @@ -142,8 +142,8 @@ void load(Output output) throws IOException { // + frameIndexData + " trans=" + (transitions.length != 0 ? transitions[0] : "n/a" + " state=" // + state)); - if (output != null && output.bytes != null && transitionCount != 0) { - BytesRef frameIndexData = output.bytes; + if (output != null && output.bytes() != null && transitionCount != 0) { + BytesRef frameIndexData = output.bytes(); // Floor frame if (floorData.length < frameIndexData.length) { diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsSegmentTermsEnum.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsSegmentTermsEnum.java index 5b6771e7856c..e6c297befc0c 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsSegmentTermsEnum.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsSegmentTermsEnum.java @@ -149,7 +149,8 @@ private FST.Arc getArc(int ord) { // Pushes a frame we seek'd to OrdsSegmentTermsEnumFrame pushFrame(FST.Arc arc, Output frameData, int length) throws IOException { - scratchReader.reset(frameData.bytes.bytes, frameData.bytes.offset, frameData.bytes.length); + scratchReader.reset( + frameData.bytes().bytes, frameData.bytes().offset, frameData.bytes().length); final long code = scratchReader.readVLong(); final long fpSeek = code >>> OrdsBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS; // System.out.println(" fpSeek=" + fpSeek); @@ -160,11 +161,11 @@ OrdsSegmentTermsEnumFrame pushFrame(FST.Arc arc, Output frameData, int l // Must setFloorData before pushFrame in case pushFrame tries to rewind: if (f.isFloor) { - f.termOrdOrig = frameData.startOrd; - f.setFloorData(scratchReader, frameData.bytes); + f.termOrdOrig = frameData.startOrd(); + f.setFloorData(scratchReader, frameData.bytes()); } - pushFrame(arc, fpSeek, length, frameData.startOrd); + pushFrame(arc, fpSeek, length, frameData.startOrd()); return f; } @@ -891,7 +892,7 @@ private void printSeekState(PrintStream out) throws IOException { } else if (isSeekFrame && !f.isFloor) { final ByteArrayDataInput reader = new ByteArrayDataInput( - output.bytes.bytes, output.bytes.offset, output.bytes.length); + output.bytes().bytes, output.bytes().offset, output.bytes().length); final long codeOrig = reader.readVLong(); final long code = (f.fp << OrdsBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS) @@ -1210,7 +1211,8 @@ private InputOutput getByOutput(long targetOrd) throws IOException { OrdsBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.nextFinalOutput()); // System.out.println(" isFinal: " + finalOutput.startOrd + "-" + // (Long.MAX_VALUE-finalOutput.endOrd)); - if (targetOrd >= finalOutput.startOrd && targetOrd <= Long.MAX_VALUE - finalOutput.endOrd) { + if (targetOrd >= finalOutput.startOrd() + && targetOrd <= Long.MAX_VALUE - finalOutput.endOrd()) { // Only one range should match across all arc leaving this node // assert bestOutput == null; bestOutput = finalOutput; @@ -1247,9 +1249,9 @@ private InputOutput getByOutput(long targetOrd) throws IOException { } // System.out.println(" cycle mid=" + mid + " targetOrd=" + targetOrd + " output=" + // minArcOutput.startOrd + "-" + (Long.MAX_VALUE-minArcOutput.endOrd)); - if (targetOrd > Long.MAX_VALUE - minArcOutput.endOrd) { + if (targetOrd > Long.MAX_VALUE - minArcOutput.endOrd()) { low = mid + 1; - } else if (targetOrd < minArcOutput.startOrd) { + } else if (targetOrd < minArcOutput.startOrd()) { high = mid - 1; } else { // System.out.println(" found!!"); @@ -1282,10 +1284,10 @@ private InputOutput getByOutput(long targetOrd) throws IOException { // this arc: final Output minArcOutput = OrdsBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.output()); - long endOrd = Long.MAX_VALUE - minArcOutput.endOrd; + long endOrd = Long.MAX_VALUE - minArcOutput.endOrd(); // System.out.println(" endOrd=" + endOrd + " targetOrd=" + targetOrd); - if (targetOrd >= minArcOutput.startOrd && targetOrd <= endOrd) { + if (targetOrd >= minArcOutput.startOrd() && targetOrd <= endOrd) { // Recurse on this arc: output = minArcOutput; result.setIntAt(upto++, arc.label()); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java index 1daa1761fd8b..2a0472fa028b 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java @@ -54,8 +54,9 @@ * *

A choice of {@link BloomFilterFactory} can be passed to tailor Bloom Filter settings on a * per-field basis. The default configuration is {@link DefaultBloomFilterFactory} which allocates a - * ~8mb bitset and hashes values using {@link MurmurHash64}. This should be suitable for most - * purposes. + * ~8mb bitset and hashes values using {@link + * org.apache.lucene.util.StringHelper#murmurhash3_x64_128(BytesRef)}. This should be suitable for + * most purposes. * *

The format of the blm file is as follows: * diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/FuzzySet.java b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/FuzzySet.java index f1d2dee65c71..7d6fd1b64b52 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/FuzzySet.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/FuzzySet.java @@ -24,6 +24,7 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.StringHelper; /** * A class used to represent a set of many, potentially large, values (e.g. many long strings such @@ -53,7 +54,6 @@ public enum ContainsResult { NO }; - private HashFunction hashFunction; private FixedBitSet filter; private int bloomSize; private final int hashCount; @@ -138,7 +138,6 @@ private FuzzySet(FixedBitSet filter, int bloomSize, int hashCount) { super(); this.filter = filter; this.bloomSize = bloomSize; - this.hashFunction = MurmurHash64.INSTANCE; this.hashCount = hashCount; } @@ -150,11 +149,12 @@ private FuzzySet(FixedBitSet filter, int bloomSize, int hashCount) { * @return NO or MAYBE */ public ContainsResult contains(BytesRef value) { - long hash = hashFunction.hash(value); - int msb = (int) (hash >>> Integer.SIZE); - int lsb = (int) hash; + long[] hash = StringHelper.murmurhash3_x64_128(value); + + long msb = hash[0]; + long lsb = hash[1]; for (int i = 0; i < hashCount; i++) { - int bloomPos = (lsb + i * msb); + int bloomPos = ((int) (lsb + i * msb)) & bloomSize; if (!mayContainValue(bloomPos)) { return ContainsResult.NO; } @@ -216,15 +216,14 @@ private boolean mayContainValue(int aHash) { * is modulo n'd where n is the chosen size of the internal bitset. * * @param value the key value to be hashed - * @throws IOException If there is a low-level I/O error */ - public void addValue(BytesRef value) throws IOException { - long hash = hashFunction.hash(value); - int msb = (int) (hash >>> Integer.SIZE); - int lsb = (int) hash; + public void addValue(BytesRef value) { + long[] hash = StringHelper.murmurhash3_x64_128(value); + long msb = hash[0]; + long lsb = hash[1]; for (int i = 0; i < hashCount; i++) { // Bitmasking using bloomSize is effectively a modulo operation. - int bloomPos = (lsb + i * msb) & bloomSize; + int bloomPos = ((int) (lsb + i * msb)) & bloomSize; filter.set(bloomPos); } } @@ -302,9 +301,7 @@ public long ramBytesUsed() { @Override public String toString() { return getClass().getSimpleName() - + "(hash=" - + hashFunction - + ", k=" + + "(k=" + hashCount + ", bits=" + filter.cardinality() diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/MurmurHash64.java b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/MurmurHash64.java deleted file mode 100644 index 1d1897731434..000000000000 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/MurmurHash64.java +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.codecs.bloom; - -import org.apache.lucene.util.BitUtil; -import org.apache.lucene.util.BytesRef; - -/** - * This is a very fast, non-cryptographic hash suitable for general hash-based lookup. See - * http://murmurhash.googlepages.com/ for more details. - * - *

The code from Apache Commons was adapted in the form here to work with BytesRefs with offsets - * and lengths rather than raw byte arrays. - */ -public class MurmurHash64 extends HashFunction { - private static final long M64 = 0xc6a4a7935bd1e995L; - private static final int R64 = 47; - public static final HashFunction INSTANCE = new MurmurHash64(); - - /** - * Generates a 64-bit hash from byte array of the given length and seed. - * - * @param data The input byte array - * @param seed The initial seed value - * @param length The length of the array - * @return The 64-bit hash of the given array - */ - public static long hash64(byte[] data, int seed, int offset, int length) { - long h = (seed & 0xffffffffL) ^ (length * M64); - - final int nblocks = length >> 3; - - // body - for (int i = 0; i < nblocks; i++) { - - long k = (long) BitUtil.VH_LE_LONG.get(data, offset); - k *= M64; - k ^= k >>> R64; - k *= M64; - - h ^= k; - h *= M64; - - offset += Long.BYTES; - } - - int remaining = length & 0x07; - if (0 < remaining) { - for (int i = 0; i < remaining; i++) { - h ^= ((long) data[offset + i] & 0xff) << (Byte.SIZE * i); - } - h *= M64; - } - - h ^= h >>> R64; - h *= M64; - h ^= h >>> R64; - - return h; - } - - @Override - public final long hash(BytesRef br) { - return hash64(br.bytes, 0xe17a1465, br.offset, br.length); - } - - @Override - public String toString() { - return getClass().getSimpleName(); - } -} diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java index 8e17edb9e2cc..5af23fb49455 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java @@ -24,7 +24,7 @@ import org.apache.lucene.codecs.FieldsConsumer; import org.apache.lucene.codecs.FieldsProducer; import org.apache.lucene.codecs.PostingsFormat; -import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat; +import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat; import org.apache.lucene.index.BaseTermsEnum; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.Fields; @@ -54,7 +54,7 @@ // - or: longer dense skip lists than just next byte? /** - * Wraps {@link Lucene912PostingsFormat} format for on-disk storage, but then at read time loads and + * Wraps {@link Lucene101PostingsFormat} format for on-disk storage, but then at read time loads and * stores all terms and postings directly in RAM as byte[], int[]. * *

WARNING: This is exceptionally RAM intensive: it makes no effort to compress the @@ -97,12 +97,12 @@ public DirectPostingsFormat(int minSkipCount, int lowFreqCutoff) { @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - return PostingsFormat.forName("Lucene912").fieldsConsumer(state); + return PostingsFormat.forName("Lucene101").fieldsConsumer(state); } @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { - FieldsProducer postings = PostingsFormat.forName("Lucene912").fieldsProducer(state); + FieldsProducer postings = PostingsFormat.forName("Lucene101").fieldsProducer(state); if (state.context.context() != IOContext.Context.MERGE) { FieldsProducer loadedPostings; try { diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPostingsFormat.java index b184f85b176d..4893ee8ad265 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPostingsFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPostingsFormat.java @@ -22,8 +22,8 @@ import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.PostingsReaderBase; import org.apache.lucene.codecs.PostingsWriterBase; -import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader; -import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter; +import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader; +import org.apache.lucene.codecs.lucene101.Lucene101PostingsWriter; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.util.IOUtils; @@ -41,7 +41,7 @@ public String toString() { @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(state); + PostingsWriterBase postingsWriter = new Lucene101PostingsWriter(state); boolean success = false; try { @@ -57,7 +57,7 @@ public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { - PostingsReaderBase postingsReader = new Lucene912PostingsReader(state); + PostingsReaderBase postingsReader = new Lucene101PostingsReader(state); boolean success = false; try { FieldsProducer ret = new FSTTermsReader(state, postingsReader); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDReader.java index 6b989497cb39..3c574375443e 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDReader.java @@ -71,8 +71,8 @@ public SimpleTextBKDReader( this.pointCount = pointCount; this.docCount = docCount; this.version = SimpleTextBKDWriter.VERSION_CURRENT; - assert minPackedValue.length == config.packedIndexBytesLength; - assert maxPackedValue.length == config.packedIndexBytesLength; + assert minPackedValue.length == config.packedIndexBytesLength(); + assert maxPackedValue.length == config.packedIndexBytesLength(); } @Override @@ -99,8 +99,8 @@ private class SimpleTextPointTree implements PointTree { private SimpleTextPointTree( IndexInput in, int nodeID, int level, byte[] minPackedValue, byte[] maxPackedValue) { this.in = in; - this.scratchDocIDs = new int[config.maxPointsInLeafNode]; - this.scratchPackedValue = new byte[config.packedBytesLength]; + this.scratchDocIDs = new int[config.maxPointsInLeafNode()]; + this.scratchPackedValue = new byte[config.packedBytesLength()]; this.nodeID = nodeID; this.rootNode = nodeID; this.level = level; @@ -145,38 +145,39 @@ public boolean moveToChild() { private void pushLeft() { int address = nodeID * bytesPerIndexEntry; // final int splitDimPos; - if (config.numIndexDims == 1) { + if (config.numIndexDims() == 1) { splitDims[level] = 0; } else { splitDims[level] = (splitPackedValues[address++] & 0xff); } - final int splitDimPos = splitDims[level] * config.bytesPerDim; + final int splitDimPos = splitDims[level] * config.bytesPerDim(); if (splitDimValueStack[level] == null) { - splitDimValueStack[level] = new byte[config.bytesPerDim]; + splitDimValueStack[level] = new byte[config.bytesPerDim()]; } // save the dimension we are going to change System.arraycopy( - maxPackedValue, splitDimPos, splitDimValueStack[level], 0, config.bytesPerDim); + maxPackedValue, splitDimPos, splitDimValueStack[level], 0, config.bytesPerDim()); assert Arrays.compareUnsigned( maxPackedValue, splitDimPos, - splitDimPos + config.bytesPerDim, + splitDimPos + config.bytesPerDim(), splitPackedValues, address, - address + config.bytesPerDim) + address + config.bytesPerDim()) >= 0 - : "config.bytesPerDim=" - + config.bytesPerDim + : "config.bytesPerDim()=" + + config.bytesPerDim() + " splitDim=" + splitDims[level] - + " config.numIndexDims=" - + config.numIndexDims + + " config.numIndexDims()=" + + config.numIndexDims() + " config.numDims=" - + config.numDims; + + config.numDims(); nodeID *= 2; level++; // add the split dim value: - System.arraycopy(splitPackedValues, address, maxPackedValue, splitDimPos, config.bytesPerDim); + System.arraycopy( + splitPackedValues, address, maxPackedValue, splitDimPos, config.bytesPerDim()); } @Override @@ -191,37 +192,38 @@ public boolean moveToSibling() { private void pushRight() { int address = nodeID * bytesPerIndexEntry; - if (config.numIndexDims == 1) { + if (config.numIndexDims() == 1) { splitDims[level] = 0; } else { splitDims[level] = (splitPackedValues[address++] & 0xff); } - final int splitDimPos = splitDims[level] * config.bytesPerDim; + final int splitDimPos = splitDims[level] * config.bytesPerDim(); // we should have already visit the left node assert splitDimValueStack[level] != null; // save the dimension we are going to change System.arraycopy( - minPackedValue, splitDimPos, splitDimValueStack[level], 0, config.bytesPerDim); + minPackedValue, splitDimPos, splitDimValueStack[level], 0, config.bytesPerDim()); assert Arrays.compareUnsigned( minPackedValue, splitDimPos, - splitDimPos + config.bytesPerDim, + splitDimPos + config.bytesPerDim(), splitPackedValues, address, - address + config.bytesPerDim) + address + config.bytesPerDim()) <= 0 - : "config.bytesPerDim=" - + config.bytesPerDim + : "config.bytesPerDim()=" + + config.bytesPerDim() + " splitDim=" + splitDims[level] - + " config.numIndexDims=" - + config.numIndexDims + + " config.numIndexDims()=" + + config.numIndexDims() + " config.numDims=" - + config.numDims; + + config.numDims(); nodeID = 2 * nodeID + 1; level++; // add the split dim value: - System.arraycopy(splitPackedValues, address, minPackedValue, splitDimPos, config.bytesPerDim); + System.arraycopy( + splitPackedValues, address, minPackedValue, splitDimPos, config.bytesPerDim()); } @Override @@ -242,16 +244,16 @@ private void pop(boolean isLeft) { splitDimValueStack[level], 0, maxPackedValue, - splitDims[level] * config.bytesPerDim, - config.bytesPerDim); + splitDims[level] * config.bytesPerDim(), + config.bytesPerDim()); } else { System.arraycopy( splitDimValueStack[level], 0, minPackedValue, - splitDims[level] * config.bytesPerDim, - config.bytesPerDim); + splitDims[level] * config.bytesPerDim(), + config.bytesPerDim()); } } @@ -290,7 +292,7 @@ public long size() { private long sizeFromBalancedTree(int leftMostLeafNode, int rightMostLeafNode) { // number of points that need to be distributed between leaves, one per leaf final int extraPoints = - Math.toIntExact(((long) config.maxPointsInLeafNode * leafNodeOffset) - pointCount); + Math.toIntExact(((long) config.maxPointsInLeafNode() * leafNodeOffset) - pointCount); assert extraPoints < leafNodeOffset : "point excess should be lower than leafNodeOffset"; // offset where we stop adding one point to the leaves final int nodeOffset = leafNodeOffset - extraPoints; @@ -298,9 +300,9 @@ private long sizeFromBalancedTree(int leftMostLeafNode, int rightMostLeafNode) { for (int node = leftMostLeafNode; node <= rightMostLeafNode; node++) { // offsetPosition provides which extra point will be added to this node if (balanceTreeNodePosition(0, leafNodeOffset, node - leafNodeOffset, 0, 0) < nodeOffset) { - count += config.maxPointsInLeafNode; + count += config.maxPointsInLeafNode(); } else { - count += config.maxPointsInLeafNode - 1; + count += config.maxPointsInLeafNode() - 1; } } return count; @@ -376,14 +378,14 @@ public void visitDocValues(PointValues.IntersectVisitor visitor) throws IOExcept // Again, this time reading values and checking with the visitor visitor.grow(count); // NOTE: we don't do prefix coding, so we ignore commonPrefixLengths - assert scratchPackedValue.length == config.packedBytesLength; + assert scratchPackedValue.length == config.packedBytesLength(); BytesRefBuilder scratch = new BytesRefBuilder(); for (int i = 0; i < count; i++) { readLine(in, scratch); assert startsWith(scratch, BLOCK_VALUE); BytesRef br = SimpleTextUtil.fromBytesRefString(stripPrefix(scratch, BLOCK_VALUE)); - assert br.length == config.packedBytesLength; - System.arraycopy(br.bytes, br.offset, scratchPackedValue, 0, config.packedBytesLength); + assert br.length == config.packedBytesLength(); + System.arraycopy(br.bytes, br.offset, scratchPackedValue, 0, config.packedBytesLength()); visitor.visit(scratchDocIDs[i], scratchPackedValue); } } else { @@ -443,17 +445,17 @@ public byte[] getMaxPackedValue() { @Override public int getNumDimensions() throws IOException { - return config.numDims; + return config.numDims(); } @Override public int getNumIndexDimensions() throws IOException { - return config.numIndexDims; + return config.numIndexDims(); } @Override public int getBytesPerDimension() throws IOException { - return config.bytesPerDim; + return config.bytesPerDim(); } @Override diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDWriter.java index a514f62d04e9..042d2b77116f 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDWriter.java @@ -144,28 +144,28 @@ public SimpleTextBKDWriter( this.maxDoc = maxDoc; docsSeen = new FixedBitSet(maxDoc); - scratchDiff = new byte[config.bytesPerDim]; - scratch1 = new byte[config.packedBytesLength]; - scratch2 = new byte[config.packedBytesLength]; - commonPrefixLengths = new int[config.numDims]; + scratchDiff = new byte[config.bytesPerDim()]; + scratch1 = new byte[config.packedBytesLength()]; + scratch2 = new byte[config.packedBytesLength()]; + commonPrefixLengths = new int[config.numDims()]; - minPackedValue = new byte[config.packedIndexBytesLength]; - maxPackedValue = new byte[config.packedIndexBytesLength]; + minPackedValue = new byte[config.packedIndexBytesLength()]; + maxPackedValue = new byte[config.packedIndexBytesLength()]; // Maximum number of points we hold in memory at any time maxPointsSortInHeap = - (int) ((maxMBSortInHeap * 1024 * 1024) / (config.bytesPerDoc * config.numDims)); + (int) ((maxMBSortInHeap * 1024 * 1024) / (config.bytesPerDoc() * config.numDims())); // Finally, we must be able to hold at least the leaf node in heap during build: - if (maxPointsSortInHeap < config.maxPointsInLeafNode) { + if (maxPointsSortInHeap < config.maxPointsInLeafNode()) { throw new IllegalArgumentException( "maxMBSortInHeap=" + maxMBSortInHeap + " only allows for maxPointsSortInHeap=" + maxPointsSortInHeap - + ", but this is less than config.maxPointsInLeafNode=" - + config.maxPointsInLeafNode - + "; either increase maxMBSortInHeap or decrease config.maxPointsInLeafNode"); + + ", but this is less than config.maxPointsInLeafNode()=" + + config.maxPointsInLeafNode() + + "; either increase maxMBSortInHeap or decrease config.maxPointsInLeafNode()"); } this.maxMBSortInHeap = maxMBSortInHeap; @@ -183,10 +183,10 @@ public static void verifyParams(double maxMBSortInHeap, long totalPointCount) { } public void add(byte[] packedValue, int docID) throws IOException { - if (packedValue.length != config.packedBytesLength) { + if (packedValue.length != config.packedBytesLength()) { throw new IllegalArgumentException( "packedValue should be length=" - + config.packedBytesLength + + config.packedBytesLength() + " (got: " + packedValue.length + ")"); @@ -209,30 +209,30 @@ public void add(byte[] packedValue, int docID) throws IOException { } else { pointWriter = new HeapPointWriter(config, Math.toIntExact(totalPointCount)); } - System.arraycopy(packedValue, 0, minPackedValue, 0, config.packedIndexBytesLength); - System.arraycopy(packedValue, 0, maxPackedValue, 0, config.packedIndexBytesLength); + System.arraycopy(packedValue, 0, minPackedValue, 0, config.packedIndexBytesLength()); + System.arraycopy(packedValue, 0, maxPackedValue, 0, config.packedIndexBytesLength()); } else { - for (int dim = 0; dim < config.numIndexDims; dim++) { - int offset = dim * config.bytesPerDim; + for (int dim = 0; dim < config.numIndexDims(); dim++) { + int offset = dim * config.bytesPerDim(); if (Arrays.compareUnsigned( packedValue, offset, - offset + config.bytesPerDim, + offset + config.bytesPerDim(), minPackedValue, offset, - offset + config.bytesPerDim) + offset + config.bytesPerDim()) < 0) { - System.arraycopy(packedValue, offset, minPackedValue, offset, config.bytesPerDim); + System.arraycopy(packedValue, offset, minPackedValue, offset, config.bytesPerDim()); } if (Arrays.compareUnsigned( packedValue, offset, - offset + config.bytesPerDim, + offset + config.bytesPerDim(), maxPackedValue, offset, - offset + config.bytesPerDim) + offset + config.bytesPerDim()) > 0) { - System.arraycopy(packedValue, offset, maxPackedValue, offset, config.bytesPerDim); + System.arraycopy(packedValue, offset, maxPackedValue, offset, config.bytesPerDim()); } } } @@ -254,7 +254,7 @@ public long getPointCount() { */ public long writeField(IndexOutput out, String fieldName, MutablePointTree reader) throws IOException { - if (config.numIndexDims == 1) { + if (config.numIndexDims() == 1) { return writeField1Dim(out, fieldName, reader); } else { return writeFieldNDims(out, fieldName, reader); @@ -280,7 +280,7 @@ private long writeFieldNDims(IndexOutput out, String fieldName, MutablePointTree long countPerLeaf = pointCount = values.size(); long innerNodeCount = 1; - while (countPerLeaf > config.maxPointsInLeafNode) { + while (countPerLeaf > config.maxPointsInLeafNode()) { countPerLeaf = (countPerLeaf + 1) / 2; innerNodeCount *= 2; } @@ -289,7 +289,7 @@ private long writeFieldNDims(IndexOutput out, String fieldName, MutablePointTree checkMaxLeafNodeCount(numLeaves); - final byte[] splitPackedValues = new byte[numLeaves * (config.bytesPerDim + 1)]; + final byte[] splitPackedValues = new byte[numLeaves * (config.bytesPerDim() + 1)]; final long[] leafBlockFPs = new long[numLeaves]; // compute the min/max for this slice @@ -297,37 +297,37 @@ private long writeFieldNDims(IndexOutput out, String fieldName, MutablePointTree Arrays.fill(maxPackedValue, (byte) 0); for (int i = 0; i < Math.toIntExact(pointCount); ++i) { values.getValue(i, scratchBytesRef1); - for (int dim = 0; dim < config.numIndexDims; dim++) { - int offset = dim * config.bytesPerDim; + for (int dim = 0; dim < config.numIndexDims(); dim++) { + int offset = dim * config.bytesPerDim(); if (Arrays.compareUnsigned( scratchBytesRef1.bytes, scratchBytesRef1.offset + offset, - scratchBytesRef1.offset + offset + config.bytesPerDim, + scratchBytesRef1.offset + offset + config.bytesPerDim(), minPackedValue, offset, - offset + config.bytesPerDim) + offset + config.bytesPerDim()) < 0) { System.arraycopy( scratchBytesRef1.bytes, scratchBytesRef1.offset + offset, minPackedValue, offset, - config.bytesPerDim); + config.bytesPerDim()); } if (Arrays.compareUnsigned( scratchBytesRef1.bytes, scratchBytesRef1.offset + offset, - scratchBytesRef1.offset + offset + config.bytesPerDim, + scratchBytesRef1.offset + offset + config.bytesPerDim(), maxPackedValue, offset, - offset + config.bytesPerDim) + offset + config.bytesPerDim()) > 0) { System.arraycopy( scratchBytesRef1.bytes, scratchBytesRef1.offset + offset, maxPackedValue, offset, - config.bytesPerDim); + config.bytesPerDim()); } } @@ -345,7 +345,7 @@ private long writeFieldNDims(IndexOutput out, String fieldName, MutablePointTree maxPackedValue, splitPackedValues, leafBlockFPs, - new int[config.maxPointsInLeafNode]); + new int[config.maxPointsInLeafNode()]); long indexFP = out.getFilePointer(); writeIndex(out, leafBlockFPs, splitPackedValues, Math.toIntExact(countPerLeaf)); @@ -387,15 +387,15 @@ private class OneDimensionBKDWriter { final IndexOutput out; final List leafBlockFPs = new ArrayList<>(); final List leafBlockStartValues = new ArrayList<>(); - final byte[] leafValues = new byte[config.maxPointsInLeafNode * config.packedBytesLength]; - final int[] leafDocs = new int[config.maxPointsInLeafNode]; + final byte[] leafValues = new byte[config.maxPointsInLeafNode() * config.packedBytesLength()]; + final int[] leafDocs = new int[config.maxPointsInLeafNode()]; long valueCount; int leafCount; OneDimensionBKDWriter(IndexOutput out) { - if (config.numIndexDims != 1) { + if (config.numIndexDims() != 1) { throw new UnsupportedOperationException( - "config.numIndexDims must be 1 but got " + config.numIndexDims); + "config.numIndexDims() must be 1 but got " + config.numIndexDims()); } if (pointCount != 0) { throw new IllegalStateException("cannot mix add and merge"); @@ -411,7 +411,7 @@ private class OneDimensionBKDWriter { this.out = out; - lastPackedValue = new byte[config.packedBytesLength]; + lastPackedValue = new byte[config.packedBytesLength()]; } // for asserts @@ -426,8 +426,8 @@ assert valueInOrder( packedValue, 0, leafValues, - leafCount * config.packedBytesLength, - config.packedBytesLength); + leafCount * config.packedBytesLength(), + config.packedBytesLength()); leafDocs[leafCount] = docID; docsSeen.set(docID); leafCount++; @@ -441,7 +441,7 @@ assert valueInOrder( + " values"); } - if (leafCount == config.maxPointsInLeafNode) { + if (leafCount == config.maxPointsInLeafNode()) { // We write a block once we hit exactly the max count ... this is different from // when we flush a new segment, where we write between max/2 and max per leaf block, // so merged segments will behave differently from newly flushed segments: @@ -471,43 +471,44 @@ public long finish() throws IOException { // System.out.println("BKDW: now rotate numInnerNodes=" + numInnerNodes + " leafBlockStarts=" // + leafBlockStartValues.size()); - byte[] index = new byte[(1 + numInnerNodes) * (1 + config.bytesPerDim)]; + byte[] index = new byte[(1 + numInnerNodes) * (1 + config.bytesPerDim())]; rotateToTree(1, 0, numInnerNodes, index, leafBlockStartValues); long[] arr = new long[leafBlockFPs.size()]; for (int i = 0; i < leafBlockFPs.size(); i++) { arr[i] = leafBlockFPs.get(i); } - writeIndex(out, arr, index, config.maxPointsInLeafNode); + writeIndex(out, arr, index, config.maxPointsInLeafNode()); return indexFP; } private void writeLeafBlock() throws IOException { assert leafCount != 0; if (valueCount == 0) { - System.arraycopy(leafValues, 0, minPackedValue, 0, config.packedIndexBytesLength); + System.arraycopy(leafValues, 0, minPackedValue, 0, config.packedIndexBytesLength()); } System.arraycopy( leafValues, - (leafCount - 1) * config.packedBytesLength, + (leafCount - 1) * config.packedBytesLength(), maxPackedValue, 0, - config.packedIndexBytesLength); + config.packedIndexBytesLength()); valueCount += leafCount; if (leafBlockFPs.size() > 0) { // Save the first (minimum) value in each leaf block except the first, to build the split // value index in the end: - leafBlockStartValues.add(ArrayUtil.copyOfSubArray(leafValues, 0, config.packedBytesLength)); + leafBlockStartValues.add( + ArrayUtil.copyOfSubArray(leafValues, 0, config.packedBytesLength())); } leafBlockFPs.add(out.getFilePointer()); checkMaxLeafNodeCount(leafBlockFPs.size()); - Arrays.fill(commonPrefixLengths, config.bytesPerDim); + Arrays.fill(commonPrefixLengths, config.bytesPerDim()); // Find per-dim common prefix: - for (int dim = 0; dim < config.numDims; dim++) { - int offset1 = dim * config.bytesPerDim; - int offset2 = (leafCount - 1) * config.packedBytesLength + offset1; + for (int dim = 0; dim < config.numDims(); dim++) { + int offset1 = dim * config.bytesPerDim(); + int offset2 = (leafCount - 1) * config.packedBytesLength() + offset1; for (int j = 0; j < commonPrefixLengths[dim]; j++) { if (leafValues[offset1 + j] != leafValues[offset2 + j]) { commonPrefixLengths[dim] = j; @@ -523,24 +524,24 @@ private void writeLeafBlock() throws IOException { final BytesRef scratch = new BytesRef(); { - scratch.length = config.packedBytesLength; + scratch.length = config.packedBytesLength(); scratch.bytes = leafValues; } @Override public BytesRef apply(int i) { - scratch.offset = config.packedBytesLength * i; + scratch.offset = config.packedBytesLength() * i; return scratch; } }; assert valuesInOrderAndBounds( leafCount, 0, - ArrayUtil.copyOfSubArray(leafValues, 0, config.packedBytesLength), + ArrayUtil.copyOfSubArray(leafValues, 0, config.packedBytesLength()), ArrayUtil.copyOfSubArray( leafValues, - (leafCount - 1) * config.packedBytesLength, - leafCount * config.packedBytesLength), + (leafCount - 1) * config.packedBytesLength(), + leafCount * config.packedBytesLength()), packedValues, leafDocs, 0); @@ -552,7 +553,7 @@ assert valuesInOrderAndBounds( private void rotateToTree( int nodeID, int offset, int count, byte[] index, List leafBlockStartValues) { // System.out.println("ROTATE: nodeID=" + nodeID + " offset=" + offset + " count=" + count + " - // bpd=" + config.bytesPerDim + " index.length=" + index.length); + // bpd=" + config.bytesPerDim() + " index.length=" + index.length); if (count == 1) { // Leaf index node // System.out.println(" leaf index node"); @@ -561,8 +562,8 @@ private void rotateToTree( leafBlockStartValues.get(offset), 0, index, - nodeID * (1 + config.bytesPerDim) + 1, - config.bytesPerDim); + nodeID * (1 + config.bytesPerDim()) + 1, + config.bytesPerDim()); } else if (count > 1) { // Internal index node: binary partition of count int countAtLevel = 1; @@ -587,8 +588,8 @@ private void rotateToTree( leafBlockStartValues.get(rootOffset), 0, index, - nodeID * (1 + config.bytesPerDim) + 1, - config.bytesPerDim); + nodeID * (1 + config.bytesPerDim()) + 1, + config.bytesPerDim()); // System.out.println(" index[" + nodeID + "] = blockStartValues[" + rootOffset + "]"); // TODO: we could optimize/specialize, when we know it's simply fully balanced binary tree @@ -611,10 +612,10 @@ private void rotateToTree( } private void checkMaxLeafNodeCount(int numLeaves) { - if ((1 + config.bytesPerDim) * (long) numLeaves > ArrayUtil.MAX_ARRAY_LENGTH) { + if ((1 + config.bytesPerDim()) * (long) numLeaves > ArrayUtil.MAX_ARRAY_LENGTH) { throw new IllegalStateException( - "too many nodes; increase config.maxPointsInLeafNode (currently " - + config.maxPointsInLeafNode + "too many nodes; increase config.maxPointsInLeafNode() (currently " + + config.maxPointsInLeafNode() + ") and reindex"); } } @@ -652,7 +653,7 @@ public long finish(IndexOutput out) throws IOException { long countPerLeaf = pointCount; long innerNodeCount = 1; - while (countPerLeaf > config.maxPointsInLeafNode) { + while (countPerLeaf > config.maxPointsInLeafNode()) { countPerLeaf = (countPerLeaf + 1) / 2; innerNodeCount *= 2; } @@ -667,20 +668,20 @@ public long finish(IndexOutput out) throws IOException { // Indexed by nodeID, but first (root) nodeID is 1. We do 1+ because the lead byte at each // recursion says which dim we split on. - byte[] splitPackedValues = new byte[Math.multiplyExact(numLeaves, 1 + config.bytesPerDim)]; + byte[] splitPackedValues = new byte[Math.multiplyExact(numLeaves, 1 + config.bytesPerDim())]; // +1 because leaf count is power of 2 (e.g. 8), and innerNodeCount is power of 2 minus 1 (e.g. // 7) long[] leafBlockFPs = new long[numLeaves]; // Make sure the math above "worked": - assert pointCount / numLeaves <= config.maxPointsInLeafNode + assert pointCount / numLeaves <= config.maxPointsInLeafNode() : "pointCount=" + pointCount + " numLeaves=" + numLeaves - + " config.maxPointsInLeafNode=" - + config.maxPointsInLeafNode; + + " config.maxPointsInLeafNode()=" + + config.maxPointsInLeafNode(); // We re-use the selector so we do not need to create an object every time. BKDRadixSelector radixSelector = @@ -699,7 +700,7 @@ public long finish(IndexOutput out) throws IOException { maxPackedValue, splitPackedValues, leafBlockFPs, - new int[config.maxPointsInLeafNode]); + new int[config.maxPointsInLeafNode()]); // If no exception, we should have cleaned everything up: assert tempDir.getCreatedFiles().isEmpty(); @@ -724,15 +725,15 @@ private void writeIndex( IndexOutput out, long[] leafBlockFPs, byte[] splitPackedValues, int maxPointsInLeafNode) throws IOException { write(out, NUM_DATA_DIMS); - writeInt(out, config.numDims); + writeInt(out, config.numDims()); newline(out); write(out, NUM_INDEX_DIMS); - writeInt(out, config.numIndexDims); + writeInt(out, config.numIndexDims()); newline(out); write(out, BYTES_PER_DIM); - writeInt(out, config.bytesPerDim); + writeInt(out, config.bytesPerDim()); newline(out); write(out, MAX_LEAF_POINTS); @@ -767,8 +768,8 @@ private void writeIndex( newline(out); } - assert (splitPackedValues.length % (1 + config.bytesPerDim)) == 0; - int count = splitPackedValues.length / (1 + config.bytesPerDim); + assert (splitPackedValues.length % (1 + config.bytesPerDim())) == 0; + int count = splitPackedValues.length / (1 + config.bytesPerDim()); assert count == leafBlockFPs.length; write(out, SPLIT_COUNT); @@ -777,10 +778,12 @@ private void writeIndex( for (int i = 0; i < count; i++) { write(out, SPLIT_DIM); - writeInt(out, splitPackedValues[i * (1 + config.bytesPerDim)] & 0xff); + writeInt(out, splitPackedValues[i * (1 + config.bytesPerDim())] & 0xff); newline(out); write(out, SPLIT_VALUE); - br = new BytesRef(splitPackedValues, 1 + (i * (1 + config.bytesPerDim)), config.bytesPerDim); + br = + new BytesRef( + splitPackedValues, 1 + (i * (1 + config.bytesPerDim())), config.bytesPerDim()); write(out, br.toString()); newline(out); } @@ -852,25 +855,25 @@ private Error verifyChecksum(Throwable priorException, PointWriter writer) throw /** Called only in assert */ private boolean valueInBounds( BytesRef packedValue, byte[] minPackedValue, byte[] maxPackedValue) { - for (int dim = 0; dim < config.numIndexDims; dim++) { - int offset = config.bytesPerDim * dim; + for (int dim = 0; dim < config.numIndexDims(); dim++) { + int offset = config.bytesPerDim() * dim; if (Arrays.compareUnsigned( packedValue.bytes, packedValue.offset + offset, - packedValue.offset + offset + config.bytesPerDim, + packedValue.offset + offset + config.bytesPerDim(), minPackedValue, offset, - offset + config.bytesPerDim) + offset + config.bytesPerDim()) < 0) { return false; } if (Arrays.compareUnsigned( packedValue.bytes, packedValue.offset + offset, - packedValue.offset + offset + config.bytesPerDim, + packedValue.offset + offset + config.bytesPerDim(), maxPackedValue, offset, - offset + config.bytesPerDim) + offset + config.bytesPerDim()) > 0) { return false; } @@ -882,13 +885,13 @@ private boolean valueInBounds( protected int split(byte[] minPackedValue, byte[] maxPackedValue) { // Find which dim has the largest span so we can split on it: int splitDim = -1; - for (int dim = 0; dim < config.numIndexDims; dim++) { - NumericUtils.subtract(config.bytesPerDim, dim, maxPackedValue, minPackedValue, scratchDiff); + for (int dim = 0; dim < config.numIndexDims(); dim++) { + NumericUtils.subtract(config.bytesPerDim(), dim, maxPackedValue, minPackedValue, scratchDiff); if (splitDim == -1 || Arrays.compareUnsigned( - scratchDiff, 0, config.bytesPerDim, scratch1, 0, config.bytesPerDim) + scratchDiff, 0, config.bytesPerDim(), scratch1, 0, config.bytesPerDim()) > 0) { - System.arraycopy(scratchDiff, 0, scratch1, 0, config.bytesPerDim); + System.arraycopy(scratchDiff, 0, scratch1, 0, config.bytesPerDim()); splitDim = dim; } } @@ -931,15 +934,15 @@ private void build( if (nodeID >= leafNodeOffset) { // leaf node final int count = to - from; - assert count <= config.maxPointsInLeafNode; + assert count <= config.maxPointsInLeafNode(); // Compute common prefixes - Arrays.fill(commonPrefixLengths, config.bytesPerDim); + Arrays.fill(commonPrefixLengths, config.bytesPerDim()); reader.getValue(from, scratchBytesRef1); for (int i = from + 1; i < to; ++i) { reader.getValue(i, scratchBytesRef2); - for (int dim = 0; dim < config.numDims; dim++) { - final int offset = dim * config.bytesPerDim; + for (int dim = 0; dim < config.numDims(); dim++) { + final int offset = dim * config.bytesPerDim(); for (int j = 0; j < commonPrefixLengths[dim]; j++) { if (scratchBytesRef1.bytes[scratchBytesRef1.offset + offset + j] != scratchBytesRef2.bytes[scratchBytesRef2.offset + offset + j]) { @@ -951,23 +954,23 @@ private void build( } // Find the dimension that has the least number of unique bytes at commonPrefixLengths[dim] - FixedBitSet[] usedBytes = new FixedBitSet[config.numDims]; - for (int dim = 0; dim < config.numDims; ++dim) { - if (commonPrefixLengths[dim] < config.bytesPerDim) { + FixedBitSet[] usedBytes = new FixedBitSet[config.numDims()]; + for (int dim = 0; dim < config.numDims(); ++dim) { + if (commonPrefixLengths[dim] < config.bytesPerDim()) { usedBytes[dim] = new FixedBitSet(256); } } for (int i = from + 1; i < to; ++i) { - for (int dim = 0; dim < config.numDims; dim++) { + for (int dim = 0; dim < config.numDims(); dim++) { if (usedBytes[dim] != null) { - byte b = reader.getByteAt(i, dim * config.bytesPerDim + commonPrefixLengths[dim]); + byte b = reader.getByteAt(i, dim * config.bytesPerDim() + commonPrefixLengths[dim]); usedBytes[dim].set(Byte.toUnsignedInt(b)); } } } int sortedDim = 0; int sortedDimCardinality = Integer.MAX_VALUE; - for (int dim = 0; dim < config.numDims; ++dim) { + for (int dim = 0; dim < config.numDims(); ++dim) { if (usedBytes[dim] != null) { final int cardinality = usedBytes[dim].cardinality(); if (cardinality < sortedDimCardinality) { @@ -1001,7 +1004,7 @@ private void build( // Write the common prefixes: reader.getValue(from, scratchBytesRef1); System.arraycopy( - scratchBytesRef1.bytes, scratchBytesRef1.offset, scratch1, 0, config.packedBytesLength); + scratchBytesRef1.bytes, scratchBytesRef1.offset, scratch1, 0, config.packedBytesLength()); // Write the full values: IntFunction packedValues = @@ -1023,10 +1026,10 @@ assert valuesInOrderAndBounds( final int splitDim = split(minPackedValue, maxPackedValue); final int mid = (from + to + 1) >>> 1; - int commonPrefixLen = config.bytesPerDim; - for (int i = 0; i < config.bytesPerDim; ++i) { - if (minPackedValue[splitDim * config.bytesPerDim + i] - != maxPackedValue[splitDim * config.bytesPerDim + i]) { + int commonPrefixLen = config.bytesPerDim(); + for (int i = 0; i < config.bytesPerDim(); ++i) { + if (minPackedValue[splitDim * config.bytesPerDim() + i] + != maxPackedValue[splitDim * config.bytesPerDim() + i]) { commonPrefixLen = i; break; } @@ -1044,32 +1047,32 @@ assert valuesInOrderAndBounds( scratchBytesRef2); // set the split value - final int address = nodeID * (1 + config.bytesPerDim); + final int address = nodeID * (1 + config.bytesPerDim()); splitPackedValues[address] = (byte) splitDim; reader.getValue(mid, scratchBytesRef1); System.arraycopy( scratchBytesRef1.bytes, - scratchBytesRef1.offset + splitDim * config.bytesPerDim, + scratchBytesRef1.offset + splitDim * config.bytesPerDim(), splitPackedValues, address + 1, - config.bytesPerDim); + config.bytesPerDim()); byte[] minSplitPackedValue = - ArrayUtil.copyOfSubArray(minPackedValue, 0, config.packedIndexBytesLength); + ArrayUtil.copyOfSubArray(minPackedValue, 0, config.packedIndexBytesLength()); byte[] maxSplitPackedValue = - ArrayUtil.copyOfSubArray(maxPackedValue, 0, config.packedIndexBytesLength); + ArrayUtil.copyOfSubArray(maxPackedValue, 0, config.packedIndexBytesLength()); System.arraycopy( scratchBytesRef1.bytes, - scratchBytesRef1.offset + splitDim * config.bytesPerDim, + scratchBytesRef1.offset + splitDim * config.bytesPerDim(), minSplitPackedValue, - splitDim * config.bytesPerDim, - config.bytesPerDim); + splitDim * config.bytesPerDim(), + config.bytesPerDim()); System.arraycopy( scratchBytesRef1.bytes, - scratchBytesRef1.offset + splitDim * config.bytesPerDim, + scratchBytesRef1.offset + splitDim * config.bytesPerDim(), maxSplitPackedValue, - splitDim * config.bytesPerDim, - config.bytesPerDim); + splitDim * config.bytesPerDim(), + config.bytesPerDim()); // recurse build( @@ -1121,33 +1124,33 @@ private void build( // least number of unique bytes at commonPrefixLengths[dim], which makes compression more // efficient HeapPointWriter heapSource; - if (points.writer instanceof HeapPointWriter == false) { + if (points.writer() instanceof HeapPointWriter == false) { // Adversarial cases can cause this, e.g. merging big segments with most of the points // deleted - heapSource = switchToHeap(points.writer); + heapSource = switchToHeap(points.writer()); } else { - heapSource = (HeapPointWriter) points.writer; + heapSource = (HeapPointWriter) points.writer(); } - int from = Math.toIntExact(points.start); - int to = Math.toIntExact(points.start + points.count); + int from = Math.toIntExact(points.start()); + int to = Math.toIntExact(points.start() + points.count()); // we store common prefix on scratch1 computeCommonPrefixLength(heapSource, scratch1); int sortedDim = 0; int sortedDimCardinality = Integer.MAX_VALUE; - FixedBitSet[] usedBytes = new FixedBitSet[config.numDims]; - for (int dim = 0; dim < config.numDims; ++dim) { - if (commonPrefixLengths[dim] < config.bytesPerDim) { + FixedBitSet[] usedBytes = new FixedBitSet[config.numDims()]; + for (int dim = 0; dim < config.numDims(); ++dim) { + if (commonPrefixLengths[dim] < config.bytesPerDim()) { usedBytes[dim] = new FixedBitSet(256); } } // Find the dimension to compress - for (int dim = 0; dim < config.numDims; dim++) { + for (int dim = 0; dim < config.numDims(); dim++) { int prefix = commonPrefixLengths[dim]; - if (prefix < config.bytesPerDim) { - int offset = dim * config.bytesPerDim; + if (prefix < config.bytesPerDim()) { + int offset = dim * config.bytesPerDim(); for (int i = 0; i < heapSource.count(); ++i) { PointValue value = heapSource.getPackedValueSlice(i); BytesRef packedValue = value.packedValue(); @@ -1190,7 +1193,7 @@ private void build( final BytesRef scratch = new BytesRef(); { - scratch.length = config.packedBytesLength; + scratch.length = config.packedBytesLength(); } @Override @@ -1207,7 +1210,7 @@ assert valuesInOrderAndBounds( // Inner node: partition/recurse int splitDim; - if (config.numIndexDims > 1) { + if (config.numIndexDims() > 1) { splitDim = split(minPackedValue, maxPackedValue); } else { splitDim = 0; @@ -1217,19 +1220,19 @@ assert valuesInOrderAndBounds( : "nodeID=" + nodeID + " splitValues.length=" + splitPackedValues.length; // How many points will be in the left tree: - long rightCount = points.count / 2; - long leftCount = points.count - rightCount; + long rightCount = points.count() / 2; + long leftCount = points.count() - rightCount; int commonPrefixLen = Arrays.mismatch( minPackedValue, - splitDim * config.bytesPerDim, - splitDim * config.bytesPerDim + config.bytesPerDim, + splitDim * config.bytesPerDim(), + splitDim * config.bytesPerDim() + config.bytesPerDim(), maxPackedValue, - splitDim * config.bytesPerDim, - splitDim * config.bytesPerDim + config.bytesPerDim); + splitDim * config.bytesPerDim(), + splitDim * config.bytesPerDim() + config.bytesPerDim()); if (commonPrefixLen == -1) { - commonPrefixLen = config.bytesPerDim; + commonPrefixLen = config.bytesPerDim(); } BKDRadixSelector.PathSlice[] pathSlices = new BKDRadixSelector.PathSlice[2]; @@ -1238,26 +1241,34 @@ assert valuesInOrderAndBounds( radixSelector.select( points, pathSlices, - points.start, - points.start + points.count, - points.start + leftCount, + points.start(), + points.start() + points.count(), + points.start() + leftCount, splitDim, commonPrefixLen); - int address = nodeID * (1 + config.bytesPerDim); + int address = nodeID * (1 + config.bytesPerDim()); splitPackedValues[address] = (byte) splitDim; - System.arraycopy(splitValue, 0, splitPackedValues, address + 1, config.bytesPerDim); + System.arraycopy(splitValue, 0, splitPackedValues, address + 1, config.bytesPerDim()); - byte[] minSplitPackedValue = new byte[config.packedIndexBytesLength]; - System.arraycopy(minPackedValue, 0, minSplitPackedValue, 0, config.packedIndexBytesLength); + byte[] minSplitPackedValue = new byte[config.packedIndexBytesLength()]; + System.arraycopy(minPackedValue, 0, minSplitPackedValue, 0, config.packedIndexBytesLength()); - byte[] maxSplitPackedValue = new byte[config.packedIndexBytesLength]; - System.arraycopy(maxPackedValue, 0, maxSplitPackedValue, 0, config.packedIndexBytesLength); + byte[] maxSplitPackedValue = new byte[config.packedIndexBytesLength()]; + System.arraycopy(maxPackedValue, 0, maxSplitPackedValue, 0, config.packedIndexBytesLength()); System.arraycopy( - splitValue, 0, minSplitPackedValue, splitDim * config.bytesPerDim, config.bytesPerDim); + splitValue, + 0, + minSplitPackedValue, + splitDim * config.bytesPerDim(), + config.bytesPerDim()); System.arraycopy( - splitValue, 0, maxSplitPackedValue, splitDim * config.bytesPerDim, config.bytesPerDim); + splitValue, + 0, + maxSplitPackedValue, + splitDim * config.bytesPerDim(), + config.bytesPerDim()); // Recurse on left tree: build( @@ -1289,30 +1300,30 @@ assert valuesInOrderAndBounds( } private void computeCommonPrefixLength(HeapPointWriter heapPointWriter, byte[] commonPrefix) { - Arrays.fill(commonPrefixLengths, config.bytesPerDim); + Arrays.fill(commonPrefixLengths, config.bytesPerDim()); PointValue value = heapPointWriter.getPackedValueSlice(0); BytesRef packedValue = value.packedValue(); - for (int dim = 0; dim < config.numDims; dim++) { + for (int dim = 0; dim < config.numDims(); dim++) { System.arraycopy( packedValue.bytes, - packedValue.offset + dim * config.bytesPerDim, + packedValue.offset + dim * config.bytesPerDim(), commonPrefix, - dim * config.bytesPerDim, - config.bytesPerDim); + dim * config.bytesPerDim(), + config.bytesPerDim()); } for (int i = 1; i < heapPointWriter.count(); i++) { value = heapPointWriter.getPackedValueSlice(i); packedValue = value.packedValue(); - for (int dim = 0; dim < config.numDims; dim++) { + for (int dim = 0; dim < config.numDims(); dim++) { if (commonPrefixLengths[dim] != 0) { int j = Arrays.mismatch( commonPrefix, - dim * config.bytesPerDim, - dim * config.bytesPerDim + commonPrefixLengths[dim], + dim * config.bytesPerDim(), + dim * config.bytesPerDim() + commonPrefixLengths[dim], packedValue.bytes, - packedValue.offset + dim * config.bytesPerDim, - packedValue.offset + dim * config.bytesPerDim + commonPrefixLengths[dim]); + packedValue.offset + dim * config.bytesPerDim(), + packedValue.offset + dim * config.bytesPerDim() + commonPrefixLengths[dim]); if (j != -1) { commonPrefixLengths[dim] = j; } @@ -1331,11 +1342,11 @@ private boolean valuesInOrderAndBounds( int[] docs, int docsOffset) throws IOException { - byte[] lastPackedValue = new byte[config.packedBytesLength]; + byte[] lastPackedValue = new byte[config.packedBytesLength()]; int lastDoc = -1; for (int i = 0; i < count; i++) { BytesRef packedValue = values.apply(i); - assert packedValue.length == config.packedBytesLength; + assert packedValue.length == config.packedBytesLength(); assert valueInOrder( i, sortedDim, @@ -1361,43 +1372,43 @@ private boolean valueInOrder( int packedValueOffset, int doc, int lastDoc) { - int dimOffset = sortedDim * config.bytesPerDim; + int dimOffset = sortedDim * config.bytesPerDim(); if (ord > 0) { int cmp = Arrays.compareUnsigned( lastPackedValue, dimOffset, - dimOffset + config.bytesPerDim, + dimOffset + config.bytesPerDim(), packedValue, packedValueOffset + dimOffset, - packedValueOffset + dimOffset + config.bytesPerDim); + packedValueOffset + dimOffset + config.bytesPerDim()); if (cmp > 0) { throw new AssertionError( "values out of order: last value=" + new BytesRef(lastPackedValue) + " current value=" - + new BytesRef(packedValue, packedValueOffset, config.packedBytesLength) + + new BytesRef(packedValue, packedValueOffset, config.packedBytesLength()) + " ord=" + ord + " sortedDim=" + sortedDim); } - if (cmp == 0 && config.numDims > config.numIndexDims) { - int dataOffset = config.numIndexDims * config.bytesPerDim; + if (cmp == 0 && config.numDims() > config.numIndexDims()) { + int dataOffset = config.numIndexDims() * config.bytesPerDim(); cmp = Arrays.compareUnsigned( lastPackedValue, dataOffset, - config.packedBytesLength, + config.packedBytesLength(), packedValue, packedValueOffset + dataOffset, - packedValueOffset + config.packedBytesLength); + packedValueOffset + config.packedBytesLength()); if (cmp > 0) { throw new AssertionError( "data values out of order: last value=" + new BytesRef(lastPackedValue) + " current value=" - + new BytesRef(packedValue, packedValueOffset, config.packedBytesLength) + + new BytesRef(packedValue, packedValueOffset, config.packedBytesLength()) + " ord=" + ord); } @@ -1414,7 +1425,8 @@ private boolean valueInOrder( + sortedDim); } } - System.arraycopy(packedValue, packedValueOffset, lastPackedValue, 0, config.packedBytesLength); + System.arraycopy( + packedValue, packedValueOffset, lastPackedValue, 0, config.packedBytesLength()); return true; } diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextCompoundFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextCompoundFormat.java index bfb5888a56b0..8cb48e369199 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextCompoundFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextCompoundFormat.java @@ -35,6 +35,7 @@ import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.ReadAdvice; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.StringHelper; @@ -52,10 +53,10 @@ public class SimpleTextCompoundFormat extends CompoundFormat { public SimpleTextCompoundFormat() {} @Override - public CompoundDirectory getCompoundReader(Directory dir, SegmentInfo si, IOContext context) - throws IOException { + public CompoundDirectory getCompoundReader(Directory dir, SegmentInfo si) throws IOException { String dataFile = IndexFileNames.segmentFileName(si.name, "", DATA_EXTENSION); - final IndexInput in = dir.openInput(dataFile, context); + final IndexInput in = + dir.openInput(dataFile, IOContext.DEFAULT.withReadAdvice(ReadAdvice.NORMAL)); BytesRefBuilder scratch = new BytesRefBuilder(); @@ -135,7 +136,11 @@ public long fileLength(String name) throws IOException { public IndexInput openInput(String name, IOContext context) throws IOException { ensureOpen(); int index = getIndex(name); - return in.slice(name, startOffsets[index], endOffsets[index] - startOffsets[index]); + return in.slice( + name, + startOffsets[index], + endOffsets[index] - startOffsets[index], + context.readAdvice()); } @Override diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldInfosFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldInfosFormat.java index 655938ac67af..fefbb44bd80b 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldInfosFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldInfosFormat.java @@ -22,6 +22,7 @@ import java.util.HashMap; import java.util.Map; import org.apache.lucene.codecs.FieldInfosFormat; +import org.apache.lucene.index.DocValuesSkipIndexType; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; @@ -125,8 +126,8 @@ public FieldInfos read( SimpleTextUtil.readLine(input, scratch); assert StringHelper.startsWith(scratch.get(), DOCVALUES_SKIP_INDEX); - boolean docValueSkipper = - Boolean.parseBoolean(readString(DOCVALUES_SKIP_INDEX.length, scratch)); + DocValuesSkipIndexType docValueSkipper = + docValuesSkipIndexType(readString(DOCVALUES_SKIP_INDEX.length, scratch)); SimpleTextUtil.readLine(input, scratch); assert StringHelper.startsWith(scratch.get(), DOCVALUES_GEN); @@ -221,6 +222,10 @@ public DocValuesType docValuesType(String dvType) { return DocValuesType.valueOf(dvType); } + public DocValuesSkipIndexType docValuesSkipIndexType(String dvSkipIndexType) { + return DocValuesSkipIndexType.valueOf(dvSkipIndexType); + } + public VectorEncoding vectorEncoding(String vectorEncoding) { return VectorEncoding.valueOf(vectorEncoding); } @@ -268,7 +273,7 @@ public void write( SimpleTextUtil.writeNewline(out); SimpleTextUtil.write(out, STORETV); - SimpleTextUtil.write(out, Boolean.toString(fi.hasVectors()), scratch); + SimpleTextUtil.write(out, Boolean.toString(fi.hasTermVectors()), scratch); SimpleTextUtil.writeNewline(out); SimpleTextUtil.write(out, PAYLOADS); @@ -284,7 +289,7 @@ public void write( SimpleTextUtil.writeNewline(out); SimpleTextUtil.write(out, DOCVALUES_SKIP_INDEX); - SimpleTextUtil.write(out, Boolean.toString(fi.hasDocValuesSkipIndex()), scratch); + SimpleTextUtil.write(out, getDocValuesSkipIndexType(fi.docValuesSkipIndexType()), scratch); SimpleTextUtil.writeNewline(out); SimpleTextUtil.write(out, DOCVALUES_GEN); @@ -355,4 +360,8 @@ public void write( private static String getDocValuesType(DocValuesType type) { return type.toString(); } + + private static String getDocValuesSkipIndexType(DocValuesSkipIndexType type) { + return type.toString(); + } } diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsReader.java index faba629715b7..6c7c53a38d0e 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsReader.java @@ -26,8 +26,6 @@ import java.io.IOException; import java.nio.charset.StandardCharsets; -import java.util.HashMap; -import java.util.Map; import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.index.ByteVectorValues; import org.apache.lucene.index.CorruptIndexException; @@ -36,6 +34,7 @@ import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.internal.hppc.IntObjectHashMap; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.KnnCollector; import org.apache.lucene.search.VectorScorer; @@ -63,7 +62,7 @@ public class SimpleTextKnnVectorsReader extends KnnVectorsReader { private final SegmentReadState readState; private final IndexInput dataIn; private final BytesRefBuilder scratch = new BytesRefBuilder(); - private final Map fieldEntries = new HashMap<>(); + private final IntObjectHashMap fieldEntries = new IntObjectHashMap<>(); SimpleTextKnnVectorsReader(SegmentReadState readState) throws IOException { this.readState = readState; @@ -91,9 +90,9 @@ public class SimpleTextKnnVectorsReader extends KnnVectorsReader { for (int i = 0; i < size; i++) { docIds[i] = readInt(in, EMPTY); } - assert fieldEntries.containsKey(fieldName) == false; + assert fieldEntries.containsKey(fieldNumber) == false; fieldEntries.put( - fieldName, + fieldNumber, new FieldEntry( dimension, vectorDataOffset, @@ -126,7 +125,7 @@ public FloatVectorValues getFloatVectorValues(String field) throws IOException { throw new IllegalStateException( "KNN vectors readers should not be called on fields that don't enable KNN vectors"); } - FieldEntry fieldEntry = fieldEntries.get(field); + FieldEntry fieldEntry = fieldEntries.get(info.number); if (fieldEntry == null) { // mirror the handling in Lucene90VectorReader#getVectorValues // needed to pass TestSimpleTextKnnVectorsFormat#testDeleteAllVectorDocs @@ -159,7 +158,7 @@ public ByteVectorValues getByteVectorValues(String field) throws IOException { throw new IllegalStateException( "KNN vectors readers should not be called on fields that don't enable KNN vectors"); } - FieldEntry fieldEntry = fieldEntries.get(field); + FieldEntry fieldEntry = fieldEntries.get(info.number); if (fieldEntry == null) { // mirror the handling in Lucene90VectorReader#getVectorValues // needed to pass TestSimpleTextKnnVectorsFormat#testDeleteAllVectorDocs @@ -192,8 +191,8 @@ public void search(String field, float[] target, KnnCollector knnCollector, Bits } FieldInfo info = readState.fieldInfos.fieldInfo(field); VectorSimilarityFunction vectorSimilarity = info.getVectorSimilarityFunction(); - int doc; - while ((doc = values.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + for (int ord = 0; ord < values.size(); ord++) { + int doc = values.ordToDoc(ord); if (acceptDocs != null && acceptDocs.get(doc) == false) { continue; } @@ -202,7 +201,7 @@ public void search(String field, float[] target, KnnCollector knnCollector, Bits break; } - float[] vector = values.vectorValue(); + float[] vector = values.vectorValue(ord); float score = vectorSimilarity.compare(vector, target); knnCollector.collect(doc, score); knnCollector.incVisitedCount(1); @@ -223,8 +222,8 @@ public void search(String field, byte[] target, KnnCollector knnCollector, Bits FieldInfo info = readState.fieldInfos.fieldInfo(field); VectorSimilarityFunction vectorSimilarity = info.getVectorSimilarityFunction(); - int doc; - while ((doc = values.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + for (int ord = 0; ord < values.size(); ord++) { + int doc = values.ordToDoc(ord); if (acceptDocs != null && acceptDocs.get(doc) == false) { continue; } @@ -233,7 +232,7 @@ public void search(String field, byte[] target, KnnCollector knnCollector, Bits break; } - byte[] vector = values.vectorValue(); + byte[] vector = values.vectorValue(ord); float score = vectorSimilarity.compare(vector, target); knnCollector.collect(doc, score); knnCollector.incVisitedCount(1); @@ -327,35 +326,18 @@ public int size() { } @Override - public float[] vectorValue() { - return values[curOrd]; + public float[] vectorValue(int ord) { + return values[ord]; } @Override - public int docID() { - if (curOrd == -1) { - return -1; - } else if (curOrd >= entry.size()) { - // when call to advance / nextDoc below already returns NO_MORE_DOCS, calling docID - // immediately afterward should also return NO_MORE_DOCS - // this is needed for TestSimpleTextKnnVectorsFormat.testAdvance test case - return NO_MORE_DOCS; - } - - return entry.ordToDoc[curOrd]; - } - - @Override - public int nextDoc() throws IOException { - if (++curOrd < entry.size()) { - return docID(); - } - return NO_MORE_DOCS; + public int ordToDoc(int ord) { + return entry.ordToDoc[ord]; } @Override - public int advance(int target) throws IOException { - return slowAdvance(target); + public DocIndexIterator iterator() { + return createSparseIterator(); } @Override @@ -365,17 +347,19 @@ public VectorScorer scorer(float[] target) { } SimpleTextFloatVectorValues simpleTextFloatVectorValues = new SimpleTextFloatVectorValues(this); + DocIndexIterator iterator = simpleTextFloatVectorValues.iterator(); return new VectorScorer() { @Override public float score() throws IOException { + int ord = iterator.index(); return entry .similarityFunction() - .compare(simpleTextFloatVectorValues.vectorValue(), target); + .compare(simpleTextFloatVectorValues.vectorValue(ord), target); } @Override public DocIdSetIterator iterator() { - return simpleTextFloatVectorValues; + return iterator; } }; } @@ -397,6 +381,11 @@ private void readVector(float[] value) throws IOException { value[i] = Float.parseFloat(floatStrings[i]); } } + + @Override + public SimpleTextFloatVectorValues copy() { + return this; + } } private static class SimpleTextByteVectorValues extends ByteVectorValues { @@ -439,36 +428,19 @@ public int size() { } @Override - public byte[] vectorValue() { - binaryValue.bytes = values[curOrd]; + public byte[] vectorValue(int ord) { + binaryValue.bytes = values[ord]; return binaryValue.bytes; } @Override - public int docID() { - if (curOrd == -1) { - return -1; - } else if (curOrd >= entry.size()) { - // when call to advance / nextDoc below already returns NO_MORE_DOCS, calling docID - // immediately afterward should also return NO_MORE_DOCS - // this is needed for TestSimpleTextKnnVectorsFormat.testAdvance test case - return NO_MORE_DOCS; - } - - return entry.ordToDoc[curOrd]; - } - - @Override - public int nextDoc() throws IOException { - if (++curOrd < entry.size()) { - return docID(); - } - return NO_MORE_DOCS; + public int ordToDoc(int ord) { + return entry.ordToDoc[ord]; } @Override - public int advance(int target) throws IOException { - return slowAdvance(target); + public DocIndexIterator iterator() { + return createSparseIterator(); } @Override @@ -478,16 +450,19 @@ public VectorScorer scorer(byte[] target) { } SimpleTextByteVectorValues simpleTextByteVectorValues = new SimpleTextByteVectorValues(this); return new VectorScorer() { + DocIndexIterator it = simpleTextByteVectorValues.iterator(); + @Override public float score() throws IOException { + int ord = it.index(); return entry .similarityFunction() - .compare(simpleTextByteVectorValues.vectorValue(), target); + .compare(simpleTextByteVectorValues.vectorValue(ord), target); } @Override public DocIdSetIterator iterator() { - return simpleTextByteVectorValues; + return it; } }; } @@ -509,6 +484,11 @@ private void readVector(byte[] value) throws IOException { value[i] = (byte) Float.parseFloat(floatStrings[i]); } } + + @Override + public SimpleTextByteVectorValues copy() { + return this; + } } private int readInt(IndexInput in, BytesRef field) throws IOException { diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsWriter.java index a7a76ac1bb98..eaf4b657755c 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsWriter.java @@ -28,6 +28,7 @@ import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.BytesRef; @@ -77,19 +78,18 @@ public void writeField(FieldInfo fieldInfo, FloatVectorValues floatVectorValues, throws IOException { long vectorDataOffset = vectorData.getFilePointer(); List docIds = new ArrayList<>(); - for (int docV = floatVectorValues.nextDoc(); - docV != NO_MORE_DOCS; - docV = floatVectorValues.nextDoc()) { - writeFloatVectorValue(floatVectorValues); - docIds.add(docV); + KnnVectorValues.DocIndexIterator iter = floatVectorValues.iterator(); + for (int docId = iter.nextDoc(); docId != NO_MORE_DOCS; docId = iter.nextDoc()) { + writeFloatVectorValue(floatVectorValues, iter.index()); + docIds.add(docId); } long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset; writeMeta(fieldInfo, vectorDataOffset, vectorDataLength, docIds); } - private void writeFloatVectorValue(FloatVectorValues vectors) throws IOException { + private void writeFloatVectorValue(FloatVectorValues vectors, int ord) throws IOException { // write vector value - float[] value = vectors.vectorValue(); + float[] value = vectors.vectorValue(ord); assert value.length == vectors.dimension(); write(vectorData, Arrays.toString(value)); newline(vectorData); @@ -100,19 +100,18 @@ public void writeField(FieldInfo fieldInfo, ByteVectorValues byteVectorValues, i throws IOException { long vectorDataOffset = vectorData.getFilePointer(); List docIds = new ArrayList<>(); - for (int docV = byteVectorValues.nextDoc(); - docV != NO_MORE_DOCS; - docV = byteVectorValues.nextDoc()) { - writeByteVectorValue(byteVectorValues); + KnnVectorValues.DocIndexIterator it = byteVectorValues.iterator(); + for (int docV = it.nextDoc(); docV != NO_MORE_DOCS; docV = it.nextDoc()) { + writeByteVectorValue(byteVectorValues, it.index()); docIds.add(docV); } long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset; writeMeta(fieldInfo, vectorDataOffset, vectorDataLength, docIds); } - private void writeByteVectorValue(ByteVectorValues vectors) throws IOException { + private void writeByteVectorValue(ByteVectorValues vectors, int ord) throws IOException { // write vector value - byte[] value = vectors.vectorValue(); + byte[] value = vectors.vectorValue(ord); assert value.length == vectors.dimension(); write(vectorData, Arrays.toString(value)); newline(vectorData); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextLiveDocsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextLiveDocsFormat.java index 40d14f3368be..50e9bcb0c7ed 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextLiveDocsFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextLiveDocsFormat.java @@ -144,14 +144,7 @@ public void files(SegmentCommitInfo info, Collection files) throws IOExc } // read-only - static class SimpleTextBits implements Bits { - final BitSet bits; - final int size; - - SimpleTextBits(BitSet bits, int size) { - this.bits = bits; - this.size = size; - } + record SimpleTextBits(BitSet bits, int size) implements Bits { @Override public boolean get(int index) { diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/DeltaBaseTermStateSerializer.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/DeltaBaseTermStateSerializer.java index 1f876e5e9d14..140d62b4e967 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/DeltaBaseTermStateSerializer.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/DeltaBaseTermStateSerializer.java @@ -17,13 +17,13 @@ package org.apache.lucene.codecs.uniformsplit; -import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.BLOCK_SIZE; +import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.BLOCK_SIZE; import java.io.IOException; import org.apache.lucene.codecs.BlockTermState; -import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.IntBlockTermState; -import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader; -import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter; +import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.IntBlockTermState; +import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader; +import org.apache.lucene.codecs.lucene101.Lucene101PostingsWriter; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.TermState; @@ -34,7 +34,7 @@ /** * {@link TermState} serializer which encodes each file pointer as a delta relative to a base file - * pointer. It differs from {@link Lucene912PostingsWriter#encodeTerm} which encodes each file + * pointer. It differs from {@link Lucene101PostingsWriter#encodeTerm} which encodes each file * pointer as a delta relative to the previous file pointer. * *

It automatically sets the base file pointer to the first valid file pointer for doc start FP, @@ -95,7 +95,7 @@ public long getBasePayStartFP() { /** * Writes a {@link BlockTermState} to the provided {@link DataOutput}. * - *

Simpler variant of {@link Lucene912PostingsWriter#encodeTerm(DataOutput, FieldInfo, + *

Simpler variant of {@link Lucene101PostingsWriter#encodeTerm(DataOutput, FieldInfo, * BlockTermState, boolean)}. */ public void writeTermState( @@ -145,7 +145,7 @@ public void writeTermState( /** * Reads a {@link BlockTermState} from the provided {@link DataInput}. * - *

Simpler variant of {@link Lucene912PostingsReader#decodeTerm(DataInput, FieldInfo, + *

Simpler variant of {@link Lucene101PostingsReader#decodeTerm(DataInput, FieldInfo, * BlockTermState, boolean)}. * * @param reuse {@link BlockTermState} to reuse; or null to create a new one. diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitPostingsFormat.java index c8a19bf9da94..690eab214003 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitPostingsFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitPostingsFormat.java @@ -23,8 +23,8 @@ import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.PostingsReaderBase; import org.apache.lucene.codecs.PostingsWriterBase; -import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader; -import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter; +import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader; +import org.apache.lucene.codecs.lucene101.Lucene101PostingsWriter; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.util.IOUtils; @@ -113,7 +113,7 @@ protected UniformSplitPostingsFormat( @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(state); + PostingsWriterBase postingsWriter = new Lucene101PostingsWriter(state); boolean success = false; try { FieldsConsumer termsWriter = @@ -130,7 +130,7 @@ public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { - PostingsReaderBase postingsReader = new Lucene912PostingsReader(state); + PostingsReaderBase postingsReader = new Lucene101PostingsReader(state); boolean success = false; try { FieldsProducer termsReader = diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/package-info.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/package-info.java index dc77bc710a1c..d31b28704ef7 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/package-info.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/package-info.java @@ -28,7 +28,7 @@ * org.apache.lucene.search.PhraseQuery}) *

  • Quite efficient for {@link org.apache.lucene.search.PrefixQuery} *
  • Not efficient for spell-check and {@link org.apache.lucene.search.FuzzyQuery}, in this case - * prefer {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat} + * prefer {@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat} * */ package org.apache.lucene.codecs.uniformsplit; diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/FieldMetadataTermState.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/FieldMetadataTermState.java index 36d618752c32..83887656bc97 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/FieldMetadataTermState.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/FieldMetadataTermState.java @@ -25,13 +25,4 @@ * * @lucene.experimental */ -public class FieldMetadataTermState { - - public final FieldMetadata fieldMetadata; - public final BlockTermState state; - - public FieldMetadataTermState(FieldMetadata fieldMetadata, BlockTermState state) { - this.fieldMetadata = fieldMetadata; - this.state = state; - } -} +public record FieldMetadataTermState(FieldMetadata fieldMetadata, BlockTermState state) {} diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STBlockLine.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STBlockLine.java index 49352fd1c47d..fad572177a0b 100755 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STBlockLine.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STBlockLine.java @@ -57,7 +57,7 @@ public STBlockLine(TermBytes termBytes, List termStates) */ public void collectFields(Collection collector) { for (FieldMetadataTermState fieldTermState : termStates) { - collector.add(fieldTermState.fieldMetadata); + collector.add(fieldTermState.fieldMetadata()); } } @@ -82,13 +82,13 @@ public void writeLineTermStates( assert size > 0 : "not valid block line with :" + size + " lines."; if (size == 1) { // When there is only 1 field, write its id as negative, followed by the field TermState. - int fieldID = line.termStates.get(0).fieldMetadata.getFieldInfo().number; + int fieldID = line.termStates.get(0).fieldMetadata().getFieldInfo().number; termStatesOutput.writeZInt(-fieldID); fieldMetadataTermState = line.termStates.get(0); encoder.writeTermState( termStatesOutput, - fieldMetadataTermState.fieldMetadata.getFieldInfo(), - fieldMetadataTermState.state); + fieldMetadataTermState.fieldMetadata().getFieldInfo(), + fieldMetadataTermState.state()); return; } @@ -96,15 +96,15 @@ public void writeLineTermStates( // First iteration writes the fields ids. for (int i = 0; i < size; i++) { fieldMetadataTermState = line.termStates.get(i); - termStatesOutput.writeVInt(fieldMetadataTermState.fieldMetadata.getFieldInfo().number); + termStatesOutput.writeVInt(fieldMetadataTermState.fieldMetadata().getFieldInfo().number); } // Second iteration writes the corresponding field TermStates. for (int i = 0; i < size; i++) { fieldMetadataTermState = line.termStates.get(i); encoder.writeTermState( termStatesOutput, - fieldMetadataTermState.fieldMetadata.getFieldInfo(), - fieldMetadataTermState.state); + fieldMetadataTermState.fieldMetadata().getFieldInfo(), + fieldMetadataTermState.state()); } } diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/bitvectors/TestHnswBitVectorsFormat.java b/lucene/codecs/src/test/org/apache/lucene/codecs/bitvectors/TestHnswBitVectorsFormat.java index ab20ee67c8c9..388f08792565 100644 --- a/lucene/codecs/src/test/org/apache/lucene/codecs/bitvectors/TestHnswBitVectorsFormat.java +++ b/lucene/codecs/src/test/org/apache/lucene/codecs/bitvectors/TestHnswBitVectorsFormat.java @@ -22,7 +22,6 @@ import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.FilterCodec; import org.apache.lucene.codecs.KnnVectorsFormat; -import org.apache.lucene.codecs.lucene912.Lucene912Codec; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.KnnByteVectorField; @@ -38,16 +37,12 @@ import org.apache.lucene.search.TopKnnCollector; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.index.BaseIndexFileFormatTestCase; +import org.apache.lucene.tests.util.TestUtil; public class TestHnswBitVectorsFormat extends BaseIndexFileFormatTestCase { @Override protected Codec getCodec() { - return new Lucene912Codec() { - @Override - public KnnVectorsFormat getKnnVectorsFormatForField(String field) { - return new HnswBitVectorsFormat(); - } - }; + return TestUtil.alwaysKnnVectorsFormat(new HnswBitVectorsFormat()); } @Override diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/lucene90/tests/MockTermStateFactory.java b/lucene/codecs/src/test/org/apache/lucene/codecs/lucene90/tests/MockTermStateFactory.java index 51891b9d0adc..0708f3b8050b 100644 --- a/lucene/codecs/src/test/org/apache/lucene/codecs/lucene90/tests/MockTermStateFactory.java +++ b/lucene/codecs/src/test/org/apache/lucene/codecs/lucene90/tests/MockTermStateFactory.java @@ -17,7 +17,7 @@ package org.apache.lucene.codecs.lucene90.tests; -import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.IntBlockTermState; +import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.IntBlockTermState; /** Test utility class to create mock {@link IntBlockTermState}. */ public class MockTermStateFactory { diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestBlockWriter.java b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestBlockWriter.java index 24f144481785..8710b846f937 100644 --- a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestBlockWriter.java +++ b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestBlockWriter.java @@ -20,6 +20,7 @@ import java.io.IOException; import java.util.Collections; import org.apache.lucene.codecs.lucene90.tests.MockTermStateFactory; +import org.apache.lucene.index.DocValuesSkipIndexType; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexOptions; @@ -111,7 +112,7 @@ private static FieldInfo getMockFieldInfo(String fieldName, int number) { true, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS, DocValuesType.NONE, - false, + DocValuesSkipIndexType.NONE, -1, Collections.emptyMap(), 0, diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/TestSTBlockReader.java b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/TestSTBlockReader.java index 7a6524d77cdc..b2190c19ffeb 100644 --- a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/TestSTBlockReader.java +++ b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/TestSTBlockReader.java @@ -34,6 +34,7 @@ import org.apache.lucene.codecs.uniformsplit.FieldMetadata; import org.apache.lucene.codecs.uniformsplit.IndexDictionary; import org.apache.lucene.codecs.uniformsplit.TermBytes; +import org.apache.lucene.index.DocValuesSkipIndexType; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; @@ -198,7 +199,7 @@ private static FieldInfo mockFieldInfo(String fieldName, int number) { true, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS, DocValuesType.NONE, - false, + DocValuesSkipIndexType.NONE, -1, Collections.emptyMap(), 0, @@ -225,15 +226,7 @@ private static List generateBlockLines( return lines; } - private static class BlockLineDefinition { - final TermBytes termBytes; - final List fields; - - BlockLineDefinition(TermBytes termBytes, List fields) { - this.termBytes = termBytes; - this.fields = fields; - } - } + private record BlockLineDefinition(TermBytes termBytes, List fields) {} private static class MockSTBlockLine extends STBlockLine { diff --git a/lucene/core/src/generated/checksums/generateForDeltaUtil.json b/lucene/core/src/generated/checksums/generateForDeltaUtil.json new file mode 100644 index 000000000000..5cc8a3fe5a0e --- /dev/null +++ b/lucene/core/src/generated/checksums/generateForDeltaUtil.json @@ -0,0 +1,4 @@ +{ + "lucene/core/src/java/org/apache/lucene/codecs/lucene101/ForDeltaUtil.java": "0ff7fb9159693055d9e4b9468b004166156f6550", + "lucene/core/src/java/org/apache/lucene/codecs/lucene101/gen_ForDeltaUtil.py": "8c55b7aaced028388408c5eb968b1f1197e11142" +} \ No newline at end of file diff --git a/lucene/core/src/generated/checksums/generateForUtil.json b/lucene/core/src/generated/checksums/generateForUtil.json index 752285f4d7fe..6f61f8fc2c75 100644 --- a/lucene/core/src/generated/checksums/generateForUtil.json +++ b/lucene/core/src/generated/checksums/generateForUtil.json @@ -1,4 +1,4 @@ { - "lucene/core/src/java/org/apache/lucene/codecs/lucene912/ForUtil.java": "5ff856e80cab30f9e5704aa89f3197f017d07624", - "lucene/core/src/java/org/apache/lucene/codecs/lucene912/gen_ForUtil.py": "3ccf92b3ddbff6340a13e8a55090bfb900dc7be2" + "lucene/core/src/java/org/apache/lucene/codecs/lucene101/ForUtil.java": "10ceb79f031232bc1e4564db7e3ebb16eedd2e0a", + "lucene/core/src/java/org/apache/lucene/codecs/lucene101/gen_ForUtil.py": "d69e734bce30375952046a3776bbb7a5c1edbd51" } \ No newline at end of file diff --git a/lucene/core/src/java/module-info.java b/lucene/core/src/java/module-info.java index a0f0bad01eb2..85aff5722498 100644 --- a/lucene/core/src/java/module-info.java +++ b/lucene/core/src/java/module-info.java @@ -15,8 +15,6 @@ * limitations under the License. */ -import org.apache.lucene.codecs.lucene912.Lucene912Codec; - /** Lucene Core. */ @SuppressWarnings("module") // the test framework is compiled after the core... module org.apache.lucene.core { @@ -33,7 +31,7 @@ exports org.apache.lucene.codecs.lucene94; exports org.apache.lucene.codecs.lucene95; exports org.apache.lucene.codecs.lucene99; - exports org.apache.lucene.codecs.lucene912; + exports org.apache.lucene.codecs.lucene101; exports org.apache.lucene.codecs.perfield; exports org.apache.lucene.codecs; exports org.apache.lucene.document; @@ -72,7 +70,7 @@ provides org.apache.lucene.analysis.TokenizerFactory with org.apache.lucene.analysis.standard.StandardTokenizerFactory; provides org.apache.lucene.codecs.Codec with - Lucene912Codec; + org.apache.lucene.codecs.lucene101.Lucene101Codec; provides org.apache.lucene.codecs.DocValuesFormat with org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat; provides org.apache.lucene.codecs.KnnVectorsFormat with @@ -80,7 +78,7 @@ org.apache.lucene.codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat, org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat; provides org.apache.lucene.codecs.PostingsFormat with - org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat; + org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat; provides org.apache.lucene.index.SortFieldProvider with org.apache.lucene.search.SortField.Provider, org.apache.lucene.search.SortedNumericSortField.Provider, diff --git a/lucene/core/src/java/org/apache/lucene/analysis/AutomatonToTokenStream.java b/lucene/core/src/java/org/apache/lucene/analysis/AutomatonToTokenStream.java index d60d386ec5b8..28493c168bce 100644 --- a/lucene/core/src/java/org/apache/lucene/analysis/AutomatonToTokenStream.java +++ b/lucene/core/src/java/org/apache/lucene/analysis/AutomatonToTokenStream.java @@ -170,24 +170,8 @@ public void end() throws IOException { } /** Edge between position nodes. These edges will be output as tokens in the TokenStream */ - private static class EdgeToken { - public final int destination; - public final int value; - - public EdgeToken(int destination, int value) { - this.destination = destination; - this.value = value; - } - } + private record EdgeToken(int destination, int value) {} /** Node that contains original node id and position in TokenStream */ - private static class RemapNode { - public final int id; - public final int pos; - - public RemapNode(int id, int pos) { - this.id = id; - this.pos = pos; - } - } + private record RemapNode(int id, int pos) {} } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/BufferingKnnVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/BufferingKnnVectorsWriter.java index 8a9b4816571e..96b0f75a259f 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/BufferingKnnVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/BufferingKnnVectorsWriter.java @@ -20,14 +20,16 @@ import java.io.IOException; import java.util.ArrayList; import java.util.List; +import java.util.function.Supplier; import org.apache.lucene.index.ByteVectorValues; import org.apache.lucene.index.DocsWithFieldSet; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.MergeState; import org.apache.lucene.index.Sorter; -import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.search.VectorScorer; +import org.apache.lucene.index.SortingCodecReader; +import org.apache.lucene.index.SortingCodecReader.SortingValuesIterator; +import org.apache.lucene.search.DocIdSet; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.RamUsageEstimator; @@ -80,24 +82,26 @@ public void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException { case FLOAT32: BufferedFloatVectorValues bufferedFloatVectorValues = new BufferedFloatVectorValues( - fieldData.docsWithField, (List) fieldData.vectors, - fieldData.fieldInfo.getVectorDimension()); + fieldData.fieldInfo.getVectorDimension(), + fieldData.docsWithField); FloatVectorValues floatVectorValues = sortMap != null - ? new SortingFloatVectorValues(bufferedFloatVectorValues, sortMap) + ? new SortingFloatVectorValues( + bufferedFloatVectorValues, fieldData.docsWithField, sortMap) : bufferedFloatVectorValues; writeField(fieldData.fieldInfo, floatVectorValues, maxDoc); break; case BYTE: BufferedByteVectorValues bufferedByteVectorValues = new BufferedByteVectorValues( - fieldData.docsWithField, (List) fieldData.vectors, - fieldData.fieldInfo.getVectorDimension()); + fieldData.fieldInfo.getVectorDimension(), + fieldData.docsWithField); ByteVectorValues byteVectorValues = sortMap != null - ? new SortingByteVectorValues(bufferedByteVectorValues, sortMap) + ? new SortingByteVectorValues( + bufferedByteVectorValues, fieldData.docsWithField, sortMap) : bufferedByteVectorValues; writeField(fieldData.fieldInfo, byteVectorValues, maxDoc); break; @@ -107,125 +111,77 @@ public void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException { /** Sorting FloatVectorValues that iterate over documents in the order of the provided sortMap */ private static class SortingFloatVectorValues extends FloatVectorValues { - private final BufferedFloatVectorValues randomAccess; - private final int[] docIdOffsets; - private int docId = -1; + private final BufferedFloatVectorValues delegate; + private final Supplier iteratorSupplier; - SortingFloatVectorValues(BufferedFloatVectorValues delegate, Sorter.DocMap sortMap) + SortingFloatVectorValues( + BufferedFloatVectorValues delegate, DocsWithFieldSet docsWithField, Sorter.DocMap sortMap) throws IOException { - this.randomAccess = delegate.copy(); - this.docIdOffsets = new int[sortMap.size()]; - - int offset = 1; // 0 means no vector for this (field, document) - int docID; - while ((docID = delegate.nextDoc()) != NO_MORE_DOCS) { - int newDocID = sortMap.oldToNew(docID); - docIdOffsets[newDocID] = offset++; - } - } - - @Override - public int docID() { - return docId; - } - - @Override - public int nextDoc() throws IOException { - while (docId < docIdOffsets.length - 1) { - ++docId; - if (docIdOffsets[docId] != 0) { - return docId; - } - } - docId = NO_MORE_DOCS; - return docId; + this.delegate = delegate.copy(); + iteratorSupplier = SortingCodecReader.iteratorSupplier(delegate, sortMap); } @Override - public float[] vectorValue() throws IOException { - return randomAccess.vectorValue(docIdOffsets[docId] - 1); + public float[] vectorValue(int ord) throws IOException { + return delegate.vectorValue(ord); } @Override public int dimension() { - return randomAccess.dimension(); + return delegate.dimension(); } @Override public int size() { - return randomAccess.size(); + return delegate.size(); } @Override - public int advance(int target) throws IOException { + public SortingFloatVectorValues copy() { throw new UnsupportedOperationException(); } @Override - public VectorScorer scorer(float[] target) { - throw new UnsupportedOperationException(); + public DocIndexIterator iterator() { + return iteratorSupplier.get(); } } - /** Sorting FloatVectorValues that iterate over documents in the order of the provided sortMap */ + /** Sorting ByteVectorValues that iterate over documents in the order of the provided sortMap */ private static class SortingByteVectorValues extends ByteVectorValues { - private final BufferedByteVectorValues randomAccess; - private final int[] docIdOffsets; - private int docId = -1; + private final BufferedByteVectorValues delegate; + private final Supplier iteratorSupplier; - SortingByteVectorValues(BufferedByteVectorValues delegate, Sorter.DocMap sortMap) + SortingByteVectorValues( + BufferedByteVectorValues delegate, DocsWithFieldSet docsWithField, Sorter.DocMap sortMap) throws IOException { - this.randomAccess = delegate.copy(); - this.docIdOffsets = new int[sortMap.size()]; - - int offset = 1; // 0 means no vector for this (field, document) - int docID; - while ((docID = delegate.nextDoc()) != NO_MORE_DOCS) { - int newDocID = sortMap.oldToNew(docID); - docIdOffsets[newDocID] = offset++; - } - } - - @Override - public int docID() { - return docId; + this.delegate = delegate; + iteratorSupplier = SortingCodecReader.iteratorSupplier(delegate, sortMap); } @Override - public int nextDoc() throws IOException { - while (docId < docIdOffsets.length - 1) { - ++docId; - if (docIdOffsets[docId] != 0) { - return docId; - } - } - docId = NO_MORE_DOCS; - return docId; - } - - @Override - public byte[] vectorValue() throws IOException { - return randomAccess.vectorValue(docIdOffsets[docId] - 1); + public byte[] vectorValue(int ord) throws IOException { + return delegate.vectorValue(ord); } @Override public int dimension() { - return randomAccess.dimension(); + return delegate.dimension(); } @Override public int size() { - return randomAccess.size(); + return delegate.size(); } @Override - public int advance(int target) throws IOException { + public SortingByteVectorValues copy() { throw new UnsupportedOperationException(); } @Override - public VectorScorer scorer(byte[] target) { - throw new UnsupportedOperationException(); + public DocIndexIterator iterator() { + return iteratorSupplier.get(); } } @@ -296,7 +252,9 @@ public final void addValue(int docID, T value) { @Override public final long ramBytesUsed() { - if (vectors.size() == 0) return 0; + if (vectors.isEmpty()) { + return 0; + } return docsWithField.ramBytesUsed() + vectors.size() * (long) @@ -307,25 +265,18 @@ public final long ramBytesUsed() { } private static class BufferedFloatVectorValues extends FloatVectorValues { - final DocsWithFieldSet docsWithField; - // These are always the vectors of a VectorValuesWriter, which are copied when added to it final List vectors; final int dimension; + private final DocIdSet docsWithField; + private final DocIndexIterator iterator; - DocIdSetIterator docsWithFieldIter; - int ord = -1; - - BufferedFloatVectorValues( - DocsWithFieldSet docsWithField, List vectors, int dimension) { - this.docsWithField = docsWithField; + BufferedFloatVectorValues(List vectors, int dimension, DocIdSet docsWithField) + throws IOException { this.vectors = vectors; this.dimension = dimension; - docsWithFieldIter = docsWithField.iterator(); - } - - public BufferedFloatVectorValues copy() { - return new BufferedFloatVectorValues(docsWithField, vectors, dimension); + this.docsWithField = docsWithField; + this.iterator = fromDISI(docsWithField.iterator()); } @Override @@ -339,58 +290,39 @@ public int size() { } @Override - public float[] vectorValue() { - return vectors.get(ord); - } - - float[] vectorValue(int targetOrd) { - return vectors.get(targetOrd); + public int ordToDoc(int ord) { + return ord; } @Override - public int docID() { - return docsWithFieldIter.docID(); - } - - @Override - public int nextDoc() throws IOException { - int docID = docsWithFieldIter.nextDoc(); - if (docID != NO_MORE_DOCS) { - ++ord; - } - return docID; + public float[] vectorValue(int targetOrd) { + return vectors.get(targetOrd); } @Override - public int advance(int target) { - throw new UnsupportedOperationException(); + public DocIndexIterator iterator() { + return iterator; } @Override - public VectorScorer scorer(float[] target) { - throw new UnsupportedOperationException(); + public BufferedFloatVectorValues copy() throws IOException { + return new BufferedFloatVectorValues(vectors, dimension, docsWithField); } } private static class BufferedByteVectorValues extends ByteVectorValues { - final DocsWithFieldSet docsWithField; - // These are always the vectors of a VectorValuesWriter, which are copied when added to it final List vectors; final int dimension; + private final DocIdSet docsWithField; + private final DocIndexIterator iterator; - DocIdSetIterator docsWithFieldIter; - int ord = -1; - - BufferedByteVectorValues(DocsWithFieldSet docsWithField, List vectors, int dimension) { - this.docsWithField = docsWithField; + BufferedByteVectorValues(List vectors, int dimension, DocIdSet docsWithField) + throws IOException { this.vectors = vectors; this.dimension = dimension; - docsWithFieldIter = docsWithField.iterator(); - } - - public BufferedByteVectorValues copy() { - return new BufferedByteVectorValues(docsWithField, vectors, dimension); + this.docsWithField = docsWithField; + iterator = fromDISI(docsWithField.iterator()); } @Override @@ -404,36 +336,18 @@ public int size() { } @Override - public byte[] vectorValue() { - return vectors.get(ord); - } - - byte[] vectorValue(int targetOrd) { + public byte[] vectorValue(int targetOrd) { return vectors.get(targetOrd); } @Override - public int docID() { - return docsWithFieldIter.docID(); + public DocIndexIterator iterator() { + return iterator; } @Override - public int nextDoc() throws IOException { - int docID = docsWithFieldIter.nextDoc(); - if (docID != NO_MORE_DOCS) { - ++ord; - } - return docID; - } - - @Override - public int advance(int target) { - throw new UnsupportedOperationException(); - } - - @Override - public VectorScorer scorer(byte[] target) { - throw new UnsupportedOperationException(); + public BufferedByteVectorValues copy() throws IOException { + return new BufferedByteVectorValues(vectors, dimension, docsWithField); } } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/Codec.java b/lucene/core/src/java/org/apache/lucene/codecs/Codec.java index e5a5dac8ff54..50974d13ff34 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/Codec.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/Codec.java @@ -55,7 +55,7 @@ static NamedSPILoader getLoader() { return LOADER; } - static Codec defaultCodec = LOADER.lookup("Lucene912"); + static Codec defaultCodec = LOADER.lookup("Lucene101"); } private final String name; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/CompetitiveImpactAccumulator.java b/lucene/core/src/java/org/apache/lucene/codecs/CompetitiveImpactAccumulator.java index 77d38d290f83..88f37b5a11ca 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/CompetitiveImpactAccumulator.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/CompetitiveImpactAccumulator.java @@ -18,7 +18,6 @@ import java.util.ArrayList; import java.util.Arrays; -import java.util.Comparator; import java.util.Iterator; import java.util.List; import java.util.TreeSet; @@ -39,20 +38,17 @@ public final class CompetitiveImpactAccumulator { /** Sole constructor. */ public CompetitiveImpactAccumulator() { maxFreqs = new int[256]; - Comparator comparator = - new Comparator() { - @Override - public int compare(Impact o1, Impact o2) { - // greater freqs compare greater - int cmp = Integer.compare(o1.freq, o2.freq); - if (cmp == 0) { - // greater norms compare lower - cmp = Long.compareUnsigned(o2.norm, o1.norm); - } - return cmp; - } - }; - otherFreqNormPairs = new TreeSet<>(comparator); + otherFreqNormPairs = + new TreeSet<>( + (o1, o2) -> { + // greater freqs compare greater + int cmp = Integer.compare(o1.freq, o2.freq); + if (cmp == 0) { + // greater norms compare lower + cmp = Long.compareUnsigned(o2.norm, o1.norm); + } + return cmp; + }); } /** Reset to the same state it was in after creation. */ diff --git a/lucene/core/src/java/org/apache/lucene/codecs/CompoundFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/CompoundFormat.java index 371e192887b8..6a7e75f267e7 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/CompoundFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/CompoundFormat.java @@ -35,8 +35,8 @@ protected CompoundFormat() {} // we can add 'producer' classes. /** Returns a Directory view (read-only) for the compound files in this segment */ - public abstract CompoundDirectory getCompoundReader( - Directory dir, SegmentInfo si, IOContext context) throws IOException; + public abstract CompoundDirectory getCompoundReader(Directory dir, SegmentInfo si) + throws IOException; /** * Packs the provided segment's files into a compound format. All files referenced by the provided diff --git a/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java index cbb906788e5d..08c08ec50754 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java @@ -613,7 +613,7 @@ public void mergeSortedField(FieldInfo fieldInfo, final MergeState mergeState) if (docValuesProducer != null) { FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(fieldInfo.name); if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED) { - values = docValuesProducer.getSorted(fieldInfo); + values = docValuesProducer.getSorted(readerFieldInfo); } } if (values == null) { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/DocValuesProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/DocValuesProducer.java index 2c90448a39c5..b8b9f68b52dc 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/DocValuesProducer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/DocValuesProducer.java @@ -19,6 +19,7 @@ import java.io.Closeable; import java.io.IOException; import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.DocValuesSkipIndexType; import org.apache.lucene.index.DocValuesSkipper; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.FieldInfo; @@ -77,7 +78,7 @@ protected DocValuesProducer() {} /** * Returns a {@link DocValuesSkipper} for this field. The returned instance need not be * thread-safe: it will only be used by a single thread. The return value is undefined if {@link - * FieldInfo#hasDocValuesSkipIndex()} doesn't return {@code true}. + * FieldInfo#docValuesSkipIndexType()} returns {@link DocValuesSkipIndexType#NONE}. */ public abstract DocValuesSkipper getSkipper(FieldInfo field) throws IOException; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java index 28f9995b11ed..50af32a7e162 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java @@ -28,7 +28,9 @@ import org.apache.lucene.index.DocIDMerger; import org.apache.lucene.index.DocsWithFieldSet; import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.FloatVectorValues; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.MergeState; import org.apache.lucene.index.Sorter; import org.apache.lucene.index.VectorEncoding; @@ -54,28 +56,26 @@ protected KnnVectorsWriter() {} @SuppressWarnings("unchecked") public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOException { switch (fieldInfo.getVectorEncoding()) { - case BYTE: + case BYTE -> { KnnFieldVectorsWriter byteWriter = (KnnFieldVectorsWriter) addField(fieldInfo); ByteVectorValues mergedBytes = MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState); - for (int doc = mergedBytes.nextDoc(); - doc != DocIdSetIterator.NO_MORE_DOCS; - doc = mergedBytes.nextDoc()) { - byteWriter.addValue(doc, mergedBytes.vectorValue()); + KnnVectorValues.DocIndexIterator iter = mergedBytes.iterator(); + for (int doc = iter.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iter.nextDoc()) { + byteWriter.addValue(doc, mergedBytes.vectorValue(iter.index())); } - break; - case FLOAT32: + } + case FLOAT32 -> { KnnFieldVectorsWriter floatWriter = (KnnFieldVectorsWriter) addField(fieldInfo); FloatVectorValues mergedFloats = MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState); - for (int doc = mergedFloats.nextDoc(); - doc != DocIdSetIterator.NO_MORE_DOCS; - doc = mergedFloats.nextDoc()) { - floatWriter.addValue(doc, mergedFloats.vectorValue()); + KnnVectorValues.DocIndexIterator iter = mergedFloats.iterator(); + for (int doc = iter.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iter.nextDoc()) { + floatWriter.addValue(doc, mergedFloats.vectorValue(iter.index())); } - break; + } } } @@ -116,32 +116,44 @@ public final void merge(MergeState mergeState) throws IOException { private static class FloatVectorValuesSub extends DocIDMerger.Sub { final FloatVectorValues values; + final KnnVectorValues.DocIndexIterator iterator; FloatVectorValuesSub(MergeState.DocMap docMap, FloatVectorValues values) { super(docMap); this.values = values; - assert values.docID() == -1; + this.iterator = values.iterator(); + assert iterator.docID() == -1; } @Override public int nextDoc() throws IOException { - return values.nextDoc(); + return iterator.nextDoc(); + } + + public int index() { + return iterator.index(); } } private static class ByteVectorValuesSub extends DocIDMerger.Sub { final ByteVectorValues values; + final KnnVectorValues.DocIndexIterator iterator; ByteVectorValuesSub(MergeState.DocMap docMap, ByteVectorValues values) { super(docMap); this.values = values; - assert values.docID() == -1; + iterator = values.iterator(); + assert iterator.docID() == -1; } @Override public int nextDoc() throws IOException { - return values.nextDoc(); + return iterator.nextDoc(); + } + + int index() { + return iterator.index(); } } @@ -212,14 +224,35 @@ private static void validateFieldEncoding(FieldInfo fieldInfo, VectorEncoding ex } } + /** + * Returns true if the fieldInfos has vector values for the field. + * + * @param fieldInfos fieldInfos for the segment + * @param fieldName field name + * @return true if the fieldInfos has vector values for the field. + */ + public static boolean hasVectorValues(FieldInfos fieldInfos, String fieldName) { + if (fieldInfos.hasVectorValues() == false) { + return false; + } + FieldInfo info = fieldInfos.fieldInfo(fieldName); + return info != null && info.hasVectorValues(); + } + private static List mergeVectorValues( KnnVectorsReader[] knnVectorsReaders, MergeState.DocMap[] docMaps, + FieldInfo mergingField, + FieldInfos[] sourceFieldInfos, IOFunction valuesSupplier, BiFunction newSub) throws IOException { List subs = new ArrayList<>(); for (int i = 0; i < knnVectorsReaders.length; i++) { + FieldInfos sourceFieldInfo = sourceFieldInfos[i]; + if (hasVectorValues(sourceFieldInfo, mergingField.name) == false) { + continue; + } KnnVectorsReader knnVectorsReader = knnVectorsReaders[i]; if (knnVectorsReader != null) { V values = valuesSupplier.apply(knnVectorsReader); @@ -239,12 +272,10 @@ public static FloatVectorValues mergeFloatVectorValues( mergeVectorValues( mergeState.knnVectorsReaders, mergeState.docMaps, - knnVectorsReader -> { - return knnVectorsReader.getFloatVectorValues(fieldInfo.name); - }, - (docMap, values) -> { - return new FloatVectorValuesSub(docMap, values); - }), + fieldInfo, + mergeState.fieldInfos, + knnVectorsReader -> knnVectorsReader.getFloatVectorValues(fieldInfo.name), + FloatVectorValuesSub::new), mergeState); } @@ -256,12 +287,10 @@ public static ByteVectorValues mergeByteVectorValues(FieldInfo fieldInfo, MergeS mergeVectorValues( mergeState.knnVectorsReaders, mergeState.docMaps, - knnVectorsReader -> { - return knnVectorsReader.getByteVectorValues(fieldInfo.name); - }, - (docMap, values) -> { - return new ByteVectorValuesSub(docMap, values); - }), + fieldInfo, + mergeState.fieldInfos, + knnVectorsReader -> knnVectorsReader.getByteVectorValues(fieldInfo.name), + ByteVectorValuesSub::new), mergeState); } @@ -269,7 +298,8 @@ static class MergedFloat32VectorValues extends FloatVectorValues { private final List subs; private final DocIDMerger docIdMerger; private final int size; - private int docId; + private int docId = -1; + private int lastOrd = -1; FloatVectorValuesSub current; private MergedFloat32VectorValues(List subs, MergeState mergeState) @@ -281,33 +311,59 @@ private MergedFloat32VectorValues(List subs, MergeState me totalSize += sub.values.size(); } size = totalSize; - docId = -1; } @Override - public int docID() { - return docId; - } + public DocIndexIterator iterator() { + return new DocIndexIterator() { + private int index = -1; - @Override - public int nextDoc() throws IOException { - current = docIdMerger.next(); - if (current == null) { - docId = NO_MORE_DOCS; - } else { - docId = current.mappedDocID; - } - return docId; - } + @Override + public int docID() { + return docId; + } - @Override - public float[] vectorValue() throws IOException { - return current.values.vectorValue(); + @Override + public int index() { + return index; + } + + @Override + public int nextDoc() throws IOException { + current = docIdMerger.next(); + if (current == null) { + docId = NO_MORE_DOCS; + index = NO_MORE_DOCS; + } else { + docId = current.mappedDocID; + ++lastOrd; + ++index; + } + return docId; + } + + @Override + public int advance(int target) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public long cost() { + return size; + } + }; } @Override - public int advance(int target) { - throw new UnsupportedOperationException(); + public float[] vectorValue(int ord) throws IOException { + if (ord != lastOrd) { + throw new IllegalStateException( + "only supports forward iteration with a single iterator: ord=" + + ord + + ", lastOrd=" + + lastOrd); + } + return current.values.vectorValue(current.index()); } @Override @@ -320,10 +376,20 @@ public int dimension() { return subs.get(0).values.dimension(); } + @Override + public int ordToDoc(int ord) { + throw new UnsupportedOperationException(); + } + @Override public VectorScorer scorer(float[] target) { throw new UnsupportedOperationException(); } + + @Override + public FloatVectorValues copy() { + throw new UnsupportedOperationException(); + } } static class MergedByteVectorValues extends ByteVectorValues { @@ -331,7 +397,8 @@ static class MergedByteVectorValues extends ByteVectorValues { private final DocIDMerger docIdMerger; private final int size; - private int docId; + private int lastOrd = -1; + private int docId = -1; ByteVectorValuesSub current; private MergedByteVectorValues(List subs, MergeState mergeState) @@ -343,33 +410,57 @@ private MergedByteVectorValues(List subs, MergeState mergeS totalSize += sub.values.size(); } size = totalSize; - docId = -1; } @Override - public byte[] vectorValue() throws IOException { - return current.values.vectorValue(); - } - - @Override - public int docID() { - return docId; - } - - @Override - public int nextDoc() throws IOException { - current = docIdMerger.next(); - if (current == null) { - docId = NO_MORE_DOCS; + public byte[] vectorValue(int ord) throws IOException { + if (ord != lastOrd + 1) { + throw new IllegalStateException( + "only supports forward iteration: ord=" + ord + ", lastOrd=" + lastOrd); } else { - docId = current.mappedDocID; + lastOrd = ord; } - return docId; + return current.values.vectorValue(current.index()); } @Override - public int advance(int target) { - throw new UnsupportedOperationException(); + public DocIndexIterator iterator() { + return new DocIndexIterator() { + private int index = -1; + + @Override + public int docID() { + return docId; + } + + @Override + public int index() { + return index; + } + + @Override + public int nextDoc() throws IOException { + current = docIdMerger.next(); + if (current == null) { + docId = NO_MORE_DOCS; + index = NO_MORE_DOCS; + } else { + docId = current.mappedDocID; + ++index; + } + return docId; + } + + @Override + public int advance(int target) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public long cost() { + return size; + } + }; } @Override @@ -382,10 +473,20 @@ public int dimension() { return subs.get(0).values.dimension(); } + @Override + public int ordToDoc(int ord) { + throw new UnsupportedOperationException(); + } + @Override public VectorScorer scorer(byte[] target) { throw new UnsupportedOperationException(); } + + @Override + public ByteVectorValues copy() { + throw new UnsupportedOperationException(); + } } } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/TermStats.java b/lucene/core/src/java/org/apache/lucene/codecs/TermStats.java index 60c522e6d69d..306f4e246173 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/TermStats.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/TermStats.java @@ -16,24 +16,10 @@ */ package org.apache.lucene.codecs; -import org.apache.lucene.index.TermsEnum; // javadocs - /** * Holder for per-term statistics. * - * @see TermsEnum#docFreq - * @see TermsEnum#totalTermFreq + * @param docFreq How many documents have at least one occurrence of this term. + * @param totalTermFreq Total number of times this term occurs across all documents in the field. */ -public class TermStats { - /** How many documents have at least one occurrence of this term. */ - public final int docFreq; - - /** Total number of times this term occurs across all documents in the field. */ - public final long totalTermFreq; - - /** Sole constructor. */ - public TermStats(int docFreq, long totalTermFreq) { - this.docFreq = docFreq; - this.totalTermFreq = totalTermFreq; - } -} +public record TermStats(int docFreq, long totalTermFreq) {} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java index c112d2ccdbe8..3e506037969a 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java @@ -18,8 +18,10 @@ package org.apache.lucene.codecs.hnsw; import java.io.IOException; +import org.apache.lucene.index.ByteVectorValues; +import org.apache.lucene.index.FloatVectorValues; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.VectorSimilarityFunction; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.hnsw.RandomVectorScorer; import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier; @@ -34,24 +36,26 @@ public class DefaultFlatVectorScorer implements FlatVectorsScorer { @Override public RandomVectorScorerSupplier getRandomVectorScorerSupplier( - VectorSimilarityFunction similarityFunction, RandomAccessVectorValues vectorValues) + VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues) throws IOException { - if (vectorValues instanceof RandomAccessVectorValues.Floats floatVectorValues) { - return new FloatScoringSupplier(floatVectorValues, similarityFunction); - } else if (vectorValues instanceof RandomAccessVectorValues.Bytes byteVectorValues) { - return new ByteScoringSupplier(byteVectorValues, similarityFunction); + switch (vectorValues.getEncoding()) { + case FLOAT32 -> { + return new FloatScoringSupplier((FloatVectorValues) vectorValues, similarityFunction); + } + case BYTE -> { + return new ByteScoringSupplier((ByteVectorValues) vectorValues, similarityFunction); + } } throw new IllegalArgumentException( - "vectorValues must be an instance of RandomAccessVectorValues.Floats or RandomAccessVectorValues.Bytes"); + "vectorValues must be an instance of FloatVectorValues or ByteVectorValues, got a " + + vectorValues.getClass().getName()); } @Override public RandomVectorScorer getRandomVectorScorer( - VectorSimilarityFunction similarityFunction, - RandomAccessVectorValues vectorValues, - float[] target) + VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues, float[] target) throws IOException { - assert vectorValues instanceof RandomAccessVectorValues.Floats; + assert vectorValues instanceof FloatVectorValues; if (target.length != vectorValues.dimension()) { throw new IllegalArgumentException( "vector query dimension: " @@ -59,17 +63,14 @@ public RandomVectorScorer getRandomVectorScorer( + " differs from field dimension: " + vectorValues.dimension()); } - return new FloatVectorScorer( - (RandomAccessVectorValues.Floats) vectorValues, target, similarityFunction); + return new FloatVectorScorer((FloatVectorValues) vectorValues, target, similarityFunction); } @Override public RandomVectorScorer getRandomVectorScorer( - VectorSimilarityFunction similarityFunction, - RandomAccessVectorValues vectorValues, - byte[] target) + VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues, byte[] target) throws IOException { - assert vectorValues instanceof RandomAccessVectorValues.Bytes; + assert vectorValues instanceof ByteVectorValues; if (target.length != vectorValues.dimension()) { throw new IllegalArgumentException( "vector query dimension: " @@ -77,8 +78,7 @@ public RandomVectorScorer getRandomVectorScorer( + " differs from field dimension: " + vectorValues.dimension()); } - return new ByteVectorScorer( - (RandomAccessVectorValues.Bytes) vectorValues, target, similarityFunction); + return new ByteVectorScorer((ByteVectorValues) vectorValues, target, similarityFunction); } @Override @@ -88,14 +88,13 @@ public String toString() { /** RandomVectorScorerSupplier for bytes vector */ private static final class ByteScoringSupplier implements RandomVectorScorerSupplier { - private final RandomAccessVectorValues.Bytes vectors; - private final RandomAccessVectorValues.Bytes vectors1; - private final RandomAccessVectorValues.Bytes vectors2; + private final ByteVectorValues vectors; + private final ByteVectorValues vectors1; + private final ByteVectorValues vectors2; private final VectorSimilarityFunction similarityFunction; private ByteScoringSupplier( - RandomAccessVectorValues.Bytes vectors, VectorSimilarityFunction similarityFunction) - throws IOException { + ByteVectorValues vectors, VectorSimilarityFunction similarityFunction) throws IOException { this.vectors = vectors; vectors1 = vectors.copy(); vectors2 = vectors.copy(); @@ -116,18 +115,22 @@ public float score(int node) throws IOException { public RandomVectorScorerSupplier copy() throws IOException { return new ByteScoringSupplier(vectors, similarityFunction); } + + @Override + public String toString() { + return "ByteScoringSupplier(similarityFunction=" + similarityFunction + ")"; + } } /** RandomVectorScorerSupplier for Float vector */ private static final class FloatScoringSupplier implements RandomVectorScorerSupplier { - private final RandomAccessVectorValues.Floats vectors; - private final RandomAccessVectorValues.Floats vectors1; - private final RandomAccessVectorValues.Floats vectors2; + private final FloatVectorValues vectors; + private final FloatVectorValues vectors1; + private final FloatVectorValues vectors2; private final VectorSimilarityFunction similarityFunction; private FloatScoringSupplier( - RandomAccessVectorValues.Floats vectors, VectorSimilarityFunction similarityFunction) - throws IOException { + FloatVectorValues vectors, VectorSimilarityFunction similarityFunction) throws IOException { this.vectors = vectors; vectors1 = vectors.copy(); vectors2 = vectors.copy(); @@ -148,18 +151,21 @@ public float score(int node) throws IOException { public RandomVectorScorerSupplier copy() throws IOException { return new FloatScoringSupplier(vectors, similarityFunction); } + + @Override + public String toString() { + return "FloatScoringSupplier(similarityFunction=" + similarityFunction + ")"; + } } /** A {@link RandomVectorScorer} for float vectors. */ private static class FloatVectorScorer extends RandomVectorScorer.AbstractRandomVectorScorer { - private final RandomAccessVectorValues.Floats values; + private final FloatVectorValues values; private final float[] query; private final VectorSimilarityFunction similarityFunction; public FloatVectorScorer( - RandomAccessVectorValues.Floats values, - float[] query, - VectorSimilarityFunction similarityFunction) { + FloatVectorValues values, float[] query, VectorSimilarityFunction similarityFunction) { super(values); this.values = values; this.query = query; @@ -174,14 +180,12 @@ public float score(int node) throws IOException { /** A {@link RandomVectorScorer} for byte vectors. */ private static class ByteVectorScorer extends RandomVectorScorer.AbstractRandomVectorScorer { - private final RandomAccessVectorValues.Bytes values; + private final ByteVectorValues values; private final byte[] query; private final VectorSimilarityFunction similarityFunction; public ByteVectorScorer( - RandomAccessVectorValues.Bytes values, - byte[] query, - VectorSimilarityFunction similarityFunction) { + ByteVectorValues values, byte[] query, VectorSimilarityFunction similarityFunction) { super(values); this.values = values; this.query = query; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorsScorer.java b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorsScorer.java index 17430c24f276..6ed170731de4 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorsScorer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorsScorer.java @@ -18,8 +18,8 @@ package org.apache.lucene.codecs.hnsw; import java.io.IOException; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.VectorSimilarityFunction; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.hnsw.RandomVectorScorer; import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier; @@ -40,8 +40,7 @@ public interface FlatVectorsScorer { * @throws IOException if an I/O error occurs */ RandomVectorScorerSupplier getRandomVectorScorerSupplier( - VectorSimilarityFunction similarityFunction, RandomAccessVectorValues vectorValues) - throws IOException; + VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues) throws IOException; /** * Returns a {@link RandomVectorScorer} for the given set of vectors and target vector. @@ -53,9 +52,7 @@ RandomVectorScorerSupplier getRandomVectorScorerSupplier( * @throws IOException if an I/O error occurs when reading from the index. */ RandomVectorScorer getRandomVectorScorer( - VectorSimilarityFunction similarityFunction, - RandomAccessVectorValues vectorValues, - float[] target) + VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues, float[] target) throws IOException; /** @@ -68,8 +65,6 @@ RandomVectorScorer getRandomVectorScorer( * @throws IOException if an I/O error occurs when reading from the index. */ RandomVectorScorer getRandomVectorScorer( - VectorSimilarityFunction similarityFunction, - RandomAccessVectorValues vectorValues, - byte[] target) + VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues, byte[] target) throws IOException; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/ScalarQuantizedVectorScorer.java b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/ScalarQuantizedVectorScorer.java index 3590acab2efb..ceb826aa3a11 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/ScalarQuantizedVectorScorer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/ScalarQuantizedVectorScorer.java @@ -18,13 +18,13 @@ package org.apache.lucene.codecs.hnsw; import java.io.IOException; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.VectorUtil; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.hnsw.RandomVectorScorer; import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier; -import org.apache.lucene.util.quantization.RandomAccessQuantizedByteVectorValues; +import org.apache.lucene.util.quantization.QuantizedByteVectorValues; import org.apache.lucene.util.quantization.ScalarQuantizedVectorSimilarity; import org.apache.lucene.util.quantization.ScalarQuantizer; @@ -60,9 +60,9 @@ public ScalarQuantizedVectorScorer(FlatVectorsScorer flatVectorsScorer) { @Override public RandomVectorScorerSupplier getRandomVectorScorerSupplier( - VectorSimilarityFunction similarityFunction, RandomAccessVectorValues vectorValues) + VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues) throws IOException { - if (vectorValues instanceof RandomAccessQuantizedByteVectorValues quantizedByteVectorValues) { + if (vectorValues instanceof QuantizedByteVectorValues quantizedByteVectorValues) { return new ScalarQuantizedRandomVectorScorerSupplier( similarityFunction, quantizedByteVectorValues.getScalarQuantizer(), @@ -74,11 +74,9 @@ public RandomVectorScorerSupplier getRandomVectorScorerSupplier( @Override public RandomVectorScorer getRandomVectorScorer( - VectorSimilarityFunction similarityFunction, - RandomAccessVectorValues vectorValues, - float[] target) + VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues, float[] target) throws IOException { - if (vectorValues instanceof RandomAccessQuantizedByteVectorValues quantizedByteVectorValues) { + if (vectorValues instanceof QuantizedByteVectorValues quantizedByteVectorValues) { ScalarQuantizer scalarQuantizer = quantizedByteVectorValues.getScalarQuantizer(); byte[] targetBytes = new byte[target.length]; float offsetCorrection = @@ -104,9 +102,7 @@ public float score(int node) throws IOException { @Override public RandomVectorScorer getRandomVectorScorer( - VectorSimilarityFunction similarityFunction, - RandomAccessVectorValues vectorValues, - byte[] target) + VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues, byte[] target) throws IOException { return nonQuantizedDelegate.getRandomVectorScorer(similarityFunction, vectorValues, target); } @@ -124,14 +120,14 @@ public String toString() { public static class ScalarQuantizedRandomVectorScorerSupplier implements RandomVectorScorerSupplier { - private final RandomAccessQuantizedByteVectorValues values; + private final QuantizedByteVectorValues values; private final ScalarQuantizedVectorSimilarity similarity; private final VectorSimilarityFunction vectorSimilarityFunction; public ScalarQuantizedRandomVectorScorerSupplier( VectorSimilarityFunction similarityFunction, ScalarQuantizer scalarQuantizer, - RandomAccessQuantizedByteVectorValues values) { + QuantizedByteVectorValues values) { this.similarity = ScalarQuantizedVectorSimilarity.fromVectorSimilarity( similarityFunction, @@ -144,7 +140,7 @@ public ScalarQuantizedRandomVectorScorerSupplier( private ScalarQuantizedRandomVectorScorerSupplier( ScalarQuantizedVectorSimilarity similarity, VectorSimilarityFunction vectorSimilarityFunction, - RandomAccessQuantizedByteVectorValues values) { + QuantizedByteVectorValues values) { this.similarity = similarity; this.values = values; this.vectorSimilarityFunction = vectorSimilarityFunction; @@ -152,7 +148,7 @@ private ScalarQuantizedRandomVectorScorerSupplier( @Override public RandomVectorScorer scorer(int ord) throws IOException { - final RandomAccessQuantizedByteVectorValues vectorsCopy = values.copy(); + final QuantizedByteVectorValues vectorsCopy = values.copy(); final byte[] queryVector = values.vectorValue(ord); final float queryOffset = values.getScoreCorrectionConstant(ord); return new RandomVectorScorer.AbstractRandomVectorScorer(vectorsCopy) { @@ -170,5 +166,12 @@ public RandomVectorScorerSupplier copy() throws IOException { return new ScalarQuantizedRandomVectorScorerSupplier( similarity, vectorSimilarityFunction, values.copy()); } + + @Override + public String toString() { + return "ScalarQuantizedRandomVectorScorerSupplier(vectorSimilarityFunction=" + + vectorSimilarityFunction + + ")"; + } } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/ForDeltaUtil.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/ForDeltaUtil.java new file mode 100644 index 000000000000..2fe9a1cce6fd --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/ForDeltaUtil.java @@ -0,0 +1,525 @@ +// This file has been automatically generated, DO NOT EDIT + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene101; + +import static org.apache.lucene.codecs.lucene101.ForUtil.*; + +import java.io.IOException; +import org.apache.lucene.internal.vectorization.PostingDecodingUtil; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.util.packed.PackedInts; + +/** + * Inspired from https://fulmicoton.com/posts/bitpacking/ Encodes multiple integers in a Java int to + * get SIMD-like speedups. If bitsPerValue <= 4 then we pack 4 ints per Java int else if + * bitsPerValue <= 11 we pack 2 ints per Java int else we use scalar operations. + */ +public final class ForDeltaUtil { + + private static final int HALF_BLOCK_SIZE = BLOCK_SIZE / 2; + private static final int ONE_BLOCK_SIZE_FOURTH = BLOCK_SIZE / 4; + private static final int TWO_BLOCK_SIZE_FOURTHS = BLOCK_SIZE / 2; + private static final int THREE_BLOCK_SIZE_FOURTHS = 3 * BLOCK_SIZE / 4; + + // IDENTITY_PLUS_ONE[i] == i+1 + private static final int[] IDENTITY_PLUS_ONE = new int[ForUtil.BLOCK_SIZE]; + + static { + for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) { + IDENTITY_PLUS_ONE[i] = i + 1; + } + } + + private static void prefixSumOfOnes(int[] arr, int base) { + System.arraycopy(IDENTITY_PLUS_ONE, 0, arr, 0, ForUtil.BLOCK_SIZE); + // This loop gets auto-vectorized + for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) { + arr[i] += base; + } + } + + private static void prefixSum8(int[] arr, int base) { + // When the number of bits per value is 4 or less, we can sum up all values in a block without + // risking overflowing an 8-bits integer. This allows computing the prefix sum by summing up 4 + // values at once. + innerPrefixSum8(arr); + expand8(arr); + final int l0 = base; + final int l1 = l0 + arr[ONE_BLOCK_SIZE_FOURTH - 1]; + final int l2 = l1 + arr[TWO_BLOCK_SIZE_FOURTHS - 1]; + final int l3 = l2 + arr[THREE_BLOCK_SIZE_FOURTHS - 1]; + + for (int i = 0; i < ONE_BLOCK_SIZE_FOURTH; ++i) { + arr[i] += l0; + arr[ONE_BLOCK_SIZE_FOURTH + i] += l1; + arr[TWO_BLOCK_SIZE_FOURTHS + i] += l2; + arr[THREE_BLOCK_SIZE_FOURTHS + i] += l3; + } + } + + private static void prefixSum16(int[] arr, int base) { + // When the number of bits per value is 11 or less, we can sum up all values in a block without + // risking overflowing an 16-bits integer. This allows computing the prefix sum by summing up 2 + // values at once. + innerPrefixSum16(arr); + expand16(arr); + final int l0 = base; + final int l1 = base + arr[HALF_BLOCK_SIZE - 1]; + for (int i = 0; i < HALF_BLOCK_SIZE; ++i) { + arr[i] += l0; + arr[HALF_BLOCK_SIZE + i] += l1; + } + } + + private static void prefixSum32(int[] arr, int base) { + arr[0] += base; + for (int i = 1; i < BLOCK_SIZE; ++i) { + arr[i] += arr[i - 1]; + } + } + + // For some reason unrolling seems to help + private static void innerPrefixSum8(int[] arr) { + arr[1] += arr[0]; + arr[2] += arr[1]; + arr[3] += arr[2]; + arr[4] += arr[3]; + arr[5] += arr[4]; + arr[6] += arr[5]; + arr[7] += arr[6]; + arr[8] += arr[7]; + arr[9] += arr[8]; + arr[10] += arr[9]; + arr[11] += arr[10]; + arr[12] += arr[11]; + arr[13] += arr[12]; + arr[14] += arr[13]; + arr[15] += arr[14]; + arr[16] += arr[15]; + arr[17] += arr[16]; + arr[18] += arr[17]; + arr[19] += arr[18]; + arr[20] += arr[19]; + arr[21] += arr[20]; + arr[22] += arr[21]; + arr[23] += arr[22]; + arr[24] += arr[23]; + arr[25] += arr[24]; + arr[26] += arr[25]; + arr[27] += arr[26]; + arr[28] += arr[27]; + arr[29] += arr[28]; + arr[30] += arr[29]; + arr[31] += arr[30]; + } + + // For some reason unrolling seems to help + private static void innerPrefixSum16(int[] arr) { + arr[1] += arr[0]; + arr[2] += arr[1]; + arr[3] += arr[2]; + arr[4] += arr[3]; + arr[5] += arr[4]; + arr[6] += arr[5]; + arr[7] += arr[6]; + arr[8] += arr[7]; + arr[9] += arr[8]; + arr[10] += arr[9]; + arr[11] += arr[10]; + arr[12] += arr[11]; + arr[13] += arr[12]; + arr[14] += arr[13]; + arr[15] += arr[14]; + arr[16] += arr[15]; + arr[17] += arr[16]; + arr[18] += arr[17]; + arr[19] += arr[18]; + arr[20] += arr[19]; + arr[21] += arr[20]; + arr[22] += arr[21]; + arr[23] += arr[22]; + arr[24] += arr[23]; + arr[25] += arr[24]; + arr[26] += arr[25]; + arr[27] += arr[26]; + arr[28] += arr[27]; + arr[29] += arr[28]; + arr[30] += arr[29]; + arr[31] += arr[30]; + arr[32] += arr[31]; + arr[33] += arr[32]; + arr[34] += arr[33]; + arr[35] += arr[34]; + arr[36] += arr[35]; + arr[37] += arr[36]; + arr[38] += arr[37]; + arr[39] += arr[38]; + arr[40] += arr[39]; + arr[41] += arr[40]; + arr[42] += arr[41]; + arr[43] += arr[42]; + arr[44] += arr[43]; + arr[45] += arr[44]; + arr[46] += arr[45]; + arr[47] += arr[46]; + arr[48] += arr[47]; + arr[49] += arr[48]; + arr[50] += arr[49]; + arr[51] += arr[50]; + arr[52] += arr[51]; + arr[53] += arr[52]; + arr[54] += arr[53]; + arr[55] += arr[54]; + arr[56] += arr[55]; + arr[57] += arr[56]; + arr[58] += arr[57]; + arr[59] += arr[58]; + arr[60] += arr[59]; + arr[61] += arr[60]; + arr[62] += arr[61]; + arr[63] += arr[62]; + } + + private final int[] tmp = new int[BLOCK_SIZE]; + + /** + * Encode deltas of a strictly monotonically increasing sequence of integers. The provided {@code + * ints} are expected to be deltas between consecutive values. + */ + void encodeDeltas(int[] ints, DataOutput out) throws IOException { + if (ints[0] == 1 && PForUtil.allEqual(ints)) { // happens with very dense postings + out.writeByte((byte) 0); + } else { + int or = 0; + for (int l : ints) { + or |= l; + } + assert or != 0; + final int bitsPerValue = PackedInts.bitsRequired(or); + out.writeByte((byte) bitsPerValue); + + final int primitiveSize; + if (bitsPerValue <= 3) { + primitiveSize = 8; + collapse8(ints); + } else if (bitsPerValue <= 10) { + primitiveSize = 16; + collapse16(ints); + } else { + primitiveSize = 32; + } + encode(ints, bitsPerValue, primitiveSize, out, tmp); + } + } + + /** Decode deltas, compute the prefix sum and add {@code base} to all decoded ints. */ + void decodeAndPrefixSum(PostingDecodingUtil pdu, int base, int[] ints) throws IOException { + final int bitsPerValue = Byte.toUnsignedInt(pdu.in.readByte()); + if (bitsPerValue == 0) { + prefixSumOfOnes(ints, base); + } else { + decodeAndPrefixSum(bitsPerValue, pdu, base, ints); + } + } + + /** Delta-decode 128 integers into {@code ints}. */ + void decodeAndPrefixSum(int bitsPerValue, PostingDecodingUtil pdu, int base, int[] ints) + throws IOException { + switch (bitsPerValue) { + case 1: + decode1(pdu, ints); + prefixSum8(ints, base); + break; + case 2: + decode2(pdu, ints); + prefixSum8(ints, base); + break; + case 3: + decode3(pdu, tmp, ints); + prefixSum8(ints, base); + break; + case 4: + decode4To16(pdu, ints); + prefixSum16(ints, base); + break; + case 5: + decode5To16(pdu, tmp, ints); + prefixSum16(ints, base); + break; + case 6: + decode6To16(pdu, tmp, ints); + prefixSum16(ints, base); + break; + case 7: + decode7To16(pdu, tmp, ints); + prefixSum16(ints, base); + break; + case 8: + decode8To16(pdu, ints); + prefixSum16(ints, base); + break; + case 9: + decode9(pdu, tmp, ints); + prefixSum16(ints, base); + break; + case 10: + decode10(pdu, tmp, ints); + prefixSum16(ints, base); + break; + case 11: + decode11To32(pdu, tmp, ints); + prefixSum32(ints, base); + break; + case 12: + decode12To32(pdu, tmp, ints); + prefixSum32(ints, base); + break; + case 13: + decode13To32(pdu, tmp, ints); + prefixSum32(ints, base); + break; + case 14: + decode14To32(pdu, tmp, ints); + prefixSum32(ints, base); + break; + case 15: + decode15To32(pdu, tmp, ints); + prefixSum32(ints, base); + break; + case 16: + decode16To32(pdu, ints); + prefixSum32(ints, base); + break; + case 17: + decode17(pdu, tmp, ints); + prefixSum32(ints, base); + break; + case 18: + decode18(pdu, tmp, ints); + prefixSum32(ints, base); + break; + case 19: + decode19(pdu, tmp, ints); + prefixSum32(ints, base); + break; + case 20: + decode20(pdu, tmp, ints); + prefixSum32(ints, base); + break; + case 21: + decode21(pdu, tmp, ints); + prefixSum32(ints, base); + break; + case 22: + decode22(pdu, tmp, ints); + prefixSum32(ints, base); + break; + case 23: + decode23(pdu, tmp, ints); + prefixSum32(ints, base); + break; + case 24: + decode24(pdu, tmp, ints); + prefixSum32(ints, base); + break; + default: + decodeSlow(bitsPerValue, pdu, tmp, ints); + prefixSum32(ints, base); + break; + } + } + + private static void decode4To16(PostingDecodingUtil pdu, int[] ints) throws IOException { + pdu.splitInts(16, ints, 12, 4, MASK16_4, ints, 48, MASK16_4); + } + + private static void decode5To16(PostingDecodingUtil pdu, int[] tmp, int[] ints) + throws IOException { + pdu.splitInts(20, ints, 11, 5, MASK16_5, tmp, 0, MASK16_1); + for (int iter = 0, tmpIdx = 0, intsIdx = 60; iter < 4; ++iter, tmpIdx += 5, intsIdx += 1) { + int l0 = tmp[tmpIdx + 0] << 4; + l0 |= tmp[tmpIdx + 1] << 3; + l0 |= tmp[tmpIdx + 2] << 2; + l0 |= tmp[tmpIdx + 3] << 1; + l0 |= tmp[tmpIdx + 4] << 0; + ints[intsIdx + 0] = l0; + } + } + + private static void decode6To16(PostingDecodingUtil pdu, int[] tmp, int[] ints) + throws IOException { + pdu.splitInts(24, ints, 10, 6, MASK16_6, tmp, 0, MASK16_4); + for (int iter = 0, tmpIdx = 0, intsIdx = 48; iter < 8; ++iter, tmpIdx += 3, intsIdx += 2) { + int l0 = tmp[tmpIdx + 0] << 2; + l0 |= (tmp[tmpIdx + 1] >>> 2) & MASK16_2; + ints[intsIdx + 0] = l0; + int l1 = (tmp[tmpIdx + 1] & MASK16_2) << 4; + l1 |= tmp[tmpIdx + 2] << 0; + ints[intsIdx + 1] = l1; + } + } + + private static void decode7To16(PostingDecodingUtil pdu, int[] tmp, int[] ints) + throws IOException { + pdu.splitInts(28, ints, 9, 7, MASK16_7, tmp, 0, MASK16_2); + for (int iter = 0, tmpIdx = 0, intsIdx = 56; iter < 4; ++iter, tmpIdx += 7, intsIdx += 2) { + int l0 = tmp[tmpIdx + 0] << 5; + l0 |= tmp[tmpIdx + 1] << 3; + l0 |= tmp[tmpIdx + 2] << 1; + l0 |= (tmp[tmpIdx + 3] >>> 1) & MASK16_1; + ints[intsIdx + 0] = l0; + int l1 = (tmp[tmpIdx + 3] & MASK16_1) << 6; + l1 |= tmp[tmpIdx + 4] << 4; + l1 |= tmp[tmpIdx + 5] << 2; + l1 |= tmp[tmpIdx + 6] << 0; + ints[intsIdx + 1] = l1; + } + } + + private static void decode8To16(PostingDecodingUtil pdu, int[] ints) throws IOException { + pdu.splitInts(32, ints, 8, 8, MASK16_8, ints, 32, MASK16_8); + } + + private static void decode11To32(PostingDecodingUtil pdu, int[] tmp, int[] ints) + throws IOException { + pdu.splitInts(44, ints, 21, 11, MASK32_11, tmp, 0, MASK32_10); + for (int iter = 0, tmpIdx = 0, intsIdx = 88; iter < 4; ++iter, tmpIdx += 11, intsIdx += 10) { + int l0 = tmp[tmpIdx + 0] << 1; + l0 |= (tmp[tmpIdx + 1] >>> 9) & MASK32_1; + ints[intsIdx + 0] = l0; + int l1 = (tmp[tmpIdx + 1] & MASK32_9) << 2; + l1 |= (tmp[tmpIdx + 2] >>> 8) & MASK32_2; + ints[intsIdx + 1] = l1; + int l2 = (tmp[tmpIdx + 2] & MASK32_8) << 3; + l2 |= (tmp[tmpIdx + 3] >>> 7) & MASK32_3; + ints[intsIdx + 2] = l2; + int l3 = (tmp[tmpIdx + 3] & MASK32_7) << 4; + l3 |= (tmp[tmpIdx + 4] >>> 6) & MASK32_4; + ints[intsIdx + 3] = l3; + int l4 = (tmp[tmpIdx + 4] & MASK32_6) << 5; + l4 |= (tmp[tmpIdx + 5] >>> 5) & MASK32_5; + ints[intsIdx + 4] = l4; + int l5 = (tmp[tmpIdx + 5] & MASK32_5) << 6; + l5 |= (tmp[tmpIdx + 6] >>> 4) & MASK32_6; + ints[intsIdx + 5] = l5; + int l6 = (tmp[tmpIdx + 6] & MASK32_4) << 7; + l6 |= (tmp[tmpIdx + 7] >>> 3) & MASK32_7; + ints[intsIdx + 6] = l6; + int l7 = (tmp[tmpIdx + 7] & MASK32_3) << 8; + l7 |= (tmp[tmpIdx + 8] >>> 2) & MASK32_8; + ints[intsIdx + 7] = l7; + int l8 = (tmp[tmpIdx + 8] & MASK32_2) << 9; + l8 |= (tmp[tmpIdx + 9] >>> 1) & MASK32_9; + ints[intsIdx + 8] = l8; + int l9 = (tmp[tmpIdx + 9] & MASK32_1) << 10; + l9 |= tmp[tmpIdx + 10] << 0; + ints[intsIdx + 9] = l9; + } + } + + private static void decode12To32(PostingDecodingUtil pdu, int[] tmp, int[] ints) + throws IOException { + pdu.splitInts(48, ints, 20, 12, MASK32_12, tmp, 0, MASK32_8); + for (int iter = 0, tmpIdx = 0, intsIdx = 96; iter < 16; ++iter, tmpIdx += 3, intsIdx += 2) { + int l0 = tmp[tmpIdx + 0] << 4; + l0 |= (tmp[tmpIdx + 1] >>> 4) & MASK32_4; + ints[intsIdx + 0] = l0; + int l1 = (tmp[tmpIdx + 1] & MASK32_4) << 8; + l1 |= tmp[tmpIdx + 2] << 0; + ints[intsIdx + 1] = l1; + } + } + + private static void decode13To32(PostingDecodingUtil pdu, int[] tmp, int[] ints) + throws IOException { + pdu.splitInts(52, ints, 19, 13, MASK32_13, tmp, 0, MASK32_6); + for (int iter = 0, tmpIdx = 0, intsIdx = 104; iter < 4; ++iter, tmpIdx += 13, intsIdx += 6) { + int l0 = tmp[tmpIdx + 0] << 7; + l0 |= tmp[tmpIdx + 1] << 1; + l0 |= (tmp[tmpIdx + 2] >>> 5) & MASK32_1; + ints[intsIdx + 0] = l0; + int l1 = (tmp[tmpIdx + 2] & MASK32_5) << 8; + l1 |= tmp[tmpIdx + 3] << 2; + l1 |= (tmp[tmpIdx + 4] >>> 4) & MASK32_2; + ints[intsIdx + 1] = l1; + int l2 = (tmp[tmpIdx + 4] & MASK32_4) << 9; + l2 |= tmp[tmpIdx + 5] << 3; + l2 |= (tmp[tmpIdx + 6] >>> 3) & MASK32_3; + ints[intsIdx + 2] = l2; + int l3 = (tmp[tmpIdx + 6] & MASK32_3) << 10; + l3 |= tmp[tmpIdx + 7] << 4; + l3 |= (tmp[tmpIdx + 8] >>> 2) & MASK32_4; + ints[intsIdx + 3] = l3; + int l4 = (tmp[tmpIdx + 8] & MASK32_2) << 11; + l4 |= tmp[tmpIdx + 9] << 5; + l4 |= (tmp[tmpIdx + 10] >>> 1) & MASK32_5; + ints[intsIdx + 4] = l4; + int l5 = (tmp[tmpIdx + 10] & MASK32_1) << 12; + l5 |= tmp[tmpIdx + 11] << 6; + l5 |= tmp[tmpIdx + 12] << 0; + ints[intsIdx + 5] = l5; + } + } + + private static void decode14To32(PostingDecodingUtil pdu, int[] tmp, int[] ints) + throws IOException { + pdu.splitInts(56, ints, 18, 14, MASK32_14, tmp, 0, MASK32_4); + for (int iter = 0, tmpIdx = 0, intsIdx = 112; iter < 8; ++iter, tmpIdx += 7, intsIdx += 2) { + int l0 = tmp[tmpIdx + 0] << 10; + l0 |= tmp[tmpIdx + 1] << 6; + l0 |= tmp[tmpIdx + 2] << 2; + l0 |= (tmp[tmpIdx + 3] >>> 2) & MASK32_2; + ints[intsIdx + 0] = l0; + int l1 = (tmp[tmpIdx + 3] & MASK32_2) << 12; + l1 |= tmp[tmpIdx + 4] << 8; + l1 |= tmp[tmpIdx + 5] << 4; + l1 |= tmp[tmpIdx + 6] << 0; + ints[intsIdx + 1] = l1; + } + } + + private static void decode15To32(PostingDecodingUtil pdu, int[] tmp, int[] ints) + throws IOException { + pdu.splitInts(60, ints, 17, 15, MASK32_15, tmp, 0, MASK32_2); + for (int iter = 0, tmpIdx = 0, intsIdx = 120; iter < 4; ++iter, tmpIdx += 15, intsIdx += 2) { + int l0 = tmp[tmpIdx + 0] << 13; + l0 |= tmp[tmpIdx + 1] << 11; + l0 |= tmp[tmpIdx + 2] << 9; + l0 |= tmp[tmpIdx + 3] << 7; + l0 |= tmp[tmpIdx + 4] << 5; + l0 |= tmp[tmpIdx + 5] << 3; + l0 |= tmp[tmpIdx + 6] << 1; + l0 |= (tmp[tmpIdx + 7] >>> 1) & MASK32_1; + ints[intsIdx + 0] = l0; + int l1 = (tmp[tmpIdx + 7] & MASK32_1) << 14; + l1 |= tmp[tmpIdx + 8] << 12; + l1 |= tmp[tmpIdx + 9] << 10; + l1 |= tmp[tmpIdx + 10] << 8; + l1 |= tmp[tmpIdx + 11] << 6; + l1 |= tmp[tmpIdx + 12] << 4; + l1 |= tmp[tmpIdx + 13] << 2; + l1 |= tmp[tmpIdx + 14] << 0; + ints[intsIdx + 1] = l1; + } + } + + private static void decode16To32(PostingDecodingUtil pdu, int[] ints) throws IOException { + pdu.splitInts(64, ints, 16, 16, MASK32_16, ints, 64, MASK32_16); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/ForUtil.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/ForUtil.java new file mode 100644 index 000000000000..1fe54b56fd50 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/ForUtil.java @@ -0,0 +1,841 @@ +// This file has been automatically generated, DO NOT EDIT + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene101; + +import java.io.IOException; +import org.apache.lucene.internal.vectorization.PostingDecodingUtil; +import org.apache.lucene.store.DataOutput; + +/** + * Inspired from https://fulmicoton.com/posts/bitpacking/ Encodes multiple integers in one to get + * SIMD-like speedups. If bitsPerValue <= 8 then we pack 4 ints per Java int else if bitsPerValue + * <= 16 we pack 2 ints per Java int else we do scalar operations. + */ +public final class ForUtil { + + public static final int BLOCK_SIZE = 128; + static final int BLOCK_SIZE_LOG2 = 7; + + static int expandMask16(int mask16) { + return mask16 | (mask16 << 16); + } + + static int expandMask8(int mask8) { + return expandMask16(mask8 | (mask8 << 8)); + } + + static int mask32(int bitsPerValue) { + return (1 << bitsPerValue) - 1; + } + + static int mask16(int bitsPerValue) { + return expandMask16((1 << bitsPerValue) - 1); + } + + static int mask8(int bitsPerValue) { + return expandMask8((1 << bitsPerValue) - 1); + } + + static void expand8(int[] arr) { + for (int i = 0; i < 32; ++i) { + int l = arr[i]; + arr[i] = (l >>> 24) & 0xFF; + arr[32 + i] = (l >>> 16) & 0xFF; + arr[64 + i] = (l >>> 8) & 0xFF; + arr[96 + i] = l & 0xFF; + } + } + + static void collapse8(int[] arr) { + for (int i = 0; i < 32; ++i) { + arr[i] = (arr[i] << 24) | (arr[32 + i] << 16) | (arr[64 + i] << 8) | arr[96 + i]; + } + } + + static void expand16(int[] arr) { + for (int i = 0; i < 64; ++i) { + int l = arr[i]; + arr[i] = (l >>> 16) & 0xFFFF; + arr[64 + i] = l & 0xFFFF; + } + } + + static void collapse16(int[] arr) { + for (int i = 0; i < 64; ++i) { + arr[i] = (arr[i] << 16) | arr[64 + i]; + } + } + + private final int[] tmp = new int[BLOCK_SIZE]; + + /** Encode 128 integers from {@code ints} into {@code out}. */ + void encode(int[] ints, int bitsPerValue, DataOutput out) throws IOException { + final int nextPrimitive; + if (bitsPerValue <= 8) { + nextPrimitive = 8; + collapse8(ints); + } else if (bitsPerValue <= 16) { + nextPrimitive = 16; + collapse16(ints); + } else { + nextPrimitive = 32; + } + encode(ints, bitsPerValue, nextPrimitive, out, tmp); + } + + static void encode(int[] ints, int bitsPerValue, int primitiveSize, DataOutput out, int[] tmp) + throws IOException { + final int numInts = BLOCK_SIZE * primitiveSize / Integer.SIZE; + + final int numIntsPerShift = bitsPerValue * 4; + int idx = 0; + int shift = primitiveSize - bitsPerValue; + for (int i = 0; i < numIntsPerShift; ++i) { + tmp[i] = ints[idx++] << shift; + } + for (shift = shift - bitsPerValue; shift >= 0; shift -= bitsPerValue) { + for (int i = 0; i < numIntsPerShift; ++i) { + tmp[i] |= ints[idx++] << shift; + } + } + + final int remainingBitsPerInt = shift + bitsPerValue; + final int maskRemainingBitsPerInt; + if (primitiveSize == 8) { + maskRemainingBitsPerInt = MASKS8[remainingBitsPerInt]; + } else if (primitiveSize == 16) { + maskRemainingBitsPerInt = MASKS16[remainingBitsPerInt]; + } else { + maskRemainingBitsPerInt = MASKS32[remainingBitsPerInt]; + } + + int tmpIdx = 0; + int remainingBitsPerValue = bitsPerValue; + while (idx < numInts) { + if (remainingBitsPerValue >= remainingBitsPerInt) { + remainingBitsPerValue -= remainingBitsPerInt; + tmp[tmpIdx++] |= (ints[idx] >>> remainingBitsPerValue) & maskRemainingBitsPerInt; + if (remainingBitsPerValue == 0) { + idx++; + remainingBitsPerValue = bitsPerValue; + } + } else { + final int mask1, mask2; + if (primitiveSize == 8) { + mask1 = MASKS8[remainingBitsPerValue]; + mask2 = MASKS8[remainingBitsPerInt - remainingBitsPerValue]; + } else if (primitiveSize == 16) { + mask1 = MASKS16[remainingBitsPerValue]; + mask2 = MASKS16[remainingBitsPerInt - remainingBitsPerValue]; + } else { + mask1 = MASKS32[remainingBitsPerValue]; + mask2 = MASKS32[remainingBitsPerInt - remainingBitsPerValue]; + } + tmp[tmpIdx] |= (ints[idx++] & mask1) << (remainingBitsPerInt - remainingBitsPerValue); + remainingBitsPerValue = bitsPerValue - remainingBitsPerInt + remainingBitsPerValue; + tmp[tmpIdx++] |= (ints[idx] >>> remainingBitsPerValue) & mask2; + } + } + + for (int i = 0; i < numIntsPerShift; ++i) { + out.writeInt(tmp[i]); + } + } + + /** Number of bytes required to encode 128 integers of {@code bitsPerValue} bits per value. */ + static int numBytes(int bitsPerValue) { + return bitsPerValue << (BLOCK_SIZE_LOG2 - 3); + } + + static void decodeSlow(int bitsPerValue, PostingDecodingUtil pdu, int[] tmp, int[] ints) + throws IOException { + final int numInts = bitsPerValue << 2; + final int mask = MASKS32[bitsPerValue]; + pdu.splitInts(numInts, ints, 32 - bitsPerValue, 32, mask, tmp, 0, -1); + final int remainingBitsPerInt = 32 - bitsPerValue; + final int mask32RemainingBitsPerInt = MASKS32[remainingBitsPerInt]; + int tmpIdx = 0; + int remainingBits = remainingBitsPerInt; + for (int intsIdx = numInts; intsIdx < BLOCK_SIZE; ++intsIdx) { + int b = bitsPerValue - remainingBits; + int l = (tmp[tmpIdx++] & MASKS32[remainingBits]) << b; + while (b >= remainingBitsPerInt) { + b -= remainingBitsPerInt; + l |= (tmp[tmpIdx++] & mask32RemainingBitsPerInt) << b; + } + if (b > 0) { + l |= (tmp[tmpIdx] >>> (remainingBitsPerInt - b)) & MASKS32[b]; + remainingBits = remainingBitsPerInt - b; + } else { + remainingBits = remainingBitsPerInt; + } + ints[intsIdx] = l; + } + } + + static final int[] MASKS8 = new int[8]; + static final int[] MASKS16 = new int[16]; + static final int[] MASKS32 = new int[32]; + + static { + for (int i = 0; i < 8; ++i) { + MASKS8[i] = mask8(i); + } + for (int i = 0; i < 16; ++i) { + MASKS16[i] = mask16(i); + } + for (int i = 0; i < 32; ++i) { + MASKS32[i] = mask32(i); + } + } + + // mark values in array as final ints to avoid the cost of reading array, arrays should only be + // used when the idx is a variable + static final int MASK8_1 = MASKS8[1]; + static final int MASK8_2 = MASKS8[2]; + static final int MASK8_3 = MASKS8[3]; + static final int MASK8_4 = MASKS8[4]; + static final int MASK8_5 = MASKS8[5]; + static final int MASK8_6 = MASKS8[6]; + static final int MASK8_7 = MASKS8[7]; + static final int MASK16_1 = MASKS16[1]; + static final int MASK16_2 = MASKS16[2]; + static final int MASK16_3 = MASKS16[3]; + static final int MASK16_4 = MASKS16[4]; + static final int MASK16_5 = MASKS16[5]; + static final int MASK16_6 = MASKS16[6]; + static final int MASK16_7 = MASKS16[7]; + static final int MASK16_8 = MASKS16[8]; + static final int MASK16_9 = MASKS16[9]; + static final int MASK16_10 = MASKS16[10]; + static final int MASK16_11 = MASKS16[11]; + static final int MASK16_12 = MASKS16[12]; + static final int MASK16_13 = MASKS16[13]; + static final int MASK16_14 = MASKS16[14]; + static final int MASK16_15 = MASKS16[15]; + static final int MASK32_1 = MASKS32[1]; + static final int MASK32_2 = MASKS32[2]; + static final int MASK32_3 = MASKS32[3]; + static final int MASK32_4 = MASKS32[4]; + static final int MASK32_5 = MASKS32[5]; + static final int MASK32_6 = MASKS32[6]; + static final int MASK32_7 = MASKS32[7]; + static final int MASK32_8 = MASKS32[8]; + static final int MASK32_9 = MASKS32[9]; + static final int MASK32_10 = MASKS32[10]; + static final int MASK32_11 = MASKS32[11]; + static final int MASK32_12 = MASKS32[12]; + static final int MASK32_13 = MASKS32[13]; + static final int MASK32_14 = MASKS32[14]; + static final int MASK32_15 = MASKS32[15]; + static final int MASK32_16 = MASKS32[16]; + static final int MASK32_17 = MASKS32[17]; + static final int MASK32_18 = MASKS32[18]; + static final int MASK32_19 = MASKS32[19]; + static final int MASK32_20 = MASKS32[20]; + static final int MASK32_21 = MASKS32[21]; + static final int MASK32_22 = MASKS32[22]; + static final int MASK32_23 = MASKS32[23]; + static final int MASK32_24 = MASKS32[24]; + + /** Decode 128 integers into {@code ints}. */ + void decode(int bitsPerValue, PostingDecodingUtil pdu, int[] ints) throws IOException { + switch (bitsPerValue) { + case 1: + decode1(pdu, ints); + expand8(ints); + break; + case 2: + decode2(pdu, ints); + expand8(ints); + break; + case 3: + decode3(pdu, tmp, ints); + expand8(ints); + break; + case 4: + decode4(pdu, ints); + expand8(ints); + break; + case 5: + decode5(pdu, tmp, ints); + expand8(ints); + break; + case 6: + decode6(pdu, tmp, ints); + expand8(ints); + break; + case 7: + decode7(pdu, tmp, ints); + expand8(ints); + break; + case 8: + decode8(pdu, ints); + expand8(ints); + break; + case 9: + decode9(pdu, tmp, ints); + expand16(ints); + break; + case 10: + decode10(pdu, tmp, ints); + expand16(ints); + break; + case 11: + decode11(pdu, tmp, ints); + expand16(ints); + break; + case 12: + decode12(pdu, tmp, ints); + expand16(ints); + break; + case 13: + decode13(pdu, tmp, ints); + expand16(ints); + break; + case 14: + decode14(pdu, tmp, ints); + expand16(ints); + break; + case 15: + decode15(pdu, tmp, ints); + expand16(ints); + break; + case 16: + decode16(pdu, ints); + expand16(ints); + break; + case 17: + decode17(pdu, tmp, ints); + break; + case 18: + decode18(pdu, tmp, ints); + break; + case 19: + decode19(pdu, tmp, ints); + break; + case 20: + decode20(pdu, tmp, ints); + break; + case 21: + decode21(pdu, tmp, ints); + break; + case 22: + decode22(pdu, tmp, ints); + break; + case 23: + decode23(pdu, tmp, ints); + break; + case 24: + decode24(pdu, tmp, ints); + break; + default: + decodeSlow(bitsPerValue, pdu, tmp, ints); + break; + } + } + + static void decode1(PostingDecodingUtil pdu, int[] ints) throws IOException { + pdu.splitInts(4, ints, 7, 1, MASK8_1, ints, 28, MASK8_1); + } + + static void decode2(PostingDecodingUtil pdu, int[] ints) throws IOException { + pdu.splitInts(8, ints, 6, 2, MASK8_2, ints, 24, MASK8_2); + } + + static void decode3(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException { + pdu.splitInts(12, ints, 5, 3, MASK8_3, tmp, 0, MASK8_2); + for (int iter = 0, tmpIdx = 0, intsIdx = 24; iter < 4; ++iter, tmpIdx += 3, intsIdx += 2) { + int l0 = tmp[tmpIdx + 0] << 1; + l0 |= (tmp[tmpIdx + 1] >>> 1) & MASK8_1; + ints[intsIdx + 0] = l0; + int l1 = (tmp[tmpIdx + 1] & MASK8_1) << 2; + l1 |= tmp[tmpIdx + 2] << 0; + ints[intsIdx + 1] = l1; + } + } + + static void decode4(PostingDecodingUtil pdu, int[] ints) throws IOException { + pdu.splitInts(16, ints, 4, 4, MASK8_4, ints, 16, MASK8_4); + } + + static void decode5(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException { + pdu.splitInts(20, ints, 3, 5, MASK8_5, tmp, 0, MASK8_3); + for (int iter = 0, tmpIdx = 0, intsIdx = 20; iter < 4; ++iter, tmpIdx += 5, intsIdx += 3) { + int l0 = tmp[tmpIdx + 0] << 2; + l0 |= (tmp[tmpIdx + 1] >>> 1) & MASK8_2; + ints[intsIdx + 0] = l0; + int l1 = (tmp[tmpIdx + 1] & MASK8_1) << 4; + l1 |= tmp[tmpIdx + 2] << 1; + l1 |= (tmp[tmpIdx + 3] >>> 2) & MASK8_1; + ints[intsIdx + 1] = l1; + int l2 = (tmp[tmpIdx + 3] & MASK8_2) << 3; + l2 |= tmp[tmpIdx + 4] << 0; + ints[intsIdx + 2] = l2; + } + } + + static void decode6(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException { + pdu.splitInts(24, ints, 2, 6, MASK8_6, tmp, 0, MASK8_2); + for (int iter = 0, tmpIdx = 0, intsIdx = 24; iter < 8; ++iter, tmpIdx += 3, intsIdx += 1) { + int l0 = tmp[tmpIdx + 0] << 4; + l0 |= tmp[tmpIdx + 1] << 2; + l0 |= tmp[tmpIdx + 2] << 0; + ints[intsIdx + 0] = l0; + } + } + + static void decode7(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException { + pdu.splitInts(28, ints, 1, 7, MASK8_7, tmp, 0, MASK8_1); + for (int iter = 0, tmpIdx = 0, intsIdx = 28; iter < 4; ++iter, tmpIdx += 7, intsIdx += 1) { + int l0 = tmp[tmpIdx + 0] << 6; + l0 |= tmp[tmpIdx + 1] << 5; + l0 |= tmp[tmpIdx + 2] << 4; + l0 |= tmp[tmpIdx + 3] << 3; + l0 |= tmp[tmpIdx + 4] << 2; + l0 |= tmp[tmpIdx + 5] << 1; + l0 |= tmp[tmpIdx + 6] << 0; + ints[intsIdx + 0] = l0; + } + } + + static void decode8(PostingDecodingUtil pdu, int[] ints) throws IOException { + pdu.in.readInts(ints, 0, 32); + } + + static void decode9(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException { + pdu.splitInts(36, ints, 7, 9, MASK16_9, tmp, 0, MASK16_7); + for (int iter = 0, tmpIdx = 0, intsIdx = 36; iter < 4; ++iter, tmpIdx += 9, intsIdx += 7) { + int l0 = tmp[tmpIdx + 0] << 2; + l0 |= (tmp[tmpIdx + 1] >>> 5) & MASK16_2; + ints[intsIdx + 0] = l0; + int l1 = (tmp[tmpIdx + 1] & MASK16_5) << 4; + l1 |= (tmp[tmpIdx + 2] >>> 3) & MASK16_4; + ints[intsIdx + 1] = l1; + int l2 = (tmp[tmpIdx + 2] & MASK16_3) << 6; + l2 |= (tmp[tmpIdx + 3] >>> 1) & MASK16_6; + ints[intsIdx + 2] = l2; + int l3 = (tmp[tmpIdx + 3] & MASK16_1) << 8; + l3 |= tmp[tmpIdx + 4] << 1; + l3 |= (tmp[tmpIdx + 5] >>> 6) & MASK16_1; + ints[intsIdx + 3] = l3; + int l4 = (tmp[tmpIdx + 5] & MASK16_6) << 3; + l4 |= (tmp[tmpIdx + 6] >>> 4) & MASK16_3; + ints[intsIdx + 4] = l4; + int l5 = (tmp[tmpIdx + 6] & MASK16_4) << 5; + l5 |= (tmp[tmpIdx + 7] >>> 2) & MASK16_5; + ints[intsIdx + 5] = l5; + int l6 = (tmp[tmpIdx + 7] & MASK16_2) << 7; + l6 |= tmp[tmpIdx + 8] << 0; + ints[intsIdx + 6] = l6; + } + } + + static void decode10(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException { + pdu.splitInts(40, ints, 6, 10, MASK16_10, tmp, 0, MASK16_6); + for (int iter = 0, tmpIdx = 0, intsIdx = 40; iter < 8; ++iter, tmpIdx += 5, intsIdx += 3) { + int l0 = tmp[tmpIdx + 0] << 4; + l0 |= (tmp[tmpIdx + 1] >>> 2) & MASK16_4; + ints[intsIdx + 0] = l0; + int l1 = (tmp[tmpIdx + 1] & MASK16_2) << 8; + l1 |= tmp[tmpIdx + 2] << 2; + l1 |= (tmp[tmpIdx + 3] >>> 4) & MASK16_2; + ints[intsIdx + 1] = l1; + int l2 = (tmp[tmpIdx + 3] & MASK16_4) << 6; + l2 |= tmp[tmpIdx + 4] << 0; + ints[intsIdx + 2] = l2; + } + } + + static void decode11(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException { + pdu.splitInts(44, ints, 5, 11, MASK16_11, tmp, 0, MASK16_5); + for (int iter = 0, tmpIdx = 0, intsIdx = 44; iter < 4; ++iter, tmpIdx += 11, intsIdx += 5) { + int l0 = tmp[tmpIdx + 0] << 6; + l0 |= tmp[tmpIdx + 1] << 1; + l0 |= (tmp[tmpIdx + 2] >>> 4) & MASK16_1; + ints[intsIdx + 0] = l0; + int l1 = (tmp[tmpIdx + 2] & MASK16_4) << 7; + l1 |= tmp[tmpIdx + 3] << 2; + l1 |= (tmp[tmpIdx + 4] >>> 3) & MASK16_2; + ints[intsIdx + 1] = l1; + int l2 = (tmp[tmpIdx + 4] & MASK16_3) << 8; + l2 |= tmp[tmpIdx + 5] << 3; + l2 |= (tmp[tmpIdx + 6] >>> 2) & MASK16_3; + ints[intsIdx + 2] = l2; + int l3 = (tmp[tmpIdx + 6] & MASK16_2) << 9; + l3 |= tmp[tmpIdx + 7] << 4; + l3 |= (tmp[tmpIdx + 8] >>> 1) & MASK16_4; + ints[intsIdx + 3] = l3; + int l4 = (tmp[tmpIdx + 8] & MASK16_1) << 10; + l4 |= tmp[tmpIdx + 9] << 5; + l4 |= tmp[tmpIdx + 10] << 0; + ints[intsIdx + 4] = l4; + } + } + + static void decode12(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException { + pdu.splitInts(48, ints, 4, 12, MASK16_12, tmp, 0, MASK16_4); + for (int iter = 0, tmpIdx = 0, intsIdx = 48; iter < 16; ++iter, tmpIdx += 3, intsIdx += 1) { + int l0 = tmp[tmpIdx + 0] << 8; + l0 |= tmp[tmpIdx + 1] << 4; + l0 |= tmp[tmpIdx + 2] << 0; + ints[intsIdx + 0] = l0; + } + } + + static void decode13(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException { + pdu.splitInts(52, ints, 3, 13, MASK16_13, tmp, 0, MASK16_3); + for (int iter = 0, tmpIdx = 0, intsIdx = 52; iter < 4; ++iter, tmpIdx += 13, intsIdx += 3) { + int l0 = tmp[tmpIdx + 0] << 10; + l0 |= tmp[tmpIdx + 1] << 7; + l0 |= tmp[tmpIdx + 2] << 4; + l0 |= tmp[tmpIdx + 3] << 1; + l0 |= (tmp[tmpIdx + 4] >>> 2) & MASK16_1; + ints[intsIdx + 0] = l0; + int l1 = (tmp[tmpIdx + 4] & MASK16_2) << 11; + l1 |= tmp[tmpIdx + 5] << 8; + l1 |= tmp[tmpIdx + 6] << 5; + l1 |= tmp[tmpIdx + 7] << 2; + l1 |= (tmp[tmpIdx + 8] >>> 1) & MASK16_2; + ints[intsIdx + 1] = l1; + int l2 = (tmp[tmpIdx + 8] & MASK16_1) << 12; + l2 |= tmp[tmpIdx + 9] << 9; + l2 |= tmp[tmpIdx + 10] << 6; + l2 |= tmp[tmpIdx + 11] << 3; + l2 |= tmp[tmpIdx + 12] << 0; + ints[intsIdx + 2] = l2; + } + } + + static void decode14(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException { + pdu.splitInts(56, ints, 2, 14, MASK16_14, tmp, 0, MASK16_2); + for (int iter = 0, tmpIdx = 0, intsIdx = 56; iter < 8; ++iter, tmpIdx += 7, intsIdx += 1) { + int l0 = tmp[tmpIdx + 0] << 12; + l0 |= tmp[tmpIdx + 1] << 10; + l0 |= tmp[tmpIdx + 2] << 8; + l0 |= tmp[tmpIdx + 3] << 6; + l0 |= tmp[tmpIdx + 4] << 4; + l0 |= tmp[tmpIdx + 5] << 2; + l0 |= tmp[tmpIdx + 6] << 0; + ints[intsIdx + 0] = l0; + } + } + + static void decode15(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException { + pdu.splitInts(60, ints, 1, 15, MASK16_15, tmp, 0, MASK16_1); + for (int iter = 0, tmpIdx = 0, intsIdx = 60; iter < 4; ++iter, tmpIdx += 15, intsIdx += 1) { + int l0 = tmp[tmpIdx + 0] << 14; + l0 |= tmp[tmpIdx + 1] << 13; + l0 |= tmp[tmpIdx + 2] << 12; + l0 |= tmp[tmpIdx + 3] << 11; + l0 |= tmp[tmpIdx + 4] << 10; + l0 |= tmp[tmpIdx + 5] << 9; + l0 |= tmp[tmpIdx + 6] << 8; + l0 |= tmp[tmpIdx + 7] << 7; + l0 |= tmp[tmpIdx + 8] << 6; + l0 |= tmp[tmpIdx + 9] << 5; + l0 |= tmp[tmpIdx + 10] << 4; + l0 |= tmp[tmpIdx + 11] << 3; + l0 |= tmp[tmpIdx + 12] << 2; + l0 |= tmp[tmpIdx + 13] << 1; + l0 |= tmp[tmpIdx + 14] << 0; + ints[intsIdx + 0] = l0; + } + } + + static void decode16(PostingDecodingUtil pdu, int[] ints) throws IOException { + pdu.in.readInts(ints, 0, 64); + } + + static void decode17(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException { + pdu.splitInts(68, ints, 15, 17, MASK32_17, tmp, 0, MASK32_15); + for (int iter = 0, tmpIdx = 0, intsIdx = 68; iter < 4; ++iter, tmpIdx += 17, intsIdx += 15) { + int l0 = tmp[tmpIdx + 0] << 2; + l0 |= (tmp[tmpIdx + 1] >>> 13) & MASK32_2; + ints[intsIdx + 0] = l0; + int l1 = (tmp[tmpIdx + 1] & MASK32_13) << 4; + l1 |= (tmp[tmpIdx + 2] >>> 11) & MASK32_4; + ints[intsIdx + 1] = l1; + int l2 = (tmp[tmpIdx + 2] & MASK32_11) << 6; + l2 |= (tmp[tmpIdx + 3] >>> 9) & MASK32_6; + ints[intsIdx + 2] = l2; + int l3 = (tmp[tmpIdx + 3] & MASK32_9) << 8; + l3 |= (tmp[tmpIdx + 4] >>> 7) & MASK32_8; + ints[intsIdx + 3] = l3; + int l4 = (tmp[tmpIdx + 4] & MASK32_7) << 10; + l4 |= (tmp[tmpIdx + 5] >>> 5) & MASK32_10; + ints[intsIdx + 4] = l4; + int l5 = (tmp[tmpIdx + 5] & MASK32_5) << 12; + l5 |= (tmp[tmpIdx + 6] >>> 3) & MASK32_12; + ints[intsIdx + 5] = l5; + int l6 = (tmp[tmpIdx + 6] & MASK32_3) << 14; + l6 |= (tmp[tmpIdx + 7] >>> 1) & MASK32_14; + ints[intsIdx + 6] = l6; + int l7 = (tmp[tmpIdx + 7] & MASK32_1) << 16; + l7 |= tmp[tmpIdx + 8] << 1; + l7 |= (tmp[tmpIdx + 9] >>> 14) & MASK32_1; + ints[intsIdx + 7] = l7; + int l8 = (tmp[tmpIdx + 9] & MASK32_14) << 3; + l8 |= (tmp[tmpIdx + 10] >>> 12) & MASK32_3; + ints[intsIdx + 8] = l8; + int l9 = (tmp[tmpIdx + 10] & MASK32_12) << 5; + l9 |= (tmp[tmpIdx + 11] >>> 10) & MASK32_5; + ints[intsIdx + 9] = l9; + int l10 = (tmp[tmpIdx + 11] & MASK32_10) << 7; + l10 |= (tmp[tmpIdx + 12] >>> 8) & MASK32_7; + ints[intsIdx + 10] = l10; + int l11 = (tmp[tmpIdx + 12] & MASK32_8) << 9; + l11 |= (tmp[tmpIdx + 13] >>> 6) & MASK32_9; + ints[intsIdx + 11] = l11; + int l12 = (tmp[tmpIdx + 13] & MASK32_6) << 11; + l12 |= (tmp[tmpIdx + 14] >>> 4) & MASK32_11; + ints[intsIdx + 12] = l12; + int l13 = (tmp[tmpIdx + 14] & MASK32_4) << 13; + l13 |= (tmp[tmpIdx + 15] >>> 2) & MASK32_13; + ints[intsIdx + 13] = l13; + int l14 = (tmp[tmpIdx + 15] & MASK32_2) << 15; + l14 |= tmp[tmpIdx + 16] << 0; + ints[intsIdx + 14] = l14; + } + } + + static void decode18(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException { + pdu.splitInts(72, ints, 14, 18, MASK32_18, tmp, 0, MASK32_14); + for (int iter = 0, tmpIdx = 0, intsIdx = 72; iter < 8; ++iter, tmpIdx += 9, intsIdx += 7) { + int l0 = tmp[tmpIdx + 0] << 4; + l0 |= (tmp[tmpIdx + 1] >>> 10) & MASK32_4; + ints[intsIdx + 0] = l0; + int l1 = (tmp[tmpIdx + 1] & MASK32_10) << 8; + l1 |= (tmp[tmpIdx + 2] >>> 6) & MASK32_8; + ints[intsIdx + 1] = l1; + int l2 = (tmp[tmpIdx + 2] & MASK32_6) << 12; + l2 |= (tmp[tmpIdx + 3] >>> 2) & MASK32_12; + ints[intsIdx + 2] = l2; + int l3 = (tmp[tmpIdx + 3] & MASK32_2) << 16; + l3 |= tmp[tmpIdx + 4] << 2; + l3 |= (tmp[tmpIdx + 5] >>> 12) & MASK32_2; + ints[intsIdx + 3] = l3; + int l4 = (tmp[tmpIdx + 5] & MASK32_12) << 6; + l4 |= (tmp[tmpIdx + 6] >>> 8) & MASK32_6; + ints[intsIdx + 4] = l4; + int l5 = (tmp[tmpIdx + 6] & MASK32_8) << 10; + l5 |= (tmp[tmpIdx + 7] >>> 4) & MASK32_10; + ints[intsIdx + 5] = l5; + int l6 = (tmp[tmpIdx + 7] & MASK32_4) << 14; + l6 |= tmp[tmpIdx + 8] << 0; + ints[intsIdx + 6] = l6; + } + } + + static void decode19(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException { + pdu.splitInts(76, ints, 13, 19, MASK32_19, tmp, 0, MASK32_13); + for (int iter = 0, tmpIdx = 0, intsIdx = 76; iter < 4; ++iter, tmpIdx += 19, intsIdx += 13) { + int l0 = tmp[tmpIdx + 0] << 6; + l0 |= (tmp[tmpIdx + 1] >>> 7) & MASK32_6; + ints[intsIdx + 0] = l0; + int l1 = (tmp[tmpIdx + 1] & MASK32_7) << 12; + l1 |= (tmp[tmpIdx + 2] >>> 1) & MASK32_12; + ints[intsIdx + 1] = l1; + int l2 = (tmp[tmpIdx + 2] & MASK32_1) << 18; + l2 |= tmp[tmpIdx + 3] << 5; + l2 |= (tmp[tmpIdx + 4] >>> 8) & MASK32_5; + ints[intsIdx + 2] = l2; + int l3 = (tmp[tmpIdx + 4] & MASK32_8) << 11; + l3 |= (tmp[tmpIdx + 5] >>> 2) & MASK32_11; + ints[intsIdx + 3] = l3; + int l4 = (tmp[tmpIdx + 5] & MASK32_2) << 17; + l4 |= tmp[tmpIdx + 6] << 4; + l4 |= (tmp[tmpIdx + 7] >>> 9) & MASK32_4; + ints[intsIdx + 4] = l4; + int l5 = (tmp[tmpIdx + 7] & MASK32_9) << 10; + l5 |= (tmp[tmpIdx + 8] >>> 3) & MASK32_10; + ints[intsIdx + 5] = l5; + int l6 = (tmp[tmpIdx + 8] & MASK32_3) << 16; + l6 |= tmp[tmpIdx + 9] << 3; + l6 |= (tmp[tmpIdx + 10] >>> 10) & MASK32_3; + ints[intsIdx + 6] = l6; + int l7 = (tmp[tmpIdx + 10] & MASK32_10) << 9; + l7 |= (tmp[tmpIdx + 11] >>> 4) & MASK32_9; + ints[intsIdx + 7] = l7; + int l8 = (tmp[tmpIdx + 11] & MASK32_4) << 15; + l8 |= tmp[tmpIdx + 12] << 2; + l8 |= (tmp[tmpIdx + 13] >>> 11) & MASK32_2; + ints[intsIdx + 8] = l8; + int l9 = (tmp[tmpIdx + 13] & MASK32_11) << 8; + l9 |= (tmp[tmpIdx + 14] >>> 5) & MASK32_8; + ints[intsIdx + 9] = l9; + int l10 = (tmp[tmpIdx + 14] & MASK32_5) << 14; + l10 |= tmp[tmpIdx + 15] << 1; + l10 |= (tmp[tmpIdx + 16] >>> 12) & MASK32_1; + ints[intsIdx + 10] = l10; + int l11 = (tmp[tmpIdx + 16] & MASK32_12) << 7; + l11 |= (tmp[tmpIdx + 17] >>> 6) & MASK32_7; + ints[intsIdx + 11] = l11; + int l12 = (tmp[tmpIdx + 17] & MASK32_6) << 13; + l12 |= tmp[tmpIdx + 18] << 0; + ints[intsIdx + 12] = l12; + } + } + + static void decode20(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException { + pdu.splitInts(80, ints, 12, 20, MASK32_20, tmp, 0, MASK32_12); + for (int iter = 0, tmpIdx = 0, intsIdx = 80; iter < 16; ++iter, tmpIdx += 5, intsIdx += 3) { + int l0 = tmp[tmpIdx + 0] << 8; + l0 |= (tmp[tmpIdx + 1] >>> 4) & MASK32_8; + ints[intsIdx + 0] = l0; + int l1 = (tmp[tmpIdx + 1] & MASK32_4) << 16; + l1 |= tmp[tmpIdx + 2] << 4; + l1 |= (tmp[tmpIdx + 3] >>> 8) & MASK32_4; + ints[intsIdx + 1] = l1; + int l2 = (tmp[tmpIdx + 3] & MASK32_8) << 12; + l2 |= tmp[tmpIdx + 4] << 0; + ints[intsIdx + 2] = l2; + } + } + + static void decode21(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException { + pdu.splitInts(84, ints, 11, 21, MASK32_21, tmp, 0, MASK32_11); + for (int iter = 0, tmpIdx = 0, intsIdx = 84; iter < 4; ++iter, tmpIdx += 21, intsIdx += 11) { + int l0 = tmp[tmpIdx + 0] << 10; + l0 |= (tmp[tmpIdx + 1] >>> 1) & MASK32_10; + ints[intsIdx + 0] = l0; + int l1 = (tmp[tmpIdx + 1] & MASK32_1) << 20; + l1 |= tmp[tmpIdx + 2] << 9; + l1 |= (tmp[tmpIdx + 3] >>> 2) & MASK32_9; + ints[intsIdx + 1] = l1; + int l2 = (tmp[tmpIdx + 3] & MASK32_2) << 19; + l2 |= tmp[tmpIdx + 4] << 8; + l2 |= (tmp[tmpIdx + 5] >>> 3) & MASK32_8; + ints[intsIdx + 2] = l2; + int l3 = (tmp[tmpIdx + 5] & MASK32_3) << 18; + l3 |= tmp[tmpIdx + 6] << 7; + l3 |= (tmp[tmpIdx + 7] >>> 4) & MASK32_7; + ints[intsIdx + 3] = l3; + int l4 = (tmp[tmpIdx + 7] & MASK32_4) << 17; + l4 |= tmp[tmpIdx + 8] << 6; + l4 |= (tmp[tmpIdx + 9] >>> 5) & MASK32_6; + ints[intsIdx + 4] = l4; + int l5 = (tmp[tmpIdx + 9] & MASK32_5) << 16; + l5 |= tmp[tmpIdx + 10] << 5; + l5 |= (tmp[tmpIdx + 11] >>> 6) & MASK32_5; + ints[intsIdx + 5] = l5; + int l6 = (tmp[tmpIdx + 11] & MASK32_6) << 15; + l6 |= tmp[tmpIdx + 12] << 4; + l6 |= (tmp[tmpIdx + 13] >>> 7) & MASK32_4; + ints[intsIdx + 6] = l6; + int l7 = (tmp[tmpIdx + 13] & MASK32_7) << 14; + l7 |= tmp[tmpIdx + 14] << 3; + l7 |= (tmp[tmpIdx + 15] >>> 8) & MASK32_3; + ints[intsIdx + 7] = l7; + int l8 = (tmp[tmpIdx + 15] & MASK32_8) << 13; + l8 |= tmp[tmpIdx + 16] << 2; + l8 |= (tmp[tmpIdx + 17] >>> 9) & MASK32_2; + ints[intsIdx + 8] = l8; + int l9 = (tmp[tmpIdx + 17] & MASK32_9) << 12; + l9 |= tmp[tmpIdx + 18] << 1; + l9 |= (tmp[tmpIdx + 19] >>> 10) & MASK32_1; + ints[intsIdx + 9] = l9; + int l10 = (tmp[tmpIdx + 19] & MASK32_10) << 11; + l10 |= tmp[tmpIdx + 20] << 0; + ints[intsIdx + 10] = l10; + } + } + + static void decode22(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException { + pdu.splitInts(88, ints, 10, 22, MASK32_22, tmp, 0, MASK32_10); + for (int iter = 0, tmpIdx = 0, intsIdx = 88; iter < 8; ++iter, tmpIdx += 11, intsIdx += 5) { + int l0 = tmp[tmpIdx + 0] << 12; + l0 |= tmp[tmpIdx + 1] << 2; + l0 |= (tmp[tmpIdx + 2] >>> 8) & MASK32_2; + ints[intsIdx + 0] = l0; + int l1 = (tmp[tmpIdx + 2] & MASK32_8) << 14; + l1 |= tmp[tmpIdx + 3] << 4; + l1 |= (tmp[tmpIdx + 4] >>> 6) & MASK32_4; + ints[intsIdx + 1] = l1; + int l2 = (tmp[tmpIdx + 4] & MASK32_6) << 16; + l2 |= tmp[tmpIdx + 5] << 6; + l2 |= (tmp[tmpIdx + 6] >>> 4) & MASK32_6; + ints[intsIdx + 2] = l2; + int l3 = (tmp[tmpIdx + 6] & MASK32_4) << 18; + l3 |= tmp[tmpIdx + 7] << 8; + l3 |= (tmp[tmpIdx + 8] >>> 2) & MASK32_8; + ints[intsIdx + 3] = l3; + int l4 = (tmp[tmpIdx + 8] & MASK32_2) << 20; + l4 |= tmp[tmpIdx + 9] << 10; + l4 |= tmp[tmpIdx + 10] << 0; + ints[intsIdx + 4] = l4; + } + } + + static void decode23(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException { + pdu.splitInts(92, ints, 9, 23, MASK32_23, tmp, 0, MASK32_9); + for (int iter = 0, tmpIdx = 0, intsIdx = 92; iter < 4; ++iter, tmpIdx += 23, intsIdx += 9) { + int l0 = tmp[tmpIdx + 0] << 14; + l0 |= tmp[tmpIdx + 1] << 5; + l0 |= (tmp[tmpIdx + 2] >>> 4) & MASK32_5; + ints[intsIdx + 0] = l0; + int l1 = (tmp[tmpIdx + 2] & MASK32_4) << 19; + l1 |= tmp[tmpIdx + 3] << 10; + l1 |= tmp[tmpIdx + 4] << 1; + l1 |= (tmp[tmpIdx + 5] >>> 8) & MASK32_1; + ints[intsIdx + 1] = l1; + int l2 = (tmp[tmpIdx + 5] & MASK32_8) << 15; + l2 |= tmp[tmpIdx + 6] << 6; + l2 |= (tmp[tmpIdx + 7] >>> 3) & MASK32_6; + ints[intsIdx + 2] = l2; + int l3 = (tmp[tmpIdx + 7] & MASK32_3) << 20; + l3 |= tmp[tmpIdx + 8] << 11; + l3 |= tmp[tmpIdx + 9] << 2; + l3 |= (tmp[tmpIdx + 10] >>> 7) & MASK32_2; + ints[intsIdx + 3] = l3; + int l4 = (tmp[tmpIdx + 10] & MASK32_7) << 16; + l4 |= tmp[tmpIdx + 11] << 7; + l4 |= (tmp[tmpIdx + 12] >>> 2) & MASK32_7; + ints[intsIdx + 4] = l4; + int l5 = (tmp[tmpIdx + 12] & MASK32_2) << 21; + l5 |= tmp[tmpIdx + 13] << 12; + l5 |= tmp[tmpIdx + 14] << 3; + l5 |= (tmp[tmpIdx + 15] >>> 6) & MASK32_3; + ints[intsIdx + 5] = l5; + int l6 = (tmp[tmpIdx + 15] & MASK32_6) << 17; + l6 |= tmp[tmpIdx + 16] << 8; + l6 |= (tmp[tmpIdx + 17] >>> 1) & MASK32_8; + ints[intsIdx + 6] = l6; + int l7 = (tmp[tmpIdx + 17] & MASK32_1) << 22; + l7 |= tmp[tmpIdx + 18] << 13; + l7 |= tmp[tmpIdx + 19] << 4; + l7 |= (tmp[tmpIdx + 20] >>> 5) & MASK32_4; + ints[intsIdx + 7] = l7; + int l8 = (tmp[tmpIdx + 20] & MASK32_5) << 18; + l8 |= tmp[tmpIdx + 21] << 9; + l8 |= tmp[tmpIdx + 22] << 0; + ints[intsIdx + 8] = l8; + } + } + + static void decode24(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException { + pdu.splitInts(96, ints, 8, 24, MASK32_24, tmp, 0, MASK32_8); + for (int iter = 0, tmpIdx = 0, intsIdx = 96; iter < 32; ++iter, tmpIdx += 3, intsIdx += 1) { + int l0 = tmp[tmpIdx + 0] << 16; + l0 |= tmp[tmpIdx + 1] << 8; + l0 |= tmp[tmpIdx + 2] << 0; + ints[intsIdx + 0] = l0; + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101Codec.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101Codec.java new file mode 100644 index 000000000000..2b764b876856 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101Codec.java @@ -0,0 +1,217 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene101; + +import java.util.Objects; +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.CompoundFormat; +import org.apache.lucene.codecs.DocValuesFormat; +import org.apache.lucene.codecs.FieldInfosFormat; +import org.apache.lucene.codecs.FilterCodec; +import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.codecs.LiveDocsFormat; +import org.apache.lucene.codecs.NormsFormat; +import org.apache.lucene.codecs.PointsFormat; +import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.codecs.SegmentInfoFormat; +import org.apache.lucene.codecs.StoredFieldsFormat; +import org.apache.lucene.codecs.TermVectorsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat; +import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat; +import org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat; +import org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat; +import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; +import org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat; +import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; +import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; +import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; + +/** + * Implements the Lucene 10.1 index format + * + *

    If you want to reuse functionality of this codec in another codec, extend {@link FilterCodec}. + * + * @see org.apache.lucene.codecs.lucene101 package documentation for file format details. + * @lucene.experimental + */ +public class Lucene101Codec extends Codec { + + /** Configuration option for the codec. */ + public enum Mode { + /** Trade compression ratio for retrieval speed. */ + BEST_SPEED(Lucene90StoredFieldsFormat.Mode.BEST_SPEED), + /** Trade retrieval speed for compression ratio. */ + BEST_COMPRESSION(Lucene90StoredFieldsFormat.Mode.BEST_COMPRESSION); + + private final Lucene90StoredFieldsFormat.Mode storedMode; + + private Mode(Lucene90StoredFieldsFormat.Mode storedMode) { + this.storedMode = Objects.requireNonNull(storedMode); + } + } + + private final TermVectorsFormat vectorsFormat = new Lucene90TermVectorsFormat(); + private final FieldInfosFormat fieldInfosFormat = new Lucene94FieldInfosFormat(); + private final SegmentInfoFormat segmentInfosFormat = new Lucene99SegmentInfoFormat(); + private final LiveDocsFormat liveDocsFormat = new Lucene90LiveDocsFormat(); + private final CompoundFormat compoundFormat = new Lucene90CompoundFormat(); + private final NormsFormat normsFormat = new Lucene90NormsFormat(); + + private final PostingsFormat defaultPostingsFormat; + private final PostingsFormat postingsFormat = + new PerFieldPostingsFormat() { + @Override + public PostingsFormat getPostingsFormatForField(String field) { + return Lucene101Codec.this.getPostingsFormatForField(field); + } + }; + + private final DocValuesFormat defaultDVFormat; + private final DocValuesFormat docValuesFormat = + new PerFieldDocValuesFormat() { + @Override + public DocValuesFormat getDocValuesFormatForField(String field) { + return Lucene101Codec.this.getDocValuesFormatForField(field); + } + }; + + private final KnnVectorsFormat defaultKnnVectorsFormat; + private final KnnVectorsFormat knnVectorsFormat = + new PerFieldKnnVectorsFormat() { + @Override + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return Lucene101Codec.this.getKnnVectorsFormatForField(field); + } + }; + + private final StoredFieldsFormat storedFieldsFormat; + + /** Instantiates a new codec. */ + public Lucene101Codec() { + this(Mode.BEST_SPEED); + } + + /** + * Instantiates a new codec, specifying the stored fields compression mode to use. + * + * @param mode stored fields compression mode to use for newly flushed/merged segments. + */ + public Lucene101Codec(Mode mode) { + super("Lucene101"); + this.storedFieldsFormat = + new Lucene90StoredFieldsFormat(Objects.requireNonNull(mode).storedMode); + this.defaultPostingsFormat = new Lucene101PostingsFormat(); + this.defaultDVFormat = new Lucene90DocValuesFormat(); + this.defaultKnnVectorsFormat = new Lucene99HnswVectorsFormat(); + } + + @Override + public final StoredFieldsFormat storedFieldsFormat() { + return storedFieldsFormat; + } + + @Override + public final TermVectorsFormat termVectorsFormat() { + return vectorsFormat; + } + + @Override + public final PostingsFormat postingsFormat() { + return postingsFormat; + } + + @Override + public final FieldInfosFormat fieldInfosFormat() { + return fieldInfosFormat; + } + + @Override + public final SegmentInfoFormat segmentInfoFormat() { + return segmentInfosFormat; + } + + @Override + public final LiveDocsFormat liveDocsFormat() { + return liveDocsFormat; + } + + @Override + public final CompoundFormat compoundFormat() { + return compoundFormat; + } + + @Override + public final PointsFormat pointsFormat() { + return new Lucene90PointsFormat(); + } + + @Override + public final KnnVectorsFormat knnVectorsFormat() { + return knnVectorsFormat; + } + + /** + * Returns the postings format that should be used for writing new segments of field. + * + *

    The default implementation always returns "Lucene101". + * + *

    WARNING: if you subclass, you are responsible for index backwards compatibility: + * future version of Lucene are only guaranteed to be able to read the default implementation, + */ + public PostingsFormat getPostingsFormatForField(String field) { + return defaultPostingsFormat; + } + + /** + * Returns the docvalues format that should be used for writing new segments of field + * . + * + *

    The default implementation always returns "Lucene90". + * + *

    WARNING: if you subclass, you are responsible for index backwards compatibility: + * future version of Lucene are only guaranteed to be able to read the default implementation. + */ + public DocValuesFormat getDocValuesFormatForField(String field) { + return defaultDVFormat; + } + + /** + * Returns the vectors format that should be used for writing new segments of field + * + *

    The default implementation always returns "Lucene99HnswVectorsFormat". + * + *

    WARNING: if you subclass, you are responsible for index backwards compatibility: + * future version of Lucene are only guaranteed to be able to read the default implementation. + */ + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return defaultKnnVectorsFormat; + } + + @Override + public final DocValuesFormat docValuesFormat() { + return docValuesFormat; + } + + @Override + public final NormsFormat normsFormat() { + return normsFormat; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsFormat.java new file mode 100644 index 000000000000..e228f1090ab8 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsFormat.java @@ -0,0 +1,492 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene101; + +import java.io.IOException; +import org.apache.lucene.codecs.BlockTermState; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.FieldsConsumer; +import org.apache.lucene.codecs.FieldsProducer; +import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.codecs.PostingsReaderBase; +import org.apache.lucene.codecs.PostingsWriterBase; +import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader; +import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.TermState; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.packed.PackedInts; + +/** + * Lucene 10.1 postings format, which encodes postings in packed integer blocks for fast decode. + * + *

    Basic idea: + * + *

      + *
    • Packed Blocks and VInt Blocks: + *

      In packed blocks, integers are encoded with the same bit width ({@link PackedInts packed + * format}): the block size (i.e. number of integers inside block) is fixed (currently 128). + * Additionally blocks that are all the same value are encoded in an optimized way. + *

      In VInt blocks, integers are encoded as {@link DataOutput#writeVInt VInt}: the block + * size is variable. + *

    • Block structure: + *

      When the postings are long enough, Lucene101PostingsFormat will try to encode most + * integer data as a packed block. + *

      Take a term with 259 documents as an example, the first 256 document ids are encoded as + * two packed blocks, while the remaining 3 are encoded as one VInt block. + *

      Different kinds of data are always encoded separately into different packed blocks, but + * may possibly be interleaved into the same VInt block. + *

      This strategy is applied to pairs: <document number, frequency>, <position, + * payload length>, <position, offset start, offset length>, and <position, + * payload length, offsetstart, offset length>. + *

    • Skipdata: + *

      Skipdata is interleaved with blocks on 2 levels. Level 0 skip data is interleaved + * between every packed block. Level 1 skip data is interleaved between every 32 packed + * blocks. + *

    • Positions, Payloads, and Offsets: + *

      A position is an integer indicating where the term occurs within one document. A payload + * is a blob of metadata associated with current position. An offset is a pair of integers + * indicating the tokenized start/end offsets for given term in current position: it is + * essentially a specialized payload. + *

      When payloads and offsets are not omitted, numPositions==numPayloads==numOffsets + * (assuming a null payload contributes one count). As mentioned in block structure, it is + * possible to encode these three either combined or separately. + *

      In all cases, payloads and offsets are stored together. When encoded as a packed block, + * position data is separated out as .pos, while payloads and offsets are encoded in .pay + * (payload metadata will also be stored directly in .pay). When encoded as VInt blocks, all + * these three are stored interleaved into the .pos (so is payload metadata). + *

      With this strategy, the majority of payload and offset data will be outside .pos file. + * So for queries that require only position data, running on a full index with payloads and + * offsets, this reduces disk pre-fetches. + *

    + * + *

    Files and detailed format: + * + *

    + * + * + * + *
    + *
    Term Dictionary + *

    The .tim file contains the list of terms in each field along with per-term statistics + * (such as docfreq) and pointers to the frequencies, positions, payload and skip data in the + * .doc, .pos, and .pay files. See {@link Lucene90BlockTreeTermsWriter} for more details on + * the format. + *

    NOTE: The term dictionary can plug into different postings implementations: the postings + * writer/reader are actually responsible for encoding and decoding the PostingsHeader and + * TermMetadata sections described here: + *

      + *
    • PostingsHeader --> Header, PackedBlockSize + *
    • TermMetadata --> (DocFPDelta|SingletonDocID), PosFPDelta?, PosVIntBlockFPDelta?, + * PayFPDelta? + *
    • Header, --> {@link CodecUtil#writeIndexHeader IndexHeader} + *
    • PackedBlockSize, SingletonDocID --> {@link DataOutput#writeVInt VInt} + *
    • DocFPDelta, PosFPDelta, PayFPDelta, PosVIntBlockFPDelta --> {@link + * DataOutput#writeVLong VLong} + *
    • Footer --> {@link CodecUtil#writeFooter CodecFooter} + *
    + *

    Notes: + *

      + *
    • Header is a {@link CodecUtil#writeIndexHeader IndexHeader} storing the version + * information for the postings. + *
    • PackedBlockSize is the fixed block size for packed blocks. In packed block, bit width + * is determined by the largest integer. Smaller block size result in smaller variance + * among width of integers hence smaller indexes. Larger block size result in more + * efficient bulk i/o hence better acceleration. This value should always be a multiple + * of 64, currently fixed as 128 as a tradeoff. It is also the skip interval used to + * accelerate {@link org.apache.lucene.index.PostingsEnum#advance(int)}. + *
    • DocFPDelta determines the position of this term's TermFreqs within the .doc file. In + * particular, it is the difference of file offset between this term's data and previous + * term's data (or zero, for the first term in the block).On disk it is stored as the + * difference from previous value in sequence. + *
    • PosFPDelta determines the position of this term's TermPositions within the .pos file. + * While PayFPDelta determines the position of this term's <TermPayloads, + * TermOffsets?> within the .pay file. Similar to DocFPDelta, it is the difference + * between two file positions (or neglected, for fields that omit payloads and offsets). + *
    • PosVIntBlockFPDelta determines the position of this term's last TermPosition in last + * pos packed block within the .pos file. It is synonym for PayVIntBlockFPDelta or + * OffsetVIntBlockFPDelta. This is actually used to indicate whether it is necessary to + * load following payloads and offsets from .pos instead of .pay. Every time a new block + * of positions are to be loaded, the PostingsReader will use this value to check + * whether current block is packed format or VInt. When packed format, payloads and + * offsets are fetched from .pay, otherwise from .pos. (this value is neglected when + * total number of positions i.e. totalTermFreq is less or equal to PackedBlockSize). + *
    • SingletonDocID is an optimization when a term only appears in one document. In this + * case, instead of writing a file pointer to the .doc file (DocFPDelta), and then a + * VIntBlock at that location, the single document ID is written to the term dictionary. + *
    + *
    + * + * + * + *
    + *
    Term Index + *

    The .tip file contains an index into the term dictionary, so that it can be accessed + * randomly. See {@link Lucene90BlockTreeTermsWriter} for more details on the format. + *

    + * + * + * + *
    + *
    Frequencies and Skip Data + *

    The .doc file contains the lists of documents which contain each term, along with the + * frequency of the term in that document (except when frequencies are omitted: {@link + * IndexOptions#DOCS}). Skip data is saved at the end of each term's postings. The skip data + * is saved once for the entire postings list. + *

      + *
    • docFile(.doc) --> Header, <TermFreqs>TermCount, Footer + *
    • Header --> {@link CodecUtil#writeIndexHeader IndexHeader} + *
    • TermFreqs --> <PackedBlock32> PackedDocBlockNum/32, VIntBlock? + *
    • PackedBlock32 --> Level1SkipData, <PackedBlock> 32 + *
    • PackedBlock --> Level0SkipData, PackedDocDeltaBlock, PackedFreqBlock? + *
    • VIntBlock --> + * <DocDelta[,Freq?]>DocFreq-PackedBlockSize*PackedDocBlockNum + *
    • Level1SkipData --> DocDelta, DocFPDelta, Skip1NumBytes?, ImpactLength?, Impacts?, + * PosFPDelta?, NextPosUpto?, PayFPDelta?, NextPayByteUpto? + *
    • Level0SkipData --> Skip0NumBytes, DocDelta, DocFPDelta, PackedBlockLength, + * ImpactLength?, Impacts?, PosFPDelta?, NextPosUpto?, PayFPDelta?, NextPayByteUpto? + *
    • PackedFreqBlock --> {@link PackedInts PackedInts}, uses patching + *
    • PackedDocDeltaBlock --> {@link PackedInts PackedInts}, does not use patching + *
    • Footer --> {@link CodecUtil#writeFooter CodecFooter} + *
    + *

    Notes: + *

      + *
    • PackedDocDeltaBlock is theoretically generated from two steps: + *
        + *
      1. Calculate the difference between each document number and previous one, and get + * a d-gaps list (for the first document, use absolute value); + *
      2. For those d-gaps from first one to + * PackedDocBlockNum*PackedBlockSizeth, separately encode as packed + * blocks. + *
      + * If frequencies are not omitted, PackedFreqBlock will be generated without d-gap step. + *
    • VIntBlock stores remaining d-gaps (along with frequencies when possible) with a + * format that encodes DocDelta and Freq: + *

      DocDelta: if frequencies are indexed, this determines both the document number and + * the frequency. In particular, DocDelta/2 is the difference between this document + * number and the previous document number (or zero when this is the first document in a + * TermFreqs). When DocDelta is odd, the frequency is one. When DocDelta is even, the + * frequency is read as another VInt. If frequencies are omitted, DocDelta contains the + * gap (not multiplied by 2) between document numbers and no frequency information is + * stored. + *

      For example, the TermFreqs for a term which occurs once in document seven and + * three times in document eleven, with frequencies indexed, would be the following + * sequence of VInts: + *

      15, 8, 3 + *

      If frequencies were omitted ({@link IndexOptions#DOCS}) it would be this sequence + * of VInts instead: + *

      7,4 + *

    • PackedDocBlockNum is the number of packed blocks for current term's docids or + * frequencies. In particular, PackedDocBlockNum = floor(DocFreq/PackedBlockSize) + *
    • On skip data, DocDelta is the delta between the last doc of the previous block - or + * -1 if there is no previous block - and the last doc of this block. This helps know by + * how much the doc ID should be incremented in case the block gets skipped. + *
    • Skip0Length is the length of skip data at level 0. Encoding it is useful when skip + * data is never needed to quickly skip over skip data, e.g. if only using nextDoc(). It + * is also used when only the first fields of skip data are needed, in order to skip + * over remaining fields without reading them. + *
    • ImpactLength and Impacts are only stored if frequencies are indexed. + *
    • Since positions and payloads are also block encoded, the skip should skip to related + * block first, then fetch the values according to in-block offset. PosFPSkip and + * PayFPSkip record the file offsets of related block in .pos and .pay, respectively. + * While PosBlockOffset indicates which value to fetch inside the related block + * (PayBlockOffset is unnecessary since it is always equal to PosBlockOffset). Same as + * DocFPSkip, the file offsets are relative to the start of current term's TermFreqs, + * and stored as a difference sequence. + *
    • PayByteUpto indicates the start offset of the current payload. It is equivalent to + * the sum of the payload lengths in the current block up to PosBlockOffset + *
    • ImpactLength is the total length of CompetitiveFreqDelta and CompetitiveNormDelta + * pairs. CompetitiveFreqDelta and CompetitiveNormDelta are used to safely skip score + * calculation for uncompetitive documents; See {@link + * org.apache.lucene.codecs.CompetitiveImpactAccumulator} for more details. + *
    + *
    + * + * + * + *
    + *
    Positions + *

    The .pos file contains the lists of positions that each term occurs at within documents. + * It also sometimes stores part of payloads and offsets for speedup. + *

      + *
    • PosFile(.pos) --> Header, <TermPositions> TermCount, Footer + *
    • Header --> {@link CodecUtil#writeIndexHeader IndexHeader} + *
    • TermPositions --> <PackedPosDeltaBlock> PackedPosBlockNum, + * VIntBlock? + *
    • VIntBlock --> <PositionDelta[, PayloadLength?], PayloadData?, OffsetDelta?, + * OffsetLength?>PosVIntCount + *
    • PackedPosDeltaBlock --> {@link PackedInts PackedInts} + *
    • PositionDelta, OffsetDelta, OffsetLength --> {@link DataOutput#writeVInt VInt} + *
    • PayloadData --> {@link DataOutput#writeByte byte}PayLength + *
    • Footer --> {@link CodecUtil#writeFooter CodecFooter} + *
    + *

    Notes: + *

      + *
    • TermPositions are order by term (terms are implicit, from the term dictionary), and + * position values for each term document pair are incremental, and ordered by document + * number. + *
    • PackedPosBlockNum is the number of packed blocks for current term's positions, + * payloads or offsets. In particular, PackedPosBlockNum = + * floor(totalTermFreq/PackedBlockSize) + *
    • PosVIntCount is the number of positions encoded as VInt format. In particular, + * PosVIntCount = totalTermFreq - PackedPosBlockNum*PackedBlockSize + *
    • The procedure how PackedPosDeltaBlock is generated is the same as PackedDocDeltaBlock + * in chapter Frequencies and Skip Data. + *
    • PositionDelta is, if payloads are disabled for the term's field, the difference + * between the position of the current occurrence in the document and the previous + * occurrence (or zero, if this is the first occurrence in this document). If payloads + * are enabled for the term's field, then PositionDelta/2 is the difference between the + * current and the previous position. If payloads are enabled and PositionDelta is odd, + * then PayloadLength is stored, indicating the length of the payload at the current + * term position. + *
    • For example, the TermPositions for a term which occurs as the fourth term in one + * document, and as the fifth and ninth term in a subsequent document, would be the + * following sequence of VInts (payloads disabled): + *

      4, 5, 4 + *

    • PayloadData is metadata associated with the current term position. If PayloadLength + * is stored at the current position, then it indicates the length of this payload. If + * PayloadLength is not stored, then this payload has the same length as the payload at + * the previous position. + *
    • OffsetDelta/2 is the difference between this position's startOffset from the previous + * occurrence (or zero, if this is the first occurrence in this document). If + * OffsetDelta is odd, then the length (endOffset-startOffset) differs from the previous + * occurrence and an OffsetLength follows. Offset data is only written for {@link + * IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}. + *
    + *
    + * + * + * + *
    + *
    Payloads and Offsets + *

    The .pay file will store payloads and offsets associated with certain term-document + * positions. Some payloads and offsets will be separated out into .pos file, for performance + * reasons. + *

      + *
    • PayFile(.pay): --> Header, <TermPayloads?, TermOffsets?> + * TermCount, Footer + *
    • Header --> {@link CodecUtil#writeIndexHeader IndexHeader} + *
    • TermPayloads --> <PackedPayLengthBlock, SumPayLength, PayData> + * PackedPayBlockNum + *
    • TermOffsets --> <PackedOffsetStartDeltaBlock, PackedOffsetLengthBlock> + * PackedPayBlockNum + *
    • PackedPayLengthBlock, PackedOffsetStartDeltaBlock, PackedOffsetLengthBlock --> + * {@link PackedInts PackedInts} + *
    • SumPayLength --> {@link DataOutput#writeVInt VInt} + *
    • PayData --> {@link DataOutput#writeByte byte}SumPayLength + *
    • Footer --> {@link CodecUtil#writeFooter CodecFooter} + *
    + *

    Notes: + *

      + *
    • The order of TermPayloads/TermOffsets will be the same as TermPositions, note that + * part of payload/offsets are stored in .pos. + *
    • The procedure how PackedPayLengthBlock and PackedOffsetLengthBlock are generated is + * the same as PackedFreqBlock in chapter Frequencies and Skip + * Data. While PackedStartDeltaBlock follows a same procedure as + * PackedDocDeltaBlock. + *
    • PackedPayBlockNum is always equal to PackedPosBlockNum, for the same term. It is also + * synonym for PackedOffsetBlockNum. + *
    • SumPayLength is the total length of payloads written within one block, should be the + * sum of PayLengths in one packed block. + *
    • PayLength in PackedPayLengthBlock is the length of each payload associated with the + * current position. + *
    + *
    + * + * @lucene.experimental + */ +public final class Lucene101PostingsFormat extends PostingsFormat { + + /** Filename extension for some small metadata about how postings are encoded. */ + public static final String META_EXTENSION = "psm"; + + /** + * Filename extension for document number, frequencies, and skip data. See chapter: Frequencies and Skip Data + */ + public static final String DOC_EXTENSION = "doc"; + + /** Filename extension for positions. See chapter: Positions */ + public static final String POS_EXTENSION = "pos"; + + /** + * Filename extension for payloads and offsets. See chapter: Payloads and + * Offsets + */ + public static final String PAY_EXTENSION = "pay"; + + /** Size of blocks. */ + public static final int BLOCK_SIZE = ForUtil.BLOCK_SIZE; + + public static final int BLOCK_MASK = BLOCK_SIZE - 1; + + /** We insert skip data on every block and every SKIP_FACTOR=32 blocks. */ + public static final int LEVEL1_FACTOR = 32; + + /** Total number of docs covered by level 1 skip data: 32 * 128 = 4,096 */ + public static final int LEVEL1_NUM_DOCS = LEVEL1_FACTOR * BLOCK_SIZE; + + public static final int LEVEL1_MASK = LEVEL1_NUM_DOCS - 1; + + static final String TERMS_CODEC = "Lucene90PostingsWriterTerms"; + static final String META_CODEC = "Lucene101PostingsWriterMeta"; + static final String DOC_CODEC = "Lucene101PostingsWriterDoc"; + static final String POS_CODEC = "Lucene101PostingsWriterPos"; + static final String PAY_CODEC = "Lucene101PostingsWriterPay"; + + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + + private final int minTermBlockSize; + private final int maxTermBlockSize; + + /** Creates {@code Lucene101PostingsFormat} with default settings. */ + public Lucene101PostingsFormat() { + this( + Lucene90BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, + Lucene90BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE); + } + + /** + * Creates {@code Lucene101PostingsFormat} with custom values for {@code minBlockSize} and {@code + * maxBlockSize} passed to block terms dictionary. + * + * @see + * Lucene90BlockTreeTermsWriter#Lucene90BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int) + */ + public Lucene101PostingsFormat(int minTermBlockSize, int maxTermBlockSize) { + super("Lucene101"); + Lucene90BlockTreeTermsWriter.validateSettings(minTermBlockSize, maxTermBlockSize); + this.minTermBlockSize = minTermBlockSize; + this.maxTermBlockSize = maxTermBlockSize; + } + + @Override + public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + PostingsWriterBase postingsWriter = new Lucene101PostingsWriter(state); + boolean success = false; + try { + FieldsConsumer ret = + new Lucene90BlockTreeTermsWriter( + state, postingsWriter, minTermBlockSize, maxTermBlockSize); + success = true; + return ret; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(postingsWriter); + } + } + } + + @Override + public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { + PostingsReaderBase postingsReader = new Lucene101PostingsReader(state); + boolean success = false; + try { + FieldsProducer ret = new Lucene90BlockTreeTermsReader(postingsReader, state); + success = true; + return ret; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(postingsReader); + } + } + } + + /** + * Holds all state required for {@link Lucene101PostingsReader} to produce a {@link + * org.apache.lucene.index.PostingsEnum} without re-seeking the terms dict. + * + * @lucene.internal + */ + public static final class IntBlockTermState extends BlockTermState { + /** file pointer to the start of the doc ids enumeration, in {@link #DOC_EXTENSION} file */ + public long docStartFP; + + /** file pointer to the start of the positions enumeration, in {@link #POS_EXTENSION} file */ + public long posStartFP; + + /** file pointer to the start of the payloads enumeration, in {@link #PAY_EXTENSION} file */ + public long payStartFP; + + /** + * file offset for the last position in the last block, if there are more than {@link + * ForUtil#BLOCK_SIZE} positions; otherwise -1 + * + *

    One might think to use total term frequency to track how many positions are left to read + * as we decode the blocks, and decode the last block differently when num_left_positions < + * BLOCK_SIZE. Unfortunately this won't work since the tracking will be messed up when we skip + * blocks as the skipper will only tell us new position offset (start of block) and number of + * positions to skip for that block, without telling us how many positions it has skipped. + */ + public long lastPosBlockOffset; + + /** + * docid when there is a single pulsed posting, otherwise -1. freq is always implicitly + * totalTermFreq in this case. + */ + public int singletonDocID; + + /** Sole constructor. */ + public IntBlockTermState() { + lastPosBlockOffset = -1; + singletonDocID = -1; + } + + @Override + public IntBlockTermState clone() { + IntBlockTermState other = new IntBlockTermState(); + other.copyFrom(this); + return other; + } + + @Override + public void copyFrom(TermState _other) { + super.copyFrom(_other); + IntBlockTermState other = (IntBlockTermState) _other; + docStartFP = other.docStartFP; + posStartFP = other.posStartFP; + payStartFP = other.payStartFP; + lastPosBlockOffset = other.lastPosBlockOffset; + singletonDocID = other.singletonDocID; + } + + @Override + public String toString() { + return super.toString() + + " docStartFP=" + + docStartFP + + " posStartFP=" + + posStartFP + + " payStartFP=" + + payStartFP + + " lastPosBlockOffset=" + + lastPosBlockOffset + + " singletonDocID=" + + singletonDocID; + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java new file mode 100644 index 000000000000..ce32e8534249 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java @@ -0,0 +1,1862 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene101; + +import static org.apache.lucene.codecs.lucene101.ForUtil.BLOCK_SIZE; +import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.DOC_CODEC; +import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.LEVEL1_NUM_DOCS; +import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.META_CODEC; +import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.PAY_CODEC; +import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.POS_CODEC; +import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.TERMS_CODEC; +import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.VERSION_CURRENT; +import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.VERSION_START; + +import java.io.IOException; +import java.util.AbstractList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.RandomAccess; +import org.apache.lucene.codecs.BlockTermState; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.PostingsReaderBase; +import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.IntBlockTermState; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.Impact; +import org.apache.lucene.index.Impacts; +import org.apache.lucene.index.ImpactsEnum; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SlowImpactsEnum; +import org.apache.lucene.internal.vectorization.PostingDecodingUtil; +import org.apache.lucene.internal.vectorization.VectorizationProvider; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.ReadAdvice; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.VectorUtil; + +/** + * Concrete class that reads docId(maybe frq,pos,offset,payloads) list with postings format. + * + * @lucene.experimental + */ +public final class Lucene101PostingsReader extends PostingsReaderBase { + + static final VectorizationProvider VECTORIZATION_PROVIDER = VectorizationProvider.getInstance(); + // Dummy impacts, composed of the maximum possible term frequency and the lowest possible + // (unsigned) norm value. This is typically used on tail blocks, which don't actually record + // impacts as the storage overhead would not be worth any query evaluation speedup, since there's + // less than 128 docs left to evaluate anyway. + private static final List DUMMY_IMPACTS = + Collections.singletonList(new Impact(Integer.MAX_VALUE, 1L)); + + private final IndexInput docIn; + private final IndexInput posIn; + private final IndexInput payIn; + + private final int maxNumImpactsAtLevel0; + private final int maxImpactNumBytesAtLevel0; + private final int maxNumImpactsAtLevel1; + private final int maxImpactNumBytesAtLevel1; + + /** Sole constructor. */ + public Lucene101PostingsReader(SegmentReadState state) throws IOException { + String metaName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, state.segmentSuffix, Lucene101PostingsFormat.META_EXTENSION); + final long expectedDocFileLength, expectedPosFileLength, expectedPayFileLength; + ChecksumIndexInput metaIn = null; + boolean success = false; + int version; + try { + metaIn = state.directory.openChecksumInput(metaName); + version = + CodecUtil.checkIndexHeader( + metaIn, + META_CODEC, + VERSION_START, + VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix); + maxNumImpactsAtLevel0 = metaIn.readInt(); + maxImpactNumBytesAtLevel0 = metaIn.readInt(); + maxNumImpactsAtLevel1 = metaIn.readInt(); + maxImpactNumBytesAtLevel1 = metaIn.readInt(); + expectedDocFileLength = metaIn.readLong(); + if (state.fieldInfos.hasProx()) { + expectedPosFileLength = metaIn.readLong(); + if (state.fieldInfos.hasPayloads() || state.fieldInfos.hasOffsets()) { + expectedPayFileLength = metaIn.readLong(); + } else { + expectedPayFileLength = -1; + } + } else { + expectedPosFileLength = -1; + expectedPayFileLength = -1; + } + CodecUtil.checkFooter(metaIn, null); + success = true; + } catch (Throwable t) { + if (metaIn != null) { + CodecUtil.checkFooter(metaIn, t); + throw new AssertionError("unreachable"); + } else { + throw t; + } + } finally { + if (success) { + metaIn.close(); + } else { + IOUtils.closeWhileHandlingException(metaIn); + } + } + + success = false; + IndexInput docIn = null; + IndexInput posIn = null; + IndexInput payIn = null; + + // NOTE: these data files are too costly to verify checksum against all the bytes on open, + // but for now we at least verify proper structure of the checksum footer: which looks + // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption + // such as file truncation. + + String docName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, state.segmentSuffix, Lucene101PostingsFormat.DOC_EXTENSION); + try { + // Postings have a forward-only access pattern, so pass ReadAdvice.NORMAL to perform + // readahead. + docIn = state.directory.openInput(docName, state.context.withReadAdvice(ReadAdvice.NORMAL)); + CodecUtil.checkIndexHeader( + docIn, DOC_CODEC, version, version, state.segmentInfo.getId(), state.segmentSuffix); + CodecUtil.retrieveChecksum(docIn, expectedDocFileLength); + + if (state.fieldInfos.hasProx()) { + String proxName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, state.segmentSuffix, Lucene101PostingsFormat.POS_EXTENSION); + posIn = state.directory.openInput(proxName, state.context); + CodecUtil.checkIndexHeader( + posIn, POS_CODEC, version, version, state.segmentInfo.getId(), state.segmentSuffix); + CodecUtil.retrieveChecksum(posIn, expectedPosFileLength); + + if (state.fieldInfos.hasPayloads() || state.fieldInfos.hasOffsets()) { + String payName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, + state.segmentSuffix, + Lucene101PostingsFormat.PAY_EXTENSION); + payIn = state.directory.openInput(payName, state.context); + CodecUtil.checkIndexHeader( + payIn, PAY_CODEC, version, version, state.segmentInfo.getId(), state.segmentSuffix); + CodecUtil.retrieveChecksum(payIn, expectedPayFileLength); + } + } + + this.docIn = docIn; + this.posIn = posIn; + this.payIn = payIn; + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(docIn, posIn, payIn); + } + } + } + + @Override + public void init(IndexInput termsIn, SegmentReadState state) throws IOException { + // Make sure we are talking to the matching postings writer + CodecUtil.checkIndexHeader( + termsIn, + TERMS_CODEC, + VERSION_START, + VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix); + final int indexBlockSize = termsIn.readVInt(); + if (indexBlockSize != BLOCK_SIZE) { + throw new IllegalStateException( + "index-time BLOCK_SIZE (" + + indexBlockSize + + ") != read-time BLOCK_SIZE (" + + BLOCK_SIZE + + ")"); + } + } + + static void prefixSum(int[] buffer, int count, long base) { + buffer[0] += base; + for (int i = 1; i < count; ++i) { + buffer[i] += buffer[i - 1]; + } + } + + @Override + public BlockTermState newTermState() { + return new IntBlockTermState(); + } + + @Override + public void close() throws IOException { + IOUtils.close(docIn, posIn, payIn); + } + + @Override + public void decodeTerm( + DataInput in, FieldInfo fieldInfo, BlockTermState _termState, boolean absolute) + throws IOException { + final IntBlockTermState termState = (IntBlockTermState) _termState; + if (absolute) { + termState.docStartFP = 0; + termState.posStartFP = 0; + termState.payStartFP = 0; + } + + final long l = in.readVLong(); + if ((l & 0x01) == 0) { + termState.docStartFP += l >>> 1; + if (termState.docFreq == 1) { + termState.singletonDocID = in.readVInt(); + } else { + termState.singletonDocID = -1; + } + } else { + assert absolute == false; + assert termState.singletonDocID != -1; + termState.singletonDocID += BitUtil.zigZagDecode(l >>> 1); + } + + if (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) { + termState.posStartFP += in.readVLong(); + if (fieldInfo + .getIndexOptions() + .compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) + >= 0 + || fieldInfo.hasPayloads()) { + termState.payStartFP += in.readVLong(); + } + if (termState.totalTermFreq > BLOCK_SIZE) { + termState.lastPosBlockOffset = in.readVLong(); + } else { + termState.lastPosBlockOffset = -1; + } + } + } + + @Override + public PostingsEnum postings( + FieldInfo fieldInfo, BlockTermState termState, PostingsEnum reuse, int flags) + throws IOException { + if (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0 + || PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS) == false) { + return (reuse instanceof BlockDocsEnum blockDocsEnum + && blockDocsEnum.canReuse(docIn, fieldInfo) + ? blockDocsEnum + : new BlockDocsEnum(fieldInfo)) + .reset((IntBlockTermState) termState, flags); + } else { + return (reuse instanceof EverythingEnum everythingEnum + && everythingEnum.canReuse(docIn, fieldInfo) + ? everythingEnum + : new EverythingEnum(fieldInfo)) + .reset((IntBlockTermState) termState, flags); + } + } + + @Override + public ImpactsEnum impacts(FieldInfo fieldInfo, BlockTermState state, int flags) + throws IOException { + final IndexOptions options = fieldInfo.getIndexOptions(); + final boolean indexHasPositions = + options.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + + if (state.docFreq >= BLOCK_SIZE) { + if (options.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0 + && (indexHasPositions == false + || PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS) == false)) { + return new BlockImpactsDocsEnum(indexHasPositions, (IntBlockTermState) state); + } + + if (indexHasPositions + && (options.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) < 0 + || PostingsEnum.featureRequested(flags, PostingsEnum.OFFSETS) == false) + && (fieldInfo.hasPayloads() == false + || PostingsEnum.featureRequested(flags, PostingsEnum.PAYLOADS) == false)) { + return new BlockImpactsPostingsEnum(fieldInfo, (IntBlockTermState) state); + } + } + + return new SlowImpactsEnum(postings(fieldInfo, state, null, flags)); + } + + private static long sumOverRange(int[] arr, int start, int end) { + long res = 0L; + for (int i = start; i < end; i++) { + res += arr[i]; + } + return res; + } + + private abstract class AbstractPostingsEnum extends PostingsEnum { + + protected ForDeltaUtil forDeltaUtil; + protected PForUtil pforUtil; + + protected final int[] docBuffer = new int[BLOCK_SIZE + 1]; + protected final boolean indexHasFreq; + + protected int doc; // doc we last read + + // level 0 skip data + protected int level0LastDocID; + + // level 1 skip data + protected int level1LastDocID; + protected long level1DocEndFP; + protected int level1DocCountUpto; + + protected int docFreq; // number of docs in this posting list + protected long + totalTermFreq; // sum of freqBuffer in this posting list (or docFreq when omitted) + + protected int singletonDocID; // docid when there is a single pulsed posting, otherwise -1 + + protected int docCountUpto; // number of docs in or before the current block + protected int prevDocID; // last doc ID of the previous block + + protected int docBufferSize; + protected int docBufferUpto; + + protected IndexInput docIn; + protected PostingDecodingUtil docInUtil; + + protected AbstractPostingsEnum(FieldInfo fieldInfo) { + indexHasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; + // We set the last element of docBuffer to NO_MORE_DOCS, it helps save conditionals in + // advance() + docBuffer[BLOCK_SIZE] = NO_MORE_DOCS; + } + + @Override + public int docID() { + return doc; + } + + protected void resetIndexInput(IntBlockTermState termState) throws IOException { + docFreq = termState.docFreq; + singletonDocID = termState.singletonDocID; + if (docFreq > 1) { + if (docIn == null) { + // lazy init + docIn = Lucene101PostingsReader.this.docIn.clone(); + docInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(docIn); + } + prefetchPostings(docIn, termState); + } + } + + protected PostingsEnum resetIdsAndLevelParams(IntBlockTermState termState) throws IOException { + doc = -1; + prevDocID = -1; + docCountUpto = 0; + level0LastDocID = -1; + if (docFreq < LEVEL1_NUM_DOCS) { + level1LastDocID = NO_MORE_DOCS; + if (docFreq > 1) { + docIn.seek(termState.docStartFP); + } + } else { + level1LastDocID = -1; + level1DocEndFP = termState.docStartFP; + } + level1DocCountUpto = 0; + docBufferSize = BLOCK_SIZE; + docBufferUpto = BLOCK_SIZE; + return this; + } + } + + final class BlockDocsEnum extends AbstractPostingsEnum { + + private final int[] freqBuffer = new int[BLOCK_SIZE]; + + private boolean needsFreq; // true if the caller actually needs frequencies + private long freqFP; + + public BlockDocsEnum(FieldInfo fieldInfo) { + super(fieldInfo); + } + + public boolean canReuse(IndexInput docIn, FieldInfo fieldInfo) { + final IndexOptions options = fieldInfo.getIndexOptions(); + return docIn == Lucene101PostingsReader.this.docIn + && indexHasFreq == (options.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0); + } + + public PostingsEnum reset(IntBlockTermState termState, int flags) throws IOException { + resetIndexInput(termState); + if (pforUtil == null && docFreq >= BLOCK_SIZE) { + pforUtil = new PForUtil(); + forDeltaUtil = new ForDeltaUtil(); + } + totalTermFreq = indexHasFreq ? termState.totalTermFreq : docFreq; + + this.needsFreq = PostingsEnum.featureRequested(flags, PostingsEnum.FREQS); + if (indexHasFreq == false || needsFreq == false) { + // Filling this buffer may not be cheap when doing primary key lookups, so we make sure to + // not fill more than `docFreq` entries. + Arrays.fill(freqBuffer, 0, Math.min(ForUtil.BLOCK_SIZE, docFreq), 1); + } + freqFP = -1; + return resetIdsAndLevelParams(termState); + } + + @Override + public int freq() throws IOException { + if (freqFP != -1) { + docIn.seek(freqFP); + pforUtil.decode(docInUtil, freqBuffer); + freqFP = -1; + } + + return freqBuffer[docBufferUpto - 1]; + } + + @Override + public int nextPosition() { + return -1; + } + + @Override + public int startOffset() { + return -1; + } + + @Override + public int endOffset() { + return -1; + } + + @Override + public BytesRef getPayload() { + return null; + } + + private void refillFullBlock() throws IOException { + assert docFreq - docCountUpto >= BLOCK_SIZE; + + forDeltaUtil.decodeAndPrefixSum(docInUtil, prevDocID, docBuffer); + + if (indexHasFreq) { + if (needsFreq) { + freqFP = docIn.getFilePointer(); + } + PForUtil.skip(docIn); + } + docCountUpto += BLOCK_SIZE; + prevDocID = docBuffer[BLOCK_SIZE - 1]; + docBufferUpto = 0; + assert docBuffer[docBufferSize] == NO_MORE_DOCS; + } + + private void refillRemainder() throws IOException { + final int left = docFreq - docCountUpto; + assert left >= 0; + assert left < BLOCK_SIZE; + + if (docFreq == 1) { + docBuffer[0] = singletonDocID; + freqBuffer[0] = (int) totalTermFreq; + docBuffer[1] = NO_MORE_DOCS; + docCountUpto++; + } else { + // Read vInts: + PostingsUtil.readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq, needsFreq); + prefixSum(docBuffer, left, prevDocID); + docBuffer[left] = NO_MORE_DOCS; + docCountUpto += left; + } + docBufferUpto = 0; + docBufferSize = left; + freqFP = -1; + } + + private void skipLevel1To(int target) throws IOException { + while (true) { + prevDocID = level1LastDocID; + level0LastDocID = level1LastDocID; + docIn.seek(level1DocEndFP); + docCountUpto = level1DocCountUpto; + level1DocCountUpto += LEVEL1_NUM_DOCS; + + if (docFreq - docCountUpto < LEVEL1_NUM_DOCS) { + level1LastDocID = NO_MORE_DOCS; + break; + } + + level1LastDocID += docIn.readVInt(); + level1DocEndFP = docIn.readVLong() + docIn.getFilePointer(); + + if (level1LastDocID >= target) { + if (indexHasFreq) { + // skip impacts and pos skip data + docIn.skipBytes(docIn.readShort()); + } + break; + } + } + } + + private void skipLevel0To(int target) throws IOException { + while (true) { + prevDocID = level0LastDocID; + if (docFreq - docCountUpto >= BLOCK_SIZE) { + long skip0NumBytes = docIn.readVLong(); + // end offset of skip data (before the actual data starts) + long skip0EndFP = docIn.getFilePointer() + skip0NumBytes; + int docDelta = readVInt15(docIn); + level0LastDocID += docDelta; + + if (target <= level0LastDocID) { + docIn.seek(skip0EndFP); + break; + } + + // skip block + docIn.skipBytes(readVLong15(docIn)); + docCountUpto += BLOCK_SIZE; + } else { + level0LastDocID = NO_MORE_DOCS; + break; + } + } + } + + private void moveToNextLevel0Block() throws IOException { + if (doc == level1LastDocID) { // advance skip data on level 1 + skipLevel1To(doc + 1); + } + + prevDocID = level0LastDocID; + if (docFreq - docCountUpto >= BLOCK_SIZE) { + docIn.skipBytes(docIn.readVLong()); + refillFullBlock(); + level0LastDocID = docBuffer[BLOCK_SIZE - 1]; + } else { + level0LastDocID = NO_MORE_DOCS; + refillRemainder(); + } + } + + @Override + public int nextDoc() throws IOException { + if (docBufferUpto == BLOCK_SIZE) { // advance skip data on level 0 + moveToNextLevel0Block(); + } + + return this.doc = docBuffer[docBufferUpto++]; + } + + @Override + public int advance(int target) throws IOException { + if (target > level0LastDocID) { // advance skip data on level 0 + + if (target > level1LastDocID) { // advance skip data on level 1 + skipLevel1To(target); + } + + skipLevel0To(target); + + if (docFreq - docCountUpto >= BLOCK_SIZE) { + refillFullBlock(); + } else { + refillRemainder(); + } + } + + int next = VectorUtil.findNextGEQ(docBuffer, target, docBufferUpto, docBufferSize); + this.doc = docBuffer[next]; + docBufferUpto = next + 1; + return doc; + } + + @Override + public long cost() { + return docFreq; + } + } + + final class EverythingEnum extends AbstractPostingsEnum { + + private final int[] freqBuffer = new int[BLOCK_SIZE + 1]; + private final int[] posDeltaBuffer = new int[BLOCK_SIZE]; + + private final int[] payloadLengthBuffer; + private final int[] offsetStartDeltaBuffer; + private final int[] offsetLengthBuffer; + + private byte[] payloadBytes; + private int payloadByteUpto; + private int payloadLength; + + private int lastStartOffset; + private int startOffset; + private int endOffset; + + private int posBufferUpto; + + final IndexInput posIn; + final PostingDecodingUtil posInUtil; + final IndexInput payIn; + final PostingDecodingUtil payInUtil; + final BytesRef payload; + + final boolean indexHasOffsets; + final boolean indexHasPayloads; + final boolean indexHasOffsetsOrPayloads; + + private int freq; // freq we last read + private int position; // current position + + // how many positions "behind" we are; nextPosition must + // skip these to "catch up": + private int posPendingCount; + + // File pointer where the last (vInt encoded) pos delta + // block is. We need this to know whether to bulk + // decode vs vInt decode the block: + private long lastPosBlockFP; + + private long level0PosEndFP; + private int level0BlockPosUpto; + private long level0PayEndFP; + private int level0BlockPayUpto; + + private long level1PosEndFP; + private int level1BlockPosUpto; + private long level1PayEndFP; + private int level1BlockPayUpto; + + private boolean needsOffsets; // true if we actually need offsets + private boolean needsPayloads; // true if we actually need payloads + + public EverythingEnum(FieldInfo fieldInfo) throws IOException { + super(fieldInfo); + indexHasOffsets = + fieldInfo + .getIndexOptions() + .compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) + >= 0; + indexHasPayloads = fieldInfo.hasPayloads(); + indexHasOffsetsOrPayloads = indexHasOffsets || indexHasPayloads; + + this.posIn = Lucene101PostingsReader.this.posIn.clone(); + posInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(posIn); + if (indexHasOffsetsOrPayloads) { + this.payIn = Lucene101PostingsReader.this.payIn.clone(); + payInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(payIn); + } else { + this.payIn = null; + payInUtil = null; + } + if (indexHasOffsets) { + offsetStartDeltaBuffer = new int[BLOCK_SIZE]; + offsetLengthBuffer = new int[BLOCK_SIZE]; + } else { + offsetStartDeltaBuffer = null; + offsetLengthBuffer = null; + startOffset = -1; + endOffset = -1; + } + + if (indexHasPayloads) { + payloadLengthBuffer = new int[BLOCK_SIZE]; + payloadBytes = new byte[128]; + payload = new BytesRef(); + } else { + payloadLengthBuffer = null; + payloadBytes = null; + payload = null; + } + } + + public boolean canReuse(IndexInput docIn, FieldInfo fieldInfo) { + return docIn == Lucene101PostingsReader.this.docIn + && indexHasOffsets + == (fieldInfo + .getIndexOptions() + .compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) + >= 0) + && indexHasPayloads == fieldInfo.hasPayloads(); + } + + public PostingsEnum reset(IntBlockTermState termState, int flags) throws IOException { + resetIndexInput(termState); + if (forDeltaUtil == null && docFreq >= BLOCK_SIZE) { + forDeltaUtil = new ForDeltaUtil(); + } + totalTermFreq = termState.totalTermFreq; + if (pforUtil == null && totalTermFreq >= BLOCK_SIZE) { + pforUtil = new PForUtil(); + } + // Where this term's postings start in the .pos file: + final long posTermStartFP = termState.posStartFP; + // Where this term's payloads/offsets start in the .pay + // file: + final long payTermStartFP = termState.payStartFP; + posIn.seek(posTermStartFP); + if (indexHasOffsetsOrPayloads) { + payIn.seek(payTermStartFP); + } + level1PosEndFP = posTermStartFP; + level1PayEndFP = payTermStartFP; + level0PosEndFP = posTermStartFP; + level0PayEndFP = payTermStartFP; + posPendingCount = 0; + payloadByteUpto = 0; + if (termState.totalTermFreq < BLOCK_SIZE) { + lastPosBlockFP = posTermStartFP; + } else if (termState.totalTermFreq == BLOCK_SIZE) { + lastPosBlockFP = -1; + } else { + lastPosBlockFP = posTermStartFP + termState.lastPosBlockOffset; + } + + this.needsOffsets = PostingsEnum.featureRequested(flags, PostingsEnum.OFFSETS); + this.needsPayloads = PostingsEnum.featureRequested(flags, PostingsEnum.PAYLOADS); + + level1BlockPosUpto = 0; + level1BlockPayUpto = 0; + level0BlockPosUpto = 0; + level0BlockPayUpto = 0; + posBufferUpto = BLOCK_SIZE; + + return resetIdsAndLevelParams(termState); + } + + @Override + public int freq() { + return freq; + } + + private void refillDocs() throws IOException { + final int left = docFreq - docCountUpto; + assert left >= 0; + + if (left >= BLOCK_SIZE) { + forDeltaUtil.decodeAndPrefixSum(docInUtil, prevDocID, docBuffer); + pforUtil.decode(docInUtil, freqBuffer); + docCountUpto += BLOCK_SIZE; + } else if (docFreq == 1) { + docBuffer[0] = singletonDocID; + freqBuffer[0] = (int) totalTermFreq; + docBuffer[1] = NO_MORE_DOCS; + docCountUpto++; + docBufferSize = 1; + } else { + // Read vInts: + PostingsUtil.readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq, true); + prefixSum(docBuffer, left, prevDocID); + docBuffer[left] = NO_MORE_DOCS; + docCountUpto += left; + docBufferSize = left; + } + prevDocID = docBuffer[BLOCK_SIZE - 1]; + docBufferUpto = 0; + assert docBuffer[docBufferSize] == NO_MORE_DOCS; + } + + private void skipLevel1To(int target) throws IOException { + while (true) { + prevDocID = level1LastDocID; + level0LastDocID = level1LastDocID; + docIn.seek(level1DocEndFP); + level0PosEndFP = level1PosEndFP; + level0BlockPosUpto = level1BlockPosUpto; + if (indexHasOffsetsOrPayloads) { + level0PayEndFP = level1PayEndFP; + level0BlockPayUpto = level1BlockPayUpto; + } + docCountUpto = level1DocCountUpto; + level1DocCountUpto += LEVEL1_NUM_DOCS; + + if (docFreq - docCountUpto < LEVEL1_NUM_DOCS) { + level1LastDocID = NO_MORE_DOCS; + break; + } + + level1LastDocID += docIn.readVInt(); + long delta = docIn.readVLong(); + level1DocEndFP = delta + docIn.getFilePointer(); + + long skip1EndFP = docIn.readShort() + docIn.getFilePointer(); + docIn.skipBytes(docIn.readShort()); // impacts + level1PosEndFP += docIn.readVLong(); + level1BlockPosUpto = docIn.readByte(); + if (indexHasOffsetsOrPayloads) { + level1PayEndFP += docIn.readVLong(); + level1BlockPayUpto = docIn.readVInt(); + } + assert docIn.getFilePointer() == skip1EndFP; + + if (level1LastDocID >= target) { + break; + } + } + } + + private void moveToNextLevel0Block() throws IOException { + if (doc == level1LastDocID) { // advance level 1 skip data + skipLevel1To(doc + 1); + } + + // Now advance level 0 skip data + prevDocID = level0LastDocID; + + assert docBufferUpto == BLOCK_SIZE; + if (level0PosEndFP >= posIn.getFilePointer()) { + posIn.seek(level0PosEndFP); + posPendingCount = level0BlockPosUpto; + if (indexHasOffsetsOrPayloads) { + assert level0PayEndFP >= payIn.getFilePointer(); + payIn.seek(level0PayEndFP); + payloadByteUpto = level0BlockPayUpto; + } + posBufferUpto = BLOCK_SIZE; + } + + if (docFreq - docCountUpto >= BLOCK_SIZE) { + docIn.readVLong(); // skip0 num bytes + int docDelta = readVInt15(docIn); + level0LastDocID += docDelta; + readVLong15(docIn); // block length + docIn.skipBytes(docIn.readVLong()); // impacts + + level0PosEndFP += docIn.readVLong(); + level0BlockPosUpto = docIn.readByte(); + if (indexHasOffsetsOrPayloads) { + level0PayEndFP += docIn.readVLong(); + level0BlockPayUpto = docIn.readVInt(); + } + } else { + level0LastDocID = NO_MORE_DOCS; + } + + refillDocs(); + } + + @Override + public int nextDoc() throws IOException { + if (docBufferUpto == BLOCK_SIZE) { // advance level 0 skip data + moveToNextLevel0Block(); + } + + this.doc = docBuffer[docBufferUpto]; + this.freq = freqBuffer[docBufferUpto]; + docBufferUpto++; + posPendingCount += freq; + position = 0; + lastStartOffset = 0; + return doc; + } + + private void skipLevel0To(int target) throws IOException { + while (true) { + prevDocID = level0LastDocID; + + // If nextBlockPosFP is less than the current FP, it means that the block of positions for + // the first docs of the next block are already decoded. In this case we just accumulate + // frequencies into posPendingCount instead of seeking backwards and decoding the same pos + // block again. + if (level0PosEndFP >= posIn.getFilePointer()) { + posIn.seek(level0PosEndFP); + posPendingCount = level0BlockPosUpto; + if (indexHasOffsetsOrPayloads) { + assert level0PayEndFP >= payIn.getFilePointer(); + payIn.seek(level0PayEndFP); + payloadByteUpto = level0BlockPayUpto; + } + posBufferUpto = BLOCK_SIZE; + } else { + posPendingCount += sumOverRange(freqBuffer, docBufferUpto, BLOCK_SIZE); + } + + if (docFreq - docCountUpto >= BLOCK_SIZE) { + docIn.readVLong(); // skip0 num bytes + int docDelta = readVInt15(docIn); + level0LastDocID += docDelta; + + long blockLength = readVLong15(docIn); + long blockEndFP = docIn.getFilePointer() + blockLength; + docIn.skipBytes(docIn.readVLong()); // impacts + + level0PosEndFP += docIn.readVLong(); + level0BlockPosUpto = docIn.readByte(); + if (indexHasOffsetsOrPayloads) { + level0PayEndFP += docIn.readVLong(); + level0BlockPayUpto = docIn.readVInt(); + } + + if (target <= level0LastDocID) { + break; + } + + docIn.seek(blockEndFP); + docCountUpto += BLOCK_SIZE; + } else { + level0LastDocID = NO_MORE_DOCS; + break; + } + } + } + + @Override + public int advance(int target) throws IOException { + if (target > level0LastDocID) { // advance level 0 skip data + + if (target > level1LastDocID) { // advance level 1 skip data + skipLevel1To(target); + } + + skipLevel0To(target); + + refillDocs(); + } + + int next = VectorUtil.findNextGEQ(docBuffer, target, docBufferUpto, docBufferSize); + posPendingCount += sumOverRange(freqBuffer, docBufferUpto, next + 1); + this.freq = freqBuffer[next]; + this.docBufferUpto = next + 1; + position = 0; + lastStartOffset = 0; + + return this.doc = docBuffer[next]; + } + + private void skipPositions() throws IOException { + // Skip positions now: + int toSkip = posPendingCount - freq; + // if (DEBUG) { + // System.out.println(" FPR.skipPositions: toSkip=" + toSkip); + // } + + final int leftInBlock = BLOCK_SIZE - posBufferUpto; + if (toSkip < leftInBlock) { + int end = posBufferUpto + toSkip; + if (indexHasPayloads) { + payloadByteUpto += sumOverRange(payloadLengthBuffer, posBufferUpto, end); + } + posBufferUpto = end; + } else { + toSkip -= leftInBlock; + while (toSkip >= BLOCK_SIZE) { + assert posIn.getFilePointer() != lastPosBlockFP; + PForUtil.skip(posIn); + + if (indexHasPayloads) { + // Skip payloadLength block: + PForUtil.skip(payIn); + + // Skip payloadBytes block: + int numBytes = payIn.readVInt(); + payIn.seek(payIn.getFilePointer() + numBytes); + } + + if (indexHasOffsets) { + PForUtil.skip(payIn); + PForUtil.skip(payIn); + } + toSkip -= BLOCK_SIZE; + } + refillPositions(); + payloadByteUpto = 0; + if (indexHasPayloads) { + payloadByteUpto += sumOverRange(payloadLengthBuffer, 0, toSkip); + } + posBufferUpto = toSkip; + } + + position = 0; + lastStartOffset = 0; + } + + private void refillPositions() throws IOException { + if (posIn.getFilePointer() == lastPosBlockFP) { + final int count = (int) (totalTermFreq % BLOCK_SIZE); + int payloadLength = 0; + int offsetLength = 0; + payloadByteUpto = 0; + for (int i = 0; i < count; i++) { + int code = posIn.readVInt(); + if (indexHasPayloads) { + if ((code & 1) != 0) { + payloadLength = posIn.readVInt(); + } + payloadLengthBuffer[i] = payloadLength; + posDeltaBuffer[i] = code >>> 1; + if (payloadLength != 0) { + if (payloadByteUpto + payloadLength > payloadBytes.length) { + payloadBytes = ArrayUtil.grow(payloadBytes, payloadByteUpto + payloadLength); + } + posIn.readBytes(payloadBytes, payloadByteUpto, payloadLength); + payloadByteUpto += payloadLength; + } + } else { + posDeltaBuffer[i] = code; + } + + if (indexHasOffsets) { + int deltaCode = posIn.readVInt(); + if ((deltaCode & 1) != 0) { + offsetLength = posIn.readVInt(); + } + offsetStartDeltaBuffer[i] = deltaCode >>> 1; + offsetLengthBuffer[i] = offsetLength; + } + } + payloadByteUpto = 0; + } else { + pforUtil.decode(posInUtil, posDeltaBuffer); + + if (indexHasPayloads) { + if (needsPayloads) { + pforUtil.decode(payInUtil, payloadLengthBuffer); + int numBytes = payIn.readVInt(); + + if (numBytes > payloadBytes.length) { + payloadBytes = ArrayUtil.growNoCopy(payloadBytes, numBytes); + } + payIn.readBytes(payloadBytes, 0, numBytes); + } else { + // this works, because when writing a vint block we always force the first length to be + // written + PForUtil.skip(payIn); // skip over lengths + int numBytes = payIn.readVInt(); // read length of payloadBytes + payIn.seek(payIn.getFilePointer() + numBytes); // skip over payloadBytes + } + payloadByteUpto = 0; + } + + if (indexHasOffsets) { + if (needsOffsets) { + pforUtil.decode(payInUtil, offsetStartDeltaBuffer); + pforUtil.decode(payInUtil, offsetLengthBuffer); + } else { + // this works, because when writing a vint block we always force the first length to be + // written + PForUtil.skip(payIn); // skip over starts + PForUtil.skip(payIn); // skip over lengths + } + } + } + } + + @Override + public int nextPosition() throws IOException { + assert posPendingCount > 0; + + if (posPendingCount > freq) { + skipPositions(); + posPendingCount = freq; + } + + if (posBufferUpto == BLOCK_SIZE) { + refillPositions(); + posBufferUpto = 0; + } + position += posDeltaBuffer[posBufferUpto]; + + if (indexHasPayloads) { + payloadLength = payloadLengthBuffer[posBufferUpto]; + payload.bytes = payloadBytes; + payload.offset = payloadByteUpto; + payload.length = payloadLength; + payloadByteUpto += payloadLength; + } + + if (indexHasOffsets) { + startOffset = lastStartOffset + offsetStartDeltaBuffer[posBufferUpto]; + endOffset = startOffset + offsetLengthBuffer[posBufferUpto]; + lastStartOffset = startOffset; + } + + posBufferUpto++; + posPendingCount--; + return position; + } + + @Override + public int startOffset() { + return startOffset; + } + + @Override + public int endOffset() { + return endOffset; + } + + @Override + public BytesRef getPayload() { + if (payloadLength == 0) { + return null; + } else { + return payload; + } + } + + @Override + public long cost() { + return docFreq; + } + } + + private abstract class BlockImpactsEnum extends ImpactsEnum { + + protected final ForDeltaUtil forDeltaUtil = new ForDeltaUtil(); + protected final PForUtil pforUtil = new PForUtil(); + + protected final int[] docBuffer = new int[BLOCK_SIZE + 1]; + protected final int[] freqBuffer = new int[BLOCK_SIZE]; + + protected final int docFreq; // number of docs in this posting list + + protected final IndexInput docIn; + protected final PostingDecodingUtil docInUtil; + + protected int docCountUpto; // number of docs in or before the current block + protected int doc = -1; // doc we last read + protected int prevDocID = -1; // last doc ID of the previous block + protected int docBufferSize = BLOCK_SIZE; + protected int docBufferUpto = BLOCK_SIZE; + + // true if we shallow-advanced to a new block that we have not decoded yet + protected boolean needsRefilling; + + // level 0 skip data + protected int level0LastDocID = -1; + protected long level0DocEndFP; + protected final BytesRef level0SerializedImpacts; + protected final MutableImpactList level0Impacts; + // level 1 skip data + protected int level1LastDocID; + protected long level1DocEndFP; + protected int level1DocCountUpto = 0; + protected final BytesRef level1SerializedImpacts; + protected final MutableImpactList level1Impacts; + + private BlockImpactsEnum(IntBlockTermState termState) throws IOException { + this.docFreq = termState.docFreq; + this.docIn = Lucene101PostingsReader.this.docIn.clone(); + this.docInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(docIn); + prefetchPostings(docIn, termState); + level0SerializedImpacts = new BytesRef(maxImpactNumBytesAtLevel0); + level1SerializedImpacts = new BytesRef(maxImpactNumBytesAtLevel1); + level0Impacts = new MutableImpactList(maxNumImpactsAtLevel0); + level1Impacts = new MutableImpactList(maxNumImpactsAtLevel1); + if (docFreq < LEVEL1_NUM_DOCS) { + level1LastDocID = NO_MORE_DOCS; + if (docFreq > 1) { + docIn.seek(termState.docStartFP); + } + } else { + level1LastDocID = -1; + level1DocEndFP = termState.docStartFP; + } + // We set the last element of docBuffer to NO_MORE_DOCS, it helps save conditionals in + // advance() + docBuffer[BLOCK_SIZE] = NO_MORE_DOCS; + } + + @Override + public int docID() { + return doc; + } + + @Override + public int startOffset() { + return -1; + } + + @Override + public int endOffset() { + return -1; + } + + @Override + public BytesRef getPayload() { + return null; + } + + @Override + public long cost() { + return docFreq; + } + + private final Impacts impacts = + new Impacts() { + + private final ByteArrayDataInput scratch = new ByteArrayDataInput(); + + @Override + public int numLevels() { + return level1LastDocID == NO_MORE_DOCS ? 1 : 2; + } + + @Override + public int getDocIdUpTo(int level) { + if (level == 0) { + return level0LastDocID; + } + return level == 1 ? level1LastDocID : NO_MORE_DOCS; + } + + @Override + public List getImpacts(int level) { + if (level == 0 && level0LastDocID != NO_MORE_DOCS) { + return readImpacts(level0SerializedImpacts, level0Impacts); + } + if (level == 1) { + return readImpacts(level1SerializedImpacts, level1Impacts); + } + return DUMMY_IMPACTS; + } + + private List readImpacts(BytesRef serialized, MutableImpactList impactsList) { + var scratch = this.scratch; + scratch.reset(serialized.bytes, 0, serialized.length); + Lucene101PostingsReader.readImpacts(scratch, impactsList); + return impactsList; + } + }; + + @Override + public Impacts getImpacts() { + return impacts; + } + } + + final class BlockImpactsDocsEnum extends BlockImpactsEnum { + final boolean indexHasPos; + + private long freqFP; + + public BlockImpactsDocsEnum(boolean indexHasPos, IntBlockTermState termState) + throws IOException { + super(termState); + this.indexHasPos = indexHasPos; + freqFP = -1; + } + + @Override + public int freq() throws IOException { + if (freqFP != -1) { + docIn.seek(freqFP); + pforUtil.decode(docInUtil, freqBuffer); + freqFP = -1; + } + return freqBuffer[docBufferUpto - 1]; + } + + @Override + public int nextPosition() { + return -1; + } + + private void refillDocs() throws IOException { + final int left = docFreq - docCountUpto; + assert left >= 0; + + if (left >= BLOCK_SIZE) { + forDeltaUtil.decodeAndPrefixSum(docInUtil, prevDocID, docBuffer); + freqFP = docIn.getFilePointer(); + PForUtil.skip(docIn); + docCountUpto += BLOCK_SIZE; + } else { + // Read vInts: + PostingsUtil.readVIntBlock(docIn, docBuffer, freqBuffer, left, true, true); + prefixSum(docBuffer, left, prevDocID); + docBuffer[left] = NO_MORE_DOCS; + freqFP = -1; + docCountUpto += left; + docBufferSize = left; + } + prevDocID = docBuffer[BLOCK_SIZE - 1]; + docBufferUpto = 0; + assert docBuffer[docBufferSize] == NO_MORE_DOCS; + } + + private void skipLevel1To(int target) throws IOException { + while (true) { + prevDocID = level1LastDocID; + level0LastDocID = level1LastDocID; + docIn.seek(level1DocEndFP); + docCountUpto = level1DocCountUpto; + level1DocCountUpto += LEVEL1_NUM_DOCS; + + if (docFreq - docCountUpto < LEVEL1_NUM_DOCS) { + level1LastDocID = NO_MORE_DOCS; + break; + } + + level1LastDocID += docIn.readVInt(); + level1DocEndFP = docIn.readVLong() + docIn.getFilePointer(); + + if (level1LastDocID >= target) { + long skip1EndFP = docIn.readShort() + docIn.getFilePointer(); + int numImpactBytes = docIn.readShort(); + docIn.readBytes(level1SerializedImpacts.bytes, 0, numImpactBytes); + level1SerializedImpacts.length = numImpactBytes; + assert indexHasPos || docIn.getFilePointer() == skip1EndFP; + docIn.seek(skip1EndFP); + break; + } + } + } + + private void skipLevel0To(int target) throws IOException { + while (true) { + prevDocID = level0LastDocID; + if (docFreq - docCountUpto >= BLOCK_SIZE) { + long skip0NumBytes = docIn.readVLong(); + // end offset of skip data (before the actual data starts) + long skip0End = docIn.getFilePointer() + skip0NumBytes; + int docDelta = readVInt15(docIn); + long blockLength = readVLong15(docIn); + + level0LastDocID += docDelta; + + if (target <= level0LastDocID) { + level0DocEndFP = docIn.getFilePointer() + blockLength; + int numImpactBytes = docIn.readVInt(); + docIn.readBytes(level0SerializedImpacts.bytes, 0, numImpactBytes); + level0SerializedImpacts.length = numImpactBytes; + docIn.seek(skip0End); + break; + } + + // skip block + docIn.skipBytes(blockLength); + docCountUpto += BLOCK_SIZE; + } else { + level0LastDocID = NO_MORE_DOCS; + break; + } + } + } + + @Override + public void advanceShallow(int target) throws IOException { + if (target > level0LastDocID) { // advance skip data on level 0 + if (target > level1LastDocID) { // advance skip data on level 1 + skipLevel1To(target); + } else if (needsRefilling) { + docIn.seek(level0DocEndFP); + docCountUpto += BLOCK_SIZE; + } + + skipLevel0To(target); + + needsRefilling = true; + } + } + + private void moveToNextLevel0Block() throws IOException { + if (doc == level1LastDocID) { + skipLevel1To(doc + 1); + } else if (needsRefilling) { + docIn.seek(level0DocEndFP); + docCountUpto += BLOCK_SIZE; + } + + prevDocID = level0LastDocID; + if (docFreq - docCountUpto >= BLOCK_SIZE) { + final long skip0Len = docIn.readVLong(); // skip len + final long skip0End = docIn.getFilePointer() + skip0Len; + final int docDelta = readVInt15(docIn); + final long blockLength = readVLong15(docIn); + level0LastDocID += docDelta; + level0DocEndFP = docIn.getFilePointer() + blockLength; + final int numImpactBytes = docIn.readVInt(); + docIn.readBytes(level0SerializedImpacts.bytes, 0, numImpactBytes); + level0SerializedImpacts.length = numImpactBytes; + docIn.seek(skip0End); + } else { + level0LastDocID = NO_MORE_DOCS; + } + + refillDocs(); + needsRefilling = false; + } + + @Override + public int nextDoc() throws IOException { + if (docBufferUpto == BLOCK_SIZE) { + if (needsRefilling) { + refillDocs(); + needsRefilling = false; + } else { + moveToNextLevel0Block(); + } + } + + return this.doc = docBuffer[docBufferUpto++]; + } + + @Override + public int advance(int target) throws IOException { + if (target > level0LastDocID || needsRefilling) { + advanceShallow(target); + refillDocs(); + needsRefilling = false; + } + + int next = VectorUtil.findNextGEQ(docBuffer, target, docBufferUpto, docBufferSize); + this.doc = docBuffer[next]; + docBufferUpto = next + 1; + return doc; + } + } + + final class BlockImpactsPostingsEnum extends BlockImpactsEnum { + private final int[] posDeltaBuffer = new int[BLOCK_SIZE]; + + private int posBufferUpto; + final IndexInput posIn; + final PostingDecodingUtil posInUtil; + + final boolean indexHasFreq; + final boolean indexHasOffsets; + final boolean indexHasPayloads; + final boolean indexHasOffsetsOrPayloads; + + private final long + totalTermFreq; // sum of freqBuffer in this posting list (or docFreq when omitted) + private int freq; // freq we last read + private int position; // current position + + // how many positions "behind" we are; nextPosition must + // skip these to "catch up": + private int posPendingCount; + + // File pointer where the last (vInt encoded) pos delta + // block is. We need this to know whether to bulk + // decode vs vInt decode the block: + private final long lastPosBlockFP; + + // level 0 skip data + private long level0PosEndFP; + private int level0BlockPosUpto; + // level 1 skip data + private long level1PosEndFP; + private int level1BlockPosUpto; + + private final int singletonDocID; // docid when there is a single pulsed posting, otherwise -1 + + public BlockImpactsPostingsEnum(FieldInfo fieldInfo, IntBlockTermState termState) + throws IOException { + super(termState); + final IndexOptions options = fieldInfo.getIndexOptions(); + indexHasFreq = options.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; + indexHasOffsets = + options.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + indexHasPayloads = fieldInfo.hasPayloads(); + indexHasOffsetsOrPayloads = indexHasOffsets || indexHasPayloads; + + this.posIn = Lucene101PostingsReader.this.posIn.clone(); + posInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(posIn); + + // Where this term's postings start in the .pos file: + final long posTermStartFP = termState.posStartFP; + totalTermFreq = termState.totalTermFreq; + singletonDocID = termState.singletonDocID; + posIn.seek(posTermStartFP); + level1PosEndFP = posTermStartFP; + level0PosEndFP = posTermStartFP; + posPendingCount = 0; + if (termState.totalTermFreq < BLOCK_SIZE) { + lastPosBlockFP = posTermStartFP; + } else if (termState.totalTermFreq == BLOCK_SIZE) { + lastPosBlockFP = -1; + } else { + lastPosBlockFP = posTermStartFP + termState.lastPosBlockOffset; + } + level1BlockPosUpto = 0; + posBufferUpto = BLOCK_SIZE; + } + + @Override + public int freq() { + return freq; + } + + private void refillDocs() throws IOException { + final int left = docFreq - docCountUpto; + assert left >= 0; + + if (left >= BLOCK_SIZE) { + forDeltaUtil.decodeAndPrefixSum(docInUtil, prevDocID, docBuffer); + pforUtil.decode(docInUtil, freqBuffer); + docCountUpto += BLOCK_SIZE; + } else if (docFreq == 1) { + docBuffer[0] = singletonDocID; + freqBuffer[0] = (int) totalTermFreq; + docBuffer[1] = NO_MORE_DOCS; + docCountUpto++; + } else { + // Read vInts: + PostingsUtil.readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq, true); + prefixSum(docBuffer, left, prevDocID); + docBuffer[left] = NO_MORE_DOCS; + docCountUpto += left; + docBufferSize = left; + } + prevDocID = docBuffer[BLOCK_SIZE - 1]; + docBufferUpto = 0; + assert docBuffer[docBufferSize] == NO_MORE_DOCS; + } + + private void skipLevel1To(int target) throws IOException { + while (true) { + prevDocID = level1LastDocID; + level0LastDocID = level1LastDocID; + docIn.seek(level1DocEndFP); + level0PosEndFP = level1PosEndFP; + level0BlockPosUpto = level1BlockPosUpto; + docCountUpto = level1DocCountUpto; + level1DocCountUpto += LEVEL1_NUM_DOCS; + + if (docFreq - docCountUpto < LEVEL1_NUM_DOCS) { + level1LastDocID = NO_MORE_DOCS; + break; + } + + level1LastDocID += docIn.readVInt(); + level1DocEndFP = docIn.readVLong() + docIn.getFilePointer(); + + long skip1EndFP = docIn.readShort() + docIn.getFilePointer(); + int numImpactBytes = docIn.readShort(); + if (level1LastDocID >= target) { + docIn.readBytes(level1SerializedImpacts.bytes, 0, numImpactBytes); + level1SerializedImpacts.length = numImpactBytes; + } else { + docIn.skipBytes(numImpactBytes); + } + level1PosEndFP += docIn.readVLong(); + level1BlockPosUpto = docIn.readByte(); + assert indexHasOffsetsOrPayloads || docIn.getFilePointer() == skip1EndFP; + + if (level1LastDocID >= target) { + docIn.seek(skip1EndFP); + break; + } + } + } + + private void skipLevel0To(int target) throws IOException { + while (true) { + prevDocID = level0LastDocID; + + // If nextBlockPosFP is less than the current FP, it means that the block of positions for + // the first docs of the next block are already decoded. In this case we just accumulate + // frequencies into posPendingCount instead of seeking backwards and decoding the same pos + // block again. + if (level0PosEndFP >= posIn.getFilePointer()) { + posIn.seek(level0PosEndFP); + posPendingCount = level0BlockPosUpto; + posBufferUpto = BLOCK_SIZE; + } else { + posPendingCount += sumOverRange(freqBuffer, docBufferUpto, BLOCK_SIZE); + } + + if (docFreq - docCountUpto >= BLOCK_SIZE) { + docIn.readVLong(); // skip0 num bytes + int docDelta = readVInt15(docIn); + long blockLength = readVLong15(docIn); + level0DocEndFP = docIn.getFilePointer() + blockLength; + + level0LastDocID += docDelta; + + if (target <= level0LastDocID) { + int numImpactBytes = docIn.readVInt(); + docIn.readBytes(level0SerializedImpacts.bytes, 0, numImpactBytes); + level0SerializedImpacts.length = numImpactBytes; + level0PosEndFP += docIn.readVLong(); + level0BlockPosUpto = docIn.readByte(); + if (indexHasOffsetsOrPayloads) { + docIn.readVLong(); // pay fp delta + docIn.readVInt(); // pay upto + } + break; + } + // skip block + docIn.skipBytes(docIn.readVLong()); // impacts + level0PosEndFP += docIn.readVLong(); + level0BlockPosUpto = docIn.readVInt(); + docIn.seek(level0DocEndFP); + docCountUpto += BLOCK_SIZE; + } else { + level0LastDocID = NO_MORE_DOCS; + break; + } + } + } + + @Override + public void advanceShallow(int target) throws IOException { + if (target > level0LastDocID) { // advance level 0 skip data + + if (target > level1LastDocID) { // advance skip data on level 1 + skipLevel1To(target); + } else if (needsRefilling) { + docIn.seek(level0DocEndFP); + docCountUpto += BLOCK_SIZE; + } + + skipLevel0To(target); + + needsRefilling = true; + } + } + + @Override + public int nextDoc() throws IOException { + if (docBufferUpto == BLOCK_SIZE) { + advanceShallow(doc + 1); + assert needsRefilling; + refillDocs(); + needsRefilling = false; + } + + doc = docBuffer[docBufferUpto]; + freq = freqBuffer[docBufferUpto]; + posPendingCount += freq; + docBufferUpto++; + position = 0; + return this.doc; + } + + @Override + public int advance(int target) throws IOException { + advanceShallow(target); + if (needsRefilling) { + refillDocs(); + needsRefilling = false; + } + + int next = VectorUtil.findNextGEQ(docBuffer, target, docBufferUpto, docBufferSize); + posPendingCount += sumOverRange(freqBuffer, docBufferUpto, next + 1); + freq = freqBuffer[next]; + docBufferUpto = next + 1; + position = 0; + return this.doc = docBuffer[next]; + } + + private void skipPositions() throws IOException { + // Skip positions now: + int toSkip = posPendingCount - freq; + // if (DEBUG) { + // System.out.println(" FPR.skipPositions: toSkip=" + toSkip); + // } + + final int leftInBlock = BLOCK_SIZE - posBufferUpto; + if (toSkip < leftInBlock) { + posBufferUpto += toSkip; + } else { + toSkip -= leftInBlock; + while (toSkip >= BLOCK_SIZE) { + assert posIn.getFilePointer() != lastPosBlockFP; + PForUtil.skip(posIn); + toSkip -= BLOCK_SIZE; + } + refillPositions(); + posBufferUpto = toSkip; + } + + position = 0; + } + + private void refillPositions() throws IOException { + if (posIn.getFilePointer() == lastPosBlockFP) { + final int count = (int) (totalTermFreq % BLOCK_SIZE); + int payloadLength = 0; + for (int i = 0; i < count; i++) { + int code = posIn.readVInt(); + if (indexHasPayloads) { + if ((code & 1) != 0) { + payloadLength = posIn.readVInt(); + } + posDeltaBuffer[i] = code >>> 1; + if (payloadLength != 0) { + posIn.skipBytes(payloadLength); + } + } else { + posDeltaBuffer[i] = code; + } + + if (indexHasOffsets) { + int deltaCode = posIn.readVInt(); + if ((deltaCode & 1) != 0) { + posIn.readVInt(); // offset length + } + } + } + } else { + pforUtil.decode(posInUtil, posDeltaBuffer); + } + } + + @Override + public int nextPosition() throws IOException { + assert posPendingCount > 0; + + if (posPendingCount > freq) { + skipPositions(); + posPendingCount = freq; + } + + if (posBufferUpto == BLOCK_SIZE) { + refillPositions(); + posBufferUpto = 0; + } + position += posDeltaBuffer[posBufferUpto]; + + posBufferUpto++; + posPendingCount--; + return position; + } + } + + /** + * @see Lucene101PostingsWriter#writeVInt15(org.apache.lucene.store.DataOutput, int) + */ + static int readVInt15(DataInput in) throws IOException { + short s = in.readShort(); + if (s >= 0) { + return s; + } else { + return (s & 0x7FFF) | (in.readVInt() << 15); + } + } + + /** + * @see Lucene101PostingsWriter#writeVLong15(org.apache.lucene.store.DataOutput, long) + */ + static long readVLong15(DataInput in) throws IOException { + short s = in.readShort(); + if (s >= 0) { + return s; + } else { + return (s & 0x7FFFL) | (in.readVLong() << 15); + } + } + + private static void prefetchPostings(IndexInput docIn, IntBlockTermState state) + throws IOException { + assert state.docFreq > 1; // Singletons are inlined in the terms dict, nothing to prefetch + if (docIn.getFilePointer() != state.docStartFP) { + // Don't prefetch if the input is already positioned at the right offset, which suggests that + // the caller is streaming the entire inverted index (e.g. for merging), let the read-ahead + // logic do its work instead. Note that this heuristic doesn't work for terms that have skip + // data, since skip data is stored after the last term, but handling all terms that have <128 + // docs is a good start already. + docIn.prefetch(state.docStartFP, 1); + } + // Note: we don't prefetch positions or offsets, which are less likely to be needed. + } + + static class MutableImpactList extends AbstractList implements RandomAccess { + int length; + final Impact[] impacts; + + MutableImpactList(int capacity) { + impacts = new Impact[capacity]; + for (int i = 0; i < capacity; ++i) { + impacts[i] = new Impact(Integer.MAX_VALUE, 1L); + } + } + + @Override + public Impact get(int index) { + return impacts[index]; + } + + @Override + public int size() { + return length; + } + } + + static MutableImpactList readImpacts(ByteArrayDataInput in, MutableImpactList reuse) { + int freq = 0; + long norm = 0; + int length = 0; + while (in.getPosition() < in.length()) { + int freqDelta = in.readVInt(); + if ((freqDelta & 0x01) != 0) { + freq += 1 + (freqDelta >>> 1); + try { + norm += 1 + in.readZLong(); + } catch (IOException e) { + throw new RuntimeException(e); // cannot happen on a BADI + } + } else { + freq += 1 + (freqDelta >>> 1); + norm++; + } + Impact impact = reuse.impacts[length]; + impact.freq = freq; + impact.norm = norm; + length++; + } + reuse.length = length; + return reuse; + } + + @Override + public void checkIntegrity() throws IOException { + if (docIn != null) { + CodecUtil.checksumEntireFile(docIn); + } + if (posIn != null) { + CodecUtil.checksumEntireFile(posIn); + } + if (payIn != null) { + CodecUtil.checksumEntireFile(payIn); + } + } + + @Override + public String toString() { + return getClass().getSimpleName() + + "(positions=" + + (posIn != null) + + ",payloads=" + + (payIn != null) + + ")"; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsWriter.java new file mode 100644 index 000000000000..788a5515f2d1 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsWriter.java @@ -0,0 +1,681 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene101; + +import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.BLOCK_SIZE; +import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.DOC_CODEC; +import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.LEVEL1_MASK; +import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.META_CODEC; +import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.PAY_CODEC; +import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.POS_CODEC; +import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.TERMS_CODEC; +import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.VERSION_CURRENT; + +import java.io.IOException; +import java.util.Collection; +import java.util.List; +import org.apache.lucene.codecs.BlockTermState; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.CompetitiveImpactAccumulator; +import org.apache.lucene.codecs.PushPostingsWriterBase; +import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.IntBlockTermState; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.Impact; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.store.ByteBuffersDataOutput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; + +/** Writer for {@link Lucene101PostingsFormat}. */ +public class Lucene101PostingsWriter extends PushPostingsWriterBase { + + static final IntBlockTermState EMPTY_STATE = new IntBlockTermState(); + + IndexOutput metaOut; + IndexOutput docOut; + IndexOutput posOut; + IndexOutput payOut; + + IntBlockTermState lastState; + + // Holds starting file pointers for current term: + private long docStartFP; + private long posStartFP; + private long payStartFP; + + final int[] docDeltaBuffer; + final int[] freqBuffer; + private int docBufferUpto; + + final int[] posDeltaBuffer; + final int[] payloadLengthBuffer; + final int[] offsetStartDeltaBuffer; + final int[] offsetLengthBuffer; + private int posBufferUpto; + + private byte[] payloadBytes; + private int payloadByteUpto; + + private int level0LastDocID; + private long level0LastPosFP; + private long level0LastPayFP; + + private int level1LastDocID; + private long level1LastPosFP; + private long level1LastPayFP; + + private int docID; + private int lastDocID; + private int lastPosition; + private int lastStartOffset; + private int docCount; + + private final PForUtil pforUtil; + private final ForDeltaUtil forDeltaUtil; + + private boolean fieldHasNorms; + private NumericDocValues norms; + private final CompetitiveImpactAccumulator level0FreqNormAccumulator = + new CompetitiveImpactAccumulator(); + private final CompetitiveImpactAccumulator level1CompetitiveFreqNormAccumulator = + new CompetitiveImpactAccumulator(); + + private int maxNumImpactsAtLevel0; + private int maxImpactNumBytesAtLevel0; + private int maxNumImpactsAtLevel1; + private int maxImpactNumBytesAtLevel1; + + /** Scratch output that we use to be able to prepend the encoded length, e.g. impacts. */ + private final ByteBuffersDataOutput scratchOutput = ByteBuffersDataOutput.newResettableInstance(); + + /** + * Output for a single block. This is useful to be able to prepend skip data before each block, + * which can only be computed once the block is encoded. The content is then typically copied to + * {@link #level1Output}. + */ + private final ByteBuffersDataOutput level0Output = ByteBuffersDataOutput.newResettableInstance(); + + /** + * Output for groups of 32 blocks. This is useful to prepend skip data for these 32 blocks, which + * can only be done once we have encoded these 32 blocks. The content is then typically copied to + * {@link #docCount}. + */ + private final ByteBuffersDataOutput level1Output = ByteBuffersDataOutput.newResettableInstance(); + + /** Sole constructor. */ + public Lucene101PostingsWriter(SegmentWriteState state) throws IOException { + String metaFileName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, state.segmentSuffix, Lucene101PostingsFormat.META_EXTENSION); + String docFileName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, state.segmentSuffix, Lucene101PostingsFormat.DOC_EXTENSION); + metaOut = state.directory.createOutput(metaFileName, state.context); + IndexOutput posOut = null; + IndexOutput payOut = null; + boolean success = false; + try { + docOut = state.directory.createOutput(docFileName, state.context); + CodecUtil.writeIndexHeader( + metaOut, META_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + CodecUtil.writeIndexHeader( + docOut, DOC_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + forDeltaUtil = new ForDeltaUtil(); + pforUtil = new PForUtil(); + if (state.fieldInfos.hasProx()) { + posDeltaBuffer = new int[BLOCK_SIZE]; + String posFileName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, state.segmentSuffix, Lucene101PostingsFormat.POS_EXTENSION); + posOut = state.directory.createOutput(posFileName, state.context); + CodecUtil.writeIndexHeader( + posOut, POS_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + + if (state.fieldInfos.hasPayloads()) { + payloadBytes = new byte[128]; + payloadLengthBuffer = new int[BLOCK_SIZE]; + } else { + payloadBytes = null; + payloadLengthBuffer = null; + } + + if (state.fieldInfos.hasOffsets()) { + offsetStartDeltaBuffer = new int[BLOCK_SIZE]; + offsetLengthBuffer = new int[BLOCK_SIZE]; + } else { + offsetStartDeltaBuffer = null; + offsetLengthBuffer = null; + } + + if (state.fieldInfos.hasPayloads() || state.fieldInfos.hasOffsets()) { + String payFileName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, + state.segmentSuffix, + Lucene101PostingsFormat.PAY_EXTENSION); + payOut = state.directory.createOutput(payFileName, state.context); + CodecUtil.writeIndexHeader( + payOut, PAY_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + } + } else { + posDeltaBuffer = null; + payloadLengthBuffer = null; + offsetStartDeltaBuffer = null; + offsetLengthBuffer = null; + payloadBytes = null; + } + this.payOut = payOut; + this.posOut = posOut; + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(metaOut, docOut, posOut, payOut); + } + } + + docDeltaBuffer = new int[BLOCK_SIZE]; + freqBuffer = new int[BLOCK_SIZE]; + } + + @Override + public IntBlockTermState newTermState() { + return new IntBlockTermState(); + } + + @Override + public void init(IndexOutput termsOut, SegmentWriteState state) throws IOException { + CodecUtil.writeIndexHeader( + termsOut, TERMS_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + termsOut.writeVInt(BLOCK_SIZE); + } + + @Override + public void setField(FieldInfo fieldInfo) { + super.setField(fieldInfo); + lastState = EMPTY_STATE; + fieldHasNorms = fieldInfo.hasNorms(); + } + + @Override + public void startTerm(NumericDocValues norms) { + docStartFP = docOut.getFilePointer(); + if (writePositions) { + posStartFP = posOut.getFilePointer(); + level1LastPosFP = level0LastPosFP = posStartFP; + if (writePayloads || writeOffsets) { + payStartFP = payOut.getFilePointer(); + level1LastPayFP = level0LastPayFP = payStartFP; + } + } + lastDocID = -1; + level0LastDocID = -1; + level1LastDocID = -1; + this.norms = norms; + if (writeFreqs) { + level0FreqNormAccumulator.clear(); + } + } + + @Override + public void startDoc(int docID, int termDocFreq) throws IOException { + if (docBufferUpto == BLOCK_SIZE) { + flushDocBlock(false); + docBufferUpto = 0; + } + + final int docDelta = docID - lastDocID; + + if (docID < 0 || docDelta <= 0) { + throw new CorruptIndexException( + "docs out of order (" + docID + " <= " + lastDocID + " )", docOut); + } + + docDeltaBuffer[docBufferUpto] = docDelta; + if (writeFreqs) { + freqBuffer[docBufferUpto] = termDocFreq; + } + + this.docID = docID; + lastPosition = 0; + lastStartOffset = 0; + + if (writeFreqs) { + long norm; + if (fieldHasNorms) { + boolean found = norms.advanceExact(docID); + if (found == false) { + // This can happen if indexing hits a problem after adding a doc to the + // postings but before buffering the norm. Such documents are written + // deleted and will go away on the first merge. + norm = 1L; + } else { + norm = norms.longValue(); + assert norm != 0 : docID; + } + } else { + norm = 1L; + } + + level0FreqNormAccumulator.add(termDocFreq, norm); + } + } + + @Override + public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) + throws IOException { + if (position > IndexWriter.MAX_POSITION) { + throw new CorruptIndexException( + "position=" + + position + + " is too large (> IndexWriter.MAX_POSITION=" + + IndexWriter.MAX_POSITION + + ")", + docOut); + } + if (position < 0) { + throw new CorruptIndexException("position=" + position + " is < 0", docOut); + } + posDeltaBuffer[posBufferUpto] = position - lastPosition; + if (writePayloads) { + if (payload == null || payload.length == 0) { + // no payload + payloadLengthBuffer[posBufferUpto] = 0; + } else { + payloadLengthBuffer[posBufferUpto] = payload.length; + if (payloadByteUpto + payload.length > payloadBytes.length) { + payloadBytes = ArrayUtil.grow(payloadBytes, payloadByteUpto + payload.length); + } + System.arraycopy( + payload.bytes, payload.offset, payloadBytes, payloadByteUpto, payload.length); + payloadByteUpto += payload.length; + } + } + + if (writeOffsets) { + assert startOffset >= lastStartOffset; + assert endOffset >= startOffset; + offsetStartDeltaBuffer[posBufferUpto] = startOffset - lastStartOffset; + offsetLengthBuffer[posBufferUpto] = endOffset - startOffset; + lastStartOffset = startOffset; + } + + posBufferUpto++; + lastPosition = position; + if (posBufferUpto == BLOCK_SIZE) { + pforUtil.encode(posDeltaBuffer, posOut); + + if (writePayloads) { + pforUtil.encode(payloadLengthBuffer, payOut); + payOut.writeVInt(payloadByteUpto); + payOut.writeBytes(payloadBytes, 0, payloadByteUpto); + payloadByteUpto = 0; + } + if (writeOffsets) { + pforUtil.encode(offsetStartDeltaBuffer, payOut); + pforUtil.encode(offsetLengthBuffer, payOut); + } + posBufferUpto = 0; + } + } + + @Override + public void finishDoc() { + docBufferUpto++; + docCount++; + + lastDocID = docID; + } + + /** + * Special vints that are encoded on 2 bytes if they require 15 bits or less. VInt becomes + * especially slow when the number of bytes is variable, so this special layout helps in the case + * when the number likely requires 15 bits or less + */ + static void writeVInt15(DataOutput out, int v) throws IOException { + assert v >= 0; + writeVLong15(out, v); + } + + /** + * @see #writeVInt15(DataOutput, int) + */ + static void writeVLong15(DataOutput out, long v) throws IOException { + assert v >= 0; + if ((v & ~0x7FFFL) == 0) { + out.writeShort((short) v); + } else { + out.writeShort((short) (0x8000 | (v & 0x7FFF))); + out.writeVLong(v >> 15); + } + } + + private void flushDocBlock(boolean finishTerm) throws IOException { + assert docBufferUpto != 0; + + if (docBufferUpto < BLOCK_SIZE) { + assert finishTerm; + PostingsUtil.writeVIntBlock( + level0Output, docDeltaBuffer, freqBuffer, docBufferUpto, writeFreqs); + } else { + if (writeFreqs) { + List impacts = level0FreqNormAccumulator.getCompetitiveFreqNormPairs(); + if (impacts.size() > maxNumImpactsAtLevel0) { + maxNumImpactsAtLevel0 = impacts.size(); + } + writeImpacts(impacts, scratchOutput); + assert level0Output.size() == 0; + if (scratchOutput.size() > maxImpactNumBytesAtLevel0) { + maxImpactNumBytesAtLevel0 = Math.toIntExact(scratchOutput.size()); + } + level0Output.writeVLong(scratchOutput.size()); + scratchOutput.copyTo(level0Output); + scratchOutput.reset(); + if (writePositions) { + level0Output.writeVLong(posOut.getFilePointer() - level0LastPosFP); + level0Output.writeByte((byte) posBufferUpto); + level0LastPosFP = posOut.getFilePointer(); + + if (writeOffsets || writePayloads) { + level0Output.writeVLong(payOut.getFilePointer() - level0LastPayFP); + level0Output.writeVInt(payloadByteUpto); + level0LastPayFP = payOut.getFilePointer(); + } + } + } + long numSkipBytes = level0Output.size(); + forDeltaUtil.encodeDeltas(docDeltaBuffer, level0Output); + if (writeFreqs) { + pforUtil.encode(freqBuffer, level0Output); + } + + // docID - lastBlockDocID is at least 128, so it can never fit a single byte with a vint + // Even if we subtracted 128, only extremely dense blocks would be eligible to a single byte + // so let's go with 2 bytes right away + writeVInt15(scratchOutput, docID - level0LastDocID); + writeVLong15(scratchOutput, level0Output.size()); + numSkipBytes += scratchOutput.size(); + level1Output.writeVLong(numSkipBytes); + scratchOutput.copyTo(level1Output); + scratchOutput.reset(); + } + + level0Output.copyTo(level1Output); + level0Output.reset(); + level0LastDocID = docID; + if (writeFreqs) { + level1CompetitiveFreqNormAccumulator.addAll(level0FreqNormAccumulator); + level0FreqNormAccumulator.clear(); + } + + if ((docCount & LEVEL1_MASK) == 0) { // true every 32 blocks (4,096 docs) + writeLevel1SkipData(); + level1LastDocID = docID; + level1CompetitiveFreqNormAccumulator.clear(); + } else if (finishTerm) { + level1Output.copyTo(docOut); + level1Output.reset(); + level1CompetitiveFreqNormAccumulator.clear(); + } + } + + private void writeLevel1SkipData() throws IOException { + docOut.writeVInt(docID - level1LastDocID); + final long level1End; + if (writeFreqs) { + List impacts = level1CompetitiveFreqNormAccumulator.getCompetitiveFreqNormPairs(); + if (impacts.size() > maxNumImpactsAtLevel1) { + maxNumImpactsAtLevel1 = impacts.size(); + } + writeImpacts(impacts, scratchOutput); + long numImpactBytes = scratchOutput.size(); + if (numImpactBytes > maxImpactNumBytesAtLevel1) { + maxImpactNumBytesAtLevel1 = Math.toIntExact(numImpactBytes); + } + if (writePositions) { + scratchOutput.writeVLong(posOut.getFilePointer() - level1LastPosFP); + scratchOutput.writeByte((byte) posBufferUpto); + level1LastPosFP = posOut.getFilePointer(); + if (writeOffsets || writePayloads) { + scratchOutput.writeVLong(payOut.getFilePointer() - level1LastPayFP); + scratchOutput.writeVInt(payloadByteUpto); + level1LastPayFP = payOut.getFilePointer(); + } + } + final long level1Len = 2 * Short.BYTES + scratchOutput.size() + level1Output.size(); + docOut.writeVLong(level1Len); + level1End = docOut.getFilePointer() + level1Len; + // There are at most 128 impacts, that require at most 2 bytes each + assert numImpactBytes <= Short.MAX_VALUE; + // Like impacts plus a few vlongs, still way under the max short value + assert scratchOutput.size() + Short.BYTES <= Short.MAX_VALUE; + docOut.writeShort((short) (scratchOutput.size() + Short.BYTES)); + docOut.writeShort((short) numImpactBytes); + scratchOutput.copyTo(docOut); + scratchOutput.reset(); + } else { + docOut.writeVLong(level1Output.size()); + level1End = docOut.getFilePointer() + level1Output.size(); + } + level1Output.copyTo(docOut); + level1Output.reset(); + assert docOut.getFilePointer() == level1End : docOut.getFilePointer() + " " + level1End; + } + + static void writeImpacts(Collection impacts, DataOutput out) throws IOException { + Impact previous = new Impact(0, 0); + for (Impact impact : impacts) { + assert impact.freq > previous.freq; + assert Long.compareUnsigned(impact.norm, previous.norm) > 0; + int freqDelta = impact.freq - previous.freq - 1; + long normDelta = impact.norm - previous.norm - 1; + if (normDelta == 0) { + // most of time, norm only increases by 1, so we can fold everything in a single byte + out.writeVInt(freqDelta << 1); + } else { + out.writeVInt((freqDelta << 1) | 1); + out.writeZLong(normDelta); + } + previous = impact; + } + } + + /** Called when we are done adding docs to this term */ + @Override + public void finishTerm(BlockTermState _state) throws IOException { + IntBlockTermState state = (IntBlockTermState) _state; + assert state.docFreq > 0; + + // TODO: wasteful we are counting this (counting # docs + // for this term) in two places? + assert state.docFreq == docCount : state.docFreq + " vs " + docCount; + + // docFreq == 1, don't write the single docid/freq to a separate file along with a pointer to + // it. + final int singletonDocID; + if (state.docFreq == 1) { + // pulse the singleton docid into the term dictionary, freq is implicitly totalTermFreq + singletonDocID = docDeltaBuffer[0] - 1; + } else { + singletonDocID = -1; + flushDocBlock(true); + } + + final long lastPosBlockOffset; + + if (writePositions) { + // totalTermFreq is just total number of positions(or payloads, or offsets) + // associated with current term. + assert state.totalTermFreq != -1; + if (state.totalTermFreq > BLOCK_SIZE) { + // record file offset for last pos in last block + lastPosBlockOffset = posOut.getFilePointer() - posStartFP; + } else { + lastPosBlockOffset = -1; + } + if (posBufferUpto > 0) { + assert posBufferUpto < BLOCK_SIZE; + // TODO: should we send offsets/payloads to + // .pay...? seems wasteful (have to store extra + // vLong for low (< BLOCK_SIZE) DF terms = vast vast + // majority) + + // vInt encode the remaining positions/payloads/offsets: + int lastPayloadLength = -1; // force first payload length to be written + int lastOffsetLength = -1; // force first offset length to be written + int payloadBytesReadUpto = 0; + for (int i = 0; i < posBufferUpto; i++) { + final int posDelta = posDeltaBuffer[i]; + if (writePayloads) { + final int payloadLength = payloadLengthBuffer[i]; + if (payloadLength != lastPayloadLength) { + lastPayloadLength = payloadLength; + posOut.writeVInt((posDelta << 1) | 1); + posOut.writeVInt(payloadLength); + } else { + posOut.writeVInt(posDelta << 1); + } + + if (payloadLength != 0) { + posOut.writeBytes(payloadBytes, payloadBytesReadUpto, payloadLength); + payloadBytesReadUpto += payloadLength; + } + } else { + posOut.writeVInt(posDelta); + } + + if (writeOffsets) { + int delta = offsetStartDeltaBuffer[i]; + int length = offsetLengthBuffer[i]; + if (length == lastOffsetLength) { + posOut.writeVInt(delta << 1); + } else { + posOut.writeVInt(delta << 1 | 1); + posOut.writeVInt(length); + lastOffsetLength = length; + } + } + } + + if (writePayloads) { + assert payloadBytesReadUpto == payloadByteUpto; + payloadByteUpto = 0; + } + } + } else { + lastPosBlockOffset = -1; + } + + state.docStartFP = docStartFP; + state.posStartFP = posStartFP; + state.payStartFP = payStartFP; + state.singletonDocID = singletonDocID; + + state.lastPosBlockOffset = lastPosBlockOffset; + docBufferUpto = 0; + posBufferUpto = 0; + lastDocID = -1; + docCount = 0; + } + + @Override + public void encodeTerm( + DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute) + throws IOException { + IntBlockTermState state = (IntBlockTermState) _state; + if (absolute) { + lastState = EMPTY_STATE; + assert lastState.docStartFP == 0; + } + + if (lastState.singletonDocID != -1 + && state.singletonDocID != -1 + && state.docStartFP == lastState.docStartFP) { + // With runs of rare values such as ID fields, the increment of pointers in the docs file is + // often 0. + // Furthermore some ID schemes like auto-increment IDs or Flake IDs are monotonic, so we + // encode the delta + // between consecutive doc IDs to save space. + final long delta = (long) state.singletonDocID - lastState.singletonDocID; + out.writeVLong((BitUtil.zigZagEncode(delta) << 1) | 0x01); + } else { + out.writeVLong((state.docStartFP - lastState.docStartFP) << 1); + if (state.singletonDocID != -1) { + out.writeVInt(state.singletonDocID); + } + } + + if (writePositions) { + out.writeVLong(state.posStartFP - lastState.posStartFP); + if (writePayloads || writeOffsets) { + out.writeVLong(state.payStartFP - lastState.payStartFP); + } + } + if (writePositions) { + if (state.lastPosBlockOffset != -1) { + out.writeVLong(state.lastPosBlockOffset); + } + } + lastState = state; + } + + @Override + public void close() throws IOException { + // TODO: add a finish() at least to PushBase? DV too...? + boolean success = false; + try { + if (docOut != null) { + CodecUtil.writeFooter(docOut); + } + if (posOut != null) { + CodecUtil.writeFooter(posOut); + } + if (payOut != null) { + CodecUtil.writeFooter(payOut); + } + if (metaOut != null) { + metaOut.writeInt(maxNumImpactsAtLevel0); + metaOut.writeInt(maxImpactNumBytesAtLevel0); + metaOut.writeInt(maxNumImpactsAtLevel1); + metaOut.writeInt(maxImpactNumBytesAtLevel1); + metaOut.writeLong(docOut.getFilePointer()); + if (posOut != null) { + metaOut.writeLong(posOut.getFilePointer()); + if (payOut != null) { + metaOut.writeLong(payOut.getFilePointer()); + } + } + CodecUtil.writeFooter(metaOut); + } + success = true; + } finally { + if (success) { + IOUtils.close(metaOut, docOut, posOut, payOut); + } else { + IOUtils.closeWhileHandlingException(metaOut, docOut, posOut, payOut); + } + metaOut = docOut = posOut = payOut = null; + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/PForUtil.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/PForUtil.java new file mode 100644 index 000000000000..fd8ecd056b49 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/PForUtil.java @@ -0,0 +1,134 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene101; + +import java.io.IOException; +import java.util.Arrays; +import org.apache.lucene.internal.vectorization.PostingDecodingUtil; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.util.LongHeap; +import org.apache.lucene.util.packed.PackedInts; + +/** Utility class to encode sequences of 128 small positive integers. */ +final class PForUtil { + + private static final int MAX_EXCEPTIONS = 7; + + static boolean allEqual(int[] l) { + for (int i = 1; i < ForUtil.BLOCK_SIZE; ++i) { + if (l[i] != l[0]) { + return false; + } + } + return true; + } + + private final ForUtil forUtil = new ForUtil(); + + static { + assert ForUtil.BLOCK_SIZE <= 256 : "blocksize must fit in one byte. got " + ForUtil.BLOCK_SIZE; + } + + /** Encode 128 integers from {@code ints} into {@code out}. */ + void encode(int[] ints, DataOutput out) throws IOException { + // Determine the top MAX_EXCEPTIONS + 1 values + final LongHeap top = new LongHeap(MAX_EXCEPTIONS + 1); + for (int i = 0; i <= MAX_EXCEPTIONS; ++i) { + top.push(ints[i]); + } + long topValue = top.top(); + for (int i = MAX_EXCEPTIONS + 1; i < ForUtil.BLOCK_SIZE; ++i) { + if (ints[i] > topValue) { + topValue = top.updateTop(ints[i]); + } + } + + long max = 0L; + for (int i = 1; i <= top.size(); ++i) { + max = Math.max(max, top.get(i)); + } + + final int maxBitsRequired = PackedInts.bitsRequired(max); + // We store the patch on a byte, so we can't decrease the number of bits required by more than 8 + final int patchedBitsRequired = + Math.max(PackedInts.bitsRequired(topValue), maxBitsRequired - 8); + int numExceptions = 0; + final long maxUnpatchedValue = (1L << patchedBitsRequired) - 1; + for (int i = 2; i <= top.size(); ++i) { + if (top.get(i) > maxUnpatchedValue) { + numExceptions++; + } + } + final byte[] exceptions = new byte[numExceptions * 2]; + if (numExceptions > 0) { + int exceptionCount = 0; + for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) { + if (ints[i] > maxUnpatchedValue) { + exceptions[exceptionCount * 2] = (byte) i; + exceptions[exceptionCount * 2 + 1] = (byte) (ints[i] >>> patchedBitsRequired); + ints[i] &= maxUnpatchedValue; + exceptionCount++; + } + } + assert exceptionCount == numExceptions : exceptionCount + " " + numExceptions; + } + + if (allEqual(ints) && maxBitsRequired <= 8) { + for (int i = 0; i < numExceptions; ++i) { + exceptions[2 * i + 1] = + (byte) (Byte.toUnsignedLong(exceptions[2 * i + 1]) << patchedBitsRequired); + } + out.writeByte((byte) (numExceptions << 5)); + out.writeVInt(ints[0]); + } else { + final int token = (numExceptions << 5) | patchedBitsRequired; + out.writeByte((byte) token); + forUtil.encode(ints, patchedBitsRequired, out); + } + out.writeBytes(exceptions, exceptions.length); + } + + /** Decode 128 integers into {@code ints}. */ + void decode(PostingDecodingUtil pdu, int[] ints) throws IOException { + var in = pdu.in; + final int token = Byte.toUnsignedInt(in.readByte()); + final int bitsPerValue = token & 0x1f; + if (bitsPerValue == 0) { + Arrays.fill(ints, 0, ForUtil.BLOCK_SIZE, in.readVInt()); + } else { + forUtil.decode(bitsPerValue, pdu, ints); + } + final int numExceptions = token >>> 5; + for (int i = 0; i < numExceptions; ++i) { + ints[Byte.toUnsignedInt(in.readByte())] |= Byte.toUnsignedLong(in.readByte()) << bitsPerValue; + } + } + + /** Skip 128 integers. */ + static void skip(DataInput in) throws IOException { + final int token = Byte.toUnsignedInt(in.readByte()); + final int bitsPerValue = token & 0x1f; + final int numExceptions = token >>> 5; + if (bitsPerValue == 0) { + in.readVLong(); + in.skipBytes((numExceptions << 1)); + } else { + in.skipBytes(ForUtil.numBytes(bitsPerValue) + (numExceptions << 1)); + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/PostingIndexInput.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/PostingIndexInput.java new file mode 100644 index 000000000000..59388e3446b9 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/PostingIndexInput.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene101; + +import java.io.IOException; +import org.apache.lucene.internal.vectorization.PostingDecodingUtil; +import org.apache.lucene.internal.vectorization.VectorizationProvider; +import org.apache.lucene.store.IndexInput; + +/** + * Wrapper around an {@link IndexInput} and a {@link ForUtil} that optionally optimizes decoding + * using vectorization. This class mostly exists to enable benchmarking the decoding logic of + * postings since it internally calls code that may only be called from the lucene-core JAR. + */ +public final class PostingIndexInput { + + private static final VectorizationProvider VECTORIZATION_PROVIDER = + VectorizationProvider.getInstance(); + + public final ForUtil forUtil; + public final ForDeltaUtil forDeltaUtil; + private final PostingDecodingUtil postingDecodingUtil; + + public PostingIndexInput(IndexInput in, ForUtil forUtil, ForDeltaUtil forDeltaUtil) + throws IOException { + this.forUtil = forUtil; + this.forDeltaUtil = forDeltaUtil; + this.postingDecodingUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(in); + } + + /** Decode 128 integers stored on {@code bitsPerValues} bits per value into {@code ints}. */ + public void decode(int bitsPerValue, int[] ints) throws IOException { + forUtil.decode(bitsPerValue, postingDecodingUtil, ints); + } + + /** + * Decode 128 integers stored on {@code bitsPerValues} bits per value, compute their prefix sum, + * and store results into {@code ints}. + */ + public void decodeAndPrefixSum(int bitsPerValue, int base, int[] ints) throws IOException { + forDeltaUtil.decodeAndPrefixSum(bitsPerValue, postingDecodingUtil, base, ints); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/PostingsUtil.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/PostingsUtil.java new file mode 100644 index 000000000000..34431a3689fb --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/PostingsUtil.java @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene101; + +import java.io.IOException; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.GroupVIntUtil; + +/** Utility class to encode/decode postings block. */ +final class PostingsUtil { + + /** + * Read values that have been written using variable-length encoding and group-varint encoding + * instead of bit-packing. + */ + static void readVIntBlock( + IndexInput docIn, + int[] docBuffer, + int[] freqBuffer, + int num, + boolean indexHasFreq, + boolean decodeFreq) + throws IOException { + GroupVIntUtil.readGroupVInts(docIn, docBuffer, num); + if (indexHasFreq && decodeFreq) { + for (int i = 0; i < num; ++i) { + freqBuffer[i] = docBuffer[i] & 0x01; + docBuffer[i] >>>= 1; + if (freqBuffer[i] == 0) { + freqBuffer[i] = docIn.readVInt(); + } + } + } else if (indexHasFreq) { + for (int i = 0; i < num; ++i) { + docBuffer[i] >>>= 1; + } + } + } + + /** Write freq buffer with variable-length encoding and doc buffer with group-varint encoding. */ + static void writeVIntBlock( + DataOutput docOut, int[] docBuffer, int[] freqBuffer, int num, boolean writeFreqs) + throws IOException { + if (writeFreqs) { + for (int i = 0; i < num; i++) { + docBuffer[i] = (docBuffer[i] << 1) | (freqBuffer[i] == 1 ? 1 : 0); + } + } + docOut.writeGroupVInts(docBuffer, num); + if (writeFreqs) { + for (int i = 0; i < num; i++) { + final int freq = freqBuffer[i]; + if (freq != 1) { + docOut.writeVInt(freq); + } + } + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/gen_ForDeltaUtil.py b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/gen_ForDeltaUtil.py new file mode 100644 index 000000000000..240c66530ae7 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/gen_ForDeltaUtil.py @@ -0,0 +1,377 @@ +#! /usr/bin/env python + +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from math import gcd + +"""Code generation for ForDeltaUtil.java""" + +MAX_SPECIALIZED_BITS_PER_VALUE = 24 +OUTPUT_FILE = "ForDeltaUtil.java" +PRIMITIVE_SIZE = [8, 16, 32] +HEADER = """// This file has been automatically generated, DO NOT EDIT + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene101; + +import java.io.IOException; +import org.apache.lucene.internal.vectorization.PostingDecodingUtil; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.util.packed.PackedInts; + +import static org.apache.lucene.codecs.lucene101.ForUtil.*; + +/** + * Inspired from https://fulmicoton.com/posts/bitpacking/ + * Encodes multiple integers in a Java int to get SIMD-like speedups. + * If bitsPerValue <= 4 then we pack 4 ints per Java int + * else if bitsPerValue <= 11 we pack 2 ints per Java int + * else we use scalar operations. + */ +public final class ForDeltaUtil { + + private static final int HALF_BLOCK_SIZE = BLOCK_SIZE / 2; + private static final int ONE_BLOCK_SIZE_FOURTH = BLOCK_SIZE / 4; + private static final int TWO_BLOCK_SIZE_FOURTHS = BLOCK_SIZE / 2; + private static final int THREE_BLOCK_SIZE_FOURTHS = 3 * BLOCK_SIZE / 4; + + // IDENTITY_PLUS_ONE[i] == i+1 + private static final int[] IDENTITY_PLUS_ONE = new int[ForUtil.BLOCK_SIZE]; + + static { + for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) { + IDENTITY_PLUS_ONE[i] = i + 1; + } + } + + private static void prefixSumOfOnes(int[] arr, int base) { + System.arraycopy(IDENTITY_PLUS_ONE, 0, arr, 0, ForUtil.BLOCK_SIZE); + // This loop gets auto-vectorized + for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) { + arr[i] += base; + } + } + + private static void prefixSum8(int[] arr, int base) { + // When the number of bits per value is 4 or less, we can sum up all values in a block without + // risking overflowing an 8-bits integer. This allows computing the prefix sum by summing up 4 + // values at once. + innerPrefixSum8(arr); + expand8(arr); + final int l0 = base; + final int l1 = l0 + arr[ONE_BLOCK_SIZE_FOURTH - 1]; + final int l2 = l1 + arr[TWO_BLOCK_SIZE_FOURTHS - 1]; + final int l3 = l2 + arr[THREE_BLOCK_SIZE_FOURTHS - 1]; + + for (int i = 0; i < ONE_BLOCK_SIZE_FOURTH; ++i) { + arr[i] += l0; + arr[ONE_BLOCK_SIZE_FOURTH + i] += l1; + arr[TWO_BLOCK_SIZE_FOURTHS + i] += l2; + arr[THREE_BLOCK_SIZE_FOURTHS + i] += l3; + } + } + + private static void prefixSum16(int[] arr, int base) { + // When the number of bits per value is 11 or less, we can sum up all values in a block without + // risking overflowing an 16-bits integer. This allows computing the prefix sum by summing up 2 + // values at once. + innerPrefixSum16(arr); + expand16(arr); + final int l0 = base; + final int l1 = base + arr[HALF_BLOCK_SIZE - 1]; + for (int i = 0; i < HALF_BLOCK_SIZE; ++i) { + arr[i] += l0; + arr[HALF_BLOCK_SIZE + i] += l1; + } + } + + private static void prefixSum32(int[] arr, int base) { + arr[0] += base; + for (int i = 1; i < BLOCK_SIZE; ++i) { + arr[i] += arr[i-1]; + } + } + + // For some reason unrolling seems to help + private static void innerPrefixSum8(int[] arr) { + arr[1] += arr[0]; + arr[2] += arr[1]; + arr[3] += arr[2]; + arr[4] += arr[3]; + arr[5] += arr[4]; + arr[6] += arr[5]; + arr[7] += arr[6]; + arr[8] += arr[7]; + arr[9] += arr[8]; + arr[10] += arr[9]; + arr[11] += arr[10]; + arr[12] += arr[11]; + arr[13] += arr[12]; + arr[14] += arr[13]; + arr[15] += arr[14]; + arr[16] += arr[15]; + arr[17] += arr[16]; + arr[18] += arr[17]; + arr[19] += arr[18]; + arr[20] += arr[19]; + arr[21] += arr[20]; + arr[22] += arr[21]; + arr[23] += arr[22]; + arr[24] += arr[23]; + arr[25] += arr[24]; + arr[26] += arr[25]; + arr[27] += arr[26]; + arr[28] += arr[27]; + arr[29] += arr[28]; + arr[30] += arr[29]; + arr[31] += arr[30]; + } + + // For some reason unrolling seems to help + private static void innerPrefixSum16(int[] arr) { + arr[1] += arr[0]; + arr[2] += arr[1]; + arr[3] += arr[2]; + arr[4] += arr[3]; + arr[5] += arr[4]; + arr[6] += arr[5]; + arr[7] += arr[6]; + arr[8] += arr[7]; + arr[9] += arr[8]; + arr[10] += arr[9]; + arr[11] += arr[10]; + arr[12] += arr[11]; + arr[13] += arr[12]; + arr[14] += arr[13]; + arr[15] += arr[14]; + arr[16] += arr[15]; + arr[17] += arr[16]; + arr[18] += arr[17]; + arr[19] += arr[18]; + arr[20] += arr[19]; + arr[21] += arr[20]; + arr[22] += arr[21]; + arr[23] += arr[22]; + arr[24] += arr[23]; + arr[25] += arr[24]; + arr[26] += arr[25]; + arr[27] += arr[26]; + arr[28] += arr[27]; + arr[29] += arr[28]; + arr[30] += arr[29]; + arr[31] += arr[30]; + arr[32] += arr[31]; + arr[33] += arr[32]; + arr[34] += arr[33]; + arr[35] += arr[34]; + arr[36] += arr[35]; + arr[37] += arr[36]; + arr[38] += arr[37]; + arr[39] += arr[38]; + arr[40] += arr[39]; + arr[41] += arr[40]; + arr[42] += arr[41]; + arr[43] += arr[42]; + arr[44] += arr[43]; + arr[45] += arr[44]; + arr[46] += arr[45]; + arr[47] += arr[46]; + arr[48] += arr[47]; + arr[49] += arr[48]; + arr[50] += arr[49]; + arr[51] += arr[50]; + arr[52] += arr[51]; + arr[53] += arr[52]; + arr[54] += arr[53]; + arr[55] += arr[54]; + arr[56] += arr[55]; + arr[57] += arr[56]; + arr[58] += arr[57]; + arr[59] += arr[58]; + arr[60] += arr[59]; + arr[61] += arr[60]; + arr[62] += arr[61]; + arr[63] += arr[62]; + } + + private final int[] tmp = new int[BLOCK_SIZE]; + + /** + * Encode deltas of a strictly monotonically increasing sequence of integers. The provided {@code + * ints} are expected to be deltas between consecutive values. + */ + void encodeDeltas(int[] ints, DataOutput out) throws IOException { + if (ints[0] == 1 && PForUtil.allEqual(ints)) { // happens with very dense postings + out.writeByte((byte) 0); + } else { + int or = 0; + for (int l : ints) { + or |= l; + } + assert or != 0; + final int bitsPerValue = PackedInts.bitsRequired(or); + out.writeByte((byte) bitsPerValue); + + final int primitiveSize; + if (bitsPerValue <= 3) { + primitiveSize = 8; + collapse8(ints); + } else if (bitsPerValue <= 10) { + primitiveSize = 16; + collapse16(ints); + } else { + primitiveSize = 32; + } + encode(ints, bitsPerValue, primitiveSize, out, tmp); + } + } + + /** Decode deltas, compute the prefix sum and add {@code base} to all decoded ints. */ + void decodeAndPrefixSum(PostingDecodingUtil pdu, int base, int[] ints) throws IOException { + final int bitsPerValue = Byte.toUnsignedInt(pdu.in.readByte()); + if (bitsPerValue == 0) { + prefixSumOfOnes(ints, base); + } else { + decodeAndPrefixSum(bitsPerValue, pdu, base, ints); + } + } + +""" + +def primitive_size_for_bpv(bpv): + if bpv <= 3: + # If we have 4 bits per value or less then we can compute the prefix sum of 32 ints that store 4 8-bit values each without overflowing. + return 8 + elif bpv <= 10: + # If we have 10 bits per value or less then we can compute the prefix sum of 64 ints that store 2 16-bit values each without overflowing. + return 16 + else: + # No risk of overflow with 32 bits per value + return 32 + +def next_primitive(bpv): + if bpv <= 8: + return 8 + elif bpv <= 16: + return 16 + else: + return 32 + +def writeRemainder(bpv, next_primitive, remaining_bits_per_int, o, num_values, f): + iteration = 1 + num_ints = bpv * num_values / remaining_bits_per_int + while num_ints % 2 == 0 and num_values % 2 == 0: + num_ints /= 2 + num_values /= 2 + iteration *= 2 + f.write(' for (int iter = 0, tmpIdx = 0, intsIdx = %d; iter < %d; ++iter, tmpIdx += %d, intsIdx += %d) {\n' %(o, iteration, num_ints, num_values)) + i = 0 + remaining_bits = 0 + tmp_idx = 0 + for i in range(int(num_values)): + b = bpv + if remaining_bits == 0: + b -= remaining_bits_per_int + f.write(' int l%d = tmp[tmpIdx + %d] << %d;\n' %(i, tmp_idx, b)) + else: + b -= remaining_bits + f.write(' int l%d = (tmp[tmpIdx + %d] & MASK%d_%d) << %d;\n' %(i, tmp_idx, next_primitive, remaining_bits, b)) + tmp_idx += 1 + while b >= remaining_bits_per_int: + b -= remaining_bits_per_int + f.write(' l%d |= tmp[tmpIdx + %d] << %d;\n' %(i, tmp_idx, b)) + tmp_idx += 1 + if b > 0: + f.write(' l%d |= (tmp[tmpIdx + %d] >>> %d) & MASK%d_%d;\n' %(i, tmp_idx, remaining_bits_per_int-b, next_primitive, b)) + remaining_bits = remaining_bits_per_int-b + f.write(' ints[intsIdx + %d] = l%d;\n' %(i, i)) + f.write(' }\n') + +def writeDecode(bpv, f): + next_primitive = primitive_size_for_bpv(bpv) + if next_primitive % bpv == 0: + f.write(' private static void decode%dTo%d(PostingDecodingUtil pdu, int[] ints) throws IOException {\n' %(bpv, next_primitive)) + else: + f.write(' private static void decode%dTo%d(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {\n' %(bpv, next_primitive)) + if bpv == next_primitive: + f.write(' pdu.in.readInts(ints, 0, %d);\n' %(bpv*4)) + else: + num_values_per_int = 32 / next_primitive + remaining_bits = next_primitive % bpv + num_iters = (next_primitive - 1) // bpv + o = 4 * bpv * num_iters + if remaining_bits == 0: + f.write(' pdu.splitInts(%d, ints, %d, %d, MASK%d_%d, ints, %d, MASK%d_%d);\n' %(bpv*4, next_primitive - bpv, bpv, next_primitive, bpv, o, next_primitive, next_primitive - num_iters * bpv)) + else: + f.write(' pdu.splitInts(%d, ints, %d, %d, MASK%d_%d, tmp, 0, MASK%d_%d);\n' %(bpv*4, next_primitive - bpv, bpv, next_primitive, bpv, next_primitive, next_primitive - num_iters * bpv)) + writeRemainder(bpv, next_primitive, remaining_bits, o, 128/num_values_per_int - o, f) + f.write(' }\n') + +if __name__ == '__main__': + f = open(OUTPUT_FILE, 'w') + f.write(HEADER) + f.write(""" + /** + * Delta-decode 128 integers into {@code ints}. + */ + void decodeAndPrefixSum(int bitsPerValue, PostingDecodingUtil pdu, int base, int[] ints) throws IOException { + switch (bitsPerValue) { +""") + for bpv in range(1, MAX_SPECIALIZED_BITS_PER_VALUE+1): + primitive_size = primitive_size_for_bpv(bpv) + f.write(' case %d:\n' %bpv) + if next_primitive(bpv) == primitive_size: + if primitive_size % bpv == 0: + f.write(' decode%d(pdu, ints);\n' %bpv) + else: + f.write(' decode%d(pdu, tmp, ints);\n' %bpv) + else: + if primitive_size % bpv == 0: + f.write(' decode%dTo%d(pdu, ints);\n' %(bpv, primitive_size)) + else: + f.write(' decode%dTo%d(pdu, tmp, ints);\n' %(bpv, primitive_size)) + f.write(' prefixSum%d(ints, base);\n' %primitive_size) + f.write(' break;\n') + f.write(' default:\n') + f.write(' decodeSlow(bitsPerValue, pdu, tmp, ints);\n') + f.write(' prefixSum32(ints, base);\n') + f.write(' break;\n') + f.write(' }\n') + f.write(' }\n') + + f.write('\n') + for bpv in range(1, MAX_SPECIALIZED_BITS_PER_VALUE+1): + if next_primitive(bpv) != primitive_size_for_bpv(bpv): + writeDecode(bpv, f) + if bpv < MAX_SPECIALIZED_BITS_PER_VALUE: + f.write('\n') + + f.write('}\n') diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/gen_ForUtil.py b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/gen_ForUtil.py new file mode 100644 index 000000000000..0af17974532f --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/gen_ForUtil.py @@ -0,0 +1,327 @@ +#! /usr/bin/env python + +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from math import gcd + +"""Code generation for ForUtil.java""" + +MAX_SPECIALIZED_BITS_PER_VALUE = 24 +OUTPUT_FILE = "ForUtil.java" +PRIMITIVE_SIZE = [8, 16, 32] +HEADER = """// This file has been automatically generated, DO NOT EDIT + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene101; + +import java.io.IOException; +import org.apache.lucene.internal.vectorization.PostingDecodingUtil; +import org.apache.lucene.store.DataOutput; + +/** + * Inspired from https://fulmicoton.com/posts/bitpacking/ + * Encodes multiple integers in one to get SIMD-like speedups. + * If bitsPerValue <= 8 then we pack 4 ints per Java int + * else if bitsPerValue <= 16 we pack 2 ints per Java int + * else we do scalar operations. + */ +public final class ForUtil { + + public static final int BLOCK_SIZE = 128; + static final int BLOCK_SIZE_LOG2 = 7; + + static int expandMask16(int mask16) { + return mask16 | (mask16 << 16); + } + + static int expandMask8(int mask8) { + return expandMask16(mask8 | (mask8 << 8)); + } + + static int mask32(int bitsPerValue) { + return (1 << bitsPerValue) - 1; + } + + static int mask16(int bitsPerValue) { + return expandMask16((1 << bitsPerValue) - 1); + } + + static int mask8(int bitsPerValue) { + return expandMask8((1 << bitsPerValue) - 1); + } + + static void expand8(int[] arr) { + for (int i = 0; i < 32; ++i) { + int l = arr[i]; + arr[i] = (l >>> 24) & 0xFF; + arr[32 + i] = (l >>> 16) & 0xFF; + arr[64 + i] = (l >>> 8) & 0xFF; + arr[96 + i] = l & 0xFF; + } + } + + static void collapse8(int[] arr) { + for (int i = 0; i < 32; ++i) { + arr[i] = + (arr[i] << 24) + | (arr[32 + i] << 16) + | (arr[64 + i] << 8) + | arr[96 + i]; + } + } + + static void expand16(int[] arr) { + for (int i = 0; i < 64; ++i) { + int l = arr[i]; + arr[i] = (l >>> 16) & 0xFFFF; + arr[64 + i] = l & 0xFFFF; + } + } + + static void collapse16(int[] arr) { + for (int i = 0; i < 64; ++i) { + arr[i] = (arr[i] << 16) | arr[64 + i]; + } + } + + private final int[] tmp = new int[BLOCK_SIZE]; + + /** Encode 128 integers from {@code ints} into {@code out}. */ + void encode(int[] ints, int bitsPerValue, DataOutput out) throws IOException { + final int nextPrimitive; + if (bitsPerValue <= 8) { + nextPrimitive = 8; + collapse8(ints); + } else if (bitsPerValue <= 16) { + nextPrimitive = 16; + collapse16(ints); + } else { + nextPrimitive = 32; + } + encode(ints, bitsPerValue, nextPrimitive, out, tmp); + } + + static void encode(int[] ints, int bitsPerValue, int primitiveSize, DataOutput out, int[] tmp) throws IOException { + final int numInts = BLOCK_SIZE * primitiveSize / Integer.SIZE; + + final int numIntsPerShift = bitsPerValue * 4; + int idx = 0; + int shift = primitiveSize - bitsPerValue; + for (int i = 0; i < numIntsPerShift; ++i) { + tmp[i] = ints[idx++] << shift; + } + for (shift = shift - bitsPerValue; shift >= 0; shift -= bitsPerValue) { + for (int i = 0; i < numIntsPerShift; ++i) { + tmp[i] |= ints[idx++] << shift; + } + } + + final int remainingBitsPerInt = shift + bitsPerValue; + final int maskRemainingBitsPerInt; + if (primitiveSize == 8) { + maskRemainingBitsPerInt = MASKS8[remainingBitsPerInt]; + } else if (primitiveSize == 16) { + maskRemainingBitsPerInt = MASKS16[remainingBitsPerInt]; + } else { + maskRemainingBitsPerInt = MASKS32[remainingBitsPerInt]; + } + + int tmpIdx = 0; + int remainingBitsPerValue = bitsPerValue; + while (idx < numInts) { + if (remainingBitsPerValue >= remainingBitsPerInt) { + remainingBitsPerValue -= remainingBitsPerInt; + tmp[tmpIdx++] |= (ints[idx] >>> remainingBitsPerValue) & maskRemainingBitsPerInt; + if (remainingBitsPerValue == 0) { + idx++; + remainingBitsPerValue = bitsPerValue; + } + } else { + final int mask1, mask2; + if (primitiveSize == 8) { + mask1 = MASKS8[remainingBitsPerValue]; + mask2 = MASKS8[remainingBitsPerInt - remainingBitsPerValue]; + } else if (primitiveSize == 16) { + mask1 = MASKS16[remainingBitsPerValue]; + mask2 = MASKS16[remainingBitsPerInt - remainingBitsPerValue]; + } else { + mask1 = MASKS32[remainingBitsPerValue]; + mask2 = MASKS32[remainingBitsPerInt - remainingBitsPerValue]; + } + tmp[tmpIdx] |= (ints[idx++] & mask1) << (remainingBitsPerInt - remainingBitsPerValue); + remainingBitsPerValue = bitsPerValue - remainingBitsPerInt + remainingBitsPerValue; + tmp[tmpIdx++] |= (ints[idx] >>> remainingBitsPerValue) & mask2; + } + } + + for (int i = 0; i < numIntsPerShift; ++i) { + out.writeInt(tmp[i]); + } + } + + /** Number of bytes required to encode 128 integers of {@code bitsPerValue} bits per value. */ + static int numBytes(int bitsPerValue) { + return bitsPerValue << (BLOCK_SIZE_LOG2 - 3); + } + + static void decodeSlow(int bitsPerValue, PostingDecodingUtil pdu, int[] tmp, int[] ints) + throws IOException { + final int numInts = bitsPerValue << 2; + final int mask = MASKS32[bitsPerValue]; + pdu.splitInts(numInts, ints, 32 - bitsPerValue, 32, mask, tmp, 0, -1); + final int remainingBitsPerInt = 32 - bitsPerValue; + final int mask32RemainingBitsPerInt = MASKS32[remainingBitsPerInt]; + int tmpIdx = 0; + int remainingBits = remainingBitsPerInt; + for (int intsIdx = numInts; intsIdx < BLOCK_SIZE; ++intsIdx) { + int b = bitsPerValue - remainingBits; + int l = (tmp[tmpIdx++] & MASKS32[remainingBits]) << b; + while (b >= remainingBitsPerInt) { + b -= remainingBitsPerInt; + l |= (tmp[tmpIdx++] & mask32RemainingBitsPerInt) << b; + } + if (b > 0) { + l |= (tmp[tmpIdx] >>> (remainingBitsPerInt - b)) & MASKS32[b]; + remainingBits = remainingBitsPerInt - b; + } else { + remainingBits = remainingBitsPerInt; + } + ints[intsIdx] = l; + } + } + +""" + +def writeRemainder(bpv, next_primitive, remaining_bits_per_int, o, num_values, f): + iteration = 1 + num_ints = bpv * num_values / remaining_bits_per_int + while num_ints % 2 == 0 and num_values % 2 == 0: + num_ints /= 2 + num_values /= 2 + iteration *= 2 + f.write(' for (int iter = 0, tmpIdx = 0, intsIdx = %d; iter < %d; ++iter, tmpIdx += %d, intsIdx += %d) {\n' %(o, iteration, num_ints, num_values)) + i = 0 + remaining_bits = 0 + tmp_idx = 0 + for i in range(int(num_values)): + b = bpv + if remaining_bits == 0: + b -= remaining_bits_per_int + f.write(' int l%d = tmp[tmpIdx + %d] << %d;\n' %(i, tmp_idx, b)) + else: + b -= remaining_bits + f.write(' int l%d = (tmp[tmpIdx + %d] & MASK%d_%d) << %d;\n' %(i, tmp_idx, next_primitive, remaining_bits, b)) + tmp_idx += 1 + while b >= remaining_bits_per_int: + b -= remaining_bits_per_int + f.write(' l%d |= tmp[tmpIdx + %d] << %d;\n' %(i, tmp_idx, b)) + tmp_idx += 1 + if b > 0: + f.write(' l%d |= (tmp[tmpIdx + %d] >>> %d) & MASK%d_%d;\n' %(i, tmp_idx, remaining_bits_per_int-b, next_primitive, b)) + remaining_bits = remaining_bits_per_int-b + f.write(' ints[intsIdx + %d] = l%d;\n' %(i, i)) + f.write(' }\n') + + +def writeDecode(bpv, f): + next_primitive = 32 + if bpv <= 8: + next_primitive = 8 + elif bpv <= 16: + next_primitive = 16 + if bpv == next_primitive: + f.write(' static void decode%d(PostingDecodingUtil pdu, int[] ints) throws IOException {\n' %bpv) + f.write(' pdu.in.readInts(ints, 0, %d);\n' %(bpv*4)) + else: + num_values_per_int = 32 / next_primitive + remaining_bits = next_primitive % bpv + num_iters = (next_primitive - 1) // bpv + o = 4 * bpv * num_iters + if remaining_bits == 0: + f.write(' static void decode%d(PostingDecodingUtil pdu, int[] ints) throws IOException {\n' %bpv) + f.write(' pdu.splitInts(%d, ints, %d, %d, MASK%d_%d, ints, %d, MASK%d_%d);\n' %(bpv*4, next_primitive - bpv, bpv, next_primitive, bpv, o, next_primitive, next_primitive - num_iters * bpv)) + else: + f.write(' static void decode%d(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {\n' %bpv) + f.write(' pdu.splitInts(%d, ints, %d, %d, MASK%d_%d, tmp, 0, MASK%d_%d);\n' %(bpv*4, next_primitive - bpv, bpv, next_primitive, bpv, next_primitive, next_primitive - num_iters * bpv)) + writeRemainder(bpv, next_primitive, remaining_bits, o, 128/num_values_per_int - o, f) + f.write(' }\n') + +if __name__ == '__main__': + f = open(OUTPUT_FILE, 'w') + f.write(HEADER) + for primitive_size in PRIMITIVE_SIZE: + f.write(' static final int[] MASKS%d = new int[%d];\n' %(primitive_size, primitive_size)) + f.write('\n') + f.write(' static {\n') + for primitive_size in PRIMITIVE_SIZE: + f.write(' for (int i = 0; i < %d; ++i) {\n' %primitive_size) + f.write(' MASKS%d[i] = mask%d(i);\n' %(primitive_size, primitive_size)) + f.write(' }\n') + f.write(' }') + f.write(""" + // mark values in array as final ints to avoid the cost of reading array, arrays should only be + // used when the idx is a variable +""") + for primitive_size in PRIMITIVE_SIZE: + for bpv in range(1, min(MAX_SPECIALIZED_BITS_PER_VALUE + 1, primitive_size)): + f.write(' static final int MASK%d_%d = MASKS%d[%d];\n' %(primitive_size, bpv, primitive_size, bpv)) + + f.write(""" + /** Decode 128 integers into {@code ints}. */ + void decode(int bitsPerValue, PostingDecodingUtil pdu, int[] ints) throws IOException { + switch (bitsPerValue) { +""") + for bpv in range(1, MAX_SPECIALIZED_BITS_PER_VALUE+1): + next_primitive = 32 + if bpv <= 8: + next_primitive = 8 + elif bpv <= 16: + next_primitive = 16 + f.write(' case %d:\n' %bpv) + if next_primitive % bpv == 0: + f.write(' decode%d(pdu, ints);\n' %bpv) + else: + f.write(' decode%d(pdu, tmp, ints);\n' %bpv) + if next_primitive != 32: + f.write(' expand%d(ints);\n' %next_primitive) + f.write(' break;\n') + f.write(' default:\n') + f.write(' decodeSlow(bitsPerValue, pdu, tmp, ints);\n') + f.write(' break;\n') + f.write(' }\n') + f.write(' }\n') + + for i in range(1, MAX_SPECIALIZED_BITS_PER_VALUE+1): + writeDecode(i, f) + if i < MAX_SPECIALIZED_BITS_PER_VALUE: + f.write('\n') + + f.write('}\n') diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/package-info.java similarity index 96% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene912/package-info.java rename to lucene/core/src/java/org/apache/lucene/codecs/lucene101/package-info.java index 853f86a855ab..e582f12c3185 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/package-info.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/package-info.java @@ -16,7 +16,7 @@ */ /** - * Lucene 9.12 file format. + * Lucene 10.1 file format. * *

    Apache Lucene - Index File Formats

    * @@ -151,15 +151,15 @@ * field names. These are used to store auxiliary information about the document, such as its * title, url, or an identifier to access a database. The set of stored fields are what is * returned for each hit when searching. This is keyed by document number. - *
  • {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term dictionary}. A + *
  • {@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat Term dictionary}. A * dictionary containing all of the terms used in all of the indexed fields of all of the * documents. The dictionary also contains the number of documents which contain the term, and * pointers to the term's frequency and proximity data. - *
  • {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Frequency data}. For + *
  • {@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat Term Frequency data}. For * each term in the dictionary, the numbers of all the documents that contain that term, and * the frequency of the term in that document, unless frequencies are omitted ({@link * org.apache.lucene.index.IndexOptions#DOCS IndexOptions.DOCS}) - *
  • {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Proximity data}. For + *
  • {@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat Term Proximity data}. For * each term in the dictionary, the positions that the term occurs in each document. Note that * this will not exist if all fields in all documents omit position data. *
  • {@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Normalization factors}. For @@ -255,27 +255,27 @@ * The stored fields for documents * * - * {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Dictionary} + * {@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat Term Dictionary} * .tim * The term dictionary, stores term info * * - * {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Index} + * {@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat Term Index} * .tip * The index into the Term Dictionary * * - * {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Frequencies} + * {@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat Frequencies} * .doc * Contains the list of docs which contain each term along with frequency * * - * {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Positions} + * {@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat Positions} * .pos * Stores position information about where a term occurs in the index * * - * {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Payloads} + * {@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat Payloads} * .pay * Stores additional per-position metadata information such as character offsets and user payloads * @@ -306,7 +306,7 @@ * * * {@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values} - * .dii, .dim + * .kdd, .kdi, .kdm * Holds indexed points * * @@ -400,6 +400,8 @@ * performant encoding that is vectorized. *
  • In version 8.6, index sort serialization is delegated to the sorts themselves, to allow * user-defined sorts to be used + *
  • In version 8.6, points fields split the index tree and leaf data into separate files, to + * allow for different access patterns to the different data structures *
  • In version 8.7, stored fields compression became adaptive to better handle documents with * smaller stored fields. *
  • In version 9.0, vector-valued fields were added. @@ -414,6 +416,8 @@ *
  • In version 9.12, skip data was refactored to have only two levels: every 128 docs and every * 4,06 docs, and to be inlined in postings lists. This resulted in a speedup for queries that * need skipping, especially conjunctions. + *
  • In version 10.1, block encoding changed to be optimized for int[] storage instead of + * long[]. * * * @@ -428,4 +432,4 @@ * UInt64 values, or better yet, {@link org.apache.lucene.store.DataOutput#writeVInt * VInt} values which have no limit. */ -package org.apache.lucene.codecs.lucene912; +package org.apache.lucene.codecs.lucene101; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/IndexedDISI.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/IndexedDISI.java index a2b2c84e12ae..dbd56125fcd1 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/IndexedDISI.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/IndexedDISI.java @@ -18,6 +18,7 @@ import java.io.DataInput; import java.io.IOException; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; @@ -439,6 +440,40 @@ public static RandomAccessInput createJumpTable( // ALL variables int gap; + /** + * Returns an iterator that delegates to the IndexedDISI. Advancing this iterator will advance the + * underlying IndexedDISI, and vice-versa. + */ + public static KnnVectorValues.DocIndexIterator asDocIndexIterator(IndexedDISI disi) { + // can we replace with fromDISI? + return new KnnVectorValues.DocIndexIterator() { + @Override + public int docID() { + return disi.docID(); + } + + @Override + public int index() { + return disi.index(); + } + + @Override + public int nextDoc() throws IOException { + return disi.nextDoc(); + } + + @Override + public int advance(int target) throws IOException { + return disi.advance(target); + } + + @Override + public long cost() { + return disi.cost(); + } + }; + } + @Override public int docID() { return doc; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90CompoundFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90CompoundFormat.java index 9851b0326bf0..80b98e0a4c52 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90CompoundFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90CompoundFormat.java @@ -82,9 +82,8 @@ public final class Lucene90CompoundFormat extends CompoundFormat { public Lucene90CompoundFormat() {} @Override - public CompoundDirectory getCompoundReader(Directory dir, SegmentInfo si, IOContext context) - throws IOException { - return new Lucene90CompoundReader(dir, si, context); + public CompoundDirectory getCompoundReader(Directory dir, SegmentInfo si) throws IOException { + return new Lucene90CompoundReader(dir, si); } @Override @@ -104,15 +103,7 @@ public void write(Directory dir, SegmentInfo si, IOContext context) throws IOExc } } - private static class SizedFile { - private final String name; - private final long length; - - private SizedFile(String name, long length) { - this.name = name; - this.length = length; - } - } + private record SizedFile(String name, long length) {} private static class SizedFileQueue extends PriorityQueue { SizedFileQueue(int maxSize) { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90CompoundReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90CompoundReader.java index ee9c9ae40fa0..8f6211bc959b 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90CompoundReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90CompoundReader.java @@ -30,6 +30,7 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.ReadAdvice; import org.apache.lucene.util.CollectionUtil; import org.apache.lucene.util.IOUtils; @@ -56,8 +57,7 @@ public static final class FileEntry { /** Create a new CompoundFileDirectory. */ // TODO: we should just pre-strip "entries" and append segment name up-front like simpletext? // this need not be a "general purpose" directory anymore (it only writes index files) - public Lucene90CompoundReader(Directory directory, SegmentInfo si, IOContext context) - throws IOException { + public Lucene90CompoundReader(Directory directory, SegmentInfo si) throws IOException { this.directory = directory; this.segmentName = si.name; String dataFileName = @@ -75,7 +75,7 @@ public Lucene90CompoundReader(Directory directory, SegmentInfo si, IOContext con .orElseGet(() -> CodecUtil.indexHeaderLength(Lucene90CompoundFormat.DATA_CODEC, "")) + CodecUtil.footerLength(); - handle = directory.openInput(dataFileName, context); + handle = directory.openInput(dataFileName, IOContext.DEFAULT.withReadAdvice(ReadAdvice.NORMAL)); try { CodecUtil.checkIndexHeader( handle, Lucene90CompoundFormat.DATA_CODEC, version, version, si.getId(), ""); @@ -169,7 +169,7 @@ public IndexInput openInput(String name, IOContext context) throws IOException { + entries.keySet() + ")"); } - return handle.slice(name, entry.offset, entry.length); + return handle.slice(name, entry.offset, entry.length, context.readAdvice()); } /** Returns an array of strings, one for each file in the directory. */ diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesConsumer.java index c9f2e7742333..13f1463d0dbf 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesConsumer.java @@ -31,6 +31,7 @@ import org.apache.lucene.codecs.DocValuesProducer; import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.DocValuesSkipIndexType; import org.apache.lucene.index.EmptyDocValuesProducer; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexFileNames; @@ -143,7 +144,7 @@ public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOExcepti return DocValues.singleton(valuesProducer.getNumeric(field)); } }; - if (field.hasDocValuesSkipIndex()) { + if (field.docValuesSkipIndexType() != DocValuesSkipIndexType.NONE) { writeSkipIndex(field, producer); } writeValues(field, producer, false); @@ -248,7 +249,7 @@ public static SkipAccumulator merge(List list, int index, int l private void writeSkipIndex(FieldInfo field, DocValuesProducer valuesProducer) throws IOException { - assert field.hasDocValuesSkipIndex(); + assert field.docValuesSkipIndexType() != DocValuesSkipIndexType.NONE; final long start = data.getFilePointer(); final SortedNumericDocValues values = valuesProducer.getSortedNumeric(field); long globalMaxValue = Long.MIN_VALUE; @@ -700,7 +701,7 @@ public long cost() { return DocValues.singleton(sortedOrds); } }; - if (field.hasDocValuesSkipIndex()) { + if (field.docValuesSkipIndexType() != DocValuesSkipIndexType.NONE) { writeSkipIndex(field, producer); } if (addTypeByte) { @@ -873,7 +874,7 @@ public void addSortedNumericField(FieldInfo field, DocValuesProducer valuesProdu private void doAddSortedNumericField( FieldInfo field, DocValuesProducer valuesProducer, boolean ords) throws IOException { - if (field.hasDocValuesSkipIndex()) { + if (field.docValuesSkipIndexType() != DocValuesSkipIndexType.NONE) { writeSkipIndex(field, valuesProducer); } if (ords) { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java index a44092dbc246..11e83b3f03c1 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java @@ -21,14 +21,13 @@ import static org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat.TERMS_DICT_BLOCK_LZ4_SHIFT; import java.io.IOException; -import java.util.HashMap; -import java.util.Map; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.DocValuesProducer; import org.apache.lucene.index.BaseTermsEnum; import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.DocValuesSkipIndexType; import org.apache.lucene.index.DocValuesSkipper; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; @@ -42,6 +41,7 @@ import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.TermsEnum.SeekStatus; +import org.apache.lucene.internal.hppc.IntObjectHashMap; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.ChecksumIndexInput; @@ -58,12 +58,12 @@ /** reader for {@link Lucene90DocValuesFormat} */ final class Lucene90DocValuesProducer extends DocValuesProducer { - private final Map numerics; - private final Map binaries; - private final Map sorted; - private final Map sortedSets; - private final Map sortedNumerics; - private final Map skippers; + private final IntObjectHashMap numerics; + private final IntObjectHashMap binaries; + private final IntObjectHashMap sorted; + private final IntObjectHashMap sortedSets; + private final IntObjectHashMap sortedNumerics; + private final IntObjectHashMap skippers; private final IndexInput data; private final int maxDoc; private int version = -1; @@ -80,12 +80,12 @@ final class Lucene90DocValuesProducer extends DocValuesProducer { String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension); this.maxDoc = state.segmentInfo.maxDoc(); - numerics = new HashMap<>(); - binaries = new HashMap<>(); - sorted = new HashMap<>(); - sortedSets = new HashMap<>(); - sortedNumerics = new HashMap<>(); - skippers = new HashMap<>(); + numerics = new IntObjectHashMap<>(); + binaries = new IntObjectHashMap<>(); + sorted = new IntObjectHashMap<>(); + sortedSets = new IntObjectHashMap<>(); + sortedNumerics = new IntObjectHashMap<>(); + skippers = new IntObjectHashMap<>(); merging = false; // read in the entries from the metadata file. @@ -148,12 +148,12 @@ final class Lucene90DocValuesProducer extends DocValuesProducer { // Used for cloning private Lucene90DocValuesProducer( - Map numerics, - Map binaries, - Map sorted, - Map sortedSets, - Map sortedNumerics, - Map skippers, + IntObjectHashMap numerics, + IntObjectHashMap binaries, + IntObjectHashMap sorted, + IntObjectHashMap sortedSets, + IntObjectHashMap sortedNumerics, + IntObjectHashMap skippers, IndexInput data, int maxDoc, int version, @@ -192,19 +192,19 @@ private void readFields(IndexInput meta, FieldInfos infos) throws IOException { throw new CorruptIndexException("Invalid field number: " + fieldNumber, meta); } byte type = meta.readByte(); - if (info.hasDocValuesSkipIndex()) { - skippers.put(info.name, readDocValueSkipperMeta(meta)); + if (info.docValuesSkipIndexType() != DocValuesSkipIndexType.NONE) { + skippers.put(info.number, readDocValueSkipperMeta(meta)); } if (type == Lucene90DocValuesFormat.NUMERIC) { - numerics.put(info.name, readNumeric(meta)); + numerics.put(info.number, readNumeric(meta)); } else if (type == Lucene90DocValuesFormat.BINARY) { - binaries.put(info.name, readBinary(meta)); + binaries.put(info.number, readBinary(meta)); } else if (type == Lucene90DocValuesFormat.SORTED) { - sorted.put(info.name, readSorted(meta)); + sorted.put(info.number, readSorted(meta)); } else if (type == Lucene90DocValuesFormat.SORTED_SET) { - sortedSets.put(info.name, readSortedSet(meta)); + sortedSets.put(info.number, readSortedSet(meta)); } else if (type == Lucene90DocValuesFormat.SORTED_NUMERIC) { - sortedNumerics.put(info.name, readSortedNumeric(meta)); + sortedNumerics.put(info.number, readSortedNumeric(meta)); } else { throw new CorruptIndexException("invalid type: " + type, meta); } @@ -429,7 +429,7 @@ private static class SortedNumericEntry extends NumericEntry { @Override public NumericDocValues getNumeric(FieldInfo field) throws IOException { - NumericEntry entry = numerics.get(field.name); + NumericEntry entry = numerics.get(field.number); return getNumeric(entry); } @@ -785,13 +785,13 @@ public boolean advanceExact(int target) throws IOException { @Override public BinaryDocValues getBinary(FieldInfo field) throws IOException { - BinaryEntry entry = binaries.get(field.name); + BinaryEntry entry = binaries.get(field.number); if (entry.docsWithFieldOffset == -2) { return DocValues.emptyBinary(); } - final IndexInput bytesSlice = data.slice("fixed-binary", entry.dataOffset, entry.dataLength); + final RandomAccessInput bytesSlice = data.randomAccessSlice(entry.dataOffset, entry.dataLength); // Prefetch the first page of data. Following pages are expected to get prefetched through // read-ahead. if (bytesSlice.length() > 0) { @@ -808,8 +808,7 @@ public BinaryDocValues getBinary(FieldInfo field) throws IOException { @Override public BytesRef binaryValue() throws IOException { - bytesSlice.seek((long) doc * length); - bytesSlice.readBytes(bytes.bytes, 0, length); + bytesSlice.readBytes((long) doc * length, bytes.bytes, 0, length); return bytes; } }; @@ -831,8 +830,7 @@ public BytesRef binaryValue() throws IOException { public BytesRef binaryValue() throws IOException { long startOffset = addresses.get(doc); bytes.length = (int) (addresses.get(doc + 1L) - startOffset); - bytesSlice.seek(startOffset); - bytesSlice.readBytes(bytes.bytes, 0, bytes.length); + bytesSlice.readBytes(startOffset, bytes.bytes, 0, bytes.length); return bytes; } }; @@ -855,8 +853,7 @@ public BytesRef binaryValue() throws IOException { @Override public BytesRef binaryValue() throws IOException { - bytesSlice.seek((long) disi.index() * length); - bytesSlice.readBytes(bytes.bytes, 0, length); + bytesSlice.readBytes((long) disi.index() * length, bytes.bytes, 0, length); return bytes; } }; @@ -879,8 +876,7 @@ public BytesRef binaryValue() throws IOException { final int index = disi.index(); long startOffset = addresses.get(index); bytes.length = (int) (addresses.get(index + 1L) - startOffset); - bytesSlice.seek(startOffset); - bytesSlice.readBytes(bytes.bytes, 0, bytes.length); + bytesSlice.readBytes(startOffset, bytes.bytes, 0, bytes.length); return bytes; } }; @@ -890,7 +886,7 @@ public BytesRef binaryValue() throws IOException { @Override public SortedDocValues getSorted(FieldInfo field) throws IOException { - SortedEntry entry = sorted.get(field.name); + SortedEntry entry = sorted.get(field.number); return getSorted(entry); } @@ -1124,7 +1120,7 @@ private class TermsDict extends BaseTermsEnum { final IndexInput bytes; final long blockMask; final LongValues indexAddresses; - final IndexInput indexBytes; + final RandomAccessInput indexBytes; final BytesRef term; long ord = -1; @@ -1146,7 +1142,7 @@ private class TermsDict extends BaseTermsEnum { indexAddresses = DirectMonotonicReader.getInstance( entry.termsIndexAddressesMeta, indexAddressesSlice, merging); - indexBytes = data.slice("terms-index", entry.termsIndexOffset, entry.termsIndexLength); + indexBytes = data.randomAccessSlice(entry.termsIndexOffset, entry.termsIndexLength); term = new BytesRef(entry.maxTermLength); // add the max term length for the dictionary @@ -1204,8 +1200,7 @@ private BytesRef getTermFromIndex(long index) throws IOException { assert index >= 0 && index <= (entry.termsDictSize - 1) >>> entry.termsDictIndexShift; final long start = indexAddresses.get(index); term.length = (int) (indexAddresses.get(index + 1) - start); - indexBytes.seek(start); - indexBytes.readBytes(term.bytes, 0, term.length); + indexBytes.readBytes(start, term.bytes, 0, term.length); return term; } @@ -1367,7 +1362,7 @@ public int docFreq() throws IOException { @Override public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException { - SortedNumericEntry entry = sortedNumerics.get(field.name); + SortedNumericEntry entry = sortedNumerics.get(field.number); return getSortedNumeric(entry); } @@ -1512,7 +1507,7 @@ private void set() { @Override public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { - SortedSetEntry entry = sortedSets.get(field.name); + SortedSetEntry entry = sortedSets.get(field.number); if (entry.singleValueEntry != null) { return DocValues.singleton(getSorted(entry.singleValueEntry)); } @@ -1786,7 +1781,7 @@ long getLongValue(long index) throws IOException { @Override public DocValuesSkipper getSkipper(FieldInfo field) throws IOException { - final DocValuesSkipperEntry entry = skippers.get(field.name); + final DocValuesSkipperEntry entry = skippers.get(field.number); final IndexInput input = data.slice("doc value skipper", entry.offset, entry.length); // Prefetch the first page of data. Following pages are expected to get prefetched through diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PointsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PointsReader.java index d3f256cbf00c..82910e23ab9e 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PointsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PointsReader.java @@ -33,9 +33,9 @@ /** Reads point values previously written with {@link Lucene90PointsWriter} */ public class Lucene90PointsReader extends PointsReader { - final IndexInput indexIn, dataIn; - final SegmentReadState readState; - final IntObjectHashMap readers = new IntObjectHashMap<>(); + private final IndexInput indexIn, dataIn; + private final SegmentReadState readState; + private final IntObjectHashMap readers = new IntObjectHashMap<>(); /** Sole constructor */ public Lucene90PointsReader(SegmentReadState readState) throws IOException { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PointsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PointsWriter.java index e50d6a0fdb5a..45a946e8ac40 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PointsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PointsWriter.java @@ -253,7 +253,7 @@ public void merge(MergeState mergeState) throws IOException { FieldInfos readerFieldInfos = mergeState.fieldInfos[i]; FieldInfo readerFieldInfo = readerFieldInfos.fieldInfo(fieldInfo.name); if (readerFieldInfo != null && readerFieldInfo.getPointDimensionCount() > 0) { - PointValues aPointValues = reader90.readers.get(readerFieldInfo.number); + PointValues aPointValues = reader90.getValues(readerFieldInfo.name); if (aPointValues != null) { pointValues.add(aPointValues); docMaps.add(mergeState.docMaps[i]); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90StoredFieldsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90StoredFieldsFormat.java index ce0310d63967..9e367a3d9d82 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90StoredFieldsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90StoredFieldsFormat.java @@ -49,9 +49,9 @@ * *
      *   // the default: for high performance
    - *   indexWriterConfig.setCodec(new Lucene912Codec(Mode.BEST_SPEED));
    + *   indexWriterConfig.setCodec(new Lucene100Codec(Mode.BEST_SPEED));
      *   // instead for higher performance (but slower):
    - *   // indexWriterConfig.setCodec(new Lucene912Codec(Mode.BEST_COMPRESSION));
    + *   // indexWriterConfig.setCodec(new Lucene100Codec(Mode.BEST_COMPRESSION));
      * 
    * *

    File formats diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java index 9988c45bdf72..85d23a489fe9 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java @@ -598,7 +598,7 @@ public SeekStatus scanToTermLeaf(BytesRef target, boolean exactOnly) throws IOEx startBytePos = suffixesReader.getPosition(); suffixesReader.skipBytes(suffixLength); - // Loop over bytes in the suffix, comparing to the target + // Compare suffix and target. final int cmp = Arrays.compareUnsigned( suffixBytes, @@ -686,7 +686,7 @@ public SeekStatus binarySearchTermLeaf(BytesRef target, boolean exactOnly) throw nextEnt = mid + 1; startBytePos = mid * suffixLength; - // Binary search bytes in the suffix, comparing to the target. + // Compare suffix and target. cmp = Arrays.compareUnsigned( suffixBytes, @@ -792,6 +792,7 @@ public SeekStatus scanToTermNonLeaf(BytesRef target, boolean exactOnly) throws I lastSubFP = fp - subCode; } + // Compare suffix and target. final int cmp = Arrays.compareUnsigned( suffixBytes, diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingTermVectorsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingTermVectorsReader.java index b06da63153c4..06af857c58c5 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingTermVectorsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingTermVectorsReader.java @@ -31,6 +31,7 @@ import java.io.IOException; import java.nio.ByteBuffer; +import java.util.Arrays; import java.util.Collections; import java.util.Iterator; import java.util.NoSuchElementException; @@ -77,6 +78,9 @@ */ public final class Lucene90CompressingTermVectorsReader extends TermVectorsReader { + private static final int PREFETCH_CACHE_SIZE = 1 << 4; + private static final int PREFETCH_CACHE_MASK = PREFETCH_CACHE_SIZE - 1; + private final FieldInfos fieldInfos; final FieldsIndex indexReader; final IndexInput vectorsStream; @@ -93,6 +97,11 @@ public final class Lucene90CompressingTermVectorsReader extends TermVectorsReade private final long numDirtyDocs; // cumulative number of docs in incomplete chunks private final long maxPointer; // end of the data section private BlockState blockState = new BlockState(-1, -1, 0); + // Cache of recently prefetched block IDs. This helps reduce chances of prefetching the same block + // multiple times, which is otherwise likely due to index sorting or recursive graph bisection + // clustering similar documents together. NOTE: this cache must be small since it's fully scanned. + private final long[] prefetchedBlockIDCache; + private int prefetchedBlockIDCacheIndex; // used by clone private Lucene90CompressingTermVectorsReader(Lucene90CompressingTermVectorsReader reader) { @@ -111,6 +120,8 @@ private Lucene90CompressingTermVectorsReader(Lucene90CompressingTermVectorsReade this.numDirtyChunks = reader.numDirtyChunks; this.numDirtyDocs = reader.numDirtyDocs; this.maxPointer = reader.maxPointer; + this.prefetchedBlockIDCache = new long[PREFETCH_CACHE_SIZE]; + Arrays.fill(prefetchedBlockIDCache, -1); this.closed = false; } @@ -212,6 +223,9 @@ public Lucene90CompressingTermVectorsReader( CodecUtil.checkFooter(metaIn, null); metaIn.close(); + this.prefetchedBlockIDCache = new long[PREFETCH_CACHE_SIZE]; + Arrays.fill(prefetchedBlockIDCache, -1); + success = true; } catch (Throwable t) { if (metaIn != null) { @@ -325,16 +339,23 @@ boolean isLoaded(int docID) { return blockState.docBase <= docID && docID < blockState.docBase + blockState.chunkDocs; } - private static class BlockState { - final long startPointer; - final int docBase; - final int chunkDocs; + private record BlockState(long startPointer, int docBase, int chunkDocs) {} + + @Override + public void prefetch(int docID) throws IOException { + final long blockID = indexReader.getBlockID(docID); - BlockState(long startPointer, int docBase, int chunkDocs) { - this.startPointer = startPointer; - this.docBase = docBase; - this.chunkDocs = chunkDocs; + for (long prefetchedBlockID : prefetchedBlockIDCache) { + if (prefetchedBlockID == blockID) { + return; + } } + + final long blockStartPointer = indexReader.getBlockStartPointer(blockID); + final long blockLength = indexReader.getBlockLength(blockID); + vectorsStream.prefetch(blockStartPointer, blockLength); + + prefetchedBlockIDCache[prefetchedBlockIDCacheIndex++ & PREFETCH_CACHE_MASK] = blockID; } @Override diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/ForDeltaUtil.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene912/ForDeltaUtil.java deleted file mode 100644 index 8b9aedcfb2bb..000000000000 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/ForDeltaUtil.java +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.codecs.lucene912; - -import java.io.IOException; -import org.apache.lucene.store.DataInput; -import org.apache.lucene.store.DataOutput; -import org.apache.lucene.util.packed.PackedInts; - -/** Utility class to encode/decode increasing sequences of 128 integers. */ -public class ForDeltaUtil { - - // IDENTITY_PLUS_ONE[i] == i+1 - private static final long[] IDENTITY_PLUS_ONE = new long[ForUtil.BLOCK_SIZE]; - - static { - for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) { - IDENTITY_PLUS_ONE[i] = i + 1; - } - } - - private static void prefixSumOfOnes(long[] arr, long base) { - System.arraycopy(IDENTITY_PLUS_ONE, 0, arr, 0, ForUtil.BLOCK_SIZE); - // This loop gets auto-vectorized - for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) { - arr[i] += base; - } - } - - private final ForUtil forUtil; - - ForDeltaUtil(ForUtil forUtil) { - this.forUtil = forUtil; - } - - /** - * Encode deltas of a strictly monotonically increasing sequence of integers. The provided {@code - * longs} are expected to be deltas between consecutive values. - */ - void encodeDeltas(long[] longs, DataOutput out) throws IOException { - if (longs[0] == 1 && PForUtil.allEqual(longs)) { // happens with very dense postings - out.writeByte((byte) 0); - } else { - long or = 0; - for (long l : longs) { - or |= l; - } - assert or != 0; - final int bitsPerValue = PackedInts.bitsRequired(or); - out.writeByte((byte) bitsPerValue); - forUtil.encode(longs, bitsPerValue, out); - } - } - - /** Decode deltas, compute the prefix sum and add {@code base} to all decoded longs. */ - void decodeAndPrefixSum(DataInput in, long base, long[] longs) throws IOException { - final int bitsPerValue = Byte.toUnsignedInt(in.readByte()); - if (bitsPerValue == 0) { - prefixSumOfOnes(longs, base); - } else { - forUtil.decodeAndPrefixSum(bitsPerValue, in, base, longs); - } - } - - void skip(DataInput in) throws IOException { - final int bitsPerValue = Byte.toUnsignedInt(in.readByte()); - in.skipBytes(forUtil.numBytes(bitsPerValue)); - } -} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94FieldInfosFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94FieldInfosFormat.java index 481148cbd0db..c4cdd722c541 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94FieldInfosFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94FieldInfosFormat.java @@ -24,6 +24,7 @@ import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.FieldInfosFormat; import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.DocValuesSkipIndexType; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; @@ -163,8 +164,6 @@ public FieldInfos read( boolean isSoftDeletesField = (bits & SOFT_DELETES_FIELD) != 0; boolean isParentField = format >= FORMAT_PARENT_FIELD ? (bits & PARENT_FIELD_FIELD) != 0 : false; - boolean hasDocValuesSkipIndex = - format >= FORMAT_DOCVALUE_SKIPPER ? (bits & DOCVALUES_SKIPPER) != 0 : false; if ((bits & 0xC0) != 0) { throw new CorruptIndexException( @@ -187,6 +186,12 @@ public FieldInfos read( // DV Types are packed in one byte final DocValuesType docValuesType = getDocValuesType(input, input.readByte()); + final DocValuesSkipIndexType docValuesSkipIndex; + if (format >= FORMAT_DOCVALUE_SKIPPER) { + docValuesSkipIndex = getDocValuesSkipIndexType(input, input.readByte()); + } else { + docValuesSkipIndex = DocValuesSkipIndexType.NONE; + } final long dvGen = input.readLong(); Map attributes = input.readMapOfStrings(); // just use the last field's map if its the same @@ -217,7 +222,7 @@ public FieldInfos read( storePayloads, indexOptions, docValuesType, - hasDocValuesSkipIndex, + docValuesSkipIndex, dvGen, attributes, pointDataDimensionCount, @@ -270,6 +275,18 @@ private static byte docValuesByte(DocValuesType type) { } } + private static byte docValuesSkipIndexByte(DocValuesSkipIndexType type) { + switch (type) { + case NONE: + return 0; + case RANGE: + return 1; + default: + // BUG + throw new AssertionError("unhandled DocValuesSkipIndexType: " + type); + } + } + private static DocValuesType getDocValuesType(IndexInput input, byte b) throws IOException { switch (b) { case 0: @@ -289,6 +306,18 @@ private static DocValuesType getDocValuesType(IndexInput input, byte b) throws I } } + private static DocValuesSkipIndexType getDocValuesSkipIndexType(IndexInput input, byte b) + throws IOException { + switch (b) { + case 0: + return DocValuesSkipIndexType.NONE; + case 1: + return DocValuesSkipIndexType.RANGE; + default: + throw new CorruptIndexException("invalid docvaluesskipindex byte: " + b, input); + } + } + private static VectorEncoding getVectorEncoding(IndexInput input, byte b) throws IOException { if (b < 0 || b >= VectorEncoding.values().length) { throw new CorruptIndexException("invalid vector encoding: " + b, input); @@ -399,18 +428,18 @@ public void write( output.writeVInt(fi.number); byte bits = 0x0; - if (fi.hasVectors()) bits |= STORE_TERMVECTOR; + if (fi.hasTermVectors()) bits |= STORE_TERMVECTOR; if (fi.omitsNorms()) bits |= OMIT_NORMS; if (fi.hasPayloads()) bits |= STORE_PAYLOADS; if (fi.isSoftDeletesField()) bits |= SOFT_DELETES_FIELD; if (fi.isParentField()) bits |= PARENT_FIELD_FIELD; - if (fi.hasDocValuesSkipIndex()) bits |= DOCVALUES_SKIPPER; output.writeByte(bits); output.writeByte(indexOptionsByte(fi.getIndexOptions())); // pack the DV type and hasNorms in one byte output.writeByte(docValuesByte(fi.getDocValuesType())); + output.writeByte(docValuesSkipIndexByte(fi.docValuesSkipIndexType())); output.writeLong(fi.getDocValuesGen()); output.writeMapOfStrings(fi.attributes()); output.writeVInt(fi.getPointDimensionCount()); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/HasIndexSlice.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/HasIndexSlice.java new file mode 100644 index 000000000000..2bfe72386a05 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/HasIndexSlice.java @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene95; + +import org.apache.lucene.store.IndexInput; + +/** + * Implementors can return the IndexInput from which their values are read. For use by vector + * quantizers. + */ +public interface HasIndexSlice { + + /** Returns an IndexInput from which to read this instance's values. */ + IndexInput getSlice(); +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapByteVectorValues.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapByteVectorValues.java index f45158eadac7..1e78c8ea7aa2 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapByteVectorValues.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapByteVectorValues.java @@ -29,13 +29,11 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.RandomAccessInput; import org.apache.lucene.util.Bits; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.hnsw.RandomVectorScorer; import org.apache.lucene.util.packed.DirectMonotonicReader; /** Read the vector values from the index input. This supports both iterated and random access. */ -public abstract class OffHeapByteVectorValues extends ByteVectorValues - implements RandomAccessVectorValues.Bytes { +public abstract class OffHeapByteVectorValues extends ByteVectorValues implements HasIndexSlice { protected final int dimension; protected final int size; @@ -132,9 +130,6 @@ public static OffHeapByteVectorValues load( * vector. */ public static class DenseOffHeapVectorValues extends OffHeapByteVectorValues { - - private int doc = -1; - public DenseOffHeapVectorValues( int dimension, int size, @@ -145,36 +140,17 @@ public DenseOffHeapVectorValues( super(dimension, size, slice, byteSize, flatVectorsScorer, vectorSimilarityFunction); } - @Override - public byte[] vectorValue() throws IOException { - return vectorValue(doc); - } - - @Override - public int docID() { - return doc; - } - - @Override - public int nextDoc() throws IOException { - return advance(doc + 1); - } - - @Override - public int advance(int target) throws IOException { - assert docID() < target; - if (target >= size) { - return doc = NO_MORE_DOCS; - } - return doc = target; - } - @Override public DenseOffHeapVectorValues copy() throws IOException { return new DenseOffHeapVectorValues( dimension, size, slice.clone(), byteSize, flatVectorsScorer, similarityFunction); } + @Override + public DocIndexIterator iterator() { + return createDenseIterator(); + } + @Override public Bits getAcceptOrds(Bits acceptDocs) { return acceptDocs; @@ -183,17 +159,18 @@ public Bits getAcceptOrds(Bits acceptDocs) { @Override public VectorScorer scorer(byte[] query) throws IOException { DenseOffHeapVectorValues copy = copy(); + DocIndexIterator iterator = copy.iterator(); RandomVectorScorer scorer = flatVectorsScorer.getRandomVectorScorer(similarityFunction, copy, query); return new VectorScorer() { @Override public float score() throws IOException { - return scorer.score(copy.doc); + return scorer.score(iterator.docID()); } @Override public DocIdSetIterator iterator() { - return copy; + return iterator; } }; } @@ -238,27 +215,6 @@ public SparseOffHeapVectorValues( configuration.size); } - @Override - public byte[] vectorValue() throws IOException { - return vectorValue(disi.index()); - } - - @Override - public int docID() { - return disi.docID(); - } - - @Override - public int nextDoc() throws IOException { - return disi.nextDoc(); - } - - @Override - public int advance(int target) throws IOException { - assert docID() < target; - return disi.advance(target); - } - @Override public SparseOffHeapVectorValues copy() throws IOException { return new SparseOffHeapVectorValues( @@ -276,6 +232,11 @@ public int ordToDoc(int ord) { return (int) ordToDoc.get(ord); } + @Override + public DocIndexIterator iterator() { + return IndexedDISI.asDocIndexIterator(disi); + } + @Override public Bits getAcceptOrds(Bits acceptDocs) { if (acceptDocs == null) { @@ -307,7 +268,7 @@ public float score() throws IOException { @Override public DocIdSetIterator iterator() { - return copy; + return copy.disi; } }; } @@ -322,8 +283,6 @@ public EmptyOffHeapVectorValues( super(dimension, 0, null, 0, flatVectorsScorer, vectorSimilarityFunction); } - private int doc = -1; - @Override public int dimension() { return super.dimension(); @@ -335,23 +294,13 @@ public int size() { } @Override - public byte[] vectorValue() throws IOException { + public byte[] vectorValue(int ord) throws IOException { throw new UnsupportedOperationException(); } @Override - public int docID() { - return doc; - } - - @Override - public int nextDoc() throws IOException { - return advance(doc + 1); - } - - @Override - public int advance(int target) throws IOException { - return doc = NO_MORE_DOCS; + public DocIndexIterator iterator() { + return createDenseIterator(); } @Override @@ -359,11 +308,6 @@ public EmptyOffHeapVectorValues copy() throws IOException { throw new UnsupportedOperationException(); } - @Override - public byte[] vectorValue(int targetOrd) throws IOException { - throw new UnsupportedOperationException(); - } - @Override public int ordToDoc(int ord) { throw new UnsupportedOperationException(); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapFloatVectorValues.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapFloatVectorValues.java index 1f61283b5002..2384657e93e1 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapFloatVectorValues.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapFloatVectorValues.java @@ -28,13 +28,11 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.RandomAccessInput; import org.apache.lucene.util.Bits; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.hnsw.RandomVectorScorer; import org.apache.lucene.util.packed.DirectMonotonicReader; /** Read the vector values from the index input. This supports both iterated and random access. */ -public abstract class OffHeapFloatVectorValues extends FloatVectorValues - implements RandomAccessVectorValues.Floats { +public abstract class OffHeapFloatVectorValues extends FloatVectorValues implements HasIndexSlice { protected final int dimension; protected final int size; @@ -128,8 +126,6 @@ public static OffHeapFloatVectorValues load( */ public static class DenseOffHeapVectorValues extends OffHeapFloatVectorValues { - private int doc = -1; - public DenseOffHeapVectorValues( int dimension, int size, @@ -141,54 +137,41 @@ public DenseOffHeapVectorValues( } @Override - public float[] vectorValue() throws IOException { - return vectorValue(doc); - } - - @Override - public int docID() { - return doc; - } - - @Override - public int nextDoc() throws IOException { - return advance(doc + 1); + public DenseOffHeapVectorValues copy() throws IOException { + return new DenseOffHeapVectorValues( + dimension, size, slice.clone(), byteSize, flatVectorsScorer, similarityFunction); } @Override - public int advance(int target) throws IOException { - assert docID() < target; - if (target >= size) { - return doc = NO_MORE_DOCS; - } - return doc = target; + public int ordToDoc(int ord) { + return ord; } @Override - public DenseOffHeapVectorValues copy() throws IOException { - return new DenseOffHeapVectorValues( - dimension, size, slice.clone(), byteSize, flatVectorsScorer, similarityFunction); + public Bits getAcceptOrds(Bits acceptDocs) { + return acceptDocs; } @Override - public Bits getAcceptOrds(Bits acceptDocs) { - return acceptDocs; + public DocIndexIterator iterator() { + return createDenseIterator(); } @Override public VectorScorer scorer(float[] query) throws IOException { DenseOffHeapVectorValues copy = copy(); + DocIndexIterator iterator = copy.iterator(); RandomVectorScorer randomVectorScorer = flatVectorsScorer.getRandomVectorScorer(similarityFunction, copy, query); return new VectorScorer() { @Override public float score() throws IOException { - return randomVectorScorer.score(copy.doc); + return randomVectorScorer.score(iterator.docID()); } @Override public DocIdSetIterator iterator() { - return copy; + return iterator; } }; } @@ -227,27 +210,6 @@ public SparseOffHeapVectorValues( configuration.size); } - @Override - public float[] vectorValue() throws IOException { - return vectorValue(disi.index()); - } - - @Override - public int docID() { - return disi.docID(); - } - - @Override - public int nextDoc() throws IOException { - return disi.nextDoc(); - } - - @Override - public int advance(int target) throws IOException { - assert docID() < target; - return disi.advance(target); - } - @Override public SparseOffHeapVectorValues copy() throws IOException { return new SparseOffHeapVectorValues( @@ -283,20 +245,26 @@ public int length() { }; } + @Override + public DocIndexIterator iterator() { + return IndexedDISI.asDocIndexIterator(disi); + } + @Override public VectorScorer scorer(float[] query) throws IOException { SparseOffHeapVectorValues copy = copy(); + DocIndexIterator iterator = copy.iterator(); RandomVectorScorer randomVectorScorer = flatVectorsScorer.getRandomVectorScorer(similarityFunction, copy, query); return new VectorScorer() { @Override public float score() throws IOException { - return randomVectorScorer.score(copy.disi.index()); + return randomVectorScorer.score(iterator.index()); } @Override public DocIdSetIterator iterator() { - return copy; + return iterator; } }; } @@ -311,8 +279,6 @@ public EmptyOffHeapVectorValues( super(dimension, 0, null, 0, flatVectorsScorer, similarityFunction); } - private int doc = -1; - @Override public int dimension() { return super.dimension(); @@ -323,26 +289,6 @@ public int size() { return 0; } - @Override - public float[] vectorValue() throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public int docID() { - return doc; - } - - @Override - public int nextDoc() throws IOException { - return advance(doc + 1); - } - - @Override - public int advance(int target) { - return doc = NO_MORE_DOCS; - } - @Override public EmptyOffHeapVectorValues copy() { throw new UnsupportedOperationException(); @@ -354,8 +300,8 @@ public float[] vectorValue(int targetOrd) { } @Override - public int ordToDoc(int ord) { - throw new UnsupportedOperationException(); + public DocIndexIterator iterator() { + return createDenseIterator(); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsReader.java index 0613c9c82b84..9b42ddd0f267 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsReader.java @@ -21,8 +21,6 @@ import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader.readVectorEncoding; import java.io.IOException; -import java.util.HashMap; -import java.util.Map; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.hnsw.FlatVectorsReader; import org.apache.lucene.codecs.hnsw.FlatVectorsScorer; @@ -38,6 +36,7 @@ import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.VectorEncoding; import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.internal.hppc.IntObjectHashMap; import org.apache.lucene.store.ChecksumIndexInput; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; @@ -56,13 +55,15 @@ public final class Lucene99FlatVectorsReader extends FlatVectorsReader { private static final long SHALLOW_SIZE = RamUsageEstimator.shallowSizeOfInstance(Lucene99FlatVectorsFormat.class); - private final Map fields = new HashMap<>(); + private final IntObjectHashMap fields = new IntObjectHashMap<>(); private final IndexInput vectorData; + private final FieldInfos fieldInfos; public Lucene99FlatVectorsReader(SegmentReadState state, FlatVectorsScorer scorer) throws IOException { super(scorer); int versionMeta = readMetadata(state); + this.fieldInfos = state.fieldInfos; boolean success = false; try { vectorData = @@ -155,15 +156,13 @@ private void readFields(ChecksumIndexInput meta, FieldInfos infos) throws IOExce throw new CorruptIndexException("Invalid field number: " + fieldNumber, meta); } FieldEntry fieldEntry = FieldEntry.create(meta, info); - fields.put(info.name, fieldEntry); + fields.put(info.number, fieldEntry); } } @Override public long ramBytesUsed() { - return Lucene99FlatVectorsReader.SHALLOW_SIZE - + RamUsageEstimator.sizeOfMap( - fields, RamUsageEstimator.shallowSizeOfInstance(FieldEntry.class)); + return Lucene99FlatVectorsReader.SHALLOW_SIZE + fields.ramBytesUsed(); } @Override @@ -171,18 +170,27 @@ public void checkIntegrity() throws IOException { CodecUtil.checksumEntireFile(vectorData); } - @Override - public FloatVectorValues getFloatVectorValues(String field) throws IOException { - FieldEntry fieldEntry = fields.get(field); - if (fieldEntry.vectorEncoding != VectorEncoding.FLOAT32) { + private FieldEntry getFieldEntry(String field, VectorEncoding expectedEncoding) { + final FieldInfo info = fieldInfos.fieldInfo(field); + final FieldEntry fieldEntry; + if (info == null || (fieldEntry = fields.get(info.number)) == null) { + throw new IllegalArgumentException("field=\"" + field + "\" not found"); + } + if (fieldEntry.vectorEncoding != expectedEncoding) { throw new IllegalArgumentException( "field=\"" + field + "\" is encoded as: " + fieldEntry.vectorEncoding + " expected: " - + VectorEncoding.FLOAT32); + + expectedEncoding); } + return fieldEntry; + } + + @Override + public FloatVectorValues getFloatVectorValues(String field) throws IOException { + final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.FLOAT32); return OffHeapFloatVectorValues.load( fieldEntry.similarityFunction, vectorScorer, @@ -196,16 +204,7 @@ public FloatVectorValues getFloatVectorValues(String field) throws IOException { @Override public ByteVectorValues getByteVectorValues(String field) throws IOException { - FieldEntry fieldEntry = fields.get(field); - if (fieldEntry.vectorEncoding != VectorEncoding.BYTE) { - throw new IllegalArgumentException( - "field=\"" - + field - + "\" is encoded as: " - + fieldEntry.vectorEncoding - + " expected: " - + VectorEncoding.BYTE); - } + final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.BYTE); return OffHeapByteVectorValues.load( fieldEntry.similarityFunction, vectorScorer, @@ -219,10 +218,7 @@ public ByteVectorValues getByteVectorValues(String field) throws IOException { @Override public RandomVectorScorer getRandomVectorScorer(String field, float[] target) throws IOException { - FieldEntry fieldEntry = fields.get(field); - if (fieldEntry == null || fieldEntry.vectorEncoding != VectorEncoding.FLOAT32) { - return null; - } + final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.FLOAT32); return vectorScorer.getRandomVectorScorer( fieldEntry.similarityFunction, OffHeapFloatVectorValues.load( @@ -239,10 +235,7 @@ public RandomVectorScorer getRandomVectorScorer(String field, float[] target) th @Override public RandomVectorScorer getRandomVectorScorer(String field, byte[] target) throws IOException { - FieldEntry fieldEntry = fields.get(field); - if (fieldEntry == null || fieldEntry.vectorEncoding != VectorEncoding.BYTE) { - return null; - } + final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.BYTE); return vectorScorer.getRandomVectorScorer( fieldEntry.similarityFunction, OffHeapByteVectorValues.load( diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java index 5643752796c2..b731e758b7a8 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java @@ -39,6 +39,7 @@ import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.MergeState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.Sorter; @@ -230,12 +231,15 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE // No need to use temporary file as we don't have to re-open for reading DocsWithFieldSet docsWithField = switch (fieldInfo.getVectorEncoding()) { - case BYTE -> writeByteVectorData( - vectorData, - KnnVectorsWriter.MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState)); - case FLOAT32 -> writeVectorData( - vectorData, - KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState)); + case BYTE -> + writeByteVectorData( + vectorData, + KnnVectorsWriter.MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState)); + case FLOAT32 -> + writeVectorData( + vectorData, + KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues( + fieldInfo, mergeState)); }; long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset; writeMeta( @@ -259,12 +263,16 @@ public CloseableRandomVectorScorerSupplier mergeOneFieldToIndex( // write the vector data to a temporary file DocsWithFieldSet docsWithField = switch (fieldInfo.getVectorEncoding()) { - case BYTE -> writeByteVectorData( - tempVectorData, - KnnVectorsWriter.MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState)); - case FLOAT32 -> writeVectorData( - tempVectorData, - KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState)); + case BYTE -> + writeByteVectorData( + tempVectorData, + KnnVectorsWriter.MergedVectorValues.mergeByteVectorValues( + fieldInfo, mergeState)); + case FLOAT32 -> + writeVectorData( + tempVectorData, + KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues( + fieldInfo, mergeState)); }; CodecUtil.writeFooter(tempVectorData); IOUtils.close(tempVectorData); @@ -289,24 +297,26 @@ public CloseableRandomVectorScorerSupplier mergeOneFieldToIndex( final IndexInput finalVectorDataInput = vectorDataInput; final RandomVectorScorerSupplier randomVectorScorerSupplier = switch (fieldInfo.getVectorEncoding()) { - case BYTE -> vectorsScorer.getRandomVectorScorerSupplier( - fieldInfo.getVectorSimilarityFunction(), - new OffHeapByteVectorValues.DenseOffHeapVectorValues( - fieldInfo.getVectorDimension(), - docsWithField.cardinality(), - finalVectorDataInput, - fieldInfo.getVectorDimension() * Byte.BYTES, - vectorsScorer, - fieldInfo.getVectorSimilarityFunction())); - case FLOAT32 -> vectorsScorer.getRandomVectorScorerSupplier( - fieldInfo.getVectorSimilarityFunction(), - new OffHeapFloatVectorValues.DenseOffHeapVectorValues( - fieldInfo.getVectorDimension(), - docsWithField.cardinality(), - finalVectorDataInput, - fieldInfo.getVectorDimension() * Float.BYTES, - vectorsScorer, - fieldInfo.getVectorSimilarityFunction())); + case BYTE -> + vectorsScorer.getRandomVectorScorerSupplier( + fieldInfo.getVectorSimilarityFunction(), + new OffHeapByteVectorValues.DenseOffHeapVectorValues( + fieldInfo.getVectorDimension(), + docsWithField.cardinality(), + finalVectorDataInput, + fieldInfo.getVectorDimension() * Byte.BYTES, + vectorsScorer, + fieldInfo.getVectorSimilarityFunction())); + case FLOAT32 -> + vectorsScorer.getRandomVectorScorerSupplier( + fieldInfo.getVectorSimilarityFunction(), + new OffHeapFloatVectorValues.DenseOffHeapVectorValues( + fieldInfo.getVectorDimension(), + docsWithField.cardinality(), + finalVectorDataInput, + fieldInfo.getVectorDimension() * Float.BYTES, + vectorsScorer, + fieldInfo.getVectorSimilarityFunction())); }; return new FlatCloseableRandomVectorScorerSupplier( () -> { @@ -352,11 +362,10 @@ private void writeMeta( private static DocsWithFieldSet writeByteVectorData( IndexOutput output, ByteVectorValues byteVectorValues) throws IOException { DocsWithFieldSet docsWithField = new DocsWithFieldSet(); - for (int docV = byteVectorValues.nextDoc(); - docV != NO_MORE_DOCS; - docV = byteVectorValues.nextDoc()) { + KnnVectorValues.DocIndexIterator iter = byteVectorValues.iterator(); + for (int docV = iter.nextDoc(); docV != NO_MORE_DOCS; docV = iter.nextDoc()) { // write vector - byte[] binaryValue = byteVectorValues.vectorValue(); + byte[] binaryValue = byteVectorValues.vectorValue(iter.index()); assert binaryValue.length == byteVectorValues.dimension() * VectorEncoding.BYTE.byteSize; output.writeBytes(binaryValue, binaryValue.length); docsWithField.add(docV); @@ -373,11 +382,10 @@ private static DocsWithFieldSet writeVectorData( ByteBuffer buffer = ByteBuffer.allocate(floatVectorValues.dimension() * VectorEncoding.FLOAT32.byteSize) .order(ByteOrder.LITTLE_ENDIAN); - for (int docV = floatVectorValues.nextDoc(); - docV != NO_MORE_DOCS; - docV = floatVectorValues.nextDoc()) { + KnnVectorValues.DocIndexIterator iter = floatVectorValues.iterator(); + for (int docV = iter.nextDoc(); docV != NO_MORE_DOCS; docV = iter.nextDoc()) { // write vector - float[] value = floatVectorValues.vectorValue(); + float[] value = floatVectorValues.vectorValue(iter.index()); buffer.asFloatBuffer().put(value); output.writeBytes(buffer.array(), buffer.limit()); docsWithField.add(docV); @@ -404,18 +412,20 @@ private abstract static class FieldWriter extends FlatFieldVectorsWriter { static FieldWriter create(FieldInfo fieldInfo) { int dim = fieldInfo.getVectorDimension(); return switch (fieldInfo.getVectorEncoding()) { - case BYTE -> new Lucene99FlatVectorsWriter.FieldWriter(fieldInfo) { - @Override - public byte[] copyValue(byte[] value) { - return ArrayUtil.copyOfSubArray(value, 0, dim); - } - }; - case FLOAT32 -> new Lucene99FlatVectorsWriter.FieldWriter(fieldInfo) { - @Override - public float[] copyValue(float[] value) { - return ArrayUtil.copyOfSubArray(value, 0, dim); - } - }; + case BYTE -> + new Lucene99FlatVectorsWriter.FieldWriter(fieldInfo) { + @Override + public byte[] copyValue(byte[] value) { + return ArrayUtil.copyOfSubArray(value, 0, dim); + } + }; + case FLOAT32 -> + new Lucene99FlatVectorsWriter.FieldWriter(fieldInfo) { + @Override + public float[] copyValue(float[] value) { + return ArrayUtil.copyOfSubArray(value, 0, dim); + } + }; }; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswScalarQuantizedVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswScalarQuantizedVectorsFormat.java index 5083b05c82dc..1966ed21d654 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswScalarQuantizedVectorsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswScalarQuantizedVectorsFormat.java @@ -65,19 +65,19 @@ public class Lucene99HnswScalarQuantizedVectorsFormat extends KnnVectorsFormat { private final int numMergeWorkers; private final TaskExecutor mergeExec; - /** Constructs a format using default graph construction parameters */ + /** Constructs a format using default graph construction parameters with 7 bit quantization */ public Lucene99HnswScalarQuantizedVectorsFormat() { - this(DEFAULT_MAX_CONN, DEFAULT_BEAM_WIDTH, DEFAULT_NUM_MERGE_WORKER, 7, true, null, null); + this(DEFAULT_MAX_CONN, DEFAULT_BEAM_WIDTH, DEFAULT_NUM_MERGE_WORKER, 7, false, null, null); } /** - * Constructs a format using the given graph construction parameters. + * Constructs a format using the given graph construction parameters with 7 bit quantization * * @param maxConn the maximum number of connections to a node in the HNSW graph * @param beamWidth the size of the queue maintained during graph construction. */ public Lucene99HnswScalarQuantizedVectorsFormat(int maxConn, int beamWidth) { - this(maxConn, beamWidth, DEFAULT_NUM_MERGE_WORKER, 7, true, null, null); + this(maxConn, beamWidth, DEFAULT_NUM_MERGE_WORKER, 7, false, null, null); } /** @@ -87,11 +87,11 @@ public Lucene99HnswScalarQuantizedVectorsFormat(int maxConn, int beamWidth) { * @param beamWidth the size of the queue maintained during graph construction. * @param numMergeWorkers number of workers (threads) that will be used when doing merge. If * larger than 1, a non-null {@link ExecutorService} must be passed as mergeExec - * @param bits the number of bits to use for scalar quantization (must be between 1 and 8, - * inclusive) - * @param compress whether to compress the vectors, if true, the vectors that are quantized with - * lte 4 bits will be compressed into a single byte. If false, the vectors will be stored as - * is. This provides a trade-off of memory usage and speed. + * @param bits the number of bits to use for scalar quantization (must be 4 or 7) + * @param compress whether to compress the quantized vectors by another 50% when bits=4. If + * `true`, pairs of (4 bit quantized) dimensions are packed into a single byte. This must be + * `false` when bits=7. This provides a trade-off of 50% reduction in hot vector memory usage + * during searching, at some decode speed penalty. * @param confidenceInterval the confidenceInterval for scalar quantizing the vectors, when `null` * it is calculated based on the vector field dimensions. When `0`, the quantiles are * dynamically determined by sampling many confidence intervals and determining the most diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java index 117393706dbd..04ad32d8aa9c 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java @@ -30,6 +30,7 @@ import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.search.TaskExecutor; import org.apache.lucene.util.hnsw.HnswGraph; +import org.apache.lucene.util.hnsw.HnswGraphBuilder; /** * Lucene 9.9 vector format, which encodes numeric vector values into an associated graph connecting @@ -50,9 +51,9 @@ *

  • array[vint] the delta encoded neighbor ordinals * * - *
  • After all levels are encoded memory offsets for each node's neighbor nodes encoded by - * {@link org.apache.lucene.util.packed.DirectMonotonicWriter} are appended to the end of the - * file. + *
  • After all levels are encoded, memory offsets for each node's neighbor nodes are appended to + * the end of the file. The offsets are encoded by {@link + * org.apache.lucene.util.packed.DirectMonotonicWriter}. * * *

    .vem (vector metadata) file

    @@ -97,19 +98,19 @@ public final class Lucene99HnswVectorsFormat extends KnnVectorsFormat { public static final int MAXIMUM_MAX_CONN = 512; /** Default number of maximum connections per node */ - public static final int DEFAULT_MAX_CONN = 16; + public static final int DEFAULT_MAX_CONN = HnswGraphBuilder.DEFAULT_MAX_CONN; /** - * The maximum size of the queue to maintain while searching during graph construction This - * maximum value preserves the ratio of the DEFAULT_BEAM_WIDTH/DEFAULT_MAX_CONN i.e. `6.25 * 16 = - * 3200` + * The maximum size of the queue to maintain while searching during graph construction. This + * maximum value preserves the ratio of the `DEFAULT_BEAM_WIDTH`/`DEFAULT_MAX_CONN` (i.e. `6.25 * + * 16 = 3200`). */ public static final int MAXIMUM_BEAM_WIDTH = 3200; /** * Default number of the size of the queue maintained while searching during a graph construction. */ - public static final int DEFAULT_BEAM_WIDTH = 100; + public static final int DEFAULT_BEAM_WIDTH = HnswGraphBuilder.DEFAULT_BEAM_WIDTH; /** Default to use single thread merge */ public static final int DEFAULT_NUM_MERGE_WORKER = 1; @@ -129,7 +130,7 @@ public final class Lucene99HnswVectorsFormat extends KnnVectorsFormat { */ private final int beamWidth; - /** The format for storing, reading, merging vectors on disk */ + /** The format for storing, reading, and merging vectors on disk. */ private static final FlatVectorsFormat flatVectorsFormat = new Lucene99FlatVectorsFormat(FlatVectorScorerUtil.getLucene99FlatVectorsScorer()); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsReader.java index 35bc38571a6a..2a3088527f5f 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsReader.java @@ -21,9 +21,7 @@ import java.io.IOException; import java.util.Arrays; -import java.util.HashMap; import java.util.List; -import java.util.Map; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.codecs.hnsw.FlatVectorsReader; @@ -37,6 +35,7 @@ import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.VectorEncoding; import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.internal.hppc.IntObjectHashMap; import org.apache.lucene.search.KnnCollector; import org.apache.lucene.store.ChecksumIndexInput; import org.apache.lucene.store.DataInput; @@ -70,7 +69,7 @@ public final class Lucene99HnswVectorsReader extends KnnVectorsReader private final FlatVectorsReader flatVectorsReader; private final FieldInfos fieldInfos; - private final Map fields = new HashMap<>(); + private final IntObjectHashMap fields = new IntObjectHashMap<>(); private final IndexInput vectorIndex; public Lucene99HnswVectorsReader(SegmentReadState state, FlatVectorsReader flatVectorsReader) @@ -162,7 +161,7 @@ private void readFields(ChecksumIndexInput meta) throws IOException { } FieldEntry fieldEntry = readField(meta, info); validateFieldEntry(info, fieldEntry); - fields.put(info.name, fieldEntry); + fields.put(info.number, fieldEntry); } } @@ -225,8 +224,7 @@ private FieldEntry readField(IndexInput input, FieldInfo info) throws IOExceptio @Override public long ramBytesUsed() { return Lucene99HnswVectorsReader.SHALLOW_SIZE - + RamUsageEstimator.sizeOfMap( - fields, RamUsageEstimator.shallowSizeOfInstance(FieldEntry.class)) + + fields.ramBytesUsed() + flatVectorsReader.ramBytesUsed(); } @@ -246,25 +244,43 @@ public ByteVectorValues getByteVectorValues(String field) throws IOException { return flatVectorsReader.getByteVectorValues(field); } + private FieldEntry getFieldEntry(String field, VectorEncoding expectedEncoding) { + final FieldInfo info = fieldInfos.fieldInfo(field); + final FieldEntry fieldEntry; + if (info == null || (fieldEntry = fields.get(info.number)) == null) { + throw new IllegalArgumentException("field=\"" + field + "\" not found"); + } + if (fieldEntry.vectorEncoding != expectedEncoding) { + throw new IllegalArgumentException( + "field=\"" + + field + + "\" is encoded as: " + + fieldEntry.vectorEncoding + + " expected: " + + expectedEncoding); + } + return fieldEntry; + } + @Override public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { + final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.FLOAT32); search( - fields.get(field), + fieldEntry, knnCollector, acceptDocs, - VectorEncoding.FLOAT32, () -> flatVectorsReader.getRandomVectorScorer(field, target)); } @Override public void search(String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { + final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.BYTE); search( - fields.get(field), + fieldEntry, knnCollector, acceptDocs, - VectorEncoding.BYTE, () -> flatVectorsReader.getRandomVectorScorer(field, target)); } @@ -272,13 +288,10 @@ private void search( FieldEntry fieldEntry, KnnCollector knnCollector, Bits acceptDocs, - VectorEncoding vectorEncoding, IOSupplier scorerSupplier) throws IOException { - if (fieldEntry.size() == 0 - || knnCollector.k() == 0 - || fieldEntry.vectorEncoding != vectorEncoding) { + if (fieldEntry.size() == 0 || knnCollector.k() == 0) { return; } final RandomVectorScorer scorer = scorerSupplier.get(); @@ -304,12 +317,12 @@ private void search( @Override public HnswGraph getGraph(String field) throws IOException { - FieldInfo info = fieldInfos.fieldInfo(field); - if (info == null) { - throw new IllegalArgumentException("No such field '" + field + "'"); + final FieldInfo info = fieldInfos.fieldInfo(field); + final FieldEntry entry; + if (info == null || (entry = fields.get(info.number)) == null) { + throw new IllegalArgumentException("field=\"" + field + "\" not found"); } - FieldEntry entry = fields.get(field); - if (entry != null && entry.vectorIndexLength > 0) { + if (entry.vectorIndexLength > 0) { return getGraph(entry); } else { return HnswGraph.EMPTY; @@ -463,6 +476,7 @@ public void seek(int level, int targetOrd) throws IOException { // unsafe; no bounds checking dataIn.seek(graphLevelNodeOffsets.get(targetIndex + graphLevelNodeIndexOffsets[level])); arcCount = dataIn.readVInt(); + assert arcCount <= currentNeighborsBuffer.length : "too many neighbors: " + arcCount; if (arcCount > 0) { currentNeighborsBuffer[0] = dataIn.readVInt(); for (int i = 1; i < arcCount; i++) { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java index ce7d88f9dfd9..0f4e8196d52d 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java @@ -17,6 +17,7 @@ package org.apache.lucene.codecs.lucene99; +import static org.apache.lucene.codecs.KnnVectorsWriter.MergedVectorValues.hasVectorValues; import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat.DIRECT_MONOTONIC_BLOCK_SHIFT; import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader.SIMILARITY_FUNCTIONS; @@ -31,14 +32,16 @@ import org.apache.lucene.codecs.hnsw.FlatFieldVectorsWriter; import org.apache.lucene.codecs.hnsw.FlatVectorsScorer; import org.apache.lucene.codecs.hnsw.FlatVectorsWriter; +import org.apache.lucene.index.ByteVectorValues; import org.apache.lucene.index.DocsWithFieldSet; import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.MergeState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.Sorter; import org.apache.lucene.index.VectorSimilarityFunction; -import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.TaskExecutor; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.IOUtils; @@ -53,7 +56,6 @@ import org.apache.lucene.util.hnsw.IncrementalHnswGraphMerger; import org.apache.lucene.util.hnsw.NeighborArray; import org.apache.lucene.util.hnsw.OnHeapHnswGraph; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier; import org.apache.lucene.util.packed.DirectMonotonicWriter; @@ -353,19 +355,23 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE : new TaskExecutor(mergeState.intraMergeTaskExecutor), numMergeWorkers); for (int i = 0; i < mergeState.liveDocs.length; i++) { - merger.addReader( - mergeState.knnVectorsReaders[i], mergeState.docMaps[i], mergeState.liveDocs[i]); + if (hasVectorValues(mergeState.fieldInfos[i], fieldInfo.name)) { + merger.addReader( + mergeState.knnVectorsReaders[i], mergeState.docMaps[i], mergeState.liveDocs[i]); + } } - DocIdSetIterator mergedVectorIterator = null; + KnnVectorValues mergedVectorValues = null; switch (fieldInfo.getVectorEncoding()) { - case BYTE -> mergedVectorIterator = - KnnVectorsWriter.MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState); - case FLOAT32 -> mergedVectorIterator = - KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState); + case BYTE -> + mergedVectorValues = + KnnVectorsWriter.MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState); + case FLOAT32 -> + mergedVectorValues = + KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState); } graph = merger.merge( - mergedVectorIterator, + mergedVectorValues, segmentWriteState.infoStream, scorerSupplier.totalVectorCount()); vectorIndexNodeOffsets = writeGraph(graph); @@ -543,20 +549,22 @@ static FieldWriter create( InfoStream infoStream) throws IOException { return switch (fieldInfo.getVectorEncoding()) { - case BYTE -> new FieldWriter<>( - scorer, - (FlatFieldVectorsWriter) flatFieldVectorsWriter, - fieldInfo, - M, - beamWidth, - infoStream); - case FLOAT32 -> new FieldWriter<>( - scorer, - (FlatFieldVectorsWriter) flatFieldVectorsWriter, - fieldInfo, - M, - beamWidth, - infoStream); + case BYTE -> + new FieldWriter<>( + scorer, + (FlatFieldVectorsWriter) flatFieldVectorsWriter, + fieldInfo, + M, + beamWidth, + infoStream); + case FLOAT32 -> + new FieldWriter<>( + scorer, + (FlatFieldVectorsWriter) flatFieldVectorsWriter, + fieldInfo, + M, + beamWidth, + infoStream); }; } @@ -572,16 +580,18 @@ static FieldWriter create( this.fieldInfo = fieldInfo; RandomVectorScorerSupplier scorerSupplier = switch (fieldInfo.getVectorEncoding()) { - case BYTE -> scorer.getRandomVectorScorerSupplier( - fieldInfo.getVectorSimilarityFunction(), - RandomAccessVectorValues.fromBytes( - (List) flatFieldVectorsWriter.getVectors(), - fieldInfo.getVectorDimension())); - case FLOAT32 -> scorer.getRandomVectorScorerSupplier( - fieldInfo.getVectorSimilarityFunction(), - RandomAccessVectorValues.fromFloats( - (List) flatFieldVectorsWriter.getVectors(), - fieldInfo.getVectorDimension())); + case BYTE -> + scorer.getRandomVectorScorerSupplier( + fieldInfo.getVectorSimilarityFunction(), + ByteVectorValues.fromBytes( + (List) flatFieldVectorsWriter.getVectors(), + fieldInfo.getVectorDimension())); + case FLOAT32 -> + scorer.getRandomVectorScorerSupplier( + fieldInfo.getVectorSimilarityFunction(), + FloatVectorValues.fromFloats( + (List) flatFieldVectorsWriter.getVectors(), + fieldInfo.getVectorDimension())); }; hnswGraphBuilder = HnswGraphBuilder.create(scorerSupplier, M, beamWidth, HnswGraphBuilder.randSeed); @@ -612,7 +622,7 @@ public T copyValue(T vectorValue) { throw new UnsupportedOperationException(); } - OnHeapHnswGraph getGraph() { + OnHeapHnswGraph getGraph() throws IOException { assert flatFieldVectorsWriter.isFinished(); if (node > 0) { return hnswGraphBuilder.getCompletedGraph(); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorScorer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorScorer.java index 96c9358cc907..a4770f01f46d 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorScorer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorScorer.java @@ -21,12 +21,12 @@ import java.io.IOException; import org.apache.lucene.codecs.hnsw.FlatVectorsScorer; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.util.VectorUtil; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.hnsw.RandomVectorScorer; import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier; -import org.apache.lucene.util.quantization.RandomAccessQuantizedByteVectorValues; +import org.apache.lucene.util.quantization.QuantizedByteVectorValues; import org.apache.lucene.util.quantization.ScalarQuantizer; /** @@ -45,9 +45,9 @@ public Lucene99ScalarQuantizedVectorScorer(FlatVectorsScorer flatVectorsScorer) @Override public RandomVectorScorerSupplier getRandomVectorScorerSupplier( - VectorSimilarityFunction similarityFunction, RandomAccessVectorValues vectorValues) + VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues) throws IOException { - if (vectorValues instanceof RandomAccessQuantizedByteVectorValues quantizedByteVectorValues) { + if (vectorValues instanceof QuantizedByteVectorValues quantizedByteVectorValues) { return new ScalarQuantizedRandomVectorScorerSupplier( quantizedByteVectorValues, similarityFunction); } @@ -57,11 +57,9 @@ public RandomVectorScorerSupplier getRandomVectorScorerSupplier( @Override public RandomVectorScorer getRandomVectorScorer( - VectorSimilarityFunction similarityFunction, - RandomAccessVectorValues vectorValues, - float[] target) + VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues, float[] target) throws IOException { - if (vectorValues instanceof RandomAccessQuantizedByteVectorValues quantizedByteVectorValues) { + if (vectorValues instanceof QuantizedByteVectorValues quantizedByteVectorValues) { ScalarQuantizer scalarQuantizer = quantizedByteVectorValues.getScalarQuantizer(); byte[] targetBytes = new byte[target.length]; float offsetCorrection = @@ -79,9 +77,7 @@ public RandomVectorScorer getRandomVectorScorer( @Override public RandomVectorScorer getRandomVectorScorer( - VectorSimilarityFunction similarityFunction, - RandomAccessVectorValues vectorValues, - byte[] target) + VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues, byte[] target) throws IOException { return nonQuantizedDelegate.getRandomVectorScorer(similarityFunction, vectorValues, target); } @@ -96,17 +92,23 @@ static RandomVectorScorer fromVectorSimilarity( float offsetCorrection, VectorSimilarityFunction sim, float constMultiplier, - RandomAccessQuantizedByteVectorValues values) { + QuantizedByteVectorValues values) { return switch (sim) { case EUCLIDEAN -> new Euclidean(values, constMultiplier, targetBytes); - case COSINE, DOT_PRODUCT -> dotProductFactory( - targetBytes, offsetCorrection, constMultiplier, values, f -> Math.max((1 + f) / 2, 0)); - case MAXIMUM_INNER_PRODUCT -> dotProductFactory( - targetBytes, - offsetCorrection, - constMultiplier, - values, - VectorUtil::scaleMaxInnerProductScore); + case COSINE, DOT_PRODUCT -> + dotProductFactory( + targetBytes, + offsetCorrection, + constMultiplier, + values, + f -> Math.max((1 + f) / 2, 0)); + case MAXIMUM_INNER_PRODUCT -> + dotProductFactory( + targetBytes, + offsetCorrection, + constMultiplier, + values, + VectorUtil::scaleMaxInnerProductScore); }; } @@ -114,7 +116,7 @@ private static RandomVectorScorer.AbstractRandomVectorScorer dotProductFactory( byte[] targetBytes, float offsetCorrection, float constMultiplier, - RandomAccessQuantizedByteVectorValues values, + QuantizedByteVectorValues values, FloatToFloatFunction scoreAdjustmentFunction) { if (values.getScalarQuantizer().getBits() <= 4) { if (values.getVectorByteLength() != values.dimension() && values.getSlice() != null) { @@ -131,10 +133,9 @@ private static RandomVectorScorer.AbstractRandomVectorScorer dotProductFactory( private static class Euclidean extends RandomVectorScorer.AbstractRandomVectorScorer { private final float constMultiplier; private final byte[] targetBytes; - private final RandomAccessQuantizedByteVectorValues values; + private final QuantizedByteVectorValues values; - private Euclidean( - RandomAccessQuantizedByteVectorValues values, float constMultiplier, byte[] targetBytes) { + private Euclidean(QuantizedByteVectorValues values, float constMultiplier, byte[] targetBytes) { super(values); this.values = values; this.constMultiplier = constMultiplier; @@ -153,13 +154,13 @@ public float score(int node) throws IOException { /** Calculates dot product on quantized vectors, applying the appropriate corrections */ private static class DotProduct extends RandomVectorScorer.AbstractRandomVectorScorer { private final float constMultiplier; - private final RandomAccessQuantizedByteVectorValues values; + private final QuantizedByteVectorValues values; private final byte[] targetBytes; private final float offsetCorrection; private final FloatToFloatFunction scoreAdjustmentFunction; public DotProduct( - RandomAccessQuantizedByteVectorValues values, + QuantizedByteVectorValues values, float constMultiplier, byte[] targetBytes, float offsetCorrection, @@ -187,14 +188,14 @@ public float score(int vectorOrdinal) throws IOException { private static class CompressedInt4DotProduct extends RandomVectorScorer.AbstractRandomVectorScorer { private final float constMultiplier; - private final RandomAccessQuantizedByteVectorValues values; + private final QuantizedByteVectorValues values; private final byte[] compressedVector; private final byte[] targetBytes; private final float offsetCorrection; private final FloatToFloatFunction scoreAdjustmentFunction; private CompressedInt4DotProduct( - RandomAccessQuantizedByteVectorValues values, + QuantizedByteVectorValues values, float constMultiplier, byte[] targetBytes, float offsetCorrection, @@ -225,13 +226,13 @@ public float score(int vectorOrdinal) throws IOException { private static class Int4DotProduct extends RandomVectorScorer.AbstractRandomVectorScorer { private final float constMultiplier; - private final RandomAccessQuantizedByteVectorValues values; + private final QuantizedByteVectorValues values; private final byte[] targetBytes; private final float offsetCorrection; private final FloatToFloatFunction scoreAdjustmentFunction; public Int4DotProduct( - RandomAccessQuantizedByteVectorValues values, + QuantizedByteVectorValues values, float constMultiplier, byte[] targetBytes, float offsetCorrection, @@ -265,13 +266,12 @@ private static final class ScalarQuantizedRandomVectorScorerSupplier implements RandomVectorScorerSupplier { private final VectorSimilarityFunction vectorSimilarityFunction; - private final RandomAccessQuantizedByteVectorValues values; - private final RandomAccessQuantizedByteVectorValues values1; - private final RandomAccessQuantizedByteVectorValues values2; + private final QuantizedByteVectorValues values; + private final QuantizedByteVectorValues values1; + private final QuantizedByteVectorValues values2; public ScalarQuantizedRandomVectorScorerSupplier( - RandomAccessQuantizedByteVectorValues values, - VectorSimilarityFunction vectorSimilarityFunction) + QuantizedByteVectorValues values, VectorSimilarityFunction vectorSimilarityFunction) throws IOException { this.values = values; this.values1 = values.copy(); @@ -295,5 +295,12 @@ public RandomVectorScorer scorer(int ord) throws IOException { public ScalarQuantizedRandomVectorScorerSupplier copy() throws IOException { return new ScalarQuantizedRandomVectorScorerSupplier(values.copy(), vectorSimilarityFunction); } + + @Override + public String toString() { + return "ScalarQuantizedRandomVectorScorerSupplier(vectorSimilarityFunction=" + + vectorSimilarityFunction + + ")"; + } } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsFormat.java index 552260894a8d..0b3c6d19af83 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsFormat.java @@ -34,8 +34,10 @@ public class Lucene99ScalarQuantizedVectorsFormat extends FlatVectorsFormat { // The bits that are allowed for scalar quantization - // We only allow unsigned byte (8), signed byte (7), and half-byte (4) - private static final int ALLOWED_BITS = (1 << 8) | (1 << 7) | (1 << 4); + // We only allow signed byte (7), and half-byte (4) + // NOTE: we used to allow 8 bits as well, but it was broken so we removed it + // (https://github.com/apache/lucene/issues/13519) + private static final int ALLOWED_BITS = (1 << 7) | (1 << 4); public static final String QUANTIZED_VECTOR_COMPONENT = "QVEC"; public static final String NAME = "Lucene99ScalarQuantizedVectorsFormat"; @@ -72,7 +74,7 @@ public class Lucene99ScalarQuantizedVectorsFormat extends FlatVectorsFormat { /** Constructs a format using default graph construction parameters */ public Lucene99ScalarQuantizedVectorsFormat() { - this(null, 7, true); + this(null, 7, false); } /** @@ -83,9 +85,10 @@ public Lucene99ScalarQuantizedVectorsFormat() { * determined by sampling many confidence intervals and determining the most accurate pair. * @param bits the number of bits to use for scalar quantization (must be between 1 and 8, * inclusive) - * @param compress whether to compress the vectors, if true, the vectors that are quantized with - * lte 4 bits will be compressed into a single byte. If false, the vectors will be stored as - * is. This provides a trade-off of memory usage and speed. + * @param compress whether to compress the quantized vectors by another 50% when bits=4. If + * `true`, pairs of (4 bit quantized) dimensions are packed into a single byte. This must be + * `false` when bits=7. This provides a trade-off of 50% reduction in hot vector memory usage + * during searching, at some decode speed penalty. */ public Lucene99ScalarQuantizedVectorsFormat( Float confidenceInterval, int bits, boolean compress) { @@ -104,7 +107,12 @@ public Lucene99ScalarQuantizedVectorsFormat( + confidenceInterval); } if (bits < 1 || bits > 8 || (ALLOWED_BITS & (1 << bits)) == 0) { - throw new IllegalArgumentException("bits must be one of: 4, 7, 8; bits=" + bits); + throw new IllegalArgumentException("bits must be one of: 4, 7; bits=" + bits); + } + + if (bits > 4 && compress) { + // compress=true otherwise silently does nothing when bits=7? + throw new IllegalArgumentException("compress=true only applies when bits=4"); } this.bits = (byte) bits; this.confidenceInterval = confidenceInterval; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsReader.java index c1a5c706549d..712e9b91f9d2 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsReader.java @@ -21,8 +21,6 @@ import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader.readVectorEncoding; import java.io.IOException; -import java.util.HashMap; -import java.util.Map; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.hnsw.FlatVectorsReader; import org.apache.lucene.codecs.hnsw.FlatVectorsScorer; @@ -36,6 +34,7 @@ import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.VectorEncoding; import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.internal.hppc.IntObjectHashMap; import org.apache.lucene.search.VectorScorer; import org.apache.lucene.store.ChecksumIndexInput; import org.apache.lucene.store.IOContext; @@ -59,15 +58,17 @@ public final class Lucene99ScalarQuantizedVectorsReader extends FlatVectorsReade private static final long SHALLOW_SIZE = RamUsageEstimator.shallowSizeOfInstance(Lucene99ScalarQuantizedVectorsReader.class); - private final Map fields = new HashMap<>(); + private final IntObjectHashMap fields = new IntObjectHashMap<>(); private final IndexInput quantizedVectorData; private final FlatVectorsReader rawVectorsReader; + private final FieldInfos fieldInfos; public Lucene99ScalarQuantizedVectorsReader( SegmentReadState state, FlatVectorsReader rawVectorsReader, FlatVectorsScorer scorer) throws IOException { super(scorer); this.rawVectorsReader = rawVectorsReader; + this.fieldInfos = state.fieldInfos; int versionMeta = -1; String metaFileName = IndexFileNames.segmentFileName( @@ -118,7 +119,7 @@ private void readFields(ChecksumIndexInput meta, int versionMeta, FieldInfos inf } FieldEntry fieldEntry = readField(meta, versionMeta, info); validateFieldEntry(info, fieldEntry); - fields.put(info.name, fieldEntry); + fields.put(info.number, fieldEntry); } } @@ -136,9 +137,10 @@ static void validateFieldEntry(FieldInfo info, FieldEntry fieldEntry) { final long quantizedVectorBytes; if (fieldEntry.bits <= 4 && fieldEntry.compress) { + // two dimensions -> one byte quantizedVectorBytes = ((dimension + 1) >> 1) + Float.BYTES; } else { - // int8 quantized and calculated stored offset. + // one dimension -> one byte quantizedVectorBytes = dimension + Float.BYTES; } long numQuantizedVectorBytes = Math.multiplyExact(quantizedVectorBytes, fieldEntry.size); @@ -162,12 +164,27 @@ public void checkIntegrity() throws IOException { CodecUtil.checksumEntireFile(quantizedVectorData); } + private FieldEntry getFieldEntry(String field) { + final FieldInfo info = fieldInfos.fieldInfo(field); + final FieldEntry fieldEntry; + if (info == null || (fieldEntry = fields.get(info.number)) == null) { + throw new IllegalArgumentException("field=\"" + field + "\" not found"); + } + if (fieldEntry.vectorEncoding != VectorEncoding.FLOAT32) { + throw new IllegalArgumentException( + "field=\"" + + field + + "\" is encoded as: " + + fieldEntry.vectorEncoding + + " expected: " + + VectorEncoding.FLOAT32); + } + return fieldEntry; + } + @Override public FloatVectorValues getFloatVectorValues(String field) throws IOException { - FieldEntry fieldEntry = fields.get(field); - if (fieldEntry == null || fieldEntry.vectorEncoding != VectorEncoding.FLOAT32) { - return null; - } + final FieldEntry fieldEntry = getFieldEntry(field); final FloatVectorValues rawVectorValues = rawVectorsReader.getFloatVectorValues(field); OffHeapQuantizedByteVectorValues quantizedByteVectorValues = OffHeapQuantizedByteVectorValues.load( @@ -231,10 +248,7 @@ private static IndexInput openDataInput( @Override public RandomVectorScorer getRandomVectorScorer(String field, float[] target) throws IOException { - FieldEntry fieldEntry = fields.get(field); - if (fieldEntry == null || fieldEntry.vectorEncoding != VectorEncoding.FLOAT32) { - return null; - } + final FieldEntry fieldEntry = getFieldEntry(field); if (fieldEntry.scalarQuantizer == null) { return rawVectorsReader.getRandomVectorScorer(field, target); } @@ -265,12 +279,7 @@ public void close() throws IOException { @Override public long ramBytesUsed() { - long size = SHALLOW_SIZE; - size += - RamUsageEstimator.sizeOfMap( - fields, RamUsageEstimator.shallowSizeOfInstance(FieldEntry.class)); - size += rawVectorsReader.ramBytesUsed(); - return size; + return SHALLOW_SIZE + fields.ramBytesUsed() + rawVectorsReader.ramBytesUsed(); } private FieldEntry readField(IndexInput input, int versionMeta, FieldInfo info) @@ -291,11 +300,8 @@ private FieldEntry readField(IndexInput input, int versionMeta, FieldInfo info) } @Override - public QuantizedByteVectorValues getQuantizedVectorValues(String fieldName) throws IOException { - FieldEntry fieldEntry = fields.get(fieldName); - if (fieldEntry == null || fieldEntry.vectorEncoding != VectorEncoding.FLOAT32) { - return null; - } + public QuantizedByteVectorValues getQuantizedVectorValues(String field) throws IOException { + final FieldEntry fieldEntry = getFieldEntry(field); return OffHeapQuantizedByteVectorValues.load( fieldEntry.ordToDoc, fieldEntry.dimension, @@ -310,11 +316,8 @@ public QuantizedByteVectorValues getQuantizedVectorValues(String fieldName) thro } @Override - public ScalarQuantizer getQuantizationState(String fieldName) { - FieldEntry fieldEntry = fields.get(fieldName); - if (fieldEntry == null || fieldEntry.vectorEncoding != VectorEncoding.FLOAT32) { - return null; - } + public ScalarQuantizer getQuantizationState(String field) { + final FieldEntry fieldEntry = getFieldEntry(field); return fieldEntry.scalarQuantizer; } @@ -392,10 +395,10 @@ static FieldEntry create( private static final class QuantizedVectorValues extends FloatVectorValues { private final FloatVectorValues rawVectorValues; - private final OffHeapQuantizedByteVectorValues quantizedVectorValues; + private final QuantizedByteVectorValues quantizedVectorValues; QuantizedVectorValues( - FloatVectorValues rawVectorValues, OffHeapQuantizedByteVectorValues quantizedVectorValues) { + FloatVectorValues rawVectorValues, QuantizedByteVectorValues quantizedVectorValues) { this.rawVectorValues = rawVectorValues; this.quantizedVectorValues = quantizedVectorValues; } @@ -411,34 +414,28 @@ public int size() { } @Override - public float[] vectorValue() throws IOException { - return rawVectorValues.vectorValue(); + public float[] vectorValue(int ord) throws IOException { + return rawVectorValues.vectorValue(ord); } @Override - public int docID() { - return rawVectorValues.docID(); + public int ordToDoc(int ord) { + return rawVectorValues.ordToDoc(ord); } @Override - public int nextDoc() throws IOException { - int rawDocId = rawVectorValues.nextDoc(); - int quantizedDocId = quantizedVectorValues.nextDoc(); - assert rawDocId == quantizedDocId; - return quantizedDocId; + public QuantizedVectorValues copy() throws IOException { + return new QuantizedVectorValues(rawVectorValues.copy(), quantizedVectorValues.copy()); } @Override - public int advance(int target) throws IOException { - int rawDocId = rawVectorValues.advance(target); - int quantizedDocId = quantizedVectorValues.advance(target); - assert rawDocId == quantizedDocId; - return quantizedDocId; + public VectorScorer scorer(float[] query) throws IOException { + return quantizedVectorValues.scorer(query); } @Override - public VectorScorer scorer(float[] query) throws IOException { - return quantizedVectorValues.scorer(query); + public DocIndexIterator iterator() { + return rawVectorValues.iterator(); } } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java index e477fec75e58..1a30b5271cd7 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java @@ -17,10 +17,9 @@ package org.apache.lucene.codecs.lucene99; +import static org.apache.lucene.codecs.KnnVectorsWriter.MergedVectorValues.hasVectorValues; import static org.apache.lucene.codecs.lucene99.Lucene99FlatVectorsFormat.DIRECT_MONOTONIC_BLOCK_SHIFT; -import static org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat.DYNAMIC_CONFIDENCE_INTERVAL; -import static org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat.QUANTIZED_VECTOR_COMPONENT; -import static org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat.calculateDefaultConfidenceInterval; +import static org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat.*; import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; import static org.apache.lucene.util.RamUsageEstimator.shallowSizeOfInstance; @@ -44,6 +43,7 @@ import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.MergeState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.Sorter; @@ -630,7 +630,7 @@ public static ScalarQuantizer mergeAndRecalculateQuantiles( IntArrayList segmentSizes = new IntArrayList(mergeState.liveDocs.length); for (int i = 0; i < mergeState.liveDocs.length; i++) { FloatVectorValues fvv; - if (mergeState.knnVectorsReaders[i] != null + if (hasVectorValues(mergeState.fieldInfos[i], fieldInfo.name) && (fvv = mergeState.knnVectorsReaders[i].getFloatVectorValues(fieldInfo.name)) != null && fvv.size() > 0) { ScalarQuantizer quantizationState = @@ -652,12 +652,11 @@ public static ScalarQuantizer mergeAndRecalculateQuantiles( || bits <= 4 || shouldRecomputeQuantiles(mergedQuantiles, quantizationStates)) { int numVectors = 0; - FloatVectorValues vectorValues = - KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState); + DocIdSetIterator iter = + KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState) + .iterator(); // iterate vectorValues and increment numVectors - for (int doc = vectorValues.nextDoc(); - doc != DocIdSetIterator.NO_MORE_DOCS; - doc = vectorValues.nextDoc()) { + for (int doc = iter.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iter.nextDoc()) { numVectors++; } return buildScalarQuantizer( @@ -729,11 +728,10 @@ public static DocsWithFieldSet writeQuantizedVectorData( ? OffHeapQuantizedByteVectorValues.compressedArray( quantizedByteVectorValues.dimension(), bits) : null; - for (int docV = quantizedByteVectorValues.nextDoc(); - docV != NO_MORE_DOCS; - docV = quantizedByteVectorValues.nextDoc()) { + KnnVectorValues.DocIndexIterator iter = quantizedByteVectorValues.iterator(); + for (int docV = iter.nextDoc(); docV != NO_MORE_DOCS; docV = iter.nextDoc()) { // write vector - byte[] binaryValue = quantizedByteVectorValues.vectorValue(); + byte[] binaryValue = quantizedByteVectorValues.vectorValue(iter.index()); assert binaryValue.length == quantizedByteVectorValues.dimension() : "dim=" + quantizedByteVectorValues.dimension() + " len=" + binaryValue.length; if (compressedVector != null) { @@ -742,7 +740,8 @@ public static DocsWithFieldSet writeQuantizedVectorData( } else { output.writeBytes(binaryValue, binaryValue.length); } - output.writeInt(Float.floatToIntBits(quantizedByteVectorValues.getScoreCorrectionConstant())); + output.writeInt( + Float.floatToIntBits(quantizedByteVectorValues.getScoreCorrectionConstant(iter.index()))); docsWithField.add(docV); } return docsWithField; @@ -854,7 +853,6 @@ public DocsWithFieldSet getDocsWithFieldSet() { static class FloatVectorWrapper extends FloatVectorValues { private final List vectorList; - protected int curDoc = -1; FloatVectorWrapper(List vectorList) { this.vectorList = vectorList; @@ -871,51 +869,42 @@ public int size() { } @Override - public float[] vectorValue() throws IOException { - if (curDoc == -1 || curDoc >= vectorList.size()) { - throw new IOException("Current doc not set or too many iterations"); - } - return vectorList.get(curDoc); + public FloatVectorValues copy() throws IOException { + return this; } @Override - public int docID() { - if (curDoc >= vectorList.size()) { - return NO_MORE_DOCS; + public float[] vectorValue(int ord) throws IOException { + if (ord < 0 || ord >= vectorList.size()) { + throw new IOException("vector ord " + ord + " out of bounds"); } - return curDoc; - } - - @Override - public int nextDoc() throws IOException { - curDoc++; - return docID(); + return vectorList.get(ord); } @Override - public int advance(int target) throws IOException { - curDoc = target; - return docID(); - } - - @Override - public VectorScorer scorer(float[] target) { - throw new UnsupportedOperationException(); + public DocIndexIterator iterator() { + return createDenseIterator(); } } static class QuantizedByteVectorValueSub extends DocIDMerger.Sub { private final QuantizedByteVectorValues values; + private final KnnVectorValues.DocIndexIterator iterator; QuantizedByteVectorValueSub(MergeState.DocMap docMap, QuantizedByteVectorValues values) { super(docMap); this.values = values; - assert values.docID() == -1; + iterator = values.iterator(); + assert iterator.docID() == -1; } @Override public int nextDoc() throws IOException { - return values.nextDoc(); + return iterator.nextDoc(); + } + + public int index() { + return iterator.index(); } } @@ -928,8 +917,7 @@ public static MergedQuantizedVectorValues mergeQuantizedByteVectorValues( List subs = new ArrayList<>(); for (int i = 0; i < mergeState.knnVectorsReaders.length; i++) { - if (mergeState.knnVectorsReaders[i] != null - && mergeState.knnVectorsReaders[i].getFloatVectorValues(fieldInfo.name) != null) { + if (hasVectorValues(mergeState.fieldInfos[i], fieldInfo.name)) { QuantizedVectorsReader reader = getQuantizedKnnVectorsReader(mergeState.knnVectorsReaders[i], fieldInfo.name); assert scalarQuantizer != null; @@ -973,7 +961,6 @@ public static MergedQuantizedVectorValues mergeQuantizedByteVectorValues( private final DocIDMerger docIdMerger; private final int size; - private int docId; private QuantizedByteVectorValueSub current; private MergedQuantizedVectorValues( @@ -985,33 +972,16 @@ private MergedQuantizedVectorValues( totalSize += sub.values.size(); } size = totalSize; - docId = -1; - } - - @Override - public byte[] vectorValue() throws IOException { - return current.values.vectorValue(); - } - - @Override - public int docID() { - return docId; } @Override - public int nextDoc() throws IOException { - current = docIdMerger.next(); - if (current == null) { - docId = NO_MORE_DOCS; - } else { - docId = current.mappedDocID; - } - return docId; + public byte[] vectorValue(int ord) throws IOException { + return current.values.vectorValue(current.index()); } @Override - public int advance(int target) { - throw new UnsupportedOperationException(); + public DocIndexIterator iterator() { + return new CompositeIterator(); } @Override @@ -1025,13 +995,51 @@ public int dimension() { } @Override - public float getScoreCorrectionConstant() throws IOException { - return current.values.getScoreCorrectionConstant(); + public float getScoreCorrectionConstant(int ord) throws IOException { + return current.values.getScoreCorrectionConstant(current.index()); } - @Override - public VectorScorer scorer(float[] target) throws IOException { - throw new UnsupportedOperationException(); + private class CompositeIterator extends DocIndexIterator { + private int docId; + private int ord; + + public CompositeIterator() { + docId = -1; + ord = -1; + } + + @Override + public int index() { + return ord; + } + + @Override + public int docID() { + return docId; + } + + @Override + public int nextDoc() throws IOException { + current = docIdMerger.next(); + if (current == null) { + docId = NO_MORE_DOCS; + ord = NO_MORE_DOCS; + } else { + docId = current.mappedDocID; + ++ord; + } + return docId; + } + + @Override + public int advance(int target) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public long cost() { + return size; + } } } @@ -1039,6 +1047,7 @@ static class QuantizedFloatVectorValues extends QuantizedByteVectorValues { private final FloatVectorValues values; private final ScalarQuantizer quantizer; private final byte[] quantizedVector; + private int lastOrd = -1; private float offsetValue = 0f; private final VectorSimilarityFunction vectorSimilarityFunction; @@ -1054,7 +1063,14 @@ public QuantizedFloatVectorValues( } @Override - public float getScoreCorrectionConstant() { + public float getScoreCorrectionConstant(int ord) { + if (ord != lastOrd) { + throw new IllegalStateException( + "attempt to retrieve score correction for different ord " + + ord + + " than the quantization was done for: " + + lastOrd); + } return offsetValue; } @@ -1069,41 +1085,31 @@ public int size() { } @Override - public byte[] vectorValue() throws IOException { + public byte[] vectorValue(int ord) throws IOException { + if (ord != lastOrd) { + offsetValue = quantize(ord); + lastOrd = ord; + } return quantizedVector; } @Override - public int docID() { - return values.docID(); + public VectorScorer scorer(float[] target) throws IOException { + throw new UnsupportedOperationException(); } - @Override - public int nextDoc() throws IOException { - int doc = values.nextDoc(); - if (doc != NO_MORE_DOCS) { - quantize(); - } - return doc; + private float quantize(int ord) throws IOException { + return quantizer.quantize(values.vectorValue(ord), quantizedVector, vectorSimilarityFunction); } @Override - public int advance(int target) throws IOException { - int doc = values.advance(target); - if (doc != NO_MORE_DOCS) { - quantize(); - } - return doc; + public int ordToDoc(int ord) { + return values.ordToDoc(ord); } @Override - public VectorScorer scorer(float[] target) throws IOException { - throw new UnsupportedOperationException(); - } - - private void quantize() throws IOException { - offsetValue = - quantizer.quantize(values.vectorValue(), quantizedVector, vectorSimilarityFunction); + public DocIndexIterator iterator() { + return values.iterator(); } } @@ -1160,9 +1166,9 @@ static final class OffsetCorrectedQuantizedByteVectorValues extends QuantizedByt } @Override - public float getScoreCorrectionConstant() throws IOException { + public float getScoreCorrectionConstant(int ord) throws IOException { return scalarQuantizer.recalculateCorrectiveOffset( - in.vectorValue(), oldScalarQuantizer, vectorSimilarityFunction); + in.vectorValue(ord), oldScalarQuantizer, vectorSimilarityFunction); } @Override @@ -1176,35 +1182,24 @@ public int size() { } @Override - public byte[] vectorValue() throws IOException { - return in.vectorValue(); + public byte[] vectorValue(int ord) throws IOException { + return in.vectorValue(ord); } @Override - public int docID() { - return in.docID(); + public int ordToDoc(int ord) { + return in.ordToDoc(ord); } @Override - public int nextDoc() throws IOException { - return in.nextDoc(); - } - - @Override - public int advance(int target) throws IOException { - return in.advance(target); - } - - @Override - public VectorScorer scorer(float[] target) throws IOException { - throw new UnsupportedOperationException(); + public DocIndexIterator iterator() { + return in.iterator(); } } static final class NormalizedFloatVectorValues extends FloatVectorValues { private final FloatVectorValues values; private final float[] normalizedVector; - int curDoc = -1; public NormalizedFloatVectorValues(FloatVectorValues values) { this.values = values; @@ -1222,38 +1217,25 @@ public int size() { } @Override - public float[] vectorValue() throws IOException { - return normalizedVector; - } - - @Override - public VectorScorer scorer(float[] query) throws IOException { - throw new UnsupportedOperationException(); + public int ordToDoc(int ord) { + return values.ordToDoc(ord); } @Override - public int docID() { - return values.docID(); + public float[] vectorValue(int ord) throws IOException { + System.arraycopy(values.vectorValue(ord), 0, normalizedVector, 0, normalizedVector.length); + VectorUtil.l2normalize(normalizedVector); + return normalizedVector; } @Override - public int nextDoc() throws IOException { - curDoc = values.nextDoc(); - if (curDoc != NO_MORE_DOCS) { - System.arraycopy(values.vectorValue(), 0, normalizedVector, 0, normalizedVector.length); - VectorUtil.l2normalize(normalizedVector); - } - return curDoc; + public DocIndexIterator iterator() { + return values.iterator(); } @Override - public int advance(int target) throws IOException { - curDoc = values.advance(target); - if (curDoc != NO_MORE_DOCS) { - System.arraycopy(values.vectorValue(), 0, normalizedVector, 0, normalizedVector.length); - VectorUtil.l2normalize(normalizedVector); - } - return curDoc; + public NormalizedFloatVectorValues copy() throws IOException { + return new NormalizedFloatVectorValues(values.copy()); } } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/OffHeapQuantizedByteVectorValues.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/OffHeapQuantizedByteVectorValues.java index 655dcca11667..051c926a679e 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/OffHeapQuantizedByteVectorValues.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/OffHeapQuantizedByteVectorValues.java @@ -30,15 +30,13 @@ import org.apache.lucene.util.hnsw.RandomVectorScorer; import org.apache.lucene.util.packed.DirectMonotonicReader; import org.apache.lucene.util.quantization.QuantizedByteVectorValues; -import org.apache.lucene.util.quantization.RandomAccessQuantizedByteVectorValues; import org.apache.lucene.util.quantization.ScalarQuantizer; /** * Read the quantized vector values and their score correction values from the index input. This * supports both iterated and random access. */ -public abstract class OffHeapQuantizedByteVectorValues extends QuantizedByteVectorValues - implements RandomAccessQuantizedByteVectorValues { +public abstract class OffHeapQuantizedByteVectorValues extends QuantizedByteVectorValues { protected final int dimension; protected final int size; @@ -141,11 +139,6 @@ public byte[] vectorValue(int targetOrd) throws IOException { return binaryValue; } - @Override - public float getScoreCorrectionConstant() { - return scoreCorrectionConstant[0]; - } - @Override public float getScoreCorrectionConstant(int targetOrd) throws IOException { if (lastOrd == targetOrd) { @@ -213,8 +206,6 @@ public static OffHeapQuantizedByteVectorValues load( */ public static class DenseOffHeapVectorValues extends OffHeapQuantizedByteVectorValues { - private int doc = -1; - public DenseOffHeapVectorValues( int dimension, int size, @@ -226,30 +217,6 @@ public DenseOffHeapVectorValues( super(dimension, size, scalarQuantizer, similarityFunction, vectorsScorer, compress, slice); } - @Override - public byte[] vectorValue() throws IOException { - return vectorValue(doc); - } - - @Override - public int docID() { - return doc; - } - - @Override - public int nextDoc() throws IOException { - return advance(doc + 1); - } - - @Override - public int advance(int target) throws IOException { - assert docID() < target; - if (target >= size) { - return doc = NO_MORE_DOCS; - } - return doc = target; - } - @Override public DenseOffHeapVectorValues copy() throws IOException { return new DenseOffHeapVectorValues( @@ -270,20 +237,26 @@ public Bits getAcceptOrds(Bits acceptDocs) { @Override public VectorScorer scorer(float[] target) throws IOException { DenseOffHeapVectorValues copy = copy(); + DocIndexIterator iterator = copy.iterator(); RandomVectorScorer vectorScorer = vectorsScorer.getRandomVectorScorer(similarityFunction, copy, target); return new VectorScorer() { @Override public float score() throws IOException { - return vectorScorer.score(copy.doc); + return vectorScorer.score(iterator.index()); } @Override public DocIdSetIterator iterator() { - return copy; + return iterator; } }; } + + @Override + public DocIndexIterator iterator() { + return createDenseIterator(); + } } private static class SparseOffHeapVectorValues extends OffHeapQuantizedByteVectorValues { @@ -312,24 +285,8 @@ public SparseOffHeapVectorValues( } @Override - public byte[] vectorValue() throws IOException { - return vectorValue(disi.index()); - } - - @Override - public int docID() { - return disi.docID(); - } - - @Override - public int nextDoc() throws IOException { - return disi.nextDoc(); - } - - @Override - public int advance(int target) throws IOException { - assert docID() < target; - return disi.advance(target); + public DocIndexIterator iterator() { + return IndexedDISI.asDocIndexIterator(disi); } @Override @@ -372,17 +329,18 @@ public int length() { @Override public VectorScorer scorer(float[] target) throws IOException { SparseOffHeapVectorValues copy = copy(); + DocIndexIterator iterator = copy.iterator(); RandomVectorScorer vectorScorer = vectorsScorer.getRandomVectorScorer(similarityFunction, copy, target); return new VectorScorer() { @Override public float score() throws IOException { - return vectorScorer.score(copy.disi.index()); + return vectorScorer.score(iterator.index()); } @Override public DocIdSetIterator iterator() { - return copy; + return iterator; } }; } @@ -404,8 +362,6 @@ public EmptyOffHeapVectorValues( null); } - private int doc = -1; - @Override public int dimension() { return super.dimension(); @@ -417,23 +373,8 @@ public int size() { } @Override - public byte[] vectorValue() { - throw new UnsupportedOperationException(); - } - - @Override - public int docID() { - return doc; - } - - @Override - public int nextDoc() throws IOException { - return advance(doc + 1); - } - - @Override - public int advance(int target) { - return doc = NO_MORE_DOCS; + public DocIndexIterator iterator() { + return createDenseIterator(); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldDocValuesFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldDocValuesFormat.java index 2496278fe7a7..9350c016f672 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldDocValuesFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldDocValuesFormat.java @@ -38,6 +38,7 @@ import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.SortedNumericDocValues; import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.internal.hppc.IntObjectHashMap; import org.apache.lucene.util.IOUtils; /** @@ -77,10 +78,7 @@ public final DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IO return new FieldsWriter(state); } - static class ConsumerAndSuffix implements Closeable { - DocValuesConsumer consumer; - int suffix; - + record ConsumerAndSuffix(DocValuesConsumer consumer, int suffix) implements Closeable { @Override public void close() throws IOException { consumer.close(); @@ -221,10 +219,10 @@ private DocValuesConsumer getInstance(FieldInfo field, boolean ignoreCurrentForm final String segmentSuffix = getFullSegmentSuffix( segmentWriteState.segmentSuffix, getSuffix(formatName, Integer.toString(suffix))); - consumer = new ConsumerAndSuffix(); - consumer.consumer = - format.fieldsConsumer(new SegmentWriteState(segmentWriteState, segmentSuffix)); - consumer.suffix = suffix; + consumer = + new ConsumerAndSuffix( + format.fieldsConsumer(new SegmentWriteState(segmentWriteState, segmentSuffix)), + suffix); formats.put(format, consumer); } else { // we've already seen this format, so just grab its suffix @@ -257,9 +255,9 @@ static String getFullSegmentSuffix(String outerSegmentSuffix, String segmentSuff } } - private class FieldsReader extends DocValuesProducer { + private static class FieldsReader extends DocValuesProducer { - private final Map fields = new HashMap<>(); + private final IntObjectHashMap fields = new IntObjectHashMap<>(); private final Map formats = new HashMap<>(); // clone for merge @@ -273,10 +271,10 @@ private class FieldsReader extends DocValuesProducer { } // Then rebuild fields: - for (Map.Entry ent : other.fields.entrySet()) { - DocValuesProducer producer = oldToNew.get(ent.getValue()); + for (IntObjectHashMap.IntObjectCursor ent : other.fields) { + DocValuesProducer producer = oldToNew.get(ent.value); assert producer != null; - fields.put(ent.getKey(), producer); + fields.put(ent.key, producer); } } @@ -305,7 +303,7 @@ public FieldsReader(final SegmentReadState readState) throws IOException { segmentSuffix, format.fieldsProducer(new SegmentReadState(readState, segmentSuffix))); } - fields.put(fieldName, formats.get(segmentSuffix)); + fields.put(fi.number, formats.get(segmentSuffix)); } } } @@ -319,37 +317,37 @@ public FieldsReader(final SegmentReadState readState) throws IOException { @Override public NumericDocValues getNumeric(FieldInfo field) throws IOException { - DocValuesProducer producer = fields.get(field.name); + DocValuesProducer producer = fields.get(field.number); return producer == null ? null : producer.getNumeric(field); } @Override public BinaryDocValues getBinary(FieldInfo field) throws IOException { - DocValuesProducer producer = fields.get(field.name); + DocValuesProducer producer = fields.get(field.number); return producer == null ? null : producer.getBinary(field); } @Override public SortedDocValues getSorted(FieldInfo field) throws IOException { - DocValuesProducer producer = fields.get(field.name); + DocValuesProducer producer = fields.get(field.number); return producer == null ? null : producer.getSorted(field); } @Override public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException { - DocValuesProducer producer = fields.get(field.name); + DocValuesProducer producer = fields.get(field.number); return producer == null ? null : producer.getSortedNumeric(field); } @Override public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { - DocValuesProducer producer = fields.get(field.name); + DocValuesProducer producer = fields.get(field.number); return producer == null ? null : producer.getSortedSet(field); } @Override public DocValuesSkipper getSkipper(FieldInfo field) throws IOException { - DocValuesProducer producer = fields.get(field.name); + DocValuesProducer producer = fields.get(field.number); return producer == null ? null : producer.getSkipper(field); } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldKnnVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldKnnVectorsFormat.java index e665528652ca..63bad6d48dad 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldKnnVectorsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldKnnVectorsFormat.java @@ -19,7 +19,9 @@ import java.io.Closeable; import java.io.IOException; +import java.util.ArrayList; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.ServiceLoader; import org.apache.lucene.codecs.KnnFieldVectorsWriter; @@ -28,11 +30,14 @@ import org.apache.lucene.codecs.KnnVectorsWriter; import org.apache.lucene.index.ByteVectorValues; import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.MergeState; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.Sorter; +import org.apache.lucene.internal.hppc.IntObjectHashMap; +import org.apache.lucene.internal.hppc.ObjectCursor; import org.apache.lucene.search.KnnCollector; import org.apache.lucene.util.Bits; import org.apache.lucene.util.IOUtils; @@ -186,7 +191,8 @@ public long ramBytesUsed() { /** VectorReader that can wrap multiple delegate readers, selected by field. */ public static class FieldsReader extends KnnVectorsReader { - private final Map fields = new HashMap<>(); + private final IntObjectHashMap fields = new IntObjectHashMap<>(); + private final FieldInfos fieldInfos; /** * Create a FieldsReader over a segment, opening VectorReaders for each KnnVectorsFormat @@ -196,7 +202,7 @@ public static class FieldsReader extends KnnVectorsReader { * @throws IOException if one of the delegate readers throws */ public FieldsReader(final SegmentReadState readState) throws IOException { - + this.fieldInfos = readState.fieldInfos; // Init each unique format: boolean success = false; Map formats = new HashMap<>(); @@ -221,7 +227,7 @@ public FieldsReader(final SegmentReadState readState) throws IOException { segmentSuffix, format.fieldsReader(new SegmentReadState(readState, segmentSuffix))); } - fields.put(fieldName, formats.get(segmentSuffix)); + fields.put(fi.number, formats.get(segmentSuffix)); } } } @@ -239,51 +245,69 @@ public FieldsReader(final SegmentReadState readState) throws IOException { * @param field the name of a numeric vector field */ public KnnVectorsReader getFieldReader(String field) { - return fields.get(field); + final FieldInfo info = fieldInfos.fieldInfo(field); + if (info == null) { + return null; + } + return fields.get(info.number); } @Override public void checkIntegrity() throws IOException { - for (KnnVectorsReader reader : fields.values()) { - reader.checkIntegrity(); + for (ObjectCursor cursor : fields.values()) { + cursor.value.checkIntegrity(); } } @Override public FloatVectorValues getFloatVectorValues(String field) throws IOException { - KnnVectorsReader knnVectorsReader = fields.get(field); - if (knnVectorsReader == null) { + final FieldInfo info = fieldInfos.fieldInfo(field); + final KnnVectorsReader reader; + if (info == null || (reader = fields.get(info.number)) == null) { return null; - } else { - return knnVectorsReader.getFloatVectorValues(field); } + return reader.getFloatVectorValues(field); } @Override public ByteVectorValues getByteVectorValues(String field) throws IOException { - KnnVectorsReader knnVectorsReader = fields.get(field); - if (knnVectorsReader == null) { + final FieldInfo info = fieldInfos.fieldInfo(field); + final KnnVectorsReader reader; + if (info == null || (reader = fields.get(info.number)) == null) { return null; - } else { - return knnVectorsReader.getByteVectorValues(field); } + return reader.getByteVectorValues(field); } @Override public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { - fields.get(field).search(field, target, knnCollector, acceptDocs); + final FieldInfo info = fieldInfos.fieldInfo(field); + final KnnVectorsReader reader; + if (info == null || (reader = fields.get(info.number)) == null) { + return; + } + reader.search(field, target, knnCollector, acceptDocs); } @Override public void search(String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { - fields.get(field).search(field, target, knnCollector, acceptDocs); + final FieldInfo info = fieldInfos.fieldInfo(field); + final KnnVectorsReader reader; + if (info == null || (reader = fields.get(info.number)) == null) { + return; + } + reader.search(field, target, knnCollector, acceptDocs); } @Override public void close() throws IOException { - IOUtils.close(fields.values()); + List readers = new ArrayList<>(fields.size()); + for (ObjectCursor cursor : fields.values()) { + readers.add(cursor.value); + } + IOUtils.close(readers); } } @@ -299,14 +323,7 @@ static String getFullSegmentSuffix(String outerSegmentSuffix, String segmentSuff } } - private static class WriterAndSuffix implements Closeable { - final KnnVectorsWriter writer; - final int suffix; - - WriterAndSuffix(KnnVectorsWriter writer, int suffix) { - this.writer = writer; - this.suffix = suffix; - } + private record WriterAndSuffix(KnnVectorsWriter writer, int suffix) implements Closeable { @Override public void close() throws IOException { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldMergeState.java b/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldMergeState.java index 08c0a465859f..15e65091aed0 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldMergeState.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldMergeState.java @@ -107,7 +107,7 @@ private static class FilterFieldInfos extends FieldInfos { for (FieldInfo fi : src) { if (this.filteredNames.contains(fi.name)) { this.filtered.add(fi); - hasVectors |= fi.hasVectors(); + hasVectors |= fi.hasTermVectors(); hasPostings |= fi.getIndexOptions() != IndexOptions.NONE; hasProx |= fi.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; hasFreq |= fi.getIndexOptions() != IndexOptions.DOCS; @@ -172,7 +172,7 @@ public boolean hasOffsets() { } @Override - public boolean hasVectors() { + public boolean hasTermVectors() { return filteredHasVectors; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldPostingsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldPostingsFormat.java index 61749a09284e..182bed5797cb 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldPostingsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldPostingsFormat.java @@ -79,22 +79,13 @@ protected PerFieldPostingsFormat() { super(PER_FIELD_NAME); } - /** Group of fields written by one PostingsFormat */ - static class FieldsGroup { - final List fields; - final int suffix; - - /** - * Custom SegmentWriteState for this group of fields, with the segmentSuffix uniqueified for - * this PostingsFormat - */ - final SegmentWriteState state; - - private FieldsGroup(List fields, int suffix, SegmentWriteState state) { - this.fields = fields; - this.suffix = suffix; - this.state = state; - } + /** + * Group of fields written by one PostingsFormat + * + * @param state Custom SegmentWriteState for this group of fields, with the segmentSuffix + * uniqueified for this PostingsFormat + */ + record FieldsGroup(List fields, int suffix, SegmentWriteState state) { static class Builder { final Set fields; diff --git a/lucene/core/src/java/org/apache/lucene/document/BinaryPoint.java b/lucene/core/src/java/org/apache/lucene/document/BinaryPoint.java index cede07c4739e..b329c4bfb457 100644 --- a/lucene/core/src/java/org/apache/lucene/document/BinaryPoint.java +++ b/lucene/core/src/java/org/apache/lucene/document/BinaryPoint.java @@ -17,7 +17,6 @@ package org.apache.lucene.document; import java.util.Arrays; -import java.util.Comparator; import org.apache.lucene.index.IndexableFieldType; import org.apache.lucene.index.PointValues; import org.apache.lucene.search.MatchNoDocsQuery; @@ -238,14 +237,7 @@ public static Query newSetQuery(String field, byte[]... values) { // Don't unexpectedly change the user's incoming values array: byte[][] sortedValues = values.clone(); - Arrays.sort( - sortedValues, - new Comparator() { - @Override - public int compare(byte[] a, byte[] b) { - return Arrays.compareUnsigned(a, 0, a.length, b, 0, b.length); - } - }); + Arrays.sort(sortedValues, (a, b) -> Arrays.compareUnsigned(a, 0, a.length, b, 0, b.length)); final BytesRef encoded = new BytesRef(new byte[bytesPerDim]); diff --git a/lucene/core/src/java/org/apache/lucene/document/DateTools.java b/lucene/core/src/java/org/apache/lucene/document/DateTools.java index b38601850d8d..cf539e048057 100644 --- a/lucene/core/src/java/org/apache/lucene/document/DateTools.java +++ b/lucene/core/src/java/org/apache/lucene/document/DateTools.java @@ -148,7 +148,7 @@ public static long round(long time, Resolution resolution) { calInstance.setTimeInMillis(time); switch (resolution) { - // NOTE: switch statement fall-through is deliberate + // NOTE: switch statement fall-through is deliberate case YEAR: calInstance.set(Calendar.MONTH, 0); case MONTH: diff --git a/lucene/core/src/java/org/apache/lucene/document/DocumentStoredFieldVisitor.java b/lucene/core/src/java/org/apache/lucene/document/DocumentStoredFieldVisitor.java index 8dbb21d8c199..4413b921cd08 100644 --- a/lucene/core/src/java/org/apache/lucene/document/DocumentStoredFieldVisitor.java +++ b/lucene/core/src/java/org/apache/lucene/document/DocumentStoredFieldVisitor.java @@ -66,7 +66,7 @@ public void binaryField(FieldInfo fieldInfo, byte[] value) throws IOException { @Override public void stringField(FieldInfo fieldInfo, String value) throws IOException { final FieldType ft = new FieldType(TextField.TYPE_STORED); - ft.setStoreTermVectors(fieldInfo.hasVectors()); + ft.setStoreTermVectors(fieldInfo.hasTermVectors()); ft.setOmitNorms(fieldInfo.omitsNorms()); ft.setIndexOptions(fieldInfo.getIndexOptions()); doc.add( diff --git a/lucene/core/src/java/org/apache/lucene/document/FeatureField.java b/lucene/core/src/java/org/apache/lucene/document/FeatureField.java index ad78b375f884..bedd95cf8a52 100644 --- a/lucene/core/src/java/org/apache/lucene/document/FeatureField.java +++ b/lucene/core/src/java/org/apache/lucene/document/FeatureField.java @@ -475,7 +475,7 @@ Explanation explain(String field, String feature, float weight, int freq) { Explanation.match( pivot, "k, pivot feature value that would give a score contribution equal to w/2"), Explanation.match( - pivot, + a, "a, exponent, higher values make the function grow slower before k and faster after k"), Explanation.match(featureValue, "S, feature value")); } diff --git a/lucene/core/src/java/org/apache/lucene/document/FeatureQuery.java b/lucene/core/src/java/org/apache/lucene/document/FeatureQuery.java index ce530815345b..255895705ca7 100644 --- a/lucene/core/src/java/org/apache/lucene/document/FeatureQuery.java +++ b/lucene/core/src/java/org/apache/lucene/document/FeatureQuery.java @@ -27,7 +27,6 @@ import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.LeafSimScorer; import org.apache.lucene.search.Query; import org.apache.lucene.search.QueryVisitor; import org.apache.lucene.search.ScoreMode; @@ -120,7 +119,6 @@ public Explanation explain(LeafReaderContext context, int doc) throws IOExceptio @Override public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException { - final Weight thisWeight = this; Terms terms = Terms.getTerms(context.reader(), fieldName); TermsEnum termsEnum = terms.iterator(); if (termsEnum.seekExact(new BytesRef(featureName)) == false) { @@ -135,10 +133,8 @@ public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOExcepti @Override public Scorer get(long leadCost) throws IOException { final SimScorer scorer = function.scorer(boost); - final LeafSimScorer simScorer = - new LeafSimScorer(scorer, context.reader(), fieldName, false); final ImpactsEnum impacts = termsEnum.impacts(PostingsEnum.FREQS); - return new TermScorer(thisWeight, impacts, simScorer, topLevelScoringClause); + return new TermScorer(impacts, scorer, null, topLevelScoringClause); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/document/FieldType.java b/lucene/core/src/java/org/apache/lucene/document/FieldType.java index db4b37f6711c..632c3b24e0dd 100644 --- a/lucene/core/src/java/org/apache/lucene/document/FieldType.java +++ b/lucene/core/src/java/org/apache/lucene/document/FieldType.java @@ -20,6 +20,7 @@ import java.util.Map; import java.util.Objects; import org.apache.lucene.analysis.Analyzer; // javadocs +import org.apache.lucene.index.DocValuesSkipIndexType; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexWriterConfig; @@ -41,7 +42,7 @@ public class FieldType implements IndexableFieldType { private IndexOptions indexOptions = IndexOptions.NONE; private boolean frozen; private DocValuesType docValuesType = DocValuesType.NONE; - private boolean docValuesSkipIndex; + private DocValuesSkipIndexType docValuesSkipIndex = DocValuesSkipIndexType.NONE; private int dimensionCount; private int indexDimensionCount; private int dimensionNumBytes; @@ -61,7 +62,7 @@ public FieldType(IndexableFieldType ref) { this.omitNorms = ref.omitNorms(); this.indexOptions = ref.indexOptions(); this.docValuesType = ref.docValuesType(); - this.docValuesSkipIndex = ref.hasDocValuesSkipIndex(); + this.docValuesSkipIndex = ref.docValuesSkipIndexType(); this.dimensionCount = ref.pointDimensionCount(); this.indexDimensionCount = ref.pointIndexDimensionCount(); this.dimensionNumBytes = ref.pointNumBytes(); @@ -508,7 +509,7 @@ public void setDocValuesType(DocValuesType type) { } @Override - public boolean hasDocValuesSkipIndex() { + public DocValuesSkipIndexType docValuesSkipIndexType() { return docValuesSkipIndex; } @@ -518,7 +519,7 @@ public boolean hasDocValuesSkipIndex() { * correlate with fields that are part of the index sort, so that values can be expected to be * clustered in the doc ID space. */ - public void setDocValuesSkipIndex(boolean docValuesSkipIndex) { + public void setDocValuesSkipIndexType(DocValuesSkipIndexType docValuesSkipIndex) { checkIfFrozen(); this.docValuesSkipIndex = docValuesSkipIndex; } @@ -531,7 +532,7 @@ public int hashCode() { result = prime * result + indexDimensionCount; result = prime * result + dimensionNumBytes; result = prime * result + ((docValuesType == null) ? 0 : docValuesType.hashCode()); - result = prime * result + Boolean.hashCode(docValuesSkipIndex); + result = prime * result + (docValuesSkipIndex == null ? 0 : docValuesSkipIndex.hashCode()); result = prime * result + indexOptions.hashCode(); result = prime * result + (omitNorms ? 1231 : 1237); result = prime * result + (storeTermVectorOffsets ? 1231 : 1237); diff --git a/lucene/core/src/java/org/apache/lucene/document/InetAddressPoint.java b/lucene/core/src/java/org/apache/lucene/document/InetAddressPoint.java index dcfd65a8a087..dec9f71de879 100644 --- a/lucene/core/src/java/org/apache/lucene/document/InetAddressPoint.java +++ b/lucene/core/src/java/org/apache/lucene/document/InetAddressPoint.java @@ -19,7 +19,6 @@ import java.net.InetAddress; import java.net.UnknownHostException; import java.util.Arrays; -import java.util.Comparator; import org.apache.lucene.index.PointValues; import org.apache.lucene.search.PointInSetQuery; import org.apache.lucene.search.PointRangeQuery; @@ -288,14 +287,7 @@ public static Query newSetQuery(String field, InetAddress... values) { sortedValues[i] = encode(values[i]); } - Arrays.sort( - sortedValues, - new Comparator() { - @Override - public int compare(byte[] a, byte[] b) { - return Arrays.compareUnsigned(a, 0, BYTES, b, 0, BYTES); - } - }); + Arrays.sort(sortedValues, (a, b) -> Arrays.compareUnsigned(a, 0, BYTES, b, 0, BYTES)); final BytesRef encoded = new BytesRef(new byte[BYTES]); diff --git a/lucene/core/src/java/org/apache/lucene/document/NearestNeighbor.java b/lucene/core/src/java/org/apache/lucene/document/NearestNeighbor.java index 12f4afffd0d5..d1431e590752 100644 --- a/lucene/core/src/java/org/apache/lucene/document/NearestNeighbor.java +++ b/lucene/core/src/java/org/apache/lucene/document/NearestNeighbor.java @@ -20,7 +20,6 @@ import static org.apache.lucene.geo.GeoEncodingUtils.decodeLongitude; import java.io.IOException; -import java.util.Comparator; import java.util.List; import java.util.PriorityQueue; import org.apache.lucene.geo.Rectangle; @@ -35,30 +34,18 @@ /** KNN search on top of 2D lat/lon indexed points. */ class NearestNeighbor { - static class Cell implements Comparable { - final int readerIndex; - final byte[] minPacked; - final byte[] maxPacked; - final PointTree index; - - /** - * The closest distance from a point in this cell to the query point, computed as a sort key - * through {@link SloppyMath#haversinSortKey}. Note that this is an approximation to the closest - * distance, and there could be a point in the cell that is closer. - */ - final double distanceSortKey; - - public Cell( - PointTree index, - int readerIndex, - byte[] minPacked, - byte[] maxPacked, - double distanceSortKey) { - this.index = index; - this.readerIndex = readerIndex; - this.minPacked = minPacked.clone(); - this.maxPacked = maxPacked.clone(); - this.distanceSortKey = distanceSortKey; + /** + * @param distanceSortKey The closest distance from a point in this cell to the query point, + * computed as a sort key through {@link SloppyMath#haversinSortKey}. Note that this is an + * approximation to the closest distance, and there could be a point in the cell that is + * closer. + */ + record Cell( + PointTree index, int readerIndex, byte[] minPacked, byte[] maxPacked, double distanceSortKey) + implements Comparable { + Cell { + minPacked = minPacked.clone(); + maxPacked = maxPacked.clone(); } @Override @@ -252,18 +239,15 @@ public static NearestHit[] nearest( final PriorityQueue hitQueue = new PriorityQueue<>( n, - new Comparator() { - @Override - public int compare(NearestHit a, NearestHit b) { - // sort by opposite distanceSortKey natural order - int cmp = Double.compare(a.distanceSortKey, b.distanceSortKey); - if (cmp != 0) { - return -cmp; - } - - // tie-break by higher docID: - return b.docID - a.docID; + (a, b) -> { + // sort by opposite distanceSortKey natural order + int cmp = Double.compare(a.distanceSortKey, b.distanceSortKey); + if (cmp != 0) { + return -cmp; } + + // tie-break by higher docID: + return b.docID - a.docID; }); // Holds all cells, sorted by closest to the point: diff --git a/lucene/core/src/java/org/apache/lucene/document/NumericDocValuesField.java b/lucene/core/src/java/org/apache/lucene/document/NumericDocValuesField.java index 95ed6eb07115..6f1fb78e70c7 100644 --- a/lucene/core/src/java/org/apache/lucene/document/NumericDocValuesField.java +++ b/lucene/core/src/java/org/apache/lucene/document/NumericDocValuesField.java @@ -16,6 +16,7 @@ */ package org.apache.lucene.document; +import org.apache.lucene.index.DocValuesSkipIndexType; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.search.IndexOrDocValuesQuery; import org.apache.lucene.search.Query; @@ -42,13 +43,13 @@ public class NumericDocValuesField extends Field { TYPE.freeze(); INDEXED_TYPE = new FieldType(TYPE); - INDEXED_TYPE.setDocValuesSkipIndex(true); + INDEXED_TYPE.setDocValuesSkipIndexType(DocValuesSkipIndexType.RANGE); INDEXED_TYPE.freeze(); } /** * Creates a new {@link NumericDocValuesField} with the specified 64-bit long value that also - * creates a {@link FieldType#hasDocValuesSkipIndex() skip index}. + * creates a {@link FieldType#docValuesSkipIndexType() skip index}. * * @param name field name * @param value 64-bit long value diff --git a/lucene/core/src/java/org/apache/lucene/document/SortedDocValuesField.java b/lucene/core/src/java/org/apache/lucene/document/SortedDocValuesField.java index 2ed6956b7170..746f65ae5647 100644 --- a/lucene/core/src/java/org/apache/lucene/document/SortedDocValuesField.java +++ b/lucene/core/src/java/org/apache/lucene/document/SortedDocValuesField.java @@ -17,6 +17,7 @@ package org.apache.lucene.document; import java.util.Collection; +import org.apache.lucene.index.DocValuesSkipIndexType; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.search.IndexOrDocValuesQuery; import org.apache.lucene.search.MultiTermQuery; @@ -48,13 +49,13 @@ public class SortedDocValuesField extends Field { TYPE.freeze(); INDEXED_TYPE = new FieldType(TYPE); - INDEXED_TYPE.setDocValuesSkipIndex(true); + INDEXED_TYPE.setDocValuesSkipIndexType(DocValuesSkipIndexType.RANGE); INDEXED_TYPE.freeze(); } /** * Creates a new {@link SortedDocValuesField} with the specified 64-bit long value that also - * creates a {@link FieldType#hasDocValuesSkipIndex() skip index}. + * creates a {@link FieldType#docValuesSkipIndexType() skip index}. * * @param name field name * @param bytes binary content diff --git a/lucene/core/src/java/org/apache/lucene/document/SortedNumericDocValuesField.java b/lucene/core/src/java/org/apache/lucene/document/SortedNumericDocValuesField.java index 2d635462a226..0efb6f463939 100644 --- a/lucene/core/src/java/org/apache/lucene/document/SortedNumericDocValuesField.java +++ b/lucene/core/src/java/org/apache/lucene/document/SortedNumericDocValuesField.java @@ -16,6 +16,7 @@ */ package org.apache.lucene.document; +import org.apache.lucene.index.DocValuesSkipIndexType; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.search.IndexOrDocValuesQuery; import org.apache.lucene.search.Query; @@ -50,13 +51,13 @@ public class SortedNumericDocValuesField extends Field { TYPE.freeze(); INDEXED_TYPE = new FieldType(TYPE); - INDEXED_TYPE.setDocValuesSkipIndex(true); + INDEXED_TYPE.setDocValuesSkipIndexType(DocValuesSkipIndexType.RANGE); INDEXED_TYPE.freeze(); } /** * Creates a new {@link SortedNumericDocValuesField} with the specified 64-bit long value that - * also creates a {@link FieldType#hasDocValuesSkipIndex() skip index}. + * also creates a {@link FieldType#docValuesSkipIndexType() skip index}. * * @param name field name * @param value 64-bit long value diff --git a/lucene/core/src/java/org/apache/lucene/document/SortedNumericDocValuesRangeQuery.java b/lucene/core/src/java/org/apache/lucene/document/SortedNumericDocValuesRangeQuery.java index 31ec0319db89..3d339b91a0e0 100644 --- a/lucene/core/src/java/org/apache/lucene/document/SortedNumericDocValuesRangeQuery.java +++ b/lucene/core/src/java/org/apache/lucene/document/SortedNumericDocValuesRangeQuery.java @@ -18,13 +18,17 @@ import java.io.IOException; import java.util.Objects; +import java.util.function.LongPredicate; import org.apache.lucene.index.DocValues; import org.apache.lucene.index.DocValuesSkipper; +import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.SortedNumericDocValues; import org.apache.lucene.search.ConstantScoreScorer; import org.apache.lucene.search.ConstantScoreWeight; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.DocValuesRangeIterator; import org.apache.lucene.search.FieldExistsQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchNoDocsQuery; @@ -32,6 +36,7 @@ import org.apache.lucene.search.QueryVisitor; import org.apache.lucene.search.ScoreMode; import org.apache.lucene.search.ScorerSupplier; +import org.apache.lucene.search.Sort; import org.apache.lucene.search.TwoPhaseIterator; import org.apache.lucene.search.Weight; @@ -116,12 +121,28 @@ public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOExcepti if (skipper.minValue() > upperValue || skipper.maxValue() < lowerValue) { return null; } + if (skipper.docCount() == context.reader().maxDoc() + && skipper.minValue() >= lowerValue + && skipper.maxValue() <= upperValue) { + final var scorer = + new ConstantScoreScorer( + score(), scoreMode, DocIdSetIterator.all(skipper.docCount())); + return new DefaultScorerSupplier(scorer); + } } SortedNumericDocValues values = DocValues.getSortedNumeric(context.reader(), field); final NumericDocValues singleton = DocValues.unwrapSingleton(values); TwoPhaseIterator iterator; if (singleton != null) { + if (skipper != null) { + final DocIdSetIterator psIterator = + getDocIdSetIteratorOrNullForPrimarySort(context.reader(), singleton, skipper); + if (psIterator != null) { + return new DefaultScorerSupplier( + new ConstantScoreScorer(score(), scoreMode, psIterator)); + } + } iterator = new TwoPhaseIterator(singleton) { @Override @@ -159,11 +180,72 @@ public float matchCost() { }; } if (skipper != null) { - iterator = new DocValuesRangeIterator(iterator, skipper, lowerValue, upperValue); + iterator = new DocValuesRangeIterator(iterator, skipper, lowerValue, upperValue, false); } final var scorer = new ConstantScoreScorer(score(), scoreMode, iterator); return new DefaultScorerSupplier(scorer); } }; } + + private DocIdSetIterator getDocIdSetIteratorOrNullForPrimarySort( + LeafReader reader, NumericDocValues numericDocValues, DocValuesSkipper skipper) + throws IOException { + if (skipper.docCount() != reader.maxDoc()) { + return null; + } + final Sort indexSort = reader.getMetaData().sort(); + if (indexSort == null + || indexSort.getSort().length == 0 + || indexSort.getSort()[0].getField().equals(field) == false) { + return null; + } + + final int minDocID; + final int maxDocID; + if (indexSort.getSort()[0].getReverse()) { + if (skipper.maxValue() <= upperValue) { + minDocID = 0; + } else { + skipper.advance(Long.MIN_VALUE, upperValue); + minDocID = nextDoc(skipper.minDocID(0), numericDocValues, l -> l <= upperValue); + } + if (skipper.minValue() >= lowerValue) { + maxDocID = skipper.docCount(); + } else { + skipper.advance(Long.MIN_VALUE, lowerValue); + maxDocID = nextDoc(skipper.minDocID(0), numericDocValues, l -> l < lowerValue); + } + } else { + if (skipper.minValue() >= lowerValue) { + minDocID = 0; + } else { + skipper.advance(lowerValue, Long.MAX_VALUE); + minDocID = nextDoc(skipper.minDocID(0), numericDocValues, l -> l >= lowerValue); + } + if (skipper.maxValue() <= upperValue) { + maxDocID = skipper.docCount(); + } else { + skipper.advance(upperValue, Long.MAX_VALUE); + maxDocID = nextDoc(skipper.minDocID(0), numericDocValues, l -> l > upperValue); + } + } + return minDocID == maxDocID + ? DocIdSetIterator.empty() + : DocIdSetIterator.range(minDocID, maxDocID); + } + + private static int nextDoc(int startDoc, NumericDocValues docValues, LongPredicate predicate) + throws IOException { + int doc = docValues.docID(); + if (startDoc > doc) { + doc = docValues.advance(startDoc); + } + for (; doc < DocIdSetIterator.NO_MORE_DOCS; doc = docValues.nextDoc()) { + if (predicate.test(docValues.longValue())) { + break; + } + } + return doc; + } } diff --git a/lucene/core/src/java/org/apache/lucene/document/SortedSetDocValuesField.java b/lucene/core/src/java/org/apache/lucene/document/SortedSetDocValuesField.java index 74ae5dc80432..02e5a82d6a6f 100644 --- a/lucene/core/src/java/org/apache/lucene/document/SortedSetDocValuesField.java +++ b/lucene/core/src/java/org/apache/lucene/document/SortedSetDocValuesField.java @@ -17,6 +17,7 @@ package org.apache.lucene.document; import java.util.Collection; +import org.apache.lucene.index.DocValuesSkipIndexType; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.search.IndexOrDocValuesQuery; import org.apache.lucene.search.MultiTermQuery; @@ -49,13 +50,13 @@ public class SortedSetDocValuesField extends Field { TYPE.freeze(); INDEXED_TYPE = new FieldType(TYPE); - INDEXED_TYPE.setDocValuesSkipIndex(true); + INDEXED_TYPE.setDocValuesSkipIndexType(DocValuesSkipIndexType.RANGE); INDEXED_TYPE.freeze(); } /** * Creates a new {@link SortedSetDocValuesField} with the specified 64-bit long value that also - * creates a {@link FieldType#hasDocValuesSkipIndex() skip index}. + * creates a {@link FieldType#docValuesSkipIndexType() skip index}. * * @param name field name * @param bytes binary content diff --git a/lucene/core/src/java/org/apache/lucene/document/SortedSetDocValuesRangeQuery.java b/lucene/core/src/java/org/apache/lucene/document/SortedSetDocValuesRangeQuery.java index d03daac06891..0ad8b1593c9b 100644 --- a/lucene/core/src/java/org/apache/lucene/document/SortedSetDocValuesRangeQuery.java +++ b/lucene/core/src/java/org/apache/lucene/document/SortedSetDocValuesRangeQuery.java @@ -18,14 +18,17 @@ import java.io.IOException; import java.util.Objects; +import java.util.function.LongPredicate; import org.apache.lucene.index.DocValues; import org.apache.lucene.index.DocValuesSkipper; +import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.search.ConstantScoreScorer; import org.apache.lucene.search.ConstantScoreWeight; import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.DocValuesRangeIterator; import org.apache.lucene.search.FieldExistsQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; @@ -33,6 +36,7 @@ import org.apache.lucene.search.ScoreMode; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.ScorerSupplier; +import org.apache.lucene.search.Sort; import org.apache.lucene.search.TwoPhaseIterator; import org.apache.lucene.search.Weight; import org.apache.lucene.util.BytesRef; @@ -150,7 +154,6 @@ public Scorer get(long leadCost) throws IOException { } } - // no terms matched in this segment // no terms matched in this segment if (minOrd > maxOrd || (skipper != null @@ -158,9 +161,26 @@ public Scorer get(long leadCost) throws IOException { return new ConstantScoreScorer(score(), scoreMode, DocIdSetIterator.empty()); } + // all terms matched in this segment + if (skipper != null + && skipper.docCount() == context.reader().maxDoc() + && skipper.minValue() >= minOrd + && skipper.maxValue() <= maxOrd) { + return new ConstantScoreScorer( + score(), scoreMode, DocIdSetIterator.all(skipper.docCount())); + } + final SortedDocValues singleton = DocValues.unwrapSingleton(values); TwoPhaseIterator iterator; if (singleton != null) { + if (skipper != null) { + final DocIdSetIterator psIterator = + getDocIdSetIteratorOrNullForPrimarySort( + context.reader(), singleton, skipper, minOrd, maxOrd); + if (psIterator != null) { + return new ConstantScoreScorer(score(), scoreMode, psIterator); + } + } iterator = new TwoPhaseIterator(singleton) { @Override @@ -198,7 +218,7 @@ public float matchCost() { }; } if (skipper != null) { - iterator = new DocValuesRangeIterator(iterator, skipper, minOrd, maxOrd); + iterator = new DocValuesRangeIterator(iterator, skipper, minOrd, maxOrd, false); } return new ConstantScoreScorer(score(), scoreMode, iterator); } @@ -216,4 +236,69 @@ public boolean isCacheable(LeafReaderContext ctx) { } }; } + + private DocIdSetIterator getDocIdSetIteratorOrNullForPrimarySort( + LeafReader reader, + SortedDocValues sortedDocValues, + DocValuesSkipper skipper, + long minOrd, + long maxOrd) + throws IOException { + if (skipper.docCount() != reader.maxDoc()) { + return null; + } + final Sort indexSort = reader.getMetaData().sort(); + if (indexSort == null + || indexSort.getSort().length == 0 + || indexSort.getSort()[0].getField().equals(field) == false) { + return null; + } + + final int minDocID; + final int maxDocID; + if (indexSort.getSort()[0].getReverse()) { + if (skipper.maxValue() <= maxOrd) { + minDocID = 0; + } else { + skipper.advance(Long.MIN_VALUE, maxOrd); + minDocID = nextDoc(skipper.minDocID(0), sortedDocValues, l -> l <= maxOrd); + } + if (skipper.minValue() >= minOrd) { + maxDocID = skipper.docCount(); + } else { + skipper.advance(Long.MIN_VALUE, minOrd); + maxDocID = nextDoc(skipper.minDocID(0), sortedDocValues, l -> l < minOrd); + } + } else { + if (skipper.minValue() >= minOrd) { + minDocID = 0; + } else { + skipper.advance(minOrd, Long.MAX_VALUE); + minDocID = nextDoc(skipper.minDocID(0), sortedDocValues, l -> l >= minOrd); + } + if (skipper.maxValue() <= maxOrd) { + maxDocID = skipper.docCount(); + } else { + skipper.advance(maxOrd, Long.MAX_VALUE); + maxDocID = nextDoc(skipper.minDocID(0), sortedDocValues, l -> l > maxOrd); + } + } + return minDocID == maxDocID + ? DocIdSetIterator.empty() + : DocIdSetIterator.range(minDocID, maxDocID); + } + + private static int nextDoc(int startDoc, SortedDocValues docValues, LongPredicate predicate) + throws IOException { + int doc = docValues.docID(); + if (startDoc > doc) { + doc = docValues.advance(startDoc); + } + for (; doc < DocIdSetIterator.NO_MORE_DOCS; doc = docValues.nextDoc()) { + if (predicate.test(docValues.ordValue())) { + break; + } + } + return doc; + } } diff --git a/lucene/core/src/java/org/apache/lucene/geo/GeoEncodingUtils.java b/lucene/core/src/java/org/apache/lucene/geo/GeoEncodingUtils.java index d7ff62ba3bed..818639bee8f9 100644 --- a/lucene/core/src/java/org/apache/lucene/geo/GeoEncodingUtils.java +++ b/lucene/core/src/java/org/apache/lucene/geo/GeoEncodingUtils.java @@ -363,7 +363,7 @@ private DistancePredicate( */ public boolean test(int lat, int lon) { final int lat2 = ((lat - Integer.MIN_VALUE) >>> latShift); - if (lat2 < latBase || lat2 >= latBase + maxLatDelta) { + if (lat2 < latBase || lat2 - latBase >= maxLatDelta) { return false; } int lon2 = ((lon - Integer.MIN_VALUE) >>> lonShift); @@ -411,7 +411,7 @@ private Component2DPredicate( */ public boolean test(int lat, int lon) { final int lat2 = ((lat - Integer.MIN_VALUE) >>> latShift); - if (lat2 < latBase || lat2 >= latBase + maxLatDelta) { + if (lat2 < latBase || lat2 - latBase >= maxLatDelta) { return false; } int lon2 = ((lon - Integer.MIN_VALUE) >>> lonShift); diff --git a/lucene/core/src/java/org/apache/lucene/geo/Tessellator.java b/lucene/core/src/java/org/apache/lucene/geo/Tessellator.java index f1093c3e8b04..a2b8cad84ff9 100644 --- a/lucene/core/src/java/org/apache/lucene/geo/Tessellator.java +++ b/lucene/core/src/java/org/apache/lucene/geo/Tessellator.java @@ -20,7 +20,6 @@ import static org.apache.lucene.geo.GeoEncodingUtils.encodeLongitude; import static org.apache.lucene.geo.GeoUtils.lineCrossesLine; import static org.apache.lucene.geo.GeoUtils.lineOverlapLine; -import static org.apache.lucene.geo.GeoUtils.orient; import java.util.ArrayList; import java.util.HashMap; @@ -215,7 +214,7 @@ public static List tessellate( * Creates a circular doubly linked list using polygon points. The order is governed by the * specified winding order */ - private static final Node createDoublyLinkedList( + private static Node createDoublyLinkedList( final double[] x, final double[] y, final WindingOrder polyWindingOrder, @@ -243,7 +242,7 @@ private static final Node createDoublyLinkedList( return filterPoints(lastNode, null); } - private static final Node eliminateHoles(final XYPolygon polygon, Node outerNode) { + private static Node eliminateHoles(final XYPolygon polygon, Node outerNode) { // Define a list to hole a reference to each filtered hole list. final List holeList = new ArrayList<>(); // keep a reference to the hole @@ -273,8 +272,8 @@ private static final Node eliminateHoles(final XYPolygon polygon, Node outerNode return eliminateHoles(holeList, holeListPolygons, outerNode); } - /** Links every hole into the outer loop, producing a single-ring polygon without holes. * */ - private static final Node eliminateHoles(final Polygon polygon, Node outerNode) { + /** Links every hole into the outer loop, producing a single-ring polygon without holes. */ + private static Node eliminateHoles(final Polygon polygon, Node outerNode) { // Define a list to hole a reference to each filtered hole list. final List holeList = new ArrayList<>(); // keep a reference to the hole @@ -304,7 +303,7 @@ private static final Node eliminateHoles(final Polygon polygon, Node outerNode) return eliminateHoles(holeList, holeListPolygons, outerNode); } - private static final Node eliminateHoles( + private static Node eliminateHoles( List holeList, final Map holeListPolygons, Node outerNode) { // Sort the hole vertices by x coordinate holeList.sort( @@ -350,30 +349,19 @@ private static final Node eliminateHoles( } /** Finds a bridge between vertices that connects a hole with an outer ring, and links it */ - private static final void eliminateHole( + private static void eliminateHole( final Node holeNode, Node outerNode, double holeMinX, double holeMaxX, double holeMinY, double holeMaxY) { - // Attempt to find a common point between the HoleNode and OuterNode. - Node next = outerNode; - do { - if (Rectangle.containsPoint( - next.getY(), next.getX(), holeMinY, holeMaxY, holeMinX, holeMaxX)) { - Node sharedVertex = getSharedVertex(holeNode, next); - if (sharedVertex != null) { - // Split the resulting polygon. - Node node = splitPolygon(next, sharedVertex, true); - // Filter the split nodes. - filterPoints(node, node.next); - return; - } - } - next = next.next; - } while (next != outerNode); + // Attempt to merge the hole using a common point between if it exists. + if (maybeMergeHoleWithSharedVertices( + holeNode, outerNode, holeMinX, holeMaxX, holeMinY, holeMaxY)) { + return; + } // Attempt to find a logical bridge between the HoleNode and OuterNode. outerNode = fetchHoleBridge(holeNode, outerNode); @@ -390,12 +378,112 @@ private static final void eliminateHole( } } + /** + * Choose a common vertex between the polygon and the hole if it exists and return true, otherwise + * return false + */ + private static boolean maybeMergeHoleWithSharedVertices( + final Node holeNode, + Node outerNode, + double holeMinX, + double holeMaxX, + double holeMinY, + double holeMaxY) { + // Attempt to find a common point between the HoleNode and OuterNode. + Node sharedVertex = null; + Node sharedVertexConnection = null; + Node next = outerNode; + do { + if (Rectangle.containsPoint( + next.getY(), next.getX(), holeMinY, holeMaxY, holeMinX, holeMaxX)) { + Node newSharedVertex = getSharedVertex(holeNode, next); + if (newSharedVertex != null) { + if (sharedVertex == null) { + sharedVertex = newSharedVertex; + sharedVertexConnection = next; + } else if (newSharedVertex.equals(sharedVertex)) { + // This can only happen if this vertex has been already used for a bridge. We need to + // choose the right one. + sharedVertexConnection = + getSharedInsideVertex(sharedVertex, sharedVertexConnection, next); + } + } + } + next = next.next; + } while (next != outerNode); + if (sharedVertex != null) { + // Split the resulting polygon. + Node node = splitPolygon(sharedVertexConnection, sharedVertex, true); + // Filter the split nodes. + filterPoints(node, node.next); + return true; + } + return false; + } + + /** Check if the provided vertex is in the polygon and return it */ + private static Node getSharedVertex(final Node polygon, final Node vertex) { + Node next = polygon; + do { + if (isVertexEquals(next, vertex)) { + return next; + } + next = next.next; + } while (next != polygon); + return null; + } + + /** Choose the vertex that has a smaller angle with the hole vertex */ + static Node getSharedInsideVertex(Node holeVertex, Node candidateA, Node candidateB) { + assert isVertexEquals(holeVertex, candidateA) && isVertexEquals(holeVertex, candidateB); + // we are joining candidate.prevNode -> holeVertex.node -> holeVertex.nextNode. + // A negative area means a convex angle. if both are convex/reflex choose the point of + // minimum angle + final double a1 = + area( + candidateA.previous.getX(), + candidateA.previous.getY(), + holeVertex.getX(), + holeVertex.getY(), + holeVertex.next.getX(), + holeVertex.next.getY()); + final double a2 = + area( + candidateB.previous.getX(), + candidateB.previous.getY(), + holeVertex.getX(), + holeVertex.getY(), + holeVertex.next.getX(), + holeVertex.next.getY()); + + if (a1 < 0 != a2 < 0) { + // one is convex, the other reflex, get the convex one + return a1 < a2 ? candidateA : candidateB; + } else { + // both are convex / reflex, choose the smallest angle + final double angle1 = angle(candidateA.previous, candidateA, holeVertex.next); + final double angle2 = angle(candidateB.previous, candidateB, holeVertex.next); + return angle1 < angle2 ? candidateA : candidateB; + } + } + + private static double angle(Node a, Node b, Node c) { + final double ax = a.getX() - b.getX(); + final double ay = a.getY() - b.getY(); + final double cx = c.getX() - b.getX(); + final double cy = c.getY() - b.getY(); + final double dotProduct = ax * cx + ay * cy; + final double aLength = Math.sqrt(ax * ax + ay * ay); + final double bLength = Math.sqrt(cx * cx + cy * cy); + return Math.acos(dotProduct / (aLength * bLength)); + } + /** * David Eberly's algorithm for finding a bridge between a hole and outer polygon * *

    see: http://www.geometrictools.com/Documentation/TriangulationByEarClipping.pdf */ - private static final Node fetchHoleBridge(final Node holeNode, final Node outerNode) { + private static Node fetchHoleBridge(final Node holeNode, final Node outerNode) { Node p = outerNode; double qx = Double.NEGATIVE_INFINITY; final double hx = holeNode.getX(); @@ -453,34 +541,8 @@ && isLocallyInside(p, holeNode)) { return connection; } - /** Check if the provided vertex is in the polygon and return it * */ - private static Node getSharedVertex(final Node polygon, final Node vertex) { - Node next = polygon; - do { - if (isVertexEquals(next, vertex)) { - // make sure we are not crossing the polygon. This might happen when several holes share the - // same polygon vertex. - boolean crosses = - GeoUtils.lineCrossesLine( - next.previous.getX(), - next.previous.getY(), - vertex.next.getX(), - vertex.next.getY(), - next.next.getX(), - next.next.getY(), - vertex.previous.getX(), - vertex.previous.getY()); - if (crosses == false) { - return next; - } - } - next = next.next; - } while (next != polygon); - return null; - } - /** Finds the left-most hole of a polygon ring. * */ - private static final Node fetchLeftmost(final Node start) { + private static Node fetchLeftmost(final Node start) { Node node = start; Node leftMost = start; do { @@ -502,7 +564,7 @@ private static final Node fetchLeftmost(final Node start) { * Main ear slicing loop which triangulates the vertices of a polygon, provided as a doubly-linked * list. * */ - private static final List earcutLinkedList( + private static List earcutLinkedList( Object polygon, Node currEar, final List tessellation, @@ -587,7 +649,7 @@ private static final List earcutLinkedList( } /** Determines whether a polygon node forms a valid ear with adjacent nodes. * */ - private static final boolean isEar(final Node ear, final boolean mortonOptimized) { + private static boolean isEar(final Node ear, final boolean mortonOptimized) { if (mortonOptimized == true) { return mortonIsEar(ear); } @@ -623,7 +685,7 @@ && area( * Uses morton code for speed to determine whether or a polygon node forms a valid ear w/ adjacent * nodes */ - private static final boolean mortonIsEar(final Node ear) { + private static boolean mortonIsEar(final Node ear) { // triangle bbox (flip the bits so negative encoded values are < positive encoded values) int minTX = StrictMath.min(StrictMath.min(ear.previous.x, ear.x), ear.next.x) ^ 0x80000000; int minTY = StrictMath.min(StrictMath.min(ear.previous.y, ear.y), ear.next.y) ^ 0x80000000; @@ -740,7 +802,7 @@ && area( } /** Iterate through all polygon nodes and remove small local self-intersections * */ - private static final Node cureLocalIntersections( + private static Node cureLocalIntersections( Node startNode, final List tessellation, final boolean mortonOptimized) { Node node = startNode; Node nextNode; @@ -794,7 +856,7 @@ && isIntersectingPolygon(a, a.getX(), a.getY(), b.getX(), b.getY()) == false) { * Attempt to split a polygon and independently triangulate each side. Return true if the polygon * was splitted * */ - private static final boolean splitEarcut( + private static boolean splitEarcut( final Object polygon, final Node start, final List tessellation, @@ -858,7 +920,7 @@ private static void checkIntersection(Node a, boolean isMorton) { * Uses morton code for speed to determine whether or not and edge defined by a and b overlaps * with a polygon edge */ - private static final void mortonCheckIntersection(final Node a, final Node b) { + private static void mortonCheckIntersection(final Node a, final Node b) { // edge bbox (flip the bits so negative encoded values are < positive encoded values) int minTX = StrictMath.min(a.x, a.next.x) ^ 0x80000000; int minTY = StrictMath.min(a.y, a.next.y) ^ 0x80000000; @@ -974,7 +1036,7 @@ private static boolean isEdgeFromPolygon(final Node a, final Node b, final boole * Uses morton code for speed to determine whether or not and edge defined by a and b overlaps * with a polygon edge */ - private static final boolean isMortonEdgeFromPolygon(final Node a, final Node b) { + private static boolean isMortonEdgeFromPolygon(final Node a, final Node b) { // edge bbox (flip the bits so negative encoded values are < positive encoded values) final int minTX = StrictMath.min(a.x, b.x) ^ 0x80000000; final int minTY = StrictMath.min(a.y, b.y) ^ 0x80000000; @@ -1060,7 +1122,7 @@ private static boolean isPointInLine( } /** Links two polygon vertices using a bridge. * */ - private static final Node splitPolygon(final Node a, final Node b, boolean edgeFromPolygon) { + private static Node splitPolygon(final Node a, final Node b, boolean edgeFromPolygon) { final Node a2 = new Node(a); final Node b2 = new Node(b); final Node an = a.next; @@ -1136,7 +1198,7 @@ private static double signedArea(final Node start, final Node end) { return windingSum; } - private static final boolean isLocallyInside(final Node a, final Node b) { + private static boolean isLocallyInside(final Node a, final Node b) { double area = area( a.previous.getX(), a.previous.getY(), a.getX(), a.getY(), a.next.getX(), a.next.getY()); @@ -1156,7 +1218,7 @@ && area(a.getX(), a.getY(), a.previous.getX(), a.previous.getY(), b.getX(), b.ge } /** Determine whether the middle point of a polygon diagonal is contained within the polygon */ - private static final boolean middleInsert( + private static boolean middleInsert( final Node start, final double x0, final double y0, final double x1, final double y1) { Node node = start; Node nextNode; @@ -1179,7 +1241,7 @@ private static final boolean middleInsert( } /** Determines if the diagonal of a polygon is intersecting with any polygon elements. * */ - private static final boolean isIntersectingPolygon( + private static boolean isIntersectingPolygon( final Node start, final double x0, final double y0, final double x1, final double y1) { Node node = start; Node nextNode; @@ -1198,7 +1260,7 @@ private static final boolean isIntersectingPolygon( } /** Determines whether two line segments intersect. * */ - public static final boolean linesIntersect( + public static boolean linesIntersect( final double aX0, final double aY0, final double aX1, @@ -1212,7 +1274,7 @@ public static final boolean linesIntersect( } /** Interlinks polygon nodes in Z-Order. It reset the values on the z values* */ - private static final void sortByMortonWithReset(Node start) { + private static void sortByMortonWithReset(Node start) { Node next = start; do { next.previousZ = next.previous; @@ -1223,7 +1285,7 @@ private static final void sortByMortonWithReset(Node start) { } /** Interlinks polygon nodes in Z-Order. * */ - private static final void sortByMorton(Node start) { + private static void sortByMorton(Node start) { start.previousZ.nextZ = null; start.previousZ = null; // Sort the generated ring using Z ordering. @@ -1234,7 +1296,7 @@ private static final void sortByMorton(Node start) { * Simon Tatham's doubly-linked list O(n log n) mergesort see: * http://www.chiark.greenend.org.uk/~sgtatham/algorithms/listsort.html */ - private static final void tathamSort(Node list) { + private static void tathamSort(Node list) { Node p, q, e, tail; int i, numMerges, pSize, qSize; int inSize = 1; @@ -1290,7 +1352,7 @@ private static final void tathamSort(Node list) { } /** Eliminate colinear/duplicate points from the doubly linked list */ - private static final Node filterPoints(final Node start, Node end) { + private static Node filterPoints(final Node start, Node end) { if (start == null) { return start; } @@ -1343,7 +1405,7 @@ && area( /** * Creates a node and optionally links it with a previous node in a circular doubly-linked list */ - private static final Node insertNode( + private static Node insertNode( final double[] x, final double[] y, int index, @@ -1370,7 +1432,7 @@ private static final Node insertNode( } /** Removes a node from the doubly linked list */ - private static final void removeNode(Node node, boolean edgeFromPolygon) { + private static void removeNode(Node node, boolean edgeFromPolygon) { node.next.previous = node.previous; node.previous.next = node.next; node.previous.isNextEdgeFromPolygon = edgeFromPolygon; @@ -1384,16 +1446,16 @@ private static final void removeNode(Node node, boolean edgeFromPolygon) { } /** Determines if two point vertices are equal. * */ - private static final boolean isVertexEquals(final Node a, final Node b) { + private static boolean isVertexEquals(final Node a, final Node b) { return isVertexEquals(a, b.getX(), b.getY()); } /** Determines if two point vertices are equal. * */ - private static final boolean isVertexEquals(final Node a, final double x, final double y) { + private static boolean isVertexEquals(final Node a, final double x, final double y) { return a.getX() == x && a.getY() == y; } - /** Compute signed area of triangle */ + /** Compute signed area of triangle, negative means convex angle and positive reflex angle. */ private static double area( final double aX, final double aY, @@ -1419,29 +1481,6 @@ private static boolean pointInEar( && (bx - x) * (cy - y) - (cx - x) * (by - y) >= 0; } - /** compute whether the given x, y point is in a triangle; uses the winding order method */ - public static boolean pointInTriangle( - double x, double y, double ax, double ay, double bx, double by, double cx, double cy) { - double minX = StrictMath.min(ax, StrictMath.min(bx, cx)); - double minY = StrictMath.min(ay, StrictMath.min(by, cy)); - double maxX = StrictMath.max(ax, StrictMath.max(bx, cx)); - double maxY = StrictMath.max(ay, StrictMath.max(by, cy)); - // check the bounding box because if the triangle is degenerated, e.g points and lines, we need - // to filter out - // coplanar points that are not part of the triangle. - if (x >= minX && x <= maxX && y >= minY && y <= maxY) { - int a = orient(x, y, ax, ay, bx, by); - int b = orient(x, y, bx, by, cx, cy); - if (a == 0 || b == 0 || a < 0 == b < 0) { - int c = orient(x, y, cx, cy, ax, ay); - return c == 0 || (c < 0 == (b < 0 || a < 0)); - } - return false; - } else { - return false; - } - } - /** * Implementation of this interface will receive calls with internal data at each step of the * triangulation algorithm. This is of use for debugging complex cases, as well as gaining insight @@ -1508,7 +1547,7 @@ private static void notifyMonitor( } /** Circular Doubly-linked list used for polygon coordinates */ - protected static class Node { + static class Node { // node index in the linked list private final int idx; // vertex index in the polygon @@ -1524,9 +1563,9 @@ protected static class Node { private final long morton; // previous node - private Node previous; + Node previous; // next node - private Node next; + Node next; // previous z node private Node previousZ; // next z node @@ -1534,7 +1573,7 @@ protected static class Node { // if the edge from this node to the next node is part of the polygon edges private boolean isNextEdgeFromPolygon; - protected Node( + Node( final double[] x, final double[] y, final int index, @@ -1600,7 +1639,7 @@ public static final class Triangle { Node[] vertex; boolean[] edgeFromPolygon; - protected Triangle( + private Triangle( Node a, boolean isABfromPolygon, Node b, @@ -1636,19 +1675,6 @@ public boolean isEdgefromPolygon(int startVertex) { return edgeFromPolygon[startVertex]; } - /** utility method to compute whether the point is in the triangle */ - protected boolean containsPoint(double lat, double lon) { - return pointInTriangle( - lon, - lat, - vertex[0].getX(), - vertex[0].getY(), - vertex[1].getX(), - vertex[1].getY(), - vertex[2].getX(), - vertex[2].getY()); - } - /** pretty print the triangle vertices */ @Override public String toString() { diff --git a/lucene/core/src/java/org/apache/lucene/index/BaseCompositeReader.java b/lucene/core/src/java/org/apache/lucene/index/BaseCompositeReader.java index 68fc36120a6c..dcda91a4c5d0 100644 --- a/lucene/core/src/java/org/apache/lucene/index/BaseCompositeReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/BaseCompositeReader.java @@ -117,6 +117,15 @@ public final TermVectors termVectors() throws IOException { ensureOpen(); TermVectors[] subVectors = new TermVectors[subReaders.length]; return new TermVectors() { + @Override + public void prefetch(int docID) throws IOException { + final int i = readerIndex(docID); // find subreader num + if (subVectors[i] == null) { + subVectors[i] = subReaders[i].termVectors(); + } + subVectors[i].prefetch(docID - starts[i]); + } + @Override public Fields get(int docID) throws IOException { final int i = readerIndex(docID); // find subreader num diff --git a/lucene/core/src/java/org/apache/lucene/index/BitsSlice.java b/lucene/core/src/java/org/apache/lucene/index/BitsSlice.java index cf09c058140d..ec8ccdaeb6dd 100644 --- a/lucene/core/src/java/org/apache/lucene/index/BitsSlice.java +++ b/lucene/core/src/java/org/apache/lucene/index/BitsSlice.java @@ -32,8 +32,8 @@ final class BitsSlice implements Bits { // start is inclusive; end is exclusive (length = end-start) public BitsSlice(Bits parent, ReaderSlice slice) { this.parent = parent; - this.start = slice.start; - this.length = slice.length; + this.start = slice.start(); + this.length = slice.length(); assert length >= 0 : "length=" + length; } diff --git a/lucene/core/src/java/org/apache/lucene/index/BufferedUpdatesStream.java b/lucene/core/src/java/org/apache/lucene/index/BufferedUpdatesStream.java index d0ed8df6c099..817b0d057d7e 100644 --- a/lucene/core/src/java/org/apache/lucene/index/BufferedUpdatesStream.java +++ b/lucene/core/src/java/org/apache/lucene/index/BufferedUpdatesStream.java @@ -114,19 +114,11 @@ public long ramBytesUsed() { return bytesUsed.get(); } - static class ApplyDeletesResult { - - // True if any actual deletes took place: - final boolean anyDeletes; - - // If non-null, contains segments that are 100% deleted - final List allDeleted; - - ApplyDeletesResult(boolean anyDeletes, List allDeleted) { - this.anyDeletes = anyDeletes; - this.allDeleted = allDeleted; - } - } + /** + * @param anyDeletes True if any actual deletes took place: + * @param allDeleted If non-null, contains segments that are 100% deleted + */ + record ApplyDeletesResult(boolean anyDeletes, List allDeleted) {} /** * Waits for all in-flight packets, which are already being resolved concurrently by indexing diff --git a/lucene/core/src/java/org/apache/lucene/index/ByteVectorValues.java b/lucene/core/src/java/org/apache/lucene/index/ByteVectorValues.java index d33ca1ca3544..e9be3423c181 100644 --- a/lucene/core/src/java/org/apache/lucene/index/ByteVectorValues.java +++ b/lucene/core/src/java/org/apache/lucene/index/ByteVectorValues.java @@ -17,8 +17,8 @@ package org.apache.lucene.index; import java.io.IOException; +import java.util.List; import org.apache.lucene.document.KnnByteVectorField; -import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.VectorScorer; /** @@ -27,34 +27,21 @@ * * @lucene.experimental */ -public abstract class ByteVectorValues extends DocIdSetIterator { +public abstract class ByteVectorValues extends KnnVectorValues { /** Sole constructor */ protected ByteVectorValues() {} - /** Return the dimension of the vectors */ - public abstract int dimension(); - /** - * Return the number of vectors for this field. + * Return the vector value for the given vector ordinal which must be in [0, size() - 1], + * otherwise IndexOutOfBoundsException is thrown. The returned array may be shared across calls. * - * @return the number of vectors returned by this iterator + * @return the vector value */ - public abstract int size(); + public abstract byte[] vectorValue(int ord) throws IOException; @Override - public final long cost() { - return size(); - } - - /** - * Return the vector value for the current document ID. It is illegal to call this method when the - * iterator is not positioned: before advancing, or after failing to advance. The returned array - * may be shared across calls, re-used, and modified as the iterator advances. - * - * @return the vector value - */ - public abstract byte[] vectorValue() throws IOException; + public abstract ByteVectorValues copy() throws IOException; /** * Checks the Vector Encoding of a field @@ -78,12 +65,53 @@ public static void checkField(LeafReader in, String field) { } /** - * Return a {@link VectorScorer} for the given query vector. The iterator for the scorer is not - * the same instance as the iterator for this {@link ByteVectorValues}. It is a copy, and - * iteration over the scorer will not affect the iteration of this {@link ByteVectorValues}. + * Return a {@link VectorScorer} for the given query vector. * * @param query the query vector * @return a {@link VectorScorer} instance or null */ - public abstract VectorScorer scorer(byte[] query) throws IOException; + public VectorScorer scorer(byte[] query) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public VectorEncoding getEncoding() { + return VectorEncoding.BYTE; + } + + /** + * Creates a {@link ByteVectorValues} from a list of byte arrays. + * + * @param vectors the list of byte arrays + * @param dim the dimension of the vectors + * @return a {@link ByteVectorValues} instancec + */ + public static ByteVectorValues fromBytes(List vectors, int dim) { + return new ByteVectorValues() { + @Override + public int size() { + return vectors.size(); + } + + @Override + public int dimension() { + return dim; + } + + @Override + public byte[] vectorValue(int targetOrd) { + return vectors.get(targetOrd); + } + + @Override + public ByteVectorValues copy() { + return this; + } + + @Override + public DocIndexIterator iterator() { + return createDenseIterator(); + } + }; + } } diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java index 65ac2fcd2607..d957af01d0a2 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java @@ -1188,10 +1188,10 @@ public static Status.IndexSortStatus testSort( FieldInfos fieldInfos = reader.getFieldInfos(); if (metaData.hasBlocks() && fieldInfos.getParentField() == null - && metaData.getCreatedVersionMajor() >= Version.LUCENE_10_0_0.major) { + && metaData.createdVersionMajor() >= Version.LUCENE_10_0_0.major) { throw new IllegalStateException( "parent field is not set but the index has document blocks and was created with version: " - + metaData.getCreatedVersionMajor()); + + metaData.createdVersionMajor()); } final DocIdSetIterator iter; if (metaData.hasBlocks() && fieldInfos.getParentField() != null) { @@ -2760,16 +2760,16 @@ private static void checkFloatVectorValues( CheckIndex.Status.VectorValuesStatus status, CodecReader codecReader) throws IOException { - int docCount = 0; + int count = 0; int everyNdoc = Math.max(values.size() / 64, 1); - while (values.nextDoc() != NO_MORE_DOCS) { + while (count < values.size()) { // search the first maxNumSearches vectors to exercise the graph - if (values.docID() % everyNdoc == 0) { + if (values.ordToDoc(count) % everyNdoc == 0) { KnnCollector collector = new TopKnnCollector(10, Integer.MAX_VALUE); if (vectorsReaderSupportsSearch(codecReader, fieldInfo.name)) { codecReader .getVectorReader() - .search(fieldInfo.name, values.vectorValue(), collector, null); + .search(fieldInfo.name, values.vectorValue(count), collector, null); TopDocs docs = collector.topDocs(); if (docs.scoreDocs.length == 0) { throw new CheckIndexException( @@ -2777,7 +2777,7 @@ private static void checkFloatVectorValues( } } } - int valueLength = values.vectorValue().length; + int valueLength = values.vectorValue(count).length; if (valueLength != fieldInfo.getVectorDimension()) { throw new CheckIndexException( "Field \"" @@ -2787,19 +2787,19 @@ private static void checkFloatVectorValues( + " not matching the field's dimension=" + fieldInfo.getVectorDimension()); } - ++docCount; + ++count; } - if (docCount != values.size()) { + if (count != values.size()) { throw new CheckIndexException( "Field \"" + fieldInfo.name + "\" has size=" + values.size() + " but when iterated, returns " - + docCount + + count + " docs with values"); } - status.totalVectorValues += docCount; + status.totalVectorValues += count; } private static void checkByteVectorValues( @@ -2808,21 +2808,23 @@ private static void checkByteVectorValues( CheckIndex.Status.VectorValuesStatus status, CodecReader codecReader) throws IOException { - int docCount = 0; + int count = 0; int everyNdoc = Math.max(values.size() / 64, 1); boolean supportsSearch = vectorsReaderSupportsSearch(codecReader, fieldInfo.name); - while (values.nextDoc() != NO_MORE_DOCS) { + while (count < values.size()) { // search the first maxNumSearches vectors to exercise the graph - if (supportsSearch && values.docID() % everyNdoc == 0) { + if (supportsSearch && values.ordToDoc(count) % everyNdoc == 0) { KnnCollector collector = new TopKnnCollector(10, Integer.MAX_VALUE); - codecReader.getVectorReader().search(fieldInfo.name, values.vectorValue(), collector, null); + codecReader + .getVectorReader() + .search(fieldInfo.name, values.vectorValue(count), collector, null); TopDocs docs = collector.topDocs(); if (docs.scoreDocs.length == 0) { throw new CheckIndexException( "Field \"" + fieldInfo.name + "\" failed to search k nearest neighbors"); } } - int valueLength = values.vectorValue().length; + int valueLength = values.vectorValue(count).length; if (valueLength != fieldInfo.getVectorDimension()) { throw new CheckIndexException( "Field \"" @@ -2832,19 +2834,19 @@ private static void checkByteVectorValues( + " not matching the field's dimension=" + fieldInfo.getVectorDimension()); } - ++docCount; + ++count; } - if (docCount != values.size()) { + if (count != values.size()) { throw new CheckIndexException( "Field \"" + fieldInfo.name + "\" has size=" + values.size() + " but when iterated, returns " - + docCount + + count + " docs with values"); } - status.totalVectorValues += docCount; + status.totalVectorValues += count; } /** @@ -3143,12 +3145,7 @@ private void checkPackedValue(String desc, byte[] packedValue, int docID) { } } - private static class ConstantRelationIntersectVisitor implements IntersectVisitor { - private final Relation relation; - - ConstantRelationIntersectVisitor(Relation relation) { - this.relation = relation; - } + private record ConstantRelationIntersectVisitor(Relation relation) implements IntersectVisitor { @Override public void visit(int docID) throws IOException { @@ -3736,7 +3733,7 @@ private static void checkNumericDocValues( private static void checkDocValues( FieldInfo fi, DocValuesProducer dvReader, DocValuesStatus status) throws Exception { - if (fi.hasDocValuesSkipIndex()) { + if (fi.docValuesSkipIndexType() != DocValuesSkipIndexType.NONE) { status.totalSkippingIndex++; checkDocValueSkipper(fi, dvReader.getSkipper(fi)); } @@ -3815,6 +3812,9 @@ public static Status.TermVectorStatus testTermVectors( if (vectorsReader != null) { vectorsReader = vectorsReader.getMergeInstance(); for (int j = 0; j < reader.maxDoc(); ++j) { + if ((j & 0x03) == 0) { + vectorsReader.prefetch(j); + } // Intentionally pull/visit (but don't count in // stats) deleted documents to make sure they too // are not corrupt: @@ -3841,7 +3841,7 @@ public static Status.TermVectorStatus testTermVectors( // Make sure FieldInfo thinks this field is vector'd: final FieldInfo fieldInfo = fieldInfos.fieldInfo(field); - if (fieldInfo.hasVectors() == false) { + if (fieldInfo.hasTermVectors() == false) { throw new CheckIndexException( "docID=" + j @@ -4284,21 +4284,8 @@ public static Options parseOptions(String[] args) { int level = Integer.parseInt(args[i]); Level.checkIfLevelInBounds(level); opts.level = level; - } else if ("-fast".equals(arg)) { - // Deprecated. Remove in Lucene 11. - System.err.println( - "-fast is deprecated, use '-level 1' for explicitly verifying file checksums only. This is also now the default " - + "behaviour!"); - } else if ("-slow".equals(arg)) { - // Deprecated. Remove in Lucene 11. - System.err.println("-slow is deprecated, use '-level 3' instead for slow checks"); - opts.level = Level.MIN_LEVEL_FOR_SLOW_CHECKS; } else if ("-exorcise".equals(arg)) { opts.doExorcise = true; - } else if ("-crossCheckTermVectors".equals(arg)) { - // Deprecated. Remove in Lucene 11. - System.err.println("-crossCheckTermVectors is deprecated, use '-level 3' instead"); - opts.level = Level.MAX_VALUE; } else if (arg.equals("-verbose")) { opts.verbose = true; } else if (arg.equals("-segment")) { diff --git a/lucene/core/src/java/org/apache/lucene/index/CodecReader.java b/lucene/core/src/java/org/apache/lucene/index/CodecReader.java index bec27c5176e8..20be7e1a45a8 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CodecReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/CodecReader.java @@ -200,7 +200,7 @@ public final SortedSetDocValues getSortedSetDocValues(String field) throws IOExc public final DocValuesSkipper getDocValuesSkipper(String field) throws IOException { ensureOpen(); FieldInfo fi = getFieldInfos().fieldInfo(field); - if (fi == null || fi.hasDocValuesSkipIndex() == false) { + if (fi == null || fi.docValuesSkipIndexType() == DocValuesSkipIndexType.NONE) { return null; } return getDocValuesReader().getSkipper(fi); diff --git a/lucene/core/src/java/org/apache/lucene/index/DocValuesSkipIndexType.java b/lucene/core/src/java/org/apache/lucene/index/DocValuesSkipIndexType.java new file mode 100644 index 000000000000..fcdb735b3174 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/index/DocValuesSkipIndexType.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.index; + +/** Options for skip indexes on doc values. */ +public enum DocValuesSkipIndexType { + /** No skip index should be created. */ + NONE { + @Override + boolean isCompatibleWith(DocValuesType dvType) { + return true; + } + }, + /** + * Record range of values. This is suitable for {@link DocValuesType#NUMERIC}, {@link + * DocValuesType#SORTED_NUMERIC}, {@link DocValuesType#SORTED} and {@link + * DocValuesType#SORTED_SET} doc values, and will record the min/max values per range of doc IDs. + */ + RANGE { + @Override + boolean isCompatibleWith(DocValuesType dvType) { + return dvType == DocValuesType.NUMERIC + || dvType == DocValuesType.SORTED_NUMERIC + || dvType == DocValuesType.SORTED + || dvType == DocValuesType.SORTED_SET; + } + }; + + // TODO: add support for pre-aggregated integer/float/double + + abstract boolean isCompatibleWith(DocValuesType dvType); +} diff --git a/lucene/core/src/java/org/apache/lucene/index/DocValuesSkipper.java b/lucene/core/src/java/org/apache/lucene/index/DocValuesSkipper.java index 15d3c67c34d6..591bfacd1411 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocValuesSkipper.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocValuesSkipper.java @@ -98,4 +98,28 @@ public abstract class DocValuesSkipper { /** Return the global number of documents with a value for the field. */ public abstract int docCount(); + + /** + * Advance this skipper so that all levels intersects the range given by {@code minValue} and + * {@code maxValue}. If there are no intersecting levels, the skipper is exhausted. + */ + public final void advance(long minValue, long maxValue) throws IOException { + if (minDocID(0) == -1) { + // #advance has not been called yet + advance(0); + } + // check if the current interval intersects the provided range + while (minDocID(0) != DocIdSetIterator.NO_MORE_DOCS + && ((minValue(0) > maxValue || maxValue(0) < minValue))) { + int maxDocID = maxDocID(0); + int nextLevel = 1; + // check if the next levels intersects to skip as many docs as possible + while (nextLevel < numLevels() + && (minValue(nextLevel) > maxValue || maxValue(nextLevel) < minValue)) { + maxDocID = maxDocID(nextLevel); + nextLevel++; + } + advance(maxDocID + 1); + } + } } diff --git a/lucene/core/src/java/org/apache/lucene/index/DocValuesType.java b/lucene/core/src/java/org/apache/lucene/index/DocValuesType.java index f40132907a63..5c4f76fdd6fc 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocValuesType.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocValuesType.java @@ -22,37 +22,31 @@ */ public enum DocValuesType { /** No doc values for this field. */ - NONE(false), + NONE, /** A per-document Number */ - NUMERIC(true), + NUMERIC, /** * A per-document byte[]. Values may be larger than 32766 bytes, but different codecs may enforce * their own limits. */ - BINARY(false), + BINARY, /** * A pre-sorted byte[]. Fields with this type only store distinct byte values and store an * additional offset pointer per document to dereference the shared byte[]. The stored byte[] is * presorted and allows access via document id, ordinal and by-value. Values must be {@code <= * 32766} bytes. */ - SORTED(true), + SORTED, /** * A pre-sorted Number[]. Fields with this type store numeric values in sorted order according to * {@link Long#compare(long, long)}. */ - SORTED_NUMERIC(true), + SORTED_NUMERIC, /** * A pre-sorted Set<byte[]>. Fields with this type only store distinct byte values and store * additional offset pointers per document to dereference the shared byte[]s. The stored byte[] is * presorted and allows access via document id, ordinal and by-value. Values must be {@code <= * 32766} bytes. */ - SORTED_SET(true); - - final boolean supportsSkipIndex; // pkg-private for use in FieldInfo - - DocValuesType(boolean supportsSkipIndex) { - this.supportsSkipIndex = supportsSkipIndex; - } + SORTED_SET; } diff --git a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriter.java b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriter.java index 7955df5630e4..e32c8b20c047 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriter.java @@ -430,10 +430,16 @@ long updateDocuments( } flushingDWPT = flushControl.doAfterDocument(dwpt); } finally { - if (dwpt.isFlushPending() || dwpt.isAborted()) { - dwpt.unlock(); - } else { - perThreadPool.marksAsFreeAndUnlock(dwpt); + // If a flush is occurring, we don't want to allow this dwpt to be reused + // If it is aborted, we shouldn't allow it to be reused + // If the deleteQueue is advanced, this means the maximum seqNo has been set and it cannot be + // reused + synchronized (flushControl) { + if (dwpt.isFlushPending() || dwpt.isAborted() || dwpt.isQueueAdvanced()) { + dwpt.unlock(); + } else { + perThreadPool.marksAsFreeAndUnlock(dwpt); + } } assert dwpt.isHeldByCurrentThread() == false : "we didn't release the dwpt even on abort"; } diff --git a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterDeleteQueue.java b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterDeleteQueue.java index 2cea3a6db0d7..34c858eaf5d9 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterDeleteQueue.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterDeleteQueue.java @@ -636,7 +636,7 @@ long getMaxSeqNo() { } /** Returns true if it was advanced. */ - boolean isAdvanced() { + synchronized boolean isAdvanced() { return advanced; } } diff --git a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java index 3fe76a6338f3..fd6ed22bd4dd 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java @@ -487,7 +487,7 @@ FlushedSegment flush(DocumentsWriter.FlushNotifications flushNotifications) thro infoStream.message( "DWPT", "new segment has " - + (flushState.fieldInfos.hasVectors() ? "vectors" : "no vectors") + + (flushState.fieldInfos.hasTermVectors() ? "vectors" : "no vectors") + "; " + (flushState.fieldInfos.hasNorms() ? "norms" : "no norms") + "; " @@ -718,6 +718,10 @@ boolean isFlushPending() { return flushPending.get() == Boolean.TRUE; } + boolean isQueueAdvanced() { + return deleteQueue.isAdvanced(); + } + /** Sets this DWPT as flush pending. This can only be set once. */ void setFlushPending() { flushPending.set(Boolean.TRUE); diff --git a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThreadPool.java b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThreadPool.java index d69a71bfea57..93276c7ced44 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThreadPool.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThreadPool.java @@ -138,6 +138,15 @@ private synchronized boolean contains(DocumentsWriterPerThread state) { void marksAsFreeAndUnlock(DocumentsWriterPerThread state) { final long ramBytesUsed = state.ramBytesUsed(); + assert state.isFlushPending() == false + && state.isAborted() == false + && state.isQueueAdvanced() == false + : "DWPT has pending flush: " + + state.isFlushPending() + + " aborted=" + + state.isAborted() + + " queueAdvanced=" + + state.isQueueAdvanced(); assert contains(state) : "we tried to add a DWPT back to the pool but the pool doesn't know about this DWPT"; freeList.addAndUnlock(state, ramBytesUsed); diff --git a/lucene/core/src/java/org/apache/lucene/index/ExitableDirectoryReader.java b/lucene/core/src/java/org/apache/lucene/index/ExitableDirectoryReader.java index ca2cb1a27d45..614a652cd35a 100644 --- a/lucene/core/src/java/org/apache/lucene/index/ExitableDirectoryReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/ExitableDirectoryReader.java @@ -429,52 +429,35 @@ private void checkAndThrow(DocIdSetIterator in) { } private class ExitableFloatVectorValues extends FloatVectorValues { - private int docToCheck; private final FloatVectorValues vectorValues; public ExitableFloatVectorValues(FloatVectorValues vectorValues) { this.vectorValues = vectorValues; - docToCheck = 0; } @Override - public int advance(int target) throws IOException { - final int advance = vectorValues.advance(target); - if (advance >= docToCheck) { - checkAndThrow(); - docToCheck = advance + DOCS_BETWEEN_TIMEOUT_CHECK; - } - return advance; + public int dimension() { + return vectorValues.dimension(); } @Override - public int docID() { - return vectorValues.docID(); + public float[] vectorValue(int ord) throws IOException { + return vectorValues.vectorValue(ord); } @Override - public int nextDoc() throws IOException { - final int nextDoc = vectorValues.nextDoc(); - if (nextDoc >= docToCheck) { - checkAndThrow(); - docToCheck = nextDoc + DOCS_BETWEEN_TIMEOUT_CHECK; - } - return nextDoc; + public int ordToDoc(int ord) { + return vectorValues.ordToDoc(ord); } @Override - public int dimension() { - return vectorValues.dimension(); - } - - @Override - public float[] vectorValue() throws IOException { - return vectorValues.vectorValue(); + public int size() { + return vectorValues.size(); } @Override - public int size() { - return vectorValues.size(); + public DocIndexIterator iterator() { + return createExitableIterator(vectorValues.iterator(), queryTimeout); } @Override @@ -482,95 +465,109 @@ public VectorScorer scorer(float[] target) throws IOException { return vectorValues.scorer(target); } - /** - * Throws {@link ExitingReaderException} if {@link QueryTimeout#shouldExit()} returns true, or - * if {@link Thread#interrupted()} returns true. - */ - private void checkAndThrow() { - if (queryTimeout.shouldExit()) { - throw new ExitingReaderException( - "The request took too long to iterate over vector values. Timeout: " - + queryTimeout.toString() - + ", FloatVectorValues=" - + in); - } else if (Thread.interrupted()) { - throw new ExitingReaderException( - "Interrupted while iterating over vector values. FloatVectorValues=" + in); - } + @Override + public FloatVectorValues copy() { + throw new UnsupportedOperationException(); } } private class ExitableByteVectorValues extends ByteVectorValues { - private int docToCheck; private final ByteVectorValues vectorValues; public ExitableByteVectorValues(ByteVectorValues vectorValues) { this.vectorValues = vectorValues; - docToCheck = 0; } @Override - public int advance(int target) throws IOException { - final int advance = vectorValues.advance(target); - if (advance >= docToCheck) { - checkAndThrow(); - docToCheck = advance + DOCS_BETWEEN_TIMEOUT_CHECK; - } - return advance; + public int dimension() { + return vectorValues.dimension(); } @Override - public int docID() { - return vectorValues.docID(); + public int size() { + return vectorValues.size(); } @Override - public int nextDoc() throws IOException { - final int nextDoc = vectorValues.nextDoc(); - if (nextDoc >= docToCheck) { - checkAndThrow(); - docToCheck = nextDoc + DOCS_BETWEEN_TIMEOUT_CHECK; - } - return nextDoc; + public byte[] vectorValue(int ord) throws IOException { + return vectorValues.vectorValue(ord); } @Override - public int dimension() { - return vectorValues.dimension(); + public int ordToDoc(int ord) { + return vectorValues.ordToDoc(ord); } @Override - public int size() { - return vectorValues.size(); + public DocIndexIterator iterator() { + return createExitableIterator(vectorValues.iterator(), queryTimeout); } @Override - public byte[] vectorValue() throws IOException { - return vectorValues.vectorValue(); + public VectorScorer scorer(byte[] target) throws IOException { + return vectorValues.scorer(target); } @Override - public VectorScorer scorer(byte[] target) throws IOException { - return vectorValues.scorer(target); + public ByteVectorValues copy() { + throw new UnsupportedOperationException(); + } + } + } + + private static KnnVectorValues.DocIndexIterator createExitableIterator( + KnnVectorValues.DocIndexIterator delegate, QueryTimeout queryTimeout) { + return new KnnVectorValues.DocIndexIterator() { + private int nextCheck; + + @Override + public int index() { + return delegate.index(); + } + + @Override + public int docID() { + return delegate.docID(); + } + + @Override + public int nextDoc() throws IOException { + int doc = delegate.nextDoc(); + if (doc >= nextCheck) { + checkAndThrow(); + nextCheck = doc + ExitableFilterAtomicReader.DOCS_BETWEEN_TIMEOUT_CHECK; + } + return doc; + } + + @Override + public long cost() { + return delegate.cost(); + } + + @Override + public int advance(int target) throws IOException { + int doc = delegate.advance(target); + if (doc >= nextCheck) { + checkAndThrow(); + nextCheck = doc + ExitableFilterAtomicReader.DOCS_BETWEEN_TIMEOUT_CHECK; + } + return doc; } - /** - * Throws {@link ExitingReaderException} if {@link QueryTimeout#shouldExit()} returns true, or - * if {@link Thread#interrupted()} returns true. - */ private void checkAndThrow() { if (queryTimeout.shouldExit()) { throw new ExitingReaderException( - "The request took too long to iterate over vector values. Timeout: " + "The request took too long to iterate over knn vector values. Timeout: " + queryTimeout.toString() - + ", ByteVectorValues=" - + in); + + ", KnnVectorValues=" + + delegate); } else if (Thread.interrupted()) { throw new ExitingReaderException( - "Interrupted while iterating over vector values. ByteVectorValues=" + in); + "Interrupted while iterating over knn vector values. KnnVectorValues=" + delegate); } } - } + }; } /** Wrapper class for another PointValues implementation that is used by ExitableFields. */ @@ -683,7 +680,7 @@ private void checkAndThrow() { if (queryTimeout.shouldExit()) { throw new ExitingReaderException( "The request took too long to intersect point values. Timeout: " - + queryTimeout.toString() + + queryTimeout + ", PointValues=" + pointValues); } else if (Thread.interrupted()) { @@ -815,7 +812,7 @@ public void grow(int count) { /** Wrapper class for another Terms implementation that is used by ExitableFields. */ public static class ExitableTerms extends FilterTerms { - private QueryTimeout queryTimeout; + private final QueryTimeout queryTimeout; /** Constructor * */ public ExitableTerms(Terms terms, QueryTimeout queryTimeout) { diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java b/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java index 862c6b37993a..e6bdddc9239d 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java +++ b/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java @@ -37,7 +37,7 @@ public final class FieldInfo { private DocValuesType docValuesType = DocValuesType.NONE; - private final boolean docValuesSkipIndex; + private final DocValuesSkipIndexType docValuesSkipIndex; // True if any document indexed term vectors private boolean storeTermVector; @@ -83,7 +83,7 @@ public FieldInfo( boolean storePayloads, IndexOptions indexOptions, DocValuesType docValues, - boolean hasDocValuesSkipIndex, + DocValuesSkipIndexType docValuesSkipIndex, long dvGen, Map attributes, int pointDimensionCount, @@ -99,7 +99,7 @@ public FieldInfo( this.docValuesType = Objects.requireNonNull( docValues, "DocValuesType must not be null (field: \"" + name + "\")"); - this.docValuesSkipIndex = hasDocValuesSkipIndex; + this.docValuesSkipIndex = docValuesSkipIndex; this.indexOptions = Objects.requireNonNull( indexOptions, "IndexOptions must not be null (field: \"" + name + "\")"); @@ -157,11 +157,13 @@ public void checkConsistency() { if (docValuesType == null) { throw new IllegalArgumentException("DocValuesType must not be null (field: '" + name + "')"); } - if (docValuesType.supportsSkipIndex == false && docValuesSkipIndex) { + if (docValuesSkipIndex.isCompatibleWith(docValuesType) == false) { throw new IllegalArgumentException( "field '" + name - + "' cannot have docValuesSkipIndex set to true with doc values type " + + "' cannot have docValuesSkipIndexType=" + + docValuesSkipIndex + + " with doc values type " + docValuesType); } if (dvGen != -1 && docValuesType == DocValuesType.NONE) { @@ -308,14 +310,16 @@ static void verifySameDocValuesType( * @throws IllegalArgumentException if they are not the same */ static void verifySameDocValuesSkipIndex( - String fieldName, boolean hasDocValuesSkipIndex1, boolean hasDocValuesSkipIndex2) { + String fieldName, + DocValuesSkipIndexType hasDocValuesSkipIndex1, + DocValuesSkipIndexType hasDocValuesSkipIndex2) { if (hasDocValuesSkipIndex1 != hasDocValuesSkipIndex2) { throw new IllegalArgumentException( "cannot change field \"" + fieldName - + "\" from docValuesSkipIndex=" + + "\" from docValuesSkipIndexType=" + hasDocValuesSkipIndex1 - + " to inconsistent docValuesSkipIndex=" + + " to inconsistent docValuesSkipIndexType=" + hasDocValuesSkipIndex2); } } @@ -589,7 +593,7 @@ public DocValuesType getDocValuesType() { } /** Returns true if, and only if, this field has a skip index. */ - public boolean hasDocValuesSkipIndex() { + public DocValuesSkipIndexType docValuesSkipIndexType() { return docValuesSkipIndex; } @@ -641,7 +645,7 @@ public boolean hasPayloads() { } /** Returns true if any term vectors exist for this field. */ - public boolean hasVectors() { + public boolean hasTermVectors() { return storeTermVector; } diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java index fd0338bfe689..5392c102ca91 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java +++ b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java @@ -52,7 +52,7 @@ public class FieldInfos implements Iterable { private final boolean hasProx; private final boolean hasPayloads; private final boolean hasOffsets; - private final boolean hasVectors; + private final boolean hasTermVectors; private final boolean hasNorms; private final boolean hasDocValues; private final boolean hasPointValues; @@ -73,7 +73,7 @@ public class FieldInfos implements Iterable { * as the backing structure. */ public FieldInfos(FieldInfo[] infos) { - boolean hasVectors = false; + boolean hasTermVectors = false; boolean hasPostings = false; boolean hasProx = false; boolean hasPayloads = false; @@ -111,7 +111,7 @@ public FieldInfos(FieldInfo[] infos) { + info.name); } - hasVectors |= info.hasVectors(); + hasTermVectors |= info.hasTermVectors(); hasPostings |= info.getIndexOptions() != IndexOptions.NONE; hasProx |= info.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; hasFreq |= info.getIndexOptions() != IndexOptions.DOCS; @@ -139,7 +139,7 @@ public FieldInfos(FieldInfo[] infos) { } } - this.hasVectors = hasVectors; + this.hasTermVectors = hasTermVectors; this.hasPostings = hasPostings; this.hasProx = hasProx; this.hasPayloads = hasPayloads; @@ -275,9 +275,9 @@ public boolean hasOffsets() { return hasOffsets; } - /** Returns true if any fields have vectors */ - public boolean hasVectors() { - return hasVectors; + /** Returns true if any fields have term vectors */ + public boolean hasTermVectors() { + return hasTermVectors; } /** Returns true if any fields have norms */ @@ -365,7 +365,7 @@ private record FieldProperties( IndexOptions indexOptions, IndexOptionsProperties indexOptionsProperties, DocValuesType docValuesType, - boolean docValuesSkipIndex, + DocValuesSkipIndexType docValuesSkipIndex, FieldDimensions fieldDimensions, FieldVectorProperties fieldVectorProperties) {} @@ -441,10 +441,10 @@ synchronized int addOrGet(FieldInfo fi) { fieldNumber, fi.getIndexOptions(), fi.getIndexOptions() != IndexOptions.NONE - ? new IndexOptionsProperties(fi.hasVectors(), fi.omitsNorms()) + ? new IndexOptionsProperties(fi.hasTermVectors(), fi.omitsNorms()) : null, fi.getDocValuesType(), - fi.hasDocValuesSkipIndex(), + fi.docValuesSkipIndexType(), new FieldDimensions( fi.getPointDimensionCount(), fi.getPointIndexDimensionCount(), @@ -517,16 +517,16 @@ private void verifySameSchema(FieldInfo fi) { verifySameIndexOptions(fieldName, currentOpts, fi.getIndexOptions()); if (currentOpts != IndexOptions.NONE) { boolean curStoreTermVector = fieldProperties.indexOptionsProperties.storeTermVectors; - verifySameStoreTermVectors(fieldName, curStoreTermVector, fi.hasVectors()); + verifySameStoreTermVectors(fieldName, curStoreTermVector, fi.hasTermVectors()); boolean curOmitNorms = fieldProperties.indexOptionsProperties.omitNorms; verifySameOmitNorms(fieldName, curOmitNorms, fi.omitsNorms()); } DocValuesType currentDVType = fieldProperties.docValuesType; verifySameDocValuesType(fieldName, currentDVType, fi.getDocValuesType()); - boolean currentDocValuesSkipIndex = fieldProperties.docValuesSkipIndex; + DocValuesSkipIndexType currentDocValuesSkipIndex = fieldProperties.docValuesSkipIndex; verifySameDocValuesSkipIndex( - fieldName, currentDocValuesSkipIndex, fi.hasDocValuesSkipIndex()); + fieldName, currentDocValuesSkipIndex, fi.docValuesSkipIndexType()); FieldDimensions dims = fieldProperties.fieldDimensions; verifySamePointsOptions( @@ -582,7 +582,7 @@ synchronized void verifyOrCreateDvOnlyField( false, IndexOptions.NONE, dvType, - false, + DocValuesSkipIndexType.NONE, -1, new HashMap<>(), 0, @@ -609,8 +609,8 @@ synchronized void verifyOrCreateDvOnlyField( + fieldDvType + "]."); } - boolean hasDocValuesSkipIndex = fieldProperties.docValuesSkipIndex; - if (hasDocValuesSkipIndex) { + DocValuesSkipIndexType hasDocValuesSkipIndex = fieldProperties.docValuesSkipIndex; + if (hasDocValuesSkipIndex != DocValuesSkipIndexType.NONE) { throw new IllegalArgumentException( "Can't update [" + dvType @@ -676,7 +676,7 @@ FieldInfo constructFieldInfo(String fieldName, DocValuesType dvType, int newFiel false, IndexOptions.NONE, dvType, - false, + DocValuesSkipIndexType.NONE, -1, new HashMap<>(), 0, @@ -792,12 +792,12 @@ FieldInfo add(FieldInfo fi, long dvGen) { new FieldInfo( fi.getName(), fieldNumber, - fi.hasVectors(), + fi.hasTermVectors(), fi.omitsNorms(), fi.hasPayloads(), fi.getIndexOptions(), fi.getDocValuesType(), - fi.hasDocValuesSkipIndex(), + fi.docValuesSkipIndexType(), dvGen, // original attributes is UnmodifiableMap new HashMap<>(fi.attributes()), diff --git a/lucene/core/src/java/org/apache/lucene/index/FilteredTermsEnum.java b/lucene/core/src/java/org/apache/lucene/index/FilteredTermsEnum.java index c8354cd881f2..8099aef459a0 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FilteredTermsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/index/FilteredTermsEnum.java @@ -252,7 +252,7 @@ public BytesRef next() throws IOException { switch (accept(actualTerm)) { case YES_AND_SEEK: doSeek = true; - // term accepted, but we need to seek so fall-through + // term accepted, but we need to seek so fall-through case YES: // term accepted return actualTerm; diff --git a/lucene/core/src/java/org/apache/lucene/index/FloatVectorValues.java b/lucene/core/src/java/org/apache/lucene/index/FloatVectorValues.java index e5dbc620f5c3..aa840fc39319 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FloatVectorValues.java +++ b/lucene/core/src/java/org/apache/lucene/index/FloatVectorValues.java @@ -17,8 +17,8 @@ package org.apache.lucene.index; import java.io.IOException; +import java.util.List; import org.apache.lucene.document.KnnFloatVectorField; -import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.VectorScorer; /** @@ -27,34 +27,21 @@ * * @lucene.experimental */ -public abstract class FloatVectorValues extends DocIdSetIterator { +public abstract class FloatVectorValues extends KnnVectorValues { /** Sole constructor */ protected FloatVectorValues() {} - /** Return the dimension of the vectors */ - public abstract int dimension(); - /** - * Return the number of vectors for this field. + * Return the vector value for the given vector ordinal which must be in [0, size() - 1], + * otherwise IndexOutOfBoundsException is thrown. The returned array may be shared across calls. * - * @return the number of vectors returned by this iterator + * @return the vector value */ - public abstract int size(); + public abstract float[] vectorValue(int ord) throws IOException; @Override - public final long cost() { - return size(); - } - - /** - * Return the vector value for the current document ID. It is illegal to call this method when the - * iterator is not positioned: before advancing, or after failing to advance. The returned array - * may be shared across calls, re-used, and modified as the iterator advances. - * - * @return the vector value - */ - public abstract float[] vectorValue() throws IOException; + public abstract FloatVectorValues copy() throws IOException; /** * Checks the Vector Encoding of a field @@ -79,12 +66,53 @@ public static void checkField(LeafReader in, String field) { /** * Return a {@link VectorScorer} for the given query vector and the current {@link - * FloatVectorValues}. The iterator for the scorer is not the same instance as the iterator for - * this {@link FloatVectorValues}. It is a copy, and iteration over the scorer will not affect the - * iteration of this {@link FloatVectorValues}. + * FloatVectorValues}. * - * @param query the query vector + * @param target the query vector * @return a {@link VectorScorer} instance or null */ - public abstract VectorScorer scorer(float[] query) throws IOException; + public VectorScorer scorer(float[] target) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public VectorEncoding getEncoding() { + return VectorEncoding.FLOAT32; + } + + /** + * Creates a {@link FloatVectorValues} from a list of float arrays. + * + * @param vectors the list of float arrays + * @param dim the dimension of the vectors + * @return a {@link FloatVectorValues} instance + */ + public static FloatVectorValues fromFloats(List vectors, int dim) { + return new FloatVectorValues() { + @Override + public int size() { + return vectors.size(); + } + + @Override + public int dimension() { + return dim; + } + + @Override + public float[] vectorValue(int targetOrd) { + return vectors.get(targetOrd); + } + + @Override + public FloatVectorValues copy() { + return this; + } + + @Override + public DocIndexIterator iterator() { + return createDenseIterator(); + } + }; + } } diff --git a/lucene/core/src/java/org/apache/lucene/index/Impacts.java b/lucene/core/src/java/org/apache/lucene/index/Impacts.java index 35e8cca5c70f..e366b6f6b0ba 100644 --- a/lucene/core/src/java/org/apache/lucene/index/Impacts.java +++ b/lucene/core/src/java/org/apache/lucene/index/Impacts.java @@ -40,7 +40,8 @@ protected Impacts() {} /** * Return impacts on the given level. These impacts are sorted by increasing frequency and * increasing unsigned norm, and only valid until the doc ID returned by {@link - * #getDocIdUpTo(int)} for the same level, included. The returned list is never empty. NOTE: There + * #getDocIdUpTo(int)} for the same level, included. The returned list is never empty and should + * implement {@link java.util.RandomAccess} if it contains more than a single element. NOTE: There * is no guarantee that these impacts actually appear in postings, only that they trigger scores * that are greater than or equal to the impacts that actually appear in postings. */ diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java index 83c2cbdaf1fa..346da8a907ec 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java @@ -37,6 +37,7 @@ import java.util.Queue; import java.util.Set; import java.util.concurrent.ConcurrentLinkedQueue; +import java.util.concurrent.Executor; import java.util.concurrent.Semaphore; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; @@ -1254,8 +1255,7 @@ static FieldInfos readFieldInfos(SegmentCommitInfo si) throws IOException { return reader.read(si.info.dir, si.info, segmentSuffix, IOContext.READONCE); } else if (si.info.getUseCompoundFile()) { // cfs - try (Directory cfs = - codec.compoundFormat().getCompoundReader(si.info.dir, si.info, IOContext.DEFAULT)) { + try (Directory cfs = codec.compoundFormat().getCompoundReader(si.info.dir, si.info)) { return reader.read(cfs, si.info, "", IOContext.READONCE); } } else { @@ -3118,22 +3118,22 @@ public long addIndexes(Directory... dirs) throws IOException { private void validateMergeReader(CodecReader leaf) { LeafMetaData segmentMeta = leaf.getMetaData(); - if (segmentInfos.getIndexCreatedVersionMajor() != segmentMeta.getCreatedVersionMajor()) { + if (segmentInfos.getIndexCreatedVersionMajor() != segmentMeta.createdVersionMajor()) { throw new IllegalArgumentException( "Cannot merge a segment that has been created with major version " - + segmentMeta.getCreatedVersionMajor() + + segmentMeta.createdVersionMajor() + " into this index which has been created by major version " + segmentInfos.getIndexCreatedVersionMajor()); } - if (segmentInfos.getIndexCreatedVersionMajor() >= 7 && segmentMeta.getMinVersion() == null) { + if (segmentInfos.getIndexCreatedVersionMajor() >= 7 && segmentMeta.minVersion() == null) { throw new IllegalStateException( "Indexes created on or after Lucene 7 must record the created version major, but " + leaf + " hides it"); } - Sort leafIndexSort = segmentMeta.getSort(); + Sort leafIndexSort = segmentMeta.sort(); if (config.getIndexSort() != null && (leafIndexSort == null || isCongruentSort(config.getIndexSort(), leafIndexSort) == false)) { @@ -3434,9 +3434,11 @@ public void addIndexesReaderMerge(MergePolicy.OneMerge merge) throws IOException .map(FieldInfos::getParentField) .anyMatch(Objects::isNull); + final Executor intraMergeExecutor = mergeScheduler.getIntraMergeExecutor(merge); + if (hasIndexSort == false && hasBlocksButNoParentField == false && readers.isEmpty() == false) { CodecReader mergedReader = SlowCompositeCodecReaderWrapper.wrap(readers); - DocMap docMap = merge.reorder(mergedReader, directory); + DocMap docMap = merge.reorder(mergedReader, directory, intraMergeExecutor); if (docMap != null) { readers = Collections.singletonList(SortingCodecReader.wrap(mergedReader, docMap, null)); } @@ -3450,7 +3452,7 @@ public void addIndexesReaderMerge(MergePolicy.OneMerge merge) throws IOException trackingDir, globalFieldNumberMap, context, - mergeScheduler.getIntraMergeExecutor(merge)); + intraMergeExecutor); if (!merger.shouldMerge()) { return; @@ -3928,9 +3930,9 @@ public CodecReader wrapForMerge(CodecReader reader) throws IOException { } @Override - public Sorter.DocMap reorder(CodecReader reader, Directory dir) - throws IOException { - return toWrap.reorder(reader, dir); // must delegate + public Sorter.DocMap reorder( + CodecReader reader, Directory dir, Executor executor) throws IOException { + return toWrap.reorder(reader, dir, executor); // must delegate } @Override @@ -5205,6 +5207,8 @@ public int length() { mergeReaders.add(wrappedReader); } + final Executor intraMergeExecutor = mergeScheduler.getIntraMergeExecutor(merge); + MergeState.DocMap[] reorderDocMaps = null; // Don't reorder if an explicit sort is configured. final boolean hasIndexSort = config.getIndexSort() != null; @@ -5219,7 +5223,7 @@ public int length() { if (hasIndexSort == false && hasBlocksButNoParentField == false) { // Create a merged view of the input segments. This effectively does the merge. CodecReader mergedView = SlowCompositeCodecReaderWrapper.wrap(mergeReaders); - Sorter.DocMap docMap = merge.reorder(mergedView, directory); + Sorter.DocMap docMap = merge.reorder(mergedView, directory, intraMergeExecutor); if (docMap != null) { reorderDocMaps = new MergeState.DocMap[mergeReaders.size()]; int docBase = 0; @@ -5249,7 +5253,7 @@ public int length() { dirWrapper, globalFieldNumberMap, context, - mergeScheduler.getIntraMergeExecutor(merge)); + intraMergeExecutor); merge.info.setSoftDelCount(Math.toIntExact(softDeleteCount.get())); merge.checkAborted(); @@ -5306,7 +5310,7 @@ public int length() { ("merge codec=" + codec) + (" maxDoc=" + merge.info.info.maxDoc()) + ("; merged segment has " - + (mergeState.mergeFieldInfos.hasVectors() ? "vectors" : "no vectors")) + + (mergeState.mergeFieldInfos.hasTermVectors() ? "vectors" : "no vectors")) + ("; " + (mergeState.mergeFieldInfos.hasNorms() ? "norms" : "no norms")) + ("; " + (mergeState.mergeFieldInfos.hasDocValues() ? "docValues" : "no docValues")) @@ -6396,16 +6400,16 @@ private void finishApply( deleter.decRef(delFiles); } - if (result.anyDeletes) { + if (result.anyDeletes()) { maybeMerge.set(true); checkpoint(); } - if (result.allDeleted != null) { + if (result.allDeleted() != null) { if (infoStream.isEnabled("IW")) { - infoStream.message("IW", "drop 100% deleted segments: " + segString(result.allDeleted)); + infoStream.message("IW", "drop 100% deleted segments: " + segString(result.allDeleted())); } - for (SegmentCommitInfo info : result.allDeleted) { + for (SegmentCommitInfo info : result.allDeleted()) { dropDeletedSegment(info); } checkpoint(); @@ -6532,12 +6536,7 @@ private DocStats(int maxDoc, int numDocs) { } } - private static class IndexWriterMergeSource implements MergeScheduler.MergeSource { - private final IndexWriter writer; - - private IndexWriterMergeSource(IndexWriter writer) { - this.writer = writer; - } + private record IndexWriterMergeSource(IndexWriter writer) implements MergeScheduler.MergeSource { @Override public MergePolicy.OneMerge getNextMerge() { diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexableFieldType.java b/lucene/core/src/java/org/apache/lucene/index/IndexableFieldType.java index 006828e98a25..f76956ec5bad 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexableFieldType.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexableFieldType.java @@ -87,7 +87,7 @@ public interface IndexableFieldType { DocValuesType docValuesType(); /** Whether a skip index for doc values should be created on this field. */ - boolean hasDocValuesSkipIndex(); + DocValuesSkipIndexType docValuesSkipIndexType(); /** * If this is positive (representing the number of point dimensions), the field is indexed as a diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java index 4e61afbc2b40..f69ff533e665 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java @@ -680,7 +680,7 @@ private void initializeFieldInfo(PerField pf) throws IOException { false, s.indexOptions, s.docValuesType, - s.hasDocValuesSkipIndex, + s.docValuesSkipIndex, -1, s.attributes, s.pointDimensionCount, @@ -832,12 +832,14 @@ private static void updateDocFieldSchema( verifyUnIndexedFieldType(fieldName, fieldType); } if (fieldType.docValuesType() != DocValuesType.NONE) { - schema.setDocValues(fieldType.docValuesType(), fieldType.hasDocValuesSkipIndex()); - } else if (fieldType.hasDocValuesSkipIndex()) { + schema.setDocValues(fieldType.docValuesType(), fieldType.docValuesSkipIndexType()); + } else if (fieldType.docValuesSkipIndexType() != DocValuesSkipIndexType.NONE) { throw new IllegalArgumentException( "field '" + schema.name - + "' cannot have docValuesSkipIndex set to true without doc values"); + + "' cannot have docValuesSkipIndexType=" + + fieldType.docValuesSkipIndexType() + + " without doc values"); } if (fieldType.pointDimensionCount() != 0) { schema.setPoints( @@ -1034,10 +1036,12 @@ private void indexVectorValue( int docID, PerField pf, VectorEncoding vectorEncoding, IndexableField field) throws IOException { switch (vectorEncoding) { - case BYTE -> ((KnnFieldVectorsWriter) pf.knnFieldVectorsWriter) - .addValue(docID, ((KnnByteVectorField) field).vectorValue()); - case FLOAT32 -> ((KnnFieldVectorsWriter) pf.knnFieldVectorsWriter) - .addValue(docID, ((KnnFloatVectorField) field).vectorValue()); + case BYTE -> + ((KnnFieldVectorsWriter) pf.knnFieldVectorsWriter) + .addValue(docID, ((KnnByteVectorField) field).vectorValue()); + case FLOAT32 -> + ((KnnFieldVectorsWriter) pf.knnFieldVectorsWriter) + .addValue(docID, ((KnnFloatVectorField) field).vectorValue()); } } @@ -1142,7 +1146,7 @@ void setInvertState() { // segment norms = new NormValuesWriter(fieldInfo, bytesUsed); } - if (fieldInfo.hasVectors()) { + if (fieldInfo.hasTermVectors()) { termVectorsWriter.setHasVectors(); } } @@ -1438,7 +1442,7 @@ private static final class FieldSchema { private boolean storeTermVector = false; private IndexOptions indexOptions = IndexOptions.NONE; private DocValuesType docValuesType = DocValuesType.NONE; - private boolean hasDocValuesSkipIndex = false; + private DocValuesSkipIndexType docValuesSkipIndex = DocValuesSkipIndexType.NONE; private int pointDimensionCount = 0; private int pointIndexDimensionCount = 0; private int pointNumBytes = 0; @@ -1504,13 +1508,14 @@ void setIndexOptions( } } - void setDocValues(DocValuesType newDocValuesType, boolean newHasDocValuesSkipIndex) { + void setDocValues( + DocValuesType newDocValuesType, DocValuesSkipIndexType newDocValuesSkipIndex) { if (docValuesType == DocValuesType.NONE) { this.docValuesType = newDocValuesType; - this.hasDocValuesSkipIndex = newHasDocValuesSkipIndex; + this.docValuesSkipIndex = newDocValuesSkipIndex; } else { assertSame("doc values type", docValuesType, newDocValuesType); - assertSame("doc values skip index", hasDocValuesSkipIndex, newHasDocValuesSkipIndex); + assertSame("doc values skip index type", docValuesSkipIndex, newDocValuesSkipIndex); } } @@ -1556,9 +1561,9 @@ void reset(int doc) { void assertSameSchema(FieldInfo fi) { assertSame("index options", fi.getIndexOptions(), indexOptions); assertSame("omit norms", fi.omitsNorms(), omitNorms); - assertSame("store term vector", fi.hasVectors(), storeTermVector); + assertSame("store term vector", fi.hasTermVectors(), storeTermVector); assertSame("doc values type", fi.getDocValuesType(), docValuesType); - assertSame("doc values skip index", fi.hasDocValuesSkipIndex(), hasDocValuesSkipIndex); + assertSame("doc values skip index type", fi.docValuesSkipIndexType(), docValuesSkipIndex); assertSame( "vector similarity function", fi.getVectorSimilarityFunction(), vectorSimilarityFunction); assertSame("vector encoding", fi.getVectorEncoding(), vectorEncoding); diff --git a/lucene/core/src/java/org/apache/lucene/index/KnnVectorValues.java b/lucene/core/src/java/org/apache/lucene/index/KnnVectorValues.java new file mode 100644 index 000000000000..8e58f387a334 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/index/KnnVectorValues.java @@ -0,0 +1,229 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.index; + +import java.io.IOException; +import org.apache.lucene.document.KnnByteVectorField; +import org.apache.lucene.document.KnnFloatVectorField; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.util.Bits; + +/** + * This class abstracts addressing of document vector values indexed as {@link KnnFloatVectorField} + * or {@link KnnByteVectorField}. + * + * @lucene.experimental + */ +public abstract class KnnVectorValues { + + /** Return the dimension of the vectors */ + public abstract int dimension(); + + /** + * Return the number of vectors for this field. + * + * @return the number of vectors returned by this iterator + */ + public abstract int size(); + + /** + * Return the docid of the document indexed with the given vector ordinal. This default + * implementation returns the argument and is appropriate for dense values implementations where + * every doc has a single value. + */ + public int ordToDoc(int ord) { + return ord; + } + + /** + * Creates a new copy of this {@link KnnVectorValues}. This is helpful when you need to access + * different values at once, to avoid overwriting the underlying vector returned. + */ + public abstract KnnVectorValues copy() throws IOException; + + /** Returns the vector byte length, defaults to dimension multiplied by float byte size */ + public int getVectorByteLength() { + return dimension() * getEncoding().byteSize; + } + + /** The vector encoding of these values. */ + public abstract VectorEncoding getEncoding(); + + /** Returns a Bits accepting docs accepted by the argument and having a vector value */ + public Bits getAcceptOrds(Bits acceptDocs) { + // FIXME: change default to return acceptDocs and provide this impl + // somewhere more specialized (in every non-dense impl). + if (acceptDocs == null) { + return null; + } + return new Bits() { + @Override + public boolean get(int index) { + return acceptDocs.get(ordToDoc(index)); + } + + @Override + public int length() { + return size(); + } + }; + } + + /** Create an iterator for this instance. */ + public DocIndexIterator iterator() { + throw new UnsupportedOperationException(); + } + + /** + * A DocIdSetIterator that also provides an index() method tracking a distinct ordinal for a + * vector associated with each doc. + */ + public abstract static class DocIndexIterator extends DocIdSetIterator { + + /** return the value index (aka "ordinal" or "ord") corresponding to the current doc */ + public abstract int index(); + } + + /** + * Creates an iterator for instances where every doc has a value, and the value ordinals are equal + * to the docids. + */ + protected DocIndexIterator createDenseIterator() { + return new DocIndexIterator() { + + int doc = -1; + + @Override + public int docID() { + return doc; + } + + @Override + public int index() { + return doc; + } + + @Override + public int nextDoc() throws IOException { + if (doc >= size() - 1) { + return doc = NO_MORE_DOCS; + } else { + return ++doc; + } + } + + @Override + public int advance(int target) { + if (target >= size()) { + return doc = NO_MORE_DOCS; + } + return doc = target; + } + + @Override + public long cost() { + return size(); + } + }; + } + + /** + * Creates an iterator from a DocIdSetIterator indicating which docs have values, and for which + * ordinals increase monotonically with docid. + */ + protected static DocIndexIterator fromDISI(DocIdSetIterator docsWithField) { + return new DocIndexIterator() { + + int ord = -1; + + @Override + public int docID() { + return docsWithField.docID(); + } + + @Override + public int index() { + return ord; + } + + @Override + public int nextDoc() throws IOException { + if (docID() == NO_MORE_DOCS) { + return NO_MORE_DOCS; + } + ord++; + return docsWithField.nextDoc(); + } + + @Override + public int advance(int target) throws IOException { + return docsWithField.advance(target); + } + + @Override + public long cost() { + return docsWithField.cost(); + } + }; + } + + /** + * Creates an iterator from this instance's ordinal-to-docid mapping which must be monotonic + * (docid increases when ordinal does). + */ + protected DocIndexIterator createSparseIterator() { + return new DocIndexIterator() { + private int ord = -1; + + @Override + public int docID() { + if (ord == -1) { + return -1; + } + if (ord == NO_MORE_DOCS) { + return NO_MORE_DOCS; + } + return ordToDoc(ord); + } + + @Override + public int index() { + return ord; + } + + @Override + public int nextDoc() throws IOException { + if (ord >= size() - 1) { + ord = NO_MORE_DOCS; + } else { + ++ord; + } + return docID(); + } + + @Override + public int advance(int target) throws IOException { + return slowAdvance(target); + } + + @Override + public long cost() { + return size(); + } + }; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/index/LeafMetaData.java b/lucene/core/src/java/org/apache/lucene/index/LeafMetaData.java index 77a9f2a84b64..4595560eff8b 100644 --- a/lucene/core/src/java/org/apache/lucene/index/LeafMetaData.java +++ b/lucene/core/src/java/org/apache/lucene/index/LeafMetaData.java @@ -24,18 +24,29 @@ /** * Provides read-only metadata about a leaf. * + * @param createdVersionMajor the Lucene version that created this index. This can be used to + * implement backward compatibility on top of the codec API. A return value of {@code 6} + * indicates that the created version is unknown. + * @param minVersion the minimum Lucene version that contributed documents to this index, or {@code + * null} if this information is not available. + * @param sort the order in which documents from this index are sorted, or {@code null} if documents + * are in no particular order. + * @param hasBlocks Returns true iff this index contains blocks created with {@link + * IndexWriter#addDocument(Iterable)} or it's corresponding update methods with at least 2 or + * more documents per call. Note: This property was not recorded before {@link Version + * LUCENE_9_9_0} this method will return false for all leaves written before {@link Version + * LUCENE_9_9_0} + * @see IndexWriter#updateDocuments(Term, Iterable) + * @see IndexWriter#updateDocuments(Query, Iterable) + * @see IndexWriter#softUpdateDocuments(Term, Iterable, Field...) + * @see IndexWriter#addDocuments(Iterable) * @lucene.experimental */ -public final class LeafMetaData { - - private final int createdVersionMajor; - private final Version minVersion; - private final Sort sort; - private final boolean hasBlocks; +public record LeafMetaData( + int createdVersionMajor, Version minVersion, Sort sort, boolean hasBlocks) { /** Expert: Sole constructor. Public for use by custom {@link LeafReader} impls. */ - public LeafMetaData(int createdVersionMajor, Version minVersion, Sort sort, boolean hasBlocks) { - this.createdVersionMajor = createdVersionMajor; + public LeafMetaData { if (createdVersionMajor > Version.LATEST.major) { throw new IllegalArgumentException( "createdVersionMajor is in the future: " + createdVersionMajor); @@ -47,48 +58,5 @@ public LeafMetaData(int createdVersionMajor, Version minVersion, Sort sort, bool if (createdVersionMajor >= 7 && minVersion == null) { throw new IllegalArgumentException("minVersion must be set when createdVersionMajor is >= 7"); } - this.minVersion = minVersion; - this.sort = sort; - this.hasBlocks = hasBlocks; - } - - /** - * Get the Lucene version that created this index. This can be used to implement backward - * compatibility on top of the codec API. A return value of {@code 6} indicates that the created - * version is unknown. - */ - public int getCreatedVersionMajor() { - return createdVersionMajor; - } - - /** - * Return the minimum Lucene version that contributed documents to this index, or {@code null} if - * this information is not available. - */ - public Version getMinVersion() { - return minVersion; - } - - /** - * Return the order in which documents from this index are sorted, or {@code null} if documents - * are in no particular order. - */ - public Sort getSort() { - return sort; - } - - /** - * Returns true iff this index contains blocks created with {@link - * IndexWriter#addDocument(Iterable)} or it's corresponding update methods with at least 2 or more - * documents per call. Note: This property was not recorded before {@link Version#LUCENE_9_9_0} - * this method will return false for all leaves written before {@link Version#LUCENE_9_9_0} - * - * @see IndexWriter#updateDocuments(Term, Iterable) - * @see IndexWriter#updateDocuments(Query, Iterable) - * @see IndexWriter#softUpdateDocuments(Term, Iterable, Field...) - * @see IndexWriter#addDocuments(Iterable) - */ - public boolean hasBlocks() { - return hasBlocks; } } diff --git a/lucene/core/src/java/org/apache/lucene/index/LogMergePolicy.java b/lucene/core/src/java/org/apache/lucene/index/LogMergePolicy.java index a86809a1fb15..881ae099d5f8 100644 --- a/lucene/core/src/java/org/apache/lucene/index/LogMergePolicy.java +++ b/lucene/core/src/java/org/apache/lucene/index/LogMergePolicy.java @@ -474,14 +474,8 @@ else if (i - firstSegmentWithDeletions == mergeFactor) { return spec; } - private static class SegmentInfoAndLevel implements Comparable { - final SegmentCommitInfo info; - final float level; - - public SegmentInfoAndLevel(SegmentCommitInfo info, float level) { - this.info = info; - this.level = level; - } + private record SegmentInfoAndLevel(SegmentCommitInfo info, float level) + implements Comparable { // Sorts largest to smallest @Override diff --git a/lucene/core/src/java/org/apache/lucene/index/MappingMultiPostingsEnum.java b/lucene/core/src/java/org/apache/lucene/index/MappingMultiPostingsEnum.java index fdde920c191b..f112354db62d 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MappingMultiPostingsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/index/MappingMultiPostingsEnum.java @@ -68,7 +68,7 @@ MappingMultiPostingsEnum reset(MultiPostingsEnum postingsEnum) throws IOExceptio int count = postingsEnum.getNumSubs(); subs.clear(); for (int i = 0; i < count; i++) { - MappingPostingsSub sub = allSubs[subsArray[i].slice.readerIndex]; + MappingPostingsSub sub = allSubs[subsArray[i].slice.readerIndex()]; sub.postings = subsArray[i].postingsEnum; subs.add(sub); } diff --git a/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java b/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java index 2eaab9c58f85..d66f5648c03d 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java +++ b/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java @@ -27,6 +27,7 @@ import java.util.concurrent.CancellationException; import java.util.concurrent.CompletableFuture; import java.util.concurrent.ExecutionException; +import java.util.concurrent.Executor; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicLong; @@ -292,7 +293,7 @@ final void close( * Wrap a reader prior to merging in order to add/remove fields or documents. * *

    NOTE: It is illegal to reorder doc IDs here, use {@link - * #reorder(CodecReader,Directory)} instead. + * #reorder(CodecReader,Directory,Executor)} instead. */ public CodecReader wrapForMerge(CodecReader reader) throws IOException { return reader; @@ -308,9 +309,12 @@ public CodecReader wrapForMerge(CodecReader reader) throws IOException { * * @param reader The reader to reorder. * @param dir The {@link Directory} of the index, which may be used to create temporary files. + * @param executor An executor that can be used to parallelize the reordering logic. May be + * {@code null} if no concurrency is supported. * @lucene.experimental */ - public Sorter.DocMap reorder(CodecReader reader, Directory dir) throws IOException { + public Sorter.DocMap reorder(CodecReader reader, Directory dir, Executor executor) + throws IOException { return null; } diff --git a/lucene/core/src/java/org/apache/lucene/index/MergeState.java b/lucene/core/src/java/org/apache/lucene/index/MergeState.java index 31bdf7ce8d31..838699215f0d 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MergeState.java +++ b/lucene/core/src/java/org/apache/lucene/index/MergeState.java @@ -236,7 +236,7 @@ private static void verifyIndexSort(List readers, SegmentInfo segme return; } for (CodecReader leaf : readers) { - Sort segmentSort = leaf.getMetaData().getSort(); + Sort segmentSort = leaf.getMetaData().sort(); if (segmentSort == null || isCongruentSort(indexSort, segmentSort) == false) { throw new IllegalArgumentException( "index sort mismatch: merged segment has sort=" diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiPostingsEnum.java b/lucene/core/src/java/org/apache/lucene/index/MultiPostingsEnum.java index 3fe19b849794..f463860c078f 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiPostingsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiPostingsEnum.java @@ -115,7 +115,7 @@ public int advance(int target) throws IOException { } else { upto++; current = subs[upto].postingsEnum; - currentBase = subs[upto].slice.start; + currentBase = subs[upto].slice.start(); } } } @@ -129,7 +129,7 @@ public int nextDoc() throws IOException { } else { upto++; current = subs[upto].postingsEnum; - currentBase = subs[upto].slice.start; + currentBase = subs[upto].slice.start(); } } diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java index 9a8e48e5d3eb..5c3c25be55e5 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java @@ -70,10 +70,10 @@ static MergeState.DocMap[] sort(Sort sort, List readers) throws IOE } if (metaData.hasBlocks() && fieldInfos.getParentField() == null - && metaData.getCreatedVersionMajor() >= Version.LUCENE_10_0_0.major) { + && metaData.createdVersionMajor() >= Version.LUCENE_10_0_0.major) { throw new CorruptIndexException( "parent field is not set but the index has blocks and uses index sorting. indexCreatedVersionMajor: " - + metaData.getCreatedVersionMajor(), + + metaData.createdVersionMajor(), "IndexingChain"); } } diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiTermsEnum.java b/lucene/core/src/java/org/apache/lucene/index/MultiTermsEnum.java index f4cbb4cc1f5b..144410f4cd9b 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiTermsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiTermsEnum.java @@ -18,7 +18,6 @@ import java.io.IOException; import java.util.Arrays; -import java.util.Comparator; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; @@ -32,14 +31,6 @@ */ public final class MultiTermsEnum extends BaseTermsEnum { - private static final Comparator INDEX_COMPARATOR = - new Comparator() { - @Override - public int compare(TermsEnumWithSlice o1, TermsEnumWithSlice o2) { - return o1.subIndex - o2.subIndex; - } - }; - private final TermMergeQueue queue; // all of our subs (one per sub-reader) private final TermsEnumWithSlice[] subs; @@ -338,7 +329,7 @@ public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException { int upto = 0; - ArrayUtil.timSort(top, 0, numTop, INDEX_COMPARATOR); + ArrayUtil.timSort(top, 0, numTop, (o1, o2) -> o1.subIndex - o2.subIndex); for (int i = 0; i < numTop; i++) { @@ -370,7 +361,7 @@ static final class TermsEnumWithSlice extends TermsEnumIndex { public TermsEnumWithSlice(int index, ReaderSlice subSlice) { super(null, index); this.subSlice = subSlice; - assert subSlice.length >= 0 : "length=" + subSlice.length; + assert subSlice.length() >= 0 : "length=" + subSlice.length(); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java b/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java index 1f1e2dba9c12..c3ace74fb5b6 100644 --- a/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java @@ -48,6 +48,7 @@ * behavior. */ public class ParallelLeafReader extends LeafReader { + private final FieldInfos fieldInfos; private final LeafReader[] parallelReaders, storedFieldsReaders; private final Set completeReaderSet = @@ -128,7 +129,7 @@ public ParallelLeafReader( for (final LeafReader reader : this.parallelReaders) { LeafMetaData leafMetaData = reader.getMetaData(); - Sort leafIndexSort = leafMetaData.getSort(); + Sort leafIndexSort = leafMetaData.sort(); if (indexSort == null) { indexSort = leafIndexSort; } else if (leafIndexSort != null && indexSort.equals(leafIndexSort) == false) { @@ -140,13 +141,13 @@ public ParallelLeafReader( } if (createdVersionMajor == -1) { - createdVersionMajor = leafMetaData.getCreatedVersionMajor(); - } else if (createdVersionMajor != leafMetaData.getCreatedVersionMajor()) { + createdVersionMajor = leafMetaData.createdVersionMajor(); + } else if (createdVersionMajor != leafMetaData.createdVersionMajor()) { throw new IllegalArgumentException( "cannot combine LeafReaders that have different creation versions: saw both version=" + createdVersionMajor + " and " - + leafMetaData.getCreatedVersionMajor()); + + leafMetaData.createdVersionMajor()); } final FieldInfos readerFieldInfos = reader.getFieldInfos(); @@ -158,7 +159,7 @@ public ParallelLeafReader( // only add these if the reader responsible for that field name is the current: // TODO consider populating 1st leaf with vectors even if the field name has been seen on // a previous leaf - if (fieldInfo.hasVectors()) { + if (fieldInfo.hasTermVectors()) { tvFieldToReader.put(fieldInfo.name, reader); } // TODO consider populating 1st leaf with terms even if the field name has been seen on a @@ -177,7 +178,7 @@ public ParallelLeafReader( Version minVersion = Version.LATEST; boolean hasBlocks = false; for (final LeafReader reader : this.parallelReaders) { - Version leafVersion = reader.getMetaData().getMinVersion(); + Version leafVersion = reader.getMetaData().minVersion(); hasBlocks |= reader.getMetaData().hasBlocks(); if (leafVersion == null) { minVersion = null; @@ -325,14 +326,32 @@ public CacheHelper getReaderCacheHelper() { @Override public TermVectors termVectors() throws IOException { ensureOpen(); - // TODO: optimize + + Map readerToTermVectors = new IdentityHashMap<>(); + for (LeafReader reader : parallelReaders) { + if (reader.getFieldInfos().hasTermVectors()) { + TermVectors termVectors = reader.termVectors(); + readerToTermVectors.put(reader, termVectors); + } + } + return new TermVectors() { + @Override + public void prefetch(int docID) throws IOException { + // Prefetch all vectors. Note that this may be wasteful if the consumer doesn't need to read + // all the fields but we have no way to know what fields the consumer needs. + for (TermVectors termVectors : readerToTermVectors.values()) { + termVectors.prefetch(docID); + } + } + @Override public Fields get(int docID) throws IOException { ParallelFields fields = null; for (Map.Entry ent : tvFieldToReader.entrySet()) { String fieldName = ent.getKey(); - Terms vector = ent.getValue().termVectors().get(docID, fieldName); + TermVectors termVectors = readerToTermVectors.get(ent.getValue()); + Terms vector = termVectors.get(docID, fieldName); if (vector != null) { if (fields == null) { fields = new ParallelFields(); diff --git a/lucene/core/src/java/org/apache/lucene/index/PendingSoftDeletes.java b/lucene/core/src/java/org/apache/lucene/index/PendingSoftDeletes.java index 557d31ad4415..63c021660c78 100644 --- a/lucene/core/src/java/org/apache/lucene/index/PendingSoftDeletes.java +++ b/lucene/core/src/java/org/apache/lucene/index/PendingSoftDeletes.java @@ -76,15 +76,14 @@ void onNewReader(CodecReader reader, SegmentCommitInfo info) throws IOException hardDeletes.onNewReader(reader, info); // only re-calculate this if we haven't seen this generation if (dvGeneration < info.getDocValuesGen()) { - final DocIdSetIterator iterator = - FieldExistsQuery.getDocValuesDocIdSetIterator(field, reader); - int newDelCount; - if (iterator - != null) { // nothing is deleted we don't have a soft deletes field in this segment - assert info.info.maxDoc() > 0 : "maxDoc is 0"; + final int newDelCount; + var iterator = FieldExistsQuery.getDocValuesDocIdSetIterator(field, reader); + if (iterator != null && iterator.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { + iterator = FieldExistsQuery.getDocValuesDocIdSetIterator(field, reader); newDelCount = applySoftDeletes(iterator, getMutableBits()); assert newDelCount >= 0 : " illegal pending delete count: " + newDelCount; } else { + // nothing is deleted we don't have a soft deletes field in this segment newDelCount = 0; } assert info.getSoftDelCount() == newDelCount @@ -227,12 +226,7 @@ private FieldInfos readFieldInfos() throws IOException { // updates always outside of CFS Closeable toClose; if (segInfo.getUseCompoundFile()) { - toClose = - dir = - segInfo - .getCodec() - .compoundFormat() - .getCompoundReader(segInfo.dir, segInfo, IOContext.READONCE); + toClose = dir = segInfo.getCodec().compoundFormat().getCompoundReader(segInfo.dir, segInfo); } else { toClose = null; dir = segInfo.dir; diff --git a/lucene/core/src/java/org/apache/lucene/index/ReaderSlice.java b/lucene/core/src/java/org/apache/lucene/index/ReaderSlice.java index eec6fb76221a..ff54c392f14f 100644 --- a/lucene/core/src/java/org/apache/lucene/index/ReaderSlice.java +++ b/lucene/core/src/java/org/apache/lucene/index/ReaderSlice.java @@ -19,31 +19,13 @@ /** * Subreader slice from a parent composite reader. * + * @param start Document ID this slice starts from. + * @param length Number of documents in this slice. + * @param readerIndex Sub-reader index for this slice. * @lucene.internal */ -public final class ReaderSlice { +public record ReaderSlice(int start, int length, int readerIndex) { /** Zero-length {@code ReaderSlice} array. */ public static final ReaderSlice[] EMPTY_ARRAY = new ReaderSlice[0]; - - /** Document ID this slice starts from. */ - public final int start; - - /** Number of documents in this slice. */ - public final int length; - - /** Sub-reader index for this slice. */ - public final int readerIndex; - - /** Sole constructor. */ - public ReaderSlice(int start, int length, int readerIndex) { - this.start = start; - this.length = length; - this.readerIndex = readerIndex; - } - - @Override - public String toString() { - return "slice start=" + start + " length=" + length + " readerIndex=" + readerIndex; - } } diff --git a/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java b/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java index 9d4f79a5586a..e39861a0671b 100644 --- a/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java +++ b/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java @@ -708,12 +708,12 @@ private FieldInfo cloneFieldInfo(FieldInfo fi, int fieldNumber) { return new FieldInfo( fi.name, fieldNumber, - fi.hasVectors(), + fi.hasTermVectors(), fi.omitsNorms(), fi.hasPayloads(), fi.getIndexOptions(), fi.getDocValuesType(), - fi.hasDocValuesSkipIndex(), + fi.docValuesSkipIndexType(), fi.getDocValuesGen(), new HashMap<>(fi.attributes()), fi.getPointDimensionCount(), diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentCoreReaders.java b/lucene/core/src/java/org/apache/lucene/index/SegmentCoreReaders.java index 68970de6d1be..7da6d77136c2 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SegmentCoreReaders.java +++ b/lucene/core/src/java/org/apache/lucene/index/SegmentCoreReaders.java @@ -80,7 +80,7 @@ final class SegmentCoreReaders { try { if (si.info.getUseCompoundFile()) { - cfsDir = cfsReader = codec.compoundFormat().getCompoundReader(dir, si.info, context); + cfsDir = cfsReader = codec.compoundFormat().getCompoundReader(dir, si.info); } else { cfsReader = null; cfsDir = dir; @@ -117,7 +117,7 @@ final class SegmentCoreReaders { .storedFieldsFormat() .fieldsReader(cfsDir, si.info, coreFieldInfos, context); - if (coreFieldInfos.hasVectors()) { // open term vector files only as needed + if (coreFieldInfos.hasTermVectors()) { // open term vector files only as needed termVectorsReaderOrig = si.info .getCodec() diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentDocValuesProducer.java b/lucene/core/src/java/org/apache/lucene/index/SegmentDocValuesProducer.java index 1d9878fe0dbc..0f4df818ddcb 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SegmentDocValuesProducer.java +++ b/lucene/core/src/java/org/apache/lucene/index/SegmentDocValuesProducer.java @@ -18,11 +18,10 @@ import java.io.IOException; import java.util.Collections; -import java.util.HashMap; import java.util.IdentityHashMap; -import java.util.Map; import java.util.Set; import org.apache.lucene.codecs.DocValuesProducer; +import org.apache.lucene.internal.hppc.IntObjectHashMap; import org.apache.lucene.internal.hppc.LongArrayList; import org.apache.lucene.store.Directory; @@ -32,7 +31,7 @@ // producer? class SegmentDocValuesProducer extends DocValuesProducer { - final Map dvProducersByField = new HashMap<>(); + final IntObjectHashMap dvProducersByField = new IntObjectHashMap<>(); final Set dvProducers = Collections.newSetFromMap(new IdentityHashMap()); final LongArrayList dvGens = new LongArrayList(); @@ -67,7 +66,7 @@ class SegmentDocValuesProducer extends DocValuesProducer { dvGens.add(docValuesGen); dvProducers.add(baseProducer); } - dvProducersByField.put(fi.name, baseProducer); + dvProducersByField.put(fi.number, baseProducer); } else { assert !dvGens.contains(docValuesGen); // otherwise, producer sees only the one fieldinfo it wrote @@ -76,7 +75,7 @@ class SegmentDocValuesProducer extends DocValuesProducer { docValuesGen, si, dir, new FieldInfos(new FieldInfo[] {fi})); dvGens.add(docValuesGen); dvProducers.add(dvp); - dvProducersByField.put(fi.name, dvp); + dvProducersByField.put(fi.number, dvp); } } } catch (Throwable t) { @@ -91,42 +90,42 @@ class SegmentDocValuesProducer extends DocValuesProducer { @Override public NumericDocValues getNumeric(FieldInfo field) throws IOException { - DocValuesProducer dvProducer = dvProducersByField.get(field.name); + DocValuesProducer dvProducer = dvProducersByField.get(field.number); assert dvProducer != null; return dvProducer.getNumeric(field); } @Override public BinaryDocValues getBinary(FieldInfo field) throws IOException { - DocValuesProducer dvProducer = dvProducersByField.get(field.name); + DocValuesProducer dvProducer = dvProducersByField.get(field.number); assert dvProducer != null; return dvProducer.getBinary(field); } @Override public SortedDocValues getSorted(FieldInfo field) throws IOException { - DocValuesProducer dvProducer = dvProducersByField.get(field.name); + DocValuesProducer dvProducer = dvProducersByField.get(field.number); assert dvProducer != null; return dvProducer.getSorted(field); } @Override public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException { - DocValuesProducer dvProducer = dvProducersByField.get(field.name); + DocValuesProducer dvProducer = dvProducersByField.get(field.number); assert dvProducer != null; return dvProducer.getSortedNumeric(field); } @Override public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { - DocValuesProducer dvProducer = dvProducersByField.get(field.name); + DocValuesProducer dvProducer = dvProducersByField.get(field.number); assert dvProducer != null; return dvProducer.getSortedSet(field); } @Override public DocValuesSkipper getSkipper(FieldInfo field) throws IOException { - DocValuesProducer dvProducer = dvProducersByField.get(field.name); + DocValuesProducer dvProducer = dvProducersByField.get(field.number); assert dvProducer != null; return dvProducer.getSkipper(field); } diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java b/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java index c2299427fa2e..5e336c7fef09 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java +++ b/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java @@ -17,9 +17,7 @@ package org.apache.lucene.index; import java.io.IOException; -import java.util.ArrayList; import java.util.List; -import java.util.concurrent.Callable; import java.util.concurrent.Executor; import java.util.concurrent.TimeUnit; import org.apache.lucene.codecs.Codec; @@ -31,7 +29,6 @@ import org.apache.lucene.codecs.PointsWriter; import org.apache.lucene.codecs.StoredFieldsWriter; import org.apache.lucene.codecs.TermVectorsWriter; -import org.apache.lucene.search.TaskExecutor; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.util.InfoStream; @@ -52,6 +49,7 @@ final class SegmentMerger { final MergeState mergeState; private final FieldInfos.Builder fieldInfosBuilder; + final Thread mergeStateCreationThread; // note, just like in codec apis Directory 'dir' is NOT the same as segmentInfo.dir!! SegmentMerger( @@ -68,13 +66,14 @@ final class SegmentMerger { "IOContext.context should be MERGE; got: " + context.context()); } mergeState = new MergeState(readers, segmentInfo, infoStream, intraMergeTaskExecutor); + mergeStateCreationThread = Thread.currentThread(); directory = dir; this.codec = segmentInfo.getCodec(); this.context = context; this.fieldInfosBuilder = new FieldInfos.Builder(fieldNumbers); Version minVersion = Version.LATEST; for (CodecReader reader : readers) { - Version leafMinVersion = reader.getMetaData().getMinVersion(); + Version leafMinVersion = reader.getMetaData().minVersion(); if (leafMinVersion == null) { minVersion = null; break; @@ -99,6 +98,11 @@ boolean shouldMerge() { return mergeState.segmentInfo.maxDoc() > 0; } + private MergeState mergeState() { + assert Thread.currentThread() == mergeStateCreationThread; + return mergeState; + } + /** * Merges the readers into the directory passed to the constructor * @@ -135,36 +139,19 @@ MergeState merge() throws IOException { IOContext.DEFAULT, segmentWriteState.segmentSuffix); - TaskExecutor taskExecutor = new TaskExecutor(mergeState.intraMergeTaskExecutor); - List> mergingTasks = new ArrayList<>(); - mergingTasks.add( - () -> { - if (mergeState.mergeFieldInfos.hasNorms()) { - mergeWithLogging( - this::mergeNorms, segmentWriteState, segmentReadState, "norms", numMerged); - } + if (mergeState.mergeFieldInfos.hasNorms()) { + mergeWithLogging(this::mergeNorms, segmentWriteState, segmentReadState, "norms", numMerged); + } - mergeWithLogging( - this::mergeTerms, segmentWriteState, segmentReadState, "postings", numMerged); - return null; - }); + mergeWithLogging(this::mergeTerms, segmentWriteState, segmentReadState, "postings", numMerged); if (mergeState.mergeFieldInfos.hasDocValues()) { - mergingTasks.add( - () -> { - mergeWithLogging( - this::mergeDocValues, segmentWriteState, segmentReadState, "doc values", numMerged); - return null; - }); + mergeWithLogging( + this::mergeDocValues, segmentWriteState, segmentReadState, "doc values", numMerged); } if (mergeState.mergeFieldInfos.hasPointValues()) { - mergingTasks.add( - () -> { - mergeWithLogging( - this::mergePoints, segmentWriteState, segmentReadState, "points", numMerged); - return null; - }); + mergeWithLogging(this::mergePoints, segmentWriteState, segmentReadState, "points", numMerged); } if (mergeState.mergeFieldInfos.hasVectorValues()) { @@ -176,15 +163,10 @@ MergeState merge() throws IOException { numMerged); } - if (mergeState.mergeFieldInfos.hasVectors()) { - mergingTasks.add( - () -> { - mergeWithLogging(this::mergeTermVectors, "term vectors"); - return null; - }); + if (mergeState.mergeFieldInfos.hasTermVectors()) { + mergeWithLogging(this::mergeTermVectors, "term vectors"); } - taskExecutor.invokeAll(mergingTasks); // write the merged infos mergeWithLogging( this::mergeFieldInfos, segmentWriteState, segmentReadState, "field infos", numMerged); @@ -201,6 +183,7 @@ private void mergeFieldInfos( private void mergeDocValues( SegmentWriteState segmentWriteState, SegmentReadState segmentReadState) throws IOException { + MergeState mergeState = mergeState(); try (DocValuesConsumer consumer = codec.docValuesFormat().fieldsConsumer(segmentWriteState)) { consumer.merge(mergeState); } @@ -208,6 +191,7 @@ private void mergeDocValues( private void mergePoints(SegmentWriteState segmentWriteState, SegmentReadState segmentReadState) throws IOException { + MergeState mergeState = mergeState(); try (PointsWriter writer = codec.pointsFormat().fieldsWriter(segmentWriteState)) { writer.merge(mergeState); } @@ -215,6 +199,7 @@ private void mergePoints(SegmentWriteState segmentWriteState, SegmentReadState s private void mergeNorms(SegmentWriteState segmentWriteState, SegmentReadState segmentReadState) throws IOException { + MergeState mergeState = mergeState(); try (NormsConsumer consumer = codec.normsFormat().normsConsumer(segmentWriteState)) { consumer.merge(mergeState); } @@ -222,6 +207,7 @@ private void mergeNorms(SegmentWriteState segmentWriteState, SegmentReadState se private void mergeTerms(SegmentWriteState segmentWriteState, SegmentReadState segmentReadState) throws IOException { + MergeState mergeState = mergeState(); try (NormsProducer norms = mergeState.mergeFieldInfos.hasNorms() ? codec.normsFormat().normsProducer(segmentReadState) @@ -256,6 +242,7 @@ public void mergeFieldInfos() { * @throws IOException if there is a low-level IO error */ private int mergeFields() throws IOException { + MergeState mergeState = mergeState(); try (StoredFieldsWriter fieldsWriter = codec.storedFieldsFormat().fieldsWriter(directory, mergeState.segmentInfo, context)) { return fieldsWriter.merge(mergeState); @@ -268,6 +255,7 @@ private int mergeFields() throws IOException { * @throws IOException if there is a low-level IO error */ private int mergeTermVectors() throws IOException { + MergeState mergeState = mergeState(); try (TermVectorsWriter termVectorsWriter = codec.termVectorsFormat().vectorsWriter(directory, mergeState.segmentInfo, context)) { int numMerged = termVectorsWriter.merge(mergeState); @@ -278,6 +266,7 @@ private int mergeTermVectors() throws IOException { private void mergeVectorValues( SegmentWriteState segmentWriteState, SegmentReadState segmentReadState) throws IOException { + MergeState mergeState = mergeState(); try (KnnVectorsWriter writer = codec.knnVectorsFormat().fieldsWriter(segmentWriteState)) { writer.merge(mergeState); } diff --git a/lucene/core/src/java/org/apache/lucene/index/SlowCodecReaderWrapper.java b/lucene/core/src/java/org/apache/lucene/index/SlowCodecReaderWrapper.java index 4d05d241e699..57836ae482fd 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SlowCodecReaderWrapper.java +++ b/lucene/core/src/java/org/apache/lucene/index/SlowCodecReaderWrapper.java @@ -296,6 +296,11 @@ private static TermVectorsReader readerToTermVectorsReader(final LeafReader read throw new UncheckedIOException(e); } return new TermVectorsReader() { + @Override + public void prefetch(int docID) throws IOException { + termVectors.prefetch(docID); + } + @Override public Fields get(int docID) throws IOException { return termVectors.get(docID); diff --git a/lucene/core/src/java/org/apache/lucene/index/SlowCompositeCodecReaderWrapper.java b/lucene/core/src/java/org/apache/lucene/index/SlowCompositeCodecReaderWrapper.java index fc6c1d9b2941..69d557d270ae 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SlowCompositeCodecReaderWrapper.java +++ b/lucene/core/src/java/org/apache/lucene/index/SlowCompositeCodecReaderWrapper.java @@ -34,9 +34,7 @@ import org.apache.lucene.codecs.TermVectorsReader; import org.apache.lucene.index.MultiDocValues.MultiSortedDocValues; import org.apache.lucene.index.MultiDocValues.MultiSortedSetDocValues; -import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.KnnCollector; -import org.apache.lucene.search.VectorScorer; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.Bits; import org.apache.lucene.util.IOUtils; @@ -81,15 +79,15 @@ private SlowCompositeCodecReaderWrapper(List codecReaders) throws I for (CodecReader reader : codecReaders) { LeafMetaData readerMeta = reader.getMetaData(); if (majorVersion == -1) { - majorVersion = readerMeta.getCreatedVersionMajor(); - } else if (majorVersion != readerMeta.getCreatedVersionMajor()) { + majorVersion = readerMeta.createdVersionMajor(); + } else if (majorVersion != readerMeta.createdVersionMajor()) { throw new IllegalArgumentException( "Cannot combine leaf readers created with different major versions"); } if (minVersion == null) { - minVersion = readerMeta.getMinVersion(); - } else if (minVersion.onOrAfter(readerMeta.getMinVersion())) { - minVersion = readerMeta.getMinVersion(); + minVersion = readerMeta.minVersion(); + } else if (minVersion.onOrAfter(readerMeta.minVersion())) { + minVersion = readerMeta.minVersion(); } hasBlocks |= readerMeta.hasBlocks(); } @@ -245,6 +243,15 @@ public void checkIntegrity() throws IOException { } } + @Override + public void prefetch(int doc) throws IOException { + int readerId = docIdToReaderId(doc); + TermVectorsReader reader = readers[readerId]; + if (reader != null) { + reader.prefetch(doc - docStarts[readerId]); + } + } + @Override public Fields get(int doc) throws IOException { int readerId = docIdToReaderId(doc); @@ -294,48 +301,21 @@ public void checkIntegrity() throws IOException { } } - private static class DocValuesSub { - private final T sub; - private final int docStart; - private final int docEnd; + private record DocValuesSub(T sub, int docStart, int ordStart) {} - DocValuesSub(T sub, int docStart, int docEnd) { - this.sub = sub; - this.docStart = docStart; - this.docEnd = docEnd; - } - } - - private static class MergedDocIdSetIterator extends DocIdSetIterator { + private static class MergedDocIterator + extends KnnVectorValues.DocIndexIterator { final Iterator> it; - final long cost; DocValuesSub current; - int currentIndex = 0; + KnnVectorValues.DocIndexIterator currentIterator; + int ord = -1; int doc = -1; - MergedDocIdSetIterator(List> subs) { - long cost = 0; - for (DocValuesSub sub : subs) { - if (sub.sub != null) { - cost += sub.sub.cost(); - } - } - this.cost = cost; + MergedDocIterator(List> subs) { this.it = subs.iterator(); current = it.next(); - } - - private boolean advanceSub(int target) { - while (current.sub == null || current.docEnd <= target) { - if (it.hasNext() == false) { - doc = NO_MORE_DOCS; - return false; - } - current = it.next(); - currentIndex++; - } - return true; + currentIterator = currentIterator(); } @Override @@ -343,41 +323,47 @@ public int docID() { return doc; } + @Override + public int index() { + return ord; + } + @Override public int nextDoc() throws IOException { while (true) { if (current.sub != null) { - int next = current.sub.nextDoc(); + int next = currentIterator.nextDoc(); if (next != NO_MORE_DOCS) { + ++ord; return doc = current.docStart + next; } } if (it.hasNext() == false) { + ord = NO_MORE_DOCS; return doc = NO_MORE_DOCS; } current = it.next(); - currentIndex++; + currentIterator = currentIterator(); + ord = current.ordStart - 1; } } - @Override - public int advance(int target) throws IOException { - while (true) { - if (advanceSub(target) == false) { - return DocIdSetIterator.NO_MORE_DOCS; - } - int next = current.sub.advance(target - current.docStart); - if (next == DocIdSetIterator.NO_MORE_DOCS) { - target = current.docEnd; - } else { - return doc = current.docStart + next; - } + private KnnVectorValues.DocIndexIterator currentIterator() { + if (current.sub != null) { + return current.sub.iterator(); + } else { + return null; } } @Override public long cost() { - return cost; + throw new UnsupportedOperationException(); + } + + @Override + public int advance(int target) throws IOException { + throw new UnsupportedOperationException(); } } @@ -565,11 +551,8 @@ public PointsReader getPointsReader() { return new SlowCompositePointsReaderWrapper(codecReaders, docStarts); } - private static class PointValuesSub { - private final PointValues sub; - private final int docBase; - - PointValuesSub(PointValues sub, int docBase) { + private record PointValuesSub(PointValues sub, int docBase) { + private PointValuesSub(PointValues sub, int docBase) { this.sub = Objects.requireNonNull(sub); this.docBase = docBase; } @@ -852,55 +835,75 @@ public FloatVectorValues getFloatVectorValues(String field) throws IOException { int size = 0; for (CodecReader reader : codecReaders) { FloatVectorValues values = reader.getFloatVectorValues(field); + subs.add(new DocValuesSub<>(values, docStarts[i], size)); if (values != null) { if (dimension == -1) { dimension = values.dimension(); } size += values.size(); } - subs.add(new DocValuesSub<>(values, docStarts[i], docStarts[i + 1])); i++; } - final int finalDimension = dimension; - final int finalSize = size; - MergedDocIdSetIterator mergedIterator = new MergedDocIdSetIterator<>(subs); - return new FloatVectorValues() { - - @Override - public int dimension() { - return finalDimension; - } + return new MergedFloatVectorValues(dimension, size, subs); + } - @Override - public int size() { - return finalSize; + class MergedFloatVectorValues extends FloatVectorValues { + final int dimension; + final int size; + final DocValuesSub[] subs; + final MergedDocIterator iter; + final int[] starts; + int lastSubIndex; + + MergedFloatVectorValues(int dimension, int size, List> subs) { + this.dimension = dimension; + this.size = size; + this.subs = subs.toArray(new DocValuesSub[0]); + iter = new MergedDocIterator<>(subs); + // [0, start(1), ..., size] - we want the extra element + // to avoid checking for out-of-array bounds + starts = new int[subs.size() + 1]; + for (int i = 0; i < subs.size(); i++) { + starts[i] = subs.get(i).ordStart; } + starts[starts.length - 1] = size; + } - @Override - public float[] vectorValue() throws IOException { - return mergedIterator.current.sub.vectorValue(); - } + @Override + public MergedDocIterator iterator() { + return iter; + } - @Override - public int docID() { - return mergedIterator.docID(); - } + @Override + public int dimension() { + return dimension; + } - @Override - public int nextDoc() throws IOException { - return mergedIterator.nextDoc(); - } + @Override + public int size() { + return size; + } - @Override - public int advance(int target) throws IOException { - return mergedIterator.advance(target); + @SuppressWarnings("unchecked") + @Override + public FloatVectorValues copy() throws IOException { + List> subsCopy = new ArrayList<>(); + for (Object sub : subs) { + subsCopy.add((DocValuesSub) sub); } + return new MergedFloatVectorValues(dimension, size, subsCopy); + } - @Override - public VectorScorer scorer(float[] target) { - throw new UnsupportedOperationException(); - } - }; + @Override + public float[] vectorValue(int ord) throws IOException { + assert ord >= 0 && ord < size; + // We need to implement fully random-access API here in order to support callers like + // SortingCodecReader that rely on it. + lastSubIndex = findSub(ord, lastSubIndex, starts); + assert subs[lastSubIndex].sub != null; + return ((FloatVectorValues) subs[lastSubIndex].sub) + .vectorValue(ord - subs[lastSubIndex].ordStart); + } } @Override @@ -911,55 +914,101 @@ public ByteVectorValues getByteVectorValues(String field) throws IOException { int size = 0; for (CodecReader reader : codecReaders) { ByteVectorValues values = reader.getByteVectorValues(field); + subs.add(new DocValuesSub<>(values, docStarts[i], size)); if (values != null) { if (dimension == -1) { dimension = values.dimension(); } size += values.size(); } - subs.add(new DocValuesSub<>(values, docStarts[i], docStarts[i + 1])); i++; } - final int finalDimension = dimension; - final int finalSize = size; - MergedDocIdSetIterator mergedIterator = new MergedDocIdSetIterator<>(subs); - return new ByteVectorValues() { + return new MergedByteVectorValues(dimension, size, subs); + } - @Override - public int dimension() { - return finalDimension; + class MergedByteVectorValues extends ByteVectorValues { + final int dimension; + final int size; + final DocValuesSub[] subs; + final MergedDocIterator iter; + final int[] starts; + int lastSubIndex; + + MergedByteVectorValues(int dimension, int size, List> subs) { + this.dimension = dimension; + this.size = size; + this.subs = subs.toArray(new DocValuesSub[0]); + iter = new MergedDocIterator<>(subs); + // [0, start(1), ..., size] - we want the extra element + // to avoid checking for out-of-array bounds + starts = new int[subs.size() + 1]; + for (int i = 0; i < subs.size(); i++) { + starts[i] = subs.get(i).ordStart; } + starts[starts.length - 1] = size; + } - @Override - public int size() { - return finalSize; - } + @Override + public MergedDocIterator iterator() { + return iter; + } - @Override - public byte[] vectorValue() throws IOException { - return mergedIterator.current.sub.vectorValue(); - } + @Override + public int dimension() { + return dimension; + } - @Override - public int docID() { - return mergedIterator.docID(); - } + @Override + public int size() { + return size; + } - @Override - public int nextDoc() throws IOException { - return mergedIterator.nextDoc(); + @Override + public byte[] vectorValue(int ord) throws IOException { + assert ord >= 0 && ord < size; + // We need to implement fully random-access API here in order to support callers like + // SortingCodecReader that rely on it. We maintain lastSubIndex since we expect some + // repetition. + lastSubIndex = findSub(ord, lastSubIndex, starts); + return ((ByteVectorValues) subs[lastSubIndex].sub) + .vectorValue(ord - subs[lastSubIndex].ordStart); + } + + @SuppressWarnings("unchecked") + @Override + public ByteVectorValues copy() throws IOException { + List> newSubs = new ArrayList<>(); + for (Object sub : subs) { + newSubs.add((DocValuesSub) sub); } + return new MergedByteVectorValues(dimension, size, newSubs); + } + } - @Override - public int advance(int target) throws IOException { - return mergedIterator.advance(target); + private static int findSub(int ord, int lastSubIndex, int[] starts) { + if (ord >= starts[lastSubIndex]) { + if (ord >= starts[lastSubIndex + 1]) { + return binarySearchStarts(starts, ord, lastSubIndex + 1, starts.length); } + } else { + return binarySearchStarts(starts, ord, 0, lastSubIndex); + } + return lastSubIndex; + } - @Override - public VectorScorer scorer(byte[] target) { - throw new UnsupportedOperationException(); + private static int binarySearchStarts(int[] starts, int ord, int from, int to) { + int pos = Arrays.binarySearch(starts, from, to, ord); + if (pos < 0) { + // subtract one since binarySearch returns an *insertion point* + return -2 - pos; + } else { + while (pos < starts.length - 1 && starts[pos + 1] == ord) { + // Arrays.binarySearch can return any of a sequence of repeated value + // but we always want the last one + ++pos; } - }; + return pos; + } } @Override diff --git a/lucene/core/src/java/org/apache/lucene/index/Sorter.java b/lucene/core/src/java/org/apache/lucene/index/Sorter.java index 741dfc6944a6..619496d108d8 100644 --- a/lucene/core/src/java/org/apache/lucene/index/Sorter.java +++ b/lucene/core/src/java/org/apache/lucene/index/Sorter.java @@ -222,10 +222,10 @@ DocMap sort(LeafReader reader) throws IOException { } if (metaData.hasBlocks() && fieldInfos.getParentField() == null - && metaData.getCreatedVersionMajor() >= Version.LUCENE_10_0_0.major) { + && metaData.createdVersionMajor() >= Version.LUCENE_10_0_0.major) { throw new CorruptIndexException( "parent field is not set but the index has blocks. indexCreatedVersionMajor: " - + metaData.getCreatedVersionMajor(), + + metaData.createdVersionMajor(), "Sorter"); } diff --git a/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java b/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java index ff88e30de4a5..daec0c197d6a 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java @@ -25,6 +25,7 @@ import java.util.Iterator; import java.util.Map; import java.util.Objects; +import java.util.function.Supplier; import org.apache.lucene.codecs.DocValuesProducer; import org.apache.lucene.codecs.FieldsProducer; import org.apache.lucene.codecs.KnnVectorsReader; @@ -32,10 +33,11 @@ import org.apache.lucene.codecs.PointsReader; import org.apache.lucene.codecs.StoredFieldsReader; import org.apache.lucene.codecs.TermVectorsReader; +import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.KnnCollector; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; -import org.apache.lucene.search.VectorScorer; +import org.apache.lucene.util.BitSetIterator; import org.apache.lucene.util.Bits; import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.IOSupplier; @@ -52,15 +54,7 @@ */ public final class SortingCodecReader extends FilterCodecReader { - private static class SortingBits implements Bits { - - private final Bits in; - private final Sorter.DocMap docMap; - - SortingBits(final Bits in, Sorter.DocMap docMap) { - this.in = in; - this.docMap = docMap; - } + private record SortingBits(Bits in, Sorter.DocMap docMap) implements Bits { @Override public boolean get(int index) { @@ -214,121 +208,175 @@ public PointValues.Relation compare(byte[] minPackedValue, byte[] maxPackedValue } } - /** Sorting FloatVectorValues that iterate over documents in the order of the provided sortMap */ - private static class SortingFloatVectorValues extends FloatVectorValues { - final int size; - final int dimension; - final FixedBitSet docsWithField; - final float[][] vectors; + /** + * Factory for SortingValuesIterator. This enables us to create new iterators as needed without + * recomputing the sorting mappings. + */ + static class SortingIteratorSupplier implements Supplier { + private final FixedBitSet docBits; + private final int[] docToOrd; + private final int size; - private int docId = -1; + SortingIteratorSupplier(FixedBitSet docBits, int[] docToOrd, int size) { + this.docBits = docBits; + this.docToOrd = docToOrd; + this.size = size; + } - SortingFloatVectorValues(FloatVectorValues delegate, Sorter.DocMap sortMap) throws IOException { - this.size = delegate.size(); - this.dimension = delegate.dimension(); - docsWithField = new FixedBitSet(sortMap.size()); - vectors = new float[sortMap.size()][]; - for (int doc = delegate.nextDoc(); doc != NO_MORE_DOCS; doc = delegate.nextDoc()) { - int newDocID = sortMap.oldToNew(doc); - docsWithField.set(newDocID); - vectors[newDocID] = delegate.vectorValue().clone(); - } + @Override + public SortingValuesIterator get() { + return new SortingValuesIterator(docBits, docToOrd, size); + } + + public int size() { + return size; + } + } + + /** + * Creates a factory for SortingValuesIterator. Does the work of computing the (new docId to old + * ordinal) mapping, and caches the result, enabling it to create new iterators cheaply. + * + * @param values the values over which to iterate + * @param docMap the mapping from "old" docIds to "new" (sorted) docIds. + */ + public static SortingIteratorSupplier iteratorSupplier( + KnnVectorValues values, Sorter.DocMap docMap) throws IOException { + + final int[] docToOrd = new int[docMap.size()]; + final FixedBitSet docBits = new FixedBitSet(docMap.size()); + int count = 0; + // Note: docToOrd will contain zero for docids that have no vector. This is OK though + // because the iterator cannot be positioned on such docs + KnnVectorValues.DocIndexIterator iter = values.iterator(); + for (int doc = iter.nextDoc(); doc != NO_MORE_DOCS; doc = iter.nextDoc()) { + int newDocId = docMap.oldToNew(doc); + if (newDocId != -1) { + docToOrd[newDocId] = iter.index(); + docBits.set(newDocId); + ++count; + } + } + return new SortingIteratorSupplier(docBits, docToOrd, count); + } + + /** + * Iterator over KnnVectorValues accepting a mapping to differently-sorted docs. Consequently + * index() may skip around, not increasing monotonically as iteration proceeds. + */ + public static class SortingValuesIterator extends KnnVectorValues.DocIndexIterator { + private final FixedBitSet docBits; + private final DocIdSetIterator docsWithValues; + private final int[] docToOrd; + + int doc = -1; + + SortingValuesIterator(FixedBitSet docBits, int[] docToOrd, int size) { + this.docBits = docBits; + this.docToOrd = docToOrd; + docsWithValues = new BitSetIterator(docBits, size); } @Override public int docID() { - return docId; + return doc; + } + + @Override + public int index() { + assert docBits.get(doc); + return docToOrd[doc]; } @Override public int nextDoc() throws IOException { - return advance(docId + 1); + if (doc != NO_MORE_DOCS) { + doc = docsWithValues.nextDoc(); + } + return doc; } @Override - public float[] vectorValue() throws IOException { - return vectors[docId]; + public long cost() { + return docBits.cardinality(); + } + + @Override + public int advance(int target) { + throw new UnsupportedOperationException(); + } + } + + /** Sorting FloatVectorValues that maps ordinals using the provided sortMap */ + private static class SortingFloatVectorValues extends FloatVectorValues { + final FloatVectorValues delegate; + final SortingIteratorSupplier iteratorSupplier; + + SortingFloatVectorValues(FloatVectorValues delegate, Sorter.DocMap sortMap) throws IOException { + this.delegate = delegate; + // SortingValuesIterator consumes the iterator and records the docs and ord mapping + iteratorSupplier = iteratorSupplier(delegate, sortMap); + } + + @Override + public float[] vectorValue(int ord) throws IOException { + // ords are interpreted in the delegate's ord-space. + return delegate.vectorValue(ord); } @Override public int dimension() { - return dimension; + return delegate.dimension(); } @Override public int size() { - return size; + return iteratorSupplier.size(); } @Override - public int advance(int target) throws IOException { - if (target >= docsWithField.length()) { - return NO_MORE_DOCS; - } - return docId = docsWithField.nextSetBit(target); + public FloatVectorValues copy() { + throw new UnsupportedOperationException(); } @Override - public VectorScorer scorer(float[] target) { - throw new UnsupportedOperationException(); + public DocIndexIterator iterator() { + return iteratorSupplier.get(); } } private static class SortingByteVectorValues extends ByteVectorValues { - final int size; - final int dimension; - final FixedBitSet docsWithField; - final byte[][] vectors; - - private int docId = -1; + final ByteVectorValues delegate; + final SortingIteratorSupplier iteratorSupplier; SortingByteVectorValues(ByteVectorValues delegate, Sorter.DocMap sortMap) throws IOException { - this.size = delegate.size(); - this.dimension = delegate.dimension(); - docsWithField = new FixedBitSet(sortMap.size()); - vectors = new byte[sortMap.size()][]; - for (int doc = delegate.nextDoc(); doc != NO_MORE_DOCS; doc = delegate.nextDoc()) { - int newDocID = sortMap.oldToNew(doc); - docsWithField.set(newDocID); - vectors[newDocID] = delegate.vectorValue().clone(); - } - } - - @Override - public int docID() { - return docId; + this.delegate = delegate; + // SortingValuesIterator consumes the iterator and records the docs and ord mapping + iteratorSupplier = iteratorSupplier(delegate, sortMap); } @Override - public int nextDoc() throws IOException { - return advance(docId + 1); + public byte[] vectorValue(int ord) throws IOException { + return delegate.vectorValue(ord); } @Override - public byte[] vectorValue() throws IOException { - return vectors[docId]; + public DocIndexIterator iterator() { + return iteratorSupplier.get(); } @Override public int dimension() { - return dimension; + return delegate.dimension(); } @Override public int size() { - return size; + return iteratorSupplier.size(); } @Override - public int advance(int target) throws IOException { - if (target >= docsWithField.length()) { - return NO_MORE_DOCS; - } - return docId = docsWithField.nextSetBit(target); - } - - @Override - public VectorScorer scorer(byte[] target) { + public ByteVectorValues copy() { throw new UnsupportedOperationException(); } } @@ -349,10 +397,7 @@ public static CodecReader wrap(CodecReader reader, Sorter.DocMap docMap, Sort so LeafMetaData metaData = reader.getMetaData(); LeafMetaData newMetaData = new LeafMetaData( - metaData.getCreatedVersionMajor(), - metaData.getMinVersion(), - sort, - metaData.hasBlocks()); + metaData.createdVersionMajor(), metaData.minVersion(), sort, metaData.hasBlocks()); if (docMap == null) { // the reader is already sorted return new FilterCodecReader(reader) { @@ -666,6 +711,11 @@ public TermVectorsReader getTermVectorsReader() { private TermVectorsReader newTermVectorsReader(TermVectorsReader delegate) { return new TermVectorsReader() { + @Override + public void prefetch(int doc) throws IOException { + delegate.prefetch(docMap.newToOld(doc)); + } + @Override public Fields get(int doc) throws IOException { return delegate.get(docMap.newToOld(doc)); @@ -744,8 +794,8 @@ private boolean assertCreatedOnlyOnce(String field, boolean norms) { boolean isSortField = false; // For things that aren't sort fields, it's possible for sort to be null here // In the event that we accidentally cache twice, its better not to throw an NPE - if (metaData.getSort() != null) { - for (SortField sf : metaData.getSort().getSort()) { + if (metaData.sort() != null) { + for (SortField sf : metaData.sort().getSort()) { if (field.equals(sf.getField())) { isSortField = true; break; diff --git a/lucene/core/src/java/org/apache/lucene/index/StoredFieldsConsumer.java b/lucene/core/src/java/org/apache/lucene/index/StoredFieldsConsumer.java index 1bde9d4d5973..78f2f726a2f1 100644 --- a/lucene/core/src/java/org/apache/lucene/index/StoredFieldsConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/index/StoredFieldsConsumer.java @@ -93,9 +93,8 @@ void finishDocument() throws IOException { void finish(int maxDoc) throws IOException { while (lastDoc < maxDoc - 1) { - startDocument(lastDoc); + startDocument(lastDoc + 1); finishDocument(); - ++lastDoc; } } diff --git a/lucene/core/src/java/org/apache/lucene/index/TermVectors.java b/lucene/core/src/java/org/apache/lucene/index/TermVectors.java index fe2e8b56681a..fde57b48c932 100644 --- a/lucene/core/src/java/org/apache/lucene/index/TermVectors.java +++ b/lucene/core/src/java/org/apache/lucene/index/TermVectors.java @@ -18,6 +18,7 @@ import java.io.IOException; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; // javadocs +import org.apache.lucene.store.IndexInput; /** * API for reading term vectors. @@ -30,6 +31,18 @@ public abstract class TermVectors { /** Sole constructor. (For invocation by subclass constructors, typically implicit.) */ protected TermVectors() {} + /** + * Optional method: Give a hint to this {@link TermVectors} instance that the given document will + * be read in the near future. This typically delegates to {@link IndexInput#prefetch} and is + * useful to parallelize I/O across multiple documents. + * + *

    NOTE: This API is expected to be called on a small enough set of doc IDs that they could all + * fit in the page cache. If you plan on retrieving a very large number of documents, it may be a + * good idea to perform calls to {@link #prefetch} and {@link #get} in batches instead of + * prefetching all documents up-front. + */ + public void prefetch(int docID) throws IOException {} + /** * Returns term vectors for this document, or null if term vectors were not indexed. * diff --git a/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java b/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java index 702df660c442..2fb0c0783a2e 100644 --- a/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java +++ b/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java @@ -438,7 +438,10 @@ public MergeSpecification findMerges( } // allowedSegCount may occasionally be less than segsPerTier // if segment sizes are below the floor size - allowedSegCount = Math.max(allowedSegCount, Math.max(segsPerTier, targetSearchConcurrency)); + allowedSegCount = Math.max(allowedSegCount, segsPerTier); + // No need to merge if the total number of segments (including too big segments) is less than or + // equal to the target search concurrency. + allowedSegCount = Math.max(allowedSegCount, targetSearchConcurrency - tooBigCount); int allowedDocCount = getMaxAllowedDocs(totalMaxDoc, totalDelDocs); if (verbose(mergeContext) && tooBigCount > 0) { diff --git a/lucene/core/src/java/org/apache/lucene/internal/hppc/CharObjectHashMap.java b/lucene/core/src/java/org/apache/lucene/internal/hppc/CharObjectHashMap.java index b1fad7017b5d..40b32141f3f2 100644 --- a/lucene/core/src/java/org/apache/lucene/internal/hppc/CharObjectHashMap.java +++ b/lucene/core/src/java/org/apache/lucene/internal/hppc/CharObjectHashMap.java @@ -574,15 +574,6 @@ public Iterator> iterator() { public int size() { return CharObjectHashMap.this.size(); } - - public VType[] toArray() { - VType[] array = (VType[]) new Object[size()]; - int i = 0; - for (ObjectCursor cursor : this) { - array[i++] = cursor.value; - } - return array; - } } /** An iterator over the set of assigned values. */ diff --git a/lucene/core/src/java/org/apache/lucene/internal/hppc/IntLongHashMap.java b/lucene/core/src/java/org/apache/lucene/internal/hppc/IntLongHashMap.java new file mode 100644 index 000000000000..9eb1ccd465ae --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/internal/hppc/IntLongHashMap.java @@ -0,0 +1,834 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.internal.hppc; + +import static org.apache.lucene.internal.hppc.HashContainers.*; + +import java.util.Arrays; +import java.util.Iterator; +import org.apache.lucene.util.Accountable; +import org.apache.lucene.util.RamUsageEstimator; + +/** + * A hash map of int to long, implemented using open addressing with + * linear probing for collision resolution. + * + *

    Mostly forked and trimmed from com.carrotsearch.hppc.IntLongHashMap + * + *

    github: https://github.com/carrotsearch/hppc release 0.10.0 + * + * @lucene.internal + */ +public class IntLongHashMap + implements Iterable, Accountable, Cloneable { + + private static final long BASE_RAM_BYTES_USED = + RamUsageEstimator.shallowSizeOfInstance(IntLongHashMap.class); + + /** The array holding keys. */ + public int[] keys; + + /** The array holding values. */ + public long[] values; + + /** + * The number of stored keys (assigned key slots), excluding the special "empty" key, if any (use + * {@link #size()} instead). + * + * @see #size() + */ + protected int assigned; + + /** Mask for slot scans in {@link #keys}. */ + protected int mask; + + /** Expand (rehash) {@link #keys} when {@link #assigned} hits this value. */ + protected int resizeAt; + + /** Special treatment for the "empty slot" key marker. */ + protected boolean hasEmptyKey; + + /** The load factor for {@link #keys}. */ + protected double loadFactor; + + /** Seed used to ensure the hash iteration order is different from an iteration to another. */ + protected int iterationSeed; + + /** New instance with sane defaults. */ + public IntLongHashMap() { + this(DEFAULT_EXPECTED_ELEMENTS); + } + + /** + * New instance with sane defaults. + * + * @param expectedElements The expected number of elements guaranteed not to cause buffer + * expansion (inclusive). + */ + public IntLongHashMap(int expectedElements) { + this(expectedElements, DEFAULT_LOAD_FACTOR); + } + + /** + * New instance with the provided defaults. + * + * @param expectedElements The expected number of elements guaranteed not to cause a rehash + * (inclusive). + * @param loadFactor The load factor for internal buffers. Insane load factors (zero, full + * capacity) are rejected by {@link #verifyLoadFactor(double)}. + */ + public IntLongHashMap(int expectedElements, double loadFactor) { + this.loadFactor = verifyLoadFactor(loadFactor); + iterationSeed = ITERATION_SEED.incrementAndGet(); + ensureCapacity(expectedElements); + } + + /** Create a hash map from all key-value pairs of another container. */ + public IntLongHashMap(IntLongHashMap container) { + this(container.size()); + putAll(container); + } + + public long put(int key, long value) { + assert assigned < mask + 1; + + final int mask = this.mask; + if (((key) == 0)) { + long previousValue = hasEmptyKey ? values[mask + 1] : 0L; + hasEmptyKey = true; + values[mask + 1] = value; + return previousValue; + } else { + final int[] keys = this.keys; + int slot = hashKey(key) & mask; + + int existing; + while (!((existing = keys[slot]) == 0)) { + if (((key) == (existing))) { + final long previousValue = values[slot]; + values[slot] = value; + return previousValue; + } + slot = (slot + 1) & mask; + } + + if (assigned == resizeAt) { + allocateThenInsertThenRehash(slot, key, value); + } else { + keys[slot] = key; + values[slot] = value; + } + + assigned++; + return 0L; + } + } + + /** + * If the specified key is not already associated with a value, associates it with the given + * value. + * + * @return {@code true} if {@code key} did not exist and {@code value} was placed in the map, + * {@code false} otherwise. + */ + public boolean putIfAbsent(int key, long value) { + int keyIndex = indexOf(key); + if (indexExists(keyIndex)) { + return false; + } else { + indexInsert(keyIndex, key, value); + return true; + } + } + + /** Puts all key/value pairs from a given iterable into this map. */ + public int putAll(Iterable iterable) { + final int count = size(); + for (IntLongCursor c : iterable) { + put(c.key, c.value); + } + return size() - count; + } + + /** + * If key exists, putValue is inserted into the map, otherwise any + * existing value is incremented by additionValue. + * + * @param key The key of the value to adjust. + * @param putValue The value to put if key does not exist. + * @param incrementValue The value to add to the existing value if key exists. + * @return Returns the current value associated with key (after changes). + */ + public long putOrAdd(int key, long putValue, long incrementValue) { + assert assigned < mask + 1; + + int keyIndex = indexOf(key); + if (indexExists(keyIndex)) { + putValue = values[keyIndex] + incrementValue; + indexReplace(keyIndex, putValue); + } else { + indexInsert(keyIndex, key, putValue); + } + return putValue; + } + + /** + * Adds incrementValue to any existing value for the given key or + * inserts incrementValue if key did not previously exist. + * + * @param key The key of the value to adjust. + * @param incrementValue The value to put or add to the existing value if key exists. + * @return Returns the current value associated with key (after changes). + */ + public long addTo(int key, long incrementValue) { + return putOrAdd(key, incrementValue, incrementValue); + } + + /** + * Remove all values at the given key. The default value for the key type is returned if the value + * does not exist in the map. + */ + public long remove(int key) { + final int mask = this.mask; + if (((key) == 0)) { + if (!hasEmptyKey) { + return 0L; + } + hasEmptyKey = false; + long previousValue = values[mask + 1]; + values[mask + 1] = 0L; + return previousValue; + } else { + final int[] keys = this.keys; + int slot = hashKey(key) & mask; + + int existing; + while (!((existing = keys[slot]) == 0)) { + if (((key) == (existing))) { + final long previousValue = values[slot]; + shiftConflictingKeys(slot); + return previousValue; + } + slot = (slot + 1) & mask; + } + + return 0L; + } + } + + public long get(int key) { + if (((key) == 0)) { + return hasEmptyKey ? values[mask + 1] : 0L; + } else { + final int[] keys = this.keys; + final int mask = this.mask; + int slot = hashKey(key) & mask; + + int existing; + while (!((existing = keys[slot]) == 0)) { + if (((key) == (existing))) { + return values[slot]; + } + slot = (slot + 1) & mask; + } + + return 0L; + } + } + + public long getOrDefault(int key, long defaultValue) { + if (((key) == 0)) { + return hasEmptyKey ? values[mask + 1] : defaultValue; + } else { + final int[] keys = this.keys; + final int mask = this.mask; + int slot = hashKey(key) & mask; + + int existing; + while (!((existing = keys[slot]) == 0)) { + if (((key) == (existing))) { + return values[slot]; + } + slot = (slot + 1) & mask; + } + + return defaultValue; + } + } + + public boolean containsKey(int key) { + if (((key) == 0)) { + return hasEmptyKey; + } else { + final int[] keys = this.keys; + final int mask = this.mask; + int slot = hashKey(key) & mask; + + int existing; + while (!((existing = keys[slot]) == 0)) { + if (((key) == (existing))) { + return true; + } + slot = (slot + 1) & mask; + } + + return false; + } + } + + public int indexOf(int key) { + final int mask = this.mask; + if (((key) == 0)) { + return hasEmptyKey ? mask + 1 : ~(mask + 1); + } else { + final int[] keys = this.keys; + int slot = hashKey(key) & mask; + + int existing; + while (!((existing = keys[slot]) == 0)) { + if (((key) == (existing))) { + return slot; + } + slot = (slot + 1) & mask; + } + + return ~slot; + } + } + + public boolean indexExists(int index) { + assert index < 0 || (index >= 0 && index <= mask) || (index == mask + 1 && hasEmptyKey); + + return index >= 0; + } + + public long indexGet(int index) { + assert index >= 0 : "The index must point at an existing key."; + assert index <= mask || (index == mask + 1 && hasEmptyKey); + + return values[index]; + } + + public long indexReplace(int index, long newValue) { + assert index >= 0 : "The index must point at an existing key."; + assert index <= mask || (index == mask + 1 && hasEmptyKey); + + long previousValue = values[index]; + values[index] = newValue; + return previousValue; + } + + public void indexInsert(int index, int key, long value) { + assert index < 0 : "The index must not point at an existing key."; + + index = ~index; + if (((key) == 0)) { + assert index == mask + 1; + values[index] = value; + hasEmptyKey = true; + } else { + assert ((keys[index]) == 0); + + if (assigned == resizeAt) { + allocateThenInsertThenRehash(index, key, value); + } else { + keys[index] = key; + values[index] = value; + } + + assigned++; + } + } + + public long indexRemove(int index) { + assert index >= 0 : "The index must point at an existing key."; + assert index <= mask || (index == mask + 1 && hasEmptyKey); + + long previousValue = values[index]; + if (index > mask) { + assert index == mask + 1; + hasEmptyKey = false; + values[index] = 0L; + } else { + shiftConflictingKeys(index); + } + return previousValue; + } + + public void clear() { + assigned = 0; + hasEmptyKey = false; + + Arrays.fill(keys, 0); + } + + public void release() { + assigned = 0; + hasEmptyKey = false; + + keys = null; + values = null; + ensureCapacity(DEFAULT_EXPECTED_ELEMENTS); + } + + public int size() { + return assigned + (hasEmptyKey ? 1 : 0); + } + + public boolean isEmpty() { + return size() == 0; + } + + @Override + public int hashCode() { + int h = hasEmptyKey ? 0xDEADBEEF : 0; + for (IntLongCursor c : this) { + h += BitMixer.mix(c.key) + BitMixer.mix(c.value); + } + return h; + } + + @Override + public boolean equals(Object obj) { + return (this == obj) + || (obj != null && getClass() == obj.getClass() && equalElements(getClass().cast(obj))); + } + + /** Return true if all keys of some other container exist in this container. */ + protected boolean equalElements(IntLongHashMap other) { + if (other.size() != size()) { + return false; + } + + for (IntLongCursor c : other) { + int key = c.key; + if (!containsKey(key) || !((c.value) == (get(key)))) { + return false; + } + } + + return true; + } + + /** + * Ensure this container can hold at least the given number of keys (entries) without resizing its + * buffers. + * + * @param expectedElements The total number of keys, inclusive. + */ + public void ensureCapacity(int expectedElements) { + if (expectedElements > resizeAt || keys == null) { + final int[] prevKeys = this.keys; + final long[] prevValues = this.values; + allocateBuffers(minBufferSize(expectedElements, loadFactor)); + if (prevKeys != null && !isEmpty()) { + rehash(prevKeys, prevValues); + } + } + } + + @Override + public long ramBytesUsed() { + return BASE_RAM_BYTES_USED + RamUsageEstimator.sizeOf(keys) + RamUsageEstimator.sizeOf(values); + } + + /** + * Provides the next iteration seed used to build the iteration starting slot and offset + * increment. This method does not need to be synchronized, what matters is that each thread gets + * a sequence of varying seeds. + */ + protected int nextIterationSeed() { + return iterationSeed = BitMixer.mixPhi(iterationSeed); + } + + /** An iterator implementation for {@link #iterator}. */ + private final class EntryIterator extends AbstractIterator { + private final IntLongCursor cursor; + private final int increment; + private int index; + private int slot; + + public EntryIterator() { + cursor = new IntLongCursor(); + int seed = nextIterationSeed(); + increment = iterationIncrement(seed); + slot = seed & mask; + } + + @Override + protected IntLongCursor fetch() { + final int mask = IntLongHashMap.this.mask; + while (index <= mask) { + int existing; + index++; + slot = (slot + increment) & mask; + if (!((existing = keys[slot]) == 0)) { + cursor.index = slot; + cursor.key = existing; + cursor.value = values[slot]; + return cursor; + } + } + + if (index == mask + 1 && hasEmptyKey) { + cursor.index = index; + cursor.key = 0; + cursor.value = values[index++]; + return cursor; + } + + return done(); + } + } + + @Override + public Iterator iterator() { + return new EntryIterator(); + } + + /** Returns a specialized view of the keys of this associated container. */ + public KeysContainer keys() { + return new KeysContainer(); + } + + /** A view of the keys inside this hash map. */ + public final class KeysContainer implements Iterable { + @Override + public Iterator iterator() { + return new KeysIterator(); + } + + public int size() { + return IntLongHashMap.this.size(); + } + + public int[] toArray() { + int[] array = new int[size()]; + int i = 0; + for (IntCursor cursor : this) { + array[i++] = cursor.value; + } + return array; + } + } + + /** An iterator over the set of assigned keys. */ + private final class KeysIterator extends AbstractIterator { + private final IntCursor cursor; + private final int increment; + private int index; + private int slot; + + public KeysIterator() { + cursor = new IntCursor(); + int seed = nextIterationSeed(); + increment = iterationIncrement(seed); + slot = seed & mask; + } + + @Override + protected IntCursor fetch() { + final int mask = IntLongHashMap.this.mask; + while (index <= mask) { + int existing; + index++; + slot = (slot + increment) & mask; + if (!((existing = keys[slot]) == 0)) { + cursor.index = slot; + cursor.value = existing; + return cursor; + } + } + + if (index == mask + 1 && hasEmptyKey) { + cursor.index = index++; + cursor.value = 0; + return cursor; + } + + return done(); + } + } + + /** + * @return Returns a container with all values stored in this map. + */ + public ValuesContainer values() { + return new ValuesContainer(); + } + + /** A view over the set of values of this map. */ + public final class ValuesContainer implements Iterable { + @Override + public Iterator iterator() { + return new ValuesIterator(); + } + + public long[] toArray() { + long[] array = new long[size()]; + int i = 0; + for (LongCursor cursor : this) { + array[i++] = cursor.value; + } + return array; + } + } + + /** An iterator over the set of assigned values. */ + private final class ValuesIterator extends AbstractIterator { + private final LongCursor cursor; + private final int increment; + private int index; + private int slot; + + public ValuesIterator() { + cursor = new LongCursor(); + int seed = nextIterationSeed(); + increment = iterationIncrement(seed); + slot = seed & mask; + } + + @Override + protected LongCursor fetch() { + final int mask = IntLongHashMap.this.mask; + while (index <= mask) { + index++; + slot = (slot + increment) & mask; + if (!((keys[slot]) == 0)) { + cursor.index = slot; + cursor.value = values[slot]; + return cursor; + } + } + + if (index == mask + 1 && hasEmptyKey) { + cursor.index = index; + cursor.value = values[index++]; + return cursor; + } + + return done(); + } + } + + @Override + public IntLongHashMap clone() { + try { + + IntLongHashMap cloned = (IntLongHashMap) super.clone(); + cloned.keys = keys.clone(); + cloned.values = values.clone(); + cloned.hasEmptyKey = hasEmptyKey; + cloned.iterationSeed = ITERATION_SEED.incrementAndGet(); + return cloned; + } catch (CloneNotSupportedException e) { + throw new RuntimeException(e); + } + } + + /** Convert the contents of this map to a human-friendly string. */ + @Override + public String toString() { + final StringBuilder buffer = new StringBuilder(); + buffer.append("["); + + boolean first = true; + for (IntLongCursor cursor : this) { + if (!first) { + buffer.append(", "); + } + buffer.append(cursor.key); + buffer.append("=>"); + buffer.append(cursor.value); + first = false; + } + buffer.append("]"); + return buffer.toString(); + } + + /** Creates a hash map from two index-aligned arrays of key-value pairs. */ + public static IntLongHashMap from(int[] keys, long[] values) { + if (keys.length != values.length) { + throw new IllegalArgumentException( + "Arrays of keys and values must have an identical length."); + } + + IntLongHashMap map = new IntLongHashMap(keys.length); + for (int i = 0; i < keys.length; i++) { + map.put(keys[i], values[i]); + } + + return map; + } + + /** + * Returns a hash code for the given key. + * + *

    The output from this function should evenly distribute keys across the entire integer range. + */ + protected int hashKey(int key) { + assert !((key) == 0); // Handled as a special case (empty slot marker). + return BitMixer.mixPhi(key); + } + + /** + * Validate load factor range and return it. Override and suppress if you need insane load + * factors. + */ + protected double verifyLoadFactor(double loadFactor) { + checkLoadFactor(loadFactor, MIN_LOAD_FACTOR, MAX_LOAD_FACTOR); + return loadFactor; + } + + /** Rehash from old buffers to new buffers. */ + protected void rehash(int[] fromKeys, long[] fromValues) { + assert fromKeys.length == fromValues.length + && HashContainers.checkPowerOfTwo(fromKeys.length - 1); + + // Rehash all stored key/value pairs into the new buffers. + final int[] keys = this.keys; + final long[] values = this.values; + final int mask = this.mask; + int existing; + + // Copy the zero element's slot, then rehash everything else. + int from = fromKeys.length - 1; + keys[keys.length - 1] = fromKeys[from]; + values[values.length - 1] = fromValues[from]; + while (--from >= 0) { + if (!((existing = fromKeys[from]) == 0)) { + int slot = hashKey(existing) & mask; + while (!((keys[slot]) == 0)) { + slot = (slot + 1) & mask; + } + keys[slot] = existing; + values[slot] = fromValues[from]; + } + } + } + + /** + * Allocate new internal buffers. This method attempts to allocate and assign internal buffers + * atomically (either allocations succeed or not). + */ + protected void allocateBuffers(int arraySize) { + assert Integer.bitCount(arraySize) == 1; + + // Ensure no change is done if we hit an OOM. + int[] prevKeys = this.keys; + long[] prevValues = this.values; + try { + int emptyElementSlot = 1; + this.keys = (new int[arraySize + emptyElementSlot]); + this.values = (new long[arraySize + emptyElementSlot]); + } catch (OutOfMemoryError e) { + this.keys = prevKeys; + this.values = prevValues; + throw new BufferAllocationException( + "Not enough memory to allocate buffers for rehashing: %,d -> %,d", + e, this.mask + 1, arraySize); + } + + this.resizeAt = expandAtCount(arraySize, loadFactor); + this.mask = arraySize - 1; + } + + /** + * This method is invoked when there is a new key/ value pair to be inserted into the buffers but + * there is not enough empty slots to do so. + * + *

    New buffers are allocated. If this succeeds, we know we can proceed with rehashing so we + * assign the pending element to the previous buffer (possibly violating the invariant of having + * at least one empty slot) and rehash all keys, substituting new buffers at the end. + */ + protected void allocateThenInsertThenRehash(int slot, int pendingKey, long pendingValue) { + assert assigned == resizeAt && ((keys[slot]) == 0) && !((pendingKey) == 0); + + // Try to allocate new buffers first. If we OOM, we leave in a consistent state. + final int[] prevKeys = this.keys; + final long[] prevValues = this.values; + allocateBuffers(nextBufferSize(mask + 1, size(), loadFactor)); + assert this.keys.length > prevKeys.length; + + // We have succeeded at allocating new data so insert the pending key/value at + // the free slot in the old arrays before rehashing. + prevKeys[slot] = pendingKey; + prevValues[slot] = pendingValue; + + // Rehash old keys, including the pending key. + rehash(prevKeys, prevValues); + } + + /** + * Shift all the slot-conflicting keys and values allocated to (and including) slot. + */ + protected void shiftConflictingKeys(int gapSlot) { + final int[] keys = this.keys; + final long[] values = this.values; + final int mask = this.mask; + + // Perform shifts of conflicting keys to fill in the gap. + int distance = 0; + while (true) { + final int slot = (gapSlot + (++distance)) & mask; + final int existing = keys[slot]; + if (((existing) == 0)) { + break; + } + + final int idealSlot = hashKey(existing); + final int shift = (slot - idealSlot) & mask; + if (shift >= distance) { + // Entry at this position was originally at or before the gap slot. + // Move the conflict-shifted entry to the gap's position and repeat the procedure + // for any entries to the right of the current position, treating it + // as the new gap. + keys[gapSlot] = existing; + values[gapSlot] = values[slot]; + gapSlot = slot; + distance = 0; + } + } + + // Mark the last found gap slot without a conflict as empty. + keys[gapSlot] = 0; + values[gapSlot] = 0L; + assigned--; + } + + /** Forked from HPPC, holding int index,key and value */ + public static final class IntLongCursor { + /** + * The current key and value's index in the container this cursor belongs to. The meaning of + * this index is defined by the container (usually it will be an index in the underlying storage + * buffer). + */ + public int index; + + /** The current key. */ + public int key; + + /** The current value. */ + public long value; + + @Override + public String toString() { + return "[cursor, index: " + index + ", key: " + key + ", value: " + value + "]"; + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/internal/hppc/IntObjectHashMap.java b/lucene/core/src/java/org/apache/lucene/internal/hppc/IntObjectHashMap.java index 180bb3249f35..732b0ecb71c4 100644 --- a/lucene/core/src/java/org/apache/lucene/internal/hppc/IntObjectHashMap.java +++ b/lucene/core/src/java/org/apache/lucene/internal/hppc/IntObjectHashMap.java @@ -562,15 +562,6 @@ public Iterator> iterator() { public int size() { return IntObjectHashMap.this.size(); } - - public VType[] toArray() { - VType[] array = (VType[]) new Object[size()]; - int i = 0; - for (ObjectCursor cursor : this) { - array[i++] = cursor.value; - } - return array; - } } /** An iterator over the set of assigned values. */ diff --git a/lucene/core/src/java/org/apache/lucene/internal/hppc/LongObjectHashMap.java b/lucene/core/src/java/org/apache/lucene/internal/hppc/LongObjectHashMap.java index 4bc890b80b1a..5f34625f6750 100644 --- a/lucene/core/src/java/org/apache/lucene/internal/hppc/LongObjectHashMap.java +++ b/lucene/core/src/java/org/apache/lucene/internal/hppc/LongObjectHashMap.java @@ -562,15 +562,6 @@ public Iterator> iterator() { public int size() { return LongObjectHashMap.this.size(); } - - public VType[] toArray() { - VType[] array = (VType[]) new Object[size()]; - int i = 0; - for (ObjectCursor cursor : this) { - array[i++] = cursor.value; - } - return array; - } } /** An iterator over the set of assigned values. */ diff --git a/lucene/core/src/java/org/apache/lucene/internal/vectorization/DefaultVectorUtilSupport.java b/lucene/core/src/java/org/apache/lucene/internal/vectorization/DefaultVectorUtilSupport.java index eb5160a0f0dd..184403cf48b7 100644 --- a/lucene/core/src/java/org/apache/lucene/internal/vectorization/DefaultVectorUtilSupport.java +++ b/lucene/core/src/java/org/apache/lucene/internal/vectorization/DefaultVectorUtilSupport.java @@ -197,4 +197,14 @@ public int squareDistance(byte[] a, byte[] b) { } return squareSum; } + + @Override + public int findNextGEQ(int[] buffer, int target, int from, int to) { + for (int i = from; i < to; ++i) { + if (buffer[i] >= target) { + return i; + } + } + return to; + } } diff --git a/lucene/core/src/java/org/apache/lucene/internal/vectorization/DefaultVectorizationProvider.java b/lucene/core/src/java/org/apache/lucene/internal/vectorization/DefaultVectorizationProvider.java index c5193aa23de2..c5e9301e9bc4 100644 --- a/lucene/core/src/java/org/apache/lucene/internal/vectorization/DefaultVectorizationProvider.java +++ b/lucene/core/src/java/org/apache/lucene/internal/vectorization/DefaultVectorizationProvider.java @@ -19,6 +19,7 @@ import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer; import org.apache.lucene.codecs.hnsw.FlatVectorsScorer; +import org.apache.lucene.store.IndexInput; /** Default provider returning scalar implementations. */ final class DefaultVectorizationProvider extends VectorizationProvider { @@ -38,4 +39,9 @@ public VectorUtilSupport getVectorUtilSupport() { public FlatVectorsScorer getLucene99FlatVectorsScorer() { return DefaultFlatVectorScorer.INSTANCE; } + + @Override + public PostingDecodingUtil newPostingDecodingUtil(IndexInput input) { + return new PostingDecodingUtil(input); + } } diff --git a/lucene/core/src/java/org/apache/lucene/internal/vectorization/PostingDecodingUtil.java b/lucene/core/src/java/org/apache/lucene/internal/vectorization/PostingDecodingUtil.java new file mode 100644 index 000000000000..e45ce55bbc59 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/internal/vectorization/PostingDecodingUtil.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.internal.vectorization; + +import java.io.IOException; +import org.apache.lucene.store.IndexInput; + +/** Utility class to decode postings. */ +public class PostingDecodingUtil { + + /** The wrapper {@link IndexInput}. */ + public final IndexInput in; + + /** Sole constructor, called by sub-classes. */ + protected PostingDecodingUtil(IndexInput in) { + this.in = in; + } + + /** + * Core methods for decoding blocks of docs / freqs / positions / offsets. + * + *

      + *
    • Read {@code count} ints. + *
    • For all {@code i} >= 0 so that {@code bShift - i * dec} > 0, apply shift {@code + * bShift - i * dec} and store the result in {@code b} at offset {@code count * i}. + *
    • Apply mask {@code cMask} and store the result in {@code c} starting at offset {@code + * cIndex}. + *
    + */ + public void splitInts( + int count, int[] b, int bShift, int dec, int bMask, int[] c, int cIndex, int cMask) + throws IOException { + // Default implementation, which takes advantage of the C2 compiler's loop unrolling and + // auto-vectorization. + in.readInts(c, cIndex, count); + int maxIter = (bShift - 1) / dec; + for (int i = 0; i < count; ++i) { + for (int j = 0; j <= maxIter; ++j) { + b[count * j + i] = (c[cIndex + i] >>> (bShift - j * dec)) & bMask; + } + c[cIndex + i] &= cMask; + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorUtilSupport.java b/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorUtilSupport.java index 22e5e96aa256..fb94b0e31736 100644 --- a/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorUtilSupport.java +++ b/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorUtilSupport.java @@ -44,4 +44,12 @@ public interface VectorUtilSupport { /** Returns the sum of squared differences of the two byte vectors. */ int squareDistance(byte[] a, byte[] b); + + /** + * Given an array {@code buffer} that is sorted between indexes {@code 0} inclusive and {@code to} + * exclusive, find the first array index whose value is greater than or equal to {@code target}. + * This index is guaranteed to be at least {@code from}. If there is no such array index, {@code + * to} is returned. + */ + int findNextGEQ(int[] buffer, int target, int from, int to); } diff --git a/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java b/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java index a236c303eb4a..c0ed905353b2 100644 --- a/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java +++ b/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java @@ -17,6 +17,7 @@ package org.apache.lucene.internal.vectorization; +import java.io.IOException; import java.lang.StackWalker.StackFrame; import java.lang.invoke.MethodHandles; import java.lang.invoke.MethodType; @@ -28,6 +29,7 @@ import java.util.logging.Logger; import java.util.stream.Stream; import org.apache.lucene.codecs.hnsw.FlatVectorsScorer; +import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.Constants; import org.apache.lucene.util.VectorUtil; @@ -36,12 +38,16 @@ * vectorization modules in the Java runtime this class provides optimized implementations (using * SIMD) of several algorithms used throughout Apache Lucene. * + *

    Expert: set the {@value #UPPER_JAVA_FEATURE_VERSION_SYSPROP} system property to increase the + * set of Java versions this class will provide optimized implementations for. + * * @lucene.internal */ public abstract class VectorizationProvider { static final OptionalInt TESTS_VECTOR_SIZE; static final boolean TESTS_FORCE_INTEGER_VECTORS; + static final int UPPER_JAVA_FEATURE_VERSION = getUpperJavaFeatureVersion(); static { var vs = OptionalInt.empty(); @@ -69,6 +75,27 @@ public abstract class VectorizationProvider { TESTS_FORCE_INTEGER_VECTORS = enforce; } + private static final String UPPER_JAVA_FEATURE_VERSION_SYSPROP = + "org.apache.lucene.vectorization.upperJavaFeatureVersion"; + private static final int DEFAULT_UPPER_JAVA_FEATURE_VERSION = 23; + + private static int getUpperJavaFeatureVersion() { + int runtimeVersion = DEFAULT_UPPER_JAVA_FEATURE_VERSION; + try { + String str = System.getProperty(UPPER_JAVA_FEATURE_VERSION_SYSPROP); + if (str != null) { + runtimeVersion = Math.max(Integer.parseInt(str), runtimeVersion); + } + } catch (@SuppressWarnings("unused") NumberFormatException | SecurityException ignored) { + Logger.getLogger(VectorizationProvider.class.getName()) + .warning( + "Cannot read sysprop " + + UPPER_JAVA_FEATURE_VERSION_SYSPROP + + ", so the default value will be used."); + } + return runtimeVersion; + } + /** * Returns the default instance of the provider matching vectorization possibilities of actual * runtime. @@ -95,6 +122,9 @@ public static VectorizationProvider getInstance() { /** Returns a FlatVectorsScorer that supports the Lucene99 format. */ public abstract FlatVectorsScorer getLucene99FlatVectorsScorer(); + /** Create a new {@link PostingDecodingUtil} for the given {@link IndexInput}. */ + public abstract PostingDecodingUtil newPostingDecodingUtil(IndexInput input) throws IOException; + // *** Lookup mechanism: *** private static final Logger LOG = Logger.getLogger(VectorizationProvider.class.getName()); @@ -103,7 +133,7 @@ public static VectorizationProvider getInstance() { static VectorizationProvider lookup(boolean testMode) { final int runtimeVersion = Runtime.version().feature(); assert runtimeVersion >= 21; - if (runtimeVersion <= 22) { + if (runtimeVersion <= UPPER_JAVA_FEATURE_VERSION) { // only use vector module with Hotspot VM if (!Constants.IS_HOTSPOT_VM) { LOG.warning( @@ -184,7 +214,9 @@ private static Optional lookupVectorModule() { private static final Set VALID_CALLERS = Set.of( "org.apache.lucene.codecs.hnsw.FlatVectorScorerUtil", - "org.apache.lucene.util.VectorUtil"); + "org.apache.lucene.util.VectorUtil", + "org.apache.lucene.codecs.lucene101.Lucene101PostingsReader", + "org.apache.lucene.codecs.lucene101.PostingIndexInput"); private static void ensureCaller() { final boolean validCaller = diff --git a/lucene/core/src/java/org/apache/lucene/search/AbstractKnnVectorQuery.java b/lucene/core/src/java/org/apache/lucene/search/AbstractKnnVectorQuery.java index 227963475680..c823796c8b2a 100644 --- a/lucene/core/src/java/org/apache/lucene/search/AbstractKnnVectorQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/AbstractKnnVectorQuery.java @@ -60,9 +60,8 @@ abstract class AbstractKnnVectorQuery extends Query { /** the number of documents to find */ protected final int k; - /** the filter to be executed. when the filter is applied is up to the underlying knn index */ - private final Query filter; + protected final Query filter; public AbstractKnnVectorQuery(String field, int k, Query filter) { this.field = Objects.requireNonNull(field, "field"); @@ -146,7 +145,7 @@ private TopDocs getLeafResults( // Perform the approximate kNN search // We pass cost + 1 here to account for the edge case when we explore exactly cost vectors TopDocs results = approximateSearch(ctx, acceptDocs, cost + 1, timeLimitingKnnCollectorManager); - if (results.totalHits.relation == TotalHits.Relation.EQUAL_TO + if (results.totalHits.relation() == TotalHits.Relation.EQUAL_TO // Return partial results only when timeout is met || (queryTimeout != null && queryTimeout.shouldExit())) { return results; diff --git a/lucene/core/src/java/org/apache/lucene/search/AbstractMultiTermQueryConstantScoreWrapper.java b/lucene/core/src/java/org/apache/lucene/search/AbstractMultiTermQueryConstantScoreWrapper.java index 7a705c0ef4dd..fc18b0476a73 100644 --- a/lucene/core/src/java/org/apache/lucene/search/AbstractMultiTermQueryConstantScoreWrapper.java +++ b/lucene/core/src/java/org/apache/lucene/search/AbstractMultiTermQueryConstantScoreWrapper.java @@ -154,38 +154,17 @@ protected abstract WeightOrDocIdSetIterator rewriteInner( List collectedTerms) throws IOException; - private IOSupplier rewrite(LeafReaderContext context, Terms terms) - throws IOException { - assert terms != null; - - final int fieldDocCount = terms.getDocCount(); - final TermsEnum termsEnum = q.getTermsEnum(terms); - assert termsEnum != null; - - final List collectedTerms = new ArrayList<>(); - boolean collectResult = collectTerms(fieldDocCount, termsEnum, collectedTerms); - if (collectResult && collectedTerms.isEmpty()) { - return null; + private WeightOrDocIdSetIterator rewriteAsBooleanQuery( + LeafReaderContext context, List collectedTerms) throws IOException { + BooleanQuery.Builder bq = new BooleanQuery.Builder(); + for (TermAndState t : collectedTerms) { + final TermStates termStates = new TermStates(searcher.getTopReaderContext()); + termStates.register(t.state, context.ord, t.docFreq, t.totalTermFreq); + bq.add(new TermQuery(new Term(q.field, t.term), termStates), BooleanClause.Occur.SHOULD); } - return () -> { - if (collectResult) { - // build a boolean query - BooleanQuery.Builder bq = new BooleanQuery.Builder(); - for (TermAndState t : collectedTerms) { - final TermStates termStates = new TermStates(searcher.getTopReaderContext()); - termStates.register(t.state, context.ord, t.docFreq, t.totalTermFreq); - bq.add( - new TermQuery(new Term(q.field, t.term), termStates), BooleanClause.Occur.SHOULD); - } - Query q = new ConstantScoreQuery(bq.build()); - final Weight weight = searcher.rewrite(q).createWeight(searcher, scoreMode, score()); - return new WeightOrDocIdSetIterator(weight); - } else { - // Too many terms to rewrite as a simple bq. - // Invoke rewriteInner logic to handle rewriting: - return rewriteInner(context, fieldDocCount, terms, termsEnum, collectedTerms); - } - }; + Query q = new ConstantScoreQuery(bq.build()); + final Weight weight = searcher.rewrite(q).createWeight(searcher, scoreMode, score()); + return new WeightOrDocIdSetIterator(weight); } private boolean collectTerms(int fieldDocCount, TermsEnum termsEnum, List terms) @@ -240,9 +219,44 @@ public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOExcepti return null; } - final long cost = estimateCost(terms, q.getTermsCount()); - IOSupplier weightOrIteratorSupplier = rewrite(context, terms); - if (weightOrIteratorSupplier == null) return null; + assert terms != null; + + final int fieldDocCount = terms.getDocCount(); + final TermsEnum termsEnum = q.getTermsEnum(terms); + assert termsEnum != null; + + List collectedTerms = new ArrayList<>(); + boolean collectResult = collectTerms(fieldDocCount, termsEnum, collectedTerms); + + final long cost; + if (collectResult) { + // Return a null supplier if no query terms were in the segment: + if (collectedTerms.isEmpty()) { + return null; + } + + // TODO: Instead of replicating the cost logic of a BooleanQuery we could consider rewriting + // to a BQ eagerly at this point and delegating to its cost method (instead of lazily + // rewriting on #get). Not sure what the performance hit would be of doing this though. + long sumTermCost = 0; + for (TermAndState collectedTerm : collectedTerms) { + sumTermCost += collectedTerm.docFreq; + } + cost = sumTermCost; + } else { + cost = estimateCost(terms, q.getTermsCount()); + } + + IOSupplier weightOrIteratorSupplier = + () -> { + if (collectResult) { + return rewriteAsBooleanQuery(context, collectedTerms); + } else { + // Too many terms to rewrite as a simple bq. + // Invoke rewriteInner logic to handle rewriting: + return rewriteInner(context, fieldDocCount, terms, termsEnum, collectedTerms); + } + }; return new ScorerSupplier() { @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/AbstractVectorSimilarityQuery.java b/lucene/core/src/java/org/apache/lucene/search/AbstractVectorSimilarityQuery.java index f9afc7829a2c..71379268a47a 100644 --- a/lucene/core/src/java/org/apache/lucene/search/AbstractVectorSimilarityQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/AbstractVectorSimilarityQuery.java @@ -145,7 +145,7 @@ public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOExcepti TopDocs results = approximateSearch(context, acceptDocs, cardinality, timeLimitingKnnCollectorManager); - if (results.totalHits.relation == TotalHits.Relation.EQUAL_TO + if (results.totalHits.relation() == TotalHits.Relation.EQUAL_TO // Return partial results only when timeout is met || (queryTimeout != null && queryTimeout.shouldExit())) { // Return an iterator over the collected results diff --git a/lucene/core/src/java/org/apache/lucene/search/BooleanQuery.java b/lucene/core/src/java/org/apache/lucene/search/BooleanQuery.java index 4798745ef67f..b50b0530a2d1 100644 --- a/lucene/core/src/java/org/apache/lucene/search/BooleanQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/BooleanQuery.java @@ -87,6 +87,22 @@ public Builder add(BooleanClause clause) { return this; } + /** + * Add a collection of BooleanClauses to this {@link Builder}. Note that the order in which + * clauses are added does not have any impact on matching documents or query performance. + * + * @throws IndexSearcher.TooManyClauses if the new number of clauses exceeds the maximum clause + * number + */ + public Builder add(Collection collection) { + // see #addClause(BooleanClause) + if ((clauses.size() + collection.size()) > IndexSearcher.maxClauseCount) { + throw new IndexSearcher.TooManyClauses(); + } + clauses.addAll(collection); + return this; + } + /** * Add a new clause to this {@link Builder}. Note that the order in which clauses are added does * not have any impact on matching documents or query performance. @@ -136,7 +152,7 @@ public List clauses() { } /** Return the collection of queries for the given {@link Occur}. */ - Collection getClauses(Occur occur) { + public Collection getClauses(Occur occur) { return clauseSets.get(occur); } @@ -252,6 +268,11 @@ public Query rewrite(IndexSearcher indexSearcher) throws IOException { return new MatchNoDocsQuery("empty BooleanQuery"); } + // Queries with no positive clauses have no matches + if (clauses.size() == clauseSets.get(Occur.MUST_NOT).size()) { + return new MatchNoDocsQuery("pure negative BooleanQuery"); + } + // optimize 1-clause queries if (clauses.size() == 1) { BooleanClause c = clauses.get(0); @@ -267,8 +288,6 @@ public Query rewrite(IndexSearcher indexSearcher) throws IOException { // no scoring clauses, so return a score of 0 return new BoostQuery(new ConstantScoreQuery(query), 0); case MUST_NOT: - // no positive clauses - return new MatchNoDocsQuery("pure negative BooleanQuery"); default: throw new AssertionError(); } @@ -523,8 +542,7 @@ public Query rewrite(IndexSearcher indexSearcher) throws IOException { builder.setMinimumNumberShouldMatch(minimumNumberShouldMatch); boolean actuallyRewritten = false; for (BooleanClause clause : clauses) { - if (clause.occur() == Occur.SHOULD && clause.query() instanceof BooleanQuery) { - BooleanQuery innerQuery = (BooleanQuery) clause.query(); + if (clause.occur() == Occur.SHOULD && clause.query() instanceof BooleanQuery innerQuery) { if (innerQuery.isPureDisjunction()) { actuallyRewritten = true; for (BooleanClause innerClause : innerQuery.clauses()) { @@ -542,6 +560,46 @@ public Query rewrite(IndexSearcher indexSearcher) throws IOException { } } + // Inline required / prohibited clauses. This helps run filtered conjunctive queries more + // efficiently by providing all clauses to the block-max AND scorer. + { + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + builder.setMinimumNumberShouldMatch(minimumNumberShouldMatch); + boolean actuallyRewritten = false; + for (BooleanClause outerClause : clauses) { + if (outerClause.isRequired() && outerClause.query() instanceof BooleanQuery innerQuery) { + // Inlining prohibited clauses is not legal if the query is a pure negation, since pure + // negations have no matches. It works because the inner BooleanQuery would have first + // rewritten to a MatchNoDocsQuery if it only had prohibited clauses. + assert innerQuery.getClauses(Occur.MUST_NOT).size() != innerQuery.clauses().size(); + if (innerQuery.getMinimumNumberShouldMatch() == 0 + && innerQuery.getClauses(Occur.SHOULD).isEmpty()) { + + actuallyRewritten = true; + for (BooleanClause innerClause : innerQuery) { + Occur innerOccur = innerClause.occur(); + if (innerOccur == Occur.FILTER + || innerOccur == Occur.MUST_NOT + || outerClause.occur() == Occur.MUST) { + builder.add(innerClause); + } else { + assert outerClause.occur() == Occur.FILTER && innerOccur == Occur.MUST; + // In this case we need to change the occur of the inner query from MUST to FILTER. + builder.add(innerClause.query(), Occur.FILTER); + } + } + } else { + builder.add(outerClause); + } + } else { + builder.add(outerClause); + } + } + if (actuallyRewritten) { + return builder.build(); + } + } + // SHOULD clause count less than or equal to minimumNumberShouldMatch // Important(this can only be processed after nested clauses have been flattened) { diff --git a/lucene/core/src/java/org/apache/lucene/search/BooleanScorer.java b/lucene/core/src/java/org/apache/lucene/search/BooleanScorer.java index e6e4f456bf3b..7be1558dd71c 100644 --- a/lucene/core/src/java/org/apache/lucene/search/BooleanScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/BooleanScorer.java @@ -20,13 +20,14 @@ import java.util.Arrays; import java.util.Collection; import java.util.Objects; +import org.apache.lucene.internal.hppc.LongArrayList; import org.apache.lucene.util.Bits; import org.apache.lucene.util.PriorityQueue; /** * {@link BulkScorer} that is used for pure disjunctions and disjunctions that have low values of * {@link BooleanQuery.Builder#setMinimumNumberShouldMatch(int)} and dense clauses. This scorer - * scores documents by batches of 2048 docs. + * scores documents by batches of 4,096 docs. */ final class BooleanScorer extends BulkScorer { @@ -41,71 +42,32 @@ static class Bucket { int freq; } - private class BulkScorerAndDoc { - final BulkScorer scorer; - final long cost; - int next; - - BulkScorerAndDoc(BulkScorer scorer) { - this.scorer = scorer; - this.cost = scorer.cost(); - this.next = -1; - } - - void advance(int min) throws IOException { - score(orCollector, null, min, min); - } - - void score(LeafCollector collector, Bits acceptDocs, int min, int max) throws IOException { - next = scorer.score(collector, acceptDocs, min, max); - } - } - - // See WANDScorer for an explanation - private static long cost(Collection scorers, int minShouldMatch) { - final PriorityQueue pq = - new PriorityQueue(scorers.size() - minShouldMatch + 1) { - @Override - protected boolean lessThan(BulkScorer a, BulkScorer b) { - return a.cost() > b.cost(); - } - }; - for (BulkScorer scorer : scorers) { - pq.insertWithOverflow(scorer); - } - long cost = 0; - for (BulkScorer scorer = pq.pop(); scorer != null; scorer = pq.pop()) { - cost += scorer.cost(); - } - return cost; - } - - static final class HeadPriorityQueue extends PriorityQueue { + static final class HeadPriorityQueue extends PriorityQueue { public HeadPriorityQueue(int maxSize) { super(maxSize); } @Override - protected boolean lessThan(BulkScorerAndDoc a, BulkScorerAndDoc b) { - return a.next < b.next; + protected boolean lessThan(DisiWrapper a, DisiWrapper b) { + return a.doc < b.doc; } } - static final class TailPriorityQueue extends PriorityQueue { + static final class TailPriorityQueue extends PriorityQueue { public TailPriorityQueue(int maxSize) { super(maxSize); } @Override - protected boolean lessThan(BulkScorerAndDoc a, BulkScorerAndDoc b) { + protected boolean lessThan(DisiWrapper a, DisiWrapper b) { return a.cost < b.cost; } - public BulkScorerAndDoc get(int i) { + public DisiWrapper get(int i) { Objects.checkIndex(i, size()); - return (BulkScorerAndDoc) getHeapArray()[1 + i]; + return (DisiWrapper) getHeapArray()[1 + i]; } } @@ -115,7 +77,7 @@ public BulkScorerAndDoc get(int i) { // This is basically an inlined FixedBitSet... seems to help with bound checks final long[] matching = new long[SET_SIZE]; - final BulkScorerAndDoc[] leads; + final DisiWrapper[] leads; final HeadPriorityQueue head; final TailPriorityQueue tail; final Score score = new Score(); @@ -123,31 +85,6 @@ public BulkScorerAndDoc get(int i) { final long cost; final boolean needsScores; - final class OrCollector implements LeafCollector { - Scorable scorer; - - @Override - public void setScorer(Scorable scorer) { - this.scorer = scorer; - } - - @Override - public void collect(int doc) throws IOException { - final int i = doc & MASK; - final int idx = i >>> 6; - matching[idx] |= 1L << i; - if (buckets != null) { - final Bucket bucket = buckets[i]; - bucket.freq++; - if (needsScores) { - bucket.score += scorer.score(); - } - } - } - } - - final OrCollector orCollector = new OrCollector(); - final class DocIdStreamView extends DocIdStream { int base; @@ -194,7 +131,7 @@ public int count() throws IOException { private final DocIdStreamView docIdStreamView = new DocIdStreamView(); - BooleanScorer(Collection scorers, int minShouldMatch, boolean needsScores) { + BooleanScorer(Collection scorers, int minShouldMatch, boolean needsScores) { if (minShouldMatch < 1 || minShouldMatch > scorers.size()) { throw new IllegalArgumentException( "minShouldMatch should be within 1..num_scorers. Got " + minShouldMatch); @@ -211,18 +148,21 @@ public int count() throws IOException { } else { buckets = null; } - this.leads = new BulkScorerAndDoc[scorers.size()]; + this.leads = new DisiWrapper[scorers.size()]; this.head = new HeadPriorityQueue(scorers.size() - minShouldMatch + 1); this.tail = new TailPriorityQueue(minShouldMatch - 1); this.minShouldMatch = minShouldMatch; this.needsScores = needsScores; - for (BulkScorer scorer : scorers) { - final BulkScorerAndDoc evicted = tail.insertWithOverflow(new BulkScorerAndDoc(scorer)); + LongArrayList costs = new LongArrayList(scorers.size()); + for (Scorer scorer : scorers) { + DisiWrapper w = new DisiWrapper(scorer); + costs.add(w.cost); + final DisiWrapper evicted = tail.insertWithOverflow(w); if (evicted != null) { head.add(evicted); } } - this.cost = cost(scorers, minShouldMatch); + this.cost = ScorerUtil.costWithMinShouldMatch(costs.stream(), costs.size(), minShouldMatch); } @Override @@ -230,19 +170,49 @@ public long cost() { return cost; } + private void scoreDisiWrapperIntoBitSet(DisiWrapper w, Bits acceptDocs, int min, int max) + throws IOException { + boolean needsScores = BooleanScorer.this.needsScores; + long[] matching = BooleanScorer.this.matching; + Bucket[] buckets = BooleanScorer.this.buckets; + + DocIdSetIterator it = w.iterator; + Scorer scorer = w.scorer; + int doc = w.doc; + if (doc < min) { + doc = it.advance(min); + } + for (; doc < max; doc = it.nextDoc()) { + if (acceptDocs == null || acceptDocs.get(doc)) { + final int i = doc & MASK; + final int idx = i >> 6; + matching[idx] |= 1L << i; + if (buckets != null) { + final Bucket bucket = buckets[i]; + bucket.freq++; + if (needsScores) { + bucket.score += scorer.score(); + } + } + } + } + + w.doc = doc; + } + private void scoreWindowIntoBitSetAndReplay( LeafCollector collector, Bits acceptDocs, int base, int min, int max, - BulkScorerAndDoc[] scorers, + DisiWrapper[] scorers, int numScorers) throws IOException { for (int i = 0; i < numScorers; ++i) { - final BulkScorerAndDoc scorer = scorers[i]; - assert scorer.next < max; - scorer.score(orCollector, acceptDocs, min, max); + final DisiWrapper w = scorers[i]; + assert w.doc < max; + scoreDisiWrapperIntoBitSet(w, acceptDocs, min, max); } docIdStreamView.base = base; @@ -251,20 +221,20 @@ private void scoreWindowIntoBitSetAndReplay( Arrays.fill(matching, 0L); } - private BulkScorerAndDoc advance(int min) throws IOException { + private DisiWrapper advance(int min) throws IOException { assert tail.size() == minShouldMatch - 1; final HeadPriorityQueue head = this.head; final TailPriorityQueue tail = this.tail; - BulkScorerAndDoc headTop = head.top(); - BulkScorerAndDoc tailTop = tail.top(); - while (headTop.next < min) { + DisiWrapper headTop = head.top(); + DisiWrapper tailTop = tail.top(); + while (headTop.doc < min) { if (tailTop == null || headTop.cost <= tailTop.cost) { - headTop.advance(min); + headTop.doc = headTop.iterator.advance(min); headTop = head.updateTop(); } else { // swap the top of head and tail - final BulkScorerAndDoc previousHeadTop = headTop; - tailTop.advance(min); + final DisiWrapper previousHeadTop = headTop; + tailTop.doc = tailTop.iterator.advance(min); headTop = head.updateTop(tailTop); tailTop = tail.updateTop(previousHeadTop); } @@ -282,9 +252,11 @@ private void scoreWindowMultipleScorers( throws IOException { while (maxFreq < minShouldMatch && maxFreq + tail.size() >= minShouldMatch) { // a match is still possible - final BulkScorerAndDoc candidate = tail.pop(); - candidate.advance(windowMin); - if (candidate.next < windowMax) { + final DisiWrapper candidate = tail.pop(); + if (candidate.doc < windowMin) { + candidate.doc = candidate.iterator.advance(windowMin); + } + if (candidate.doc < windowMax) { leads[maxFreq++] = candidate; } else { head.add(candidate); @@ -304,7 +276,7 @@ private void scoreWindowMultipleScorers( // Push back scorers into head and tail for (int i = 0; i < maxFreq; ++i) { - final BulkScorerAndDoc evicted = head.insertWithOverflow(leads[i]); + final DisiWrapper evicted = head.insertWithOverflow(leads[i]); if (evicted != null) { tail.add(evicted); } @@ -312,7 +284,7 @@ private void scoreWindowMultipleScorers( } private void scoreWindowSingleScorer( - BulkScorerAndDoc bulkScorer, + DisiWrapper w, LeafCollector collector, Bits acceptDocs, int windowMin, @@ -320,33 +292,44 @@ private void scoreWindowSingleScorer( int max) throws IOException { assert tail.size() == 0; - final int nextWindowBase = head.top().next & ~MASK; + final int nextWindowBase = head.top().doc & ~MASK; final int end = Math.max(windowMax, Math.min(max, nextWindowBase)); - bulkScorer.score(collector, acceptDocs, windowMin, end); + DocIdSetIterator it = w.iterator; + int doc = w.doc; + if (doc < windowMin) { + doc = it.advance(windowMin); + } + collector.setScorer(w.scorer); + for (; doc < end; doc = it.nextDoc()) { + if (acceptDocs == null || acceptDocs.get(doc)) { + collector.collect(doc); + } + } + w.doc = doc; // reset the scorer that should be used for the general case collector.setScorer(score); } - private BulkScorerAndDoc scoreWindow( - BulkScorerAndDoc top, LeafCollector collector, Bits acceptDocs, int min, int max) + private DisiWrapper scoreWindow( + DisiWrapper top, LeafCollector collector, Bits acceptDocs, int min, int max) throws IOException { - final int windowBase = top.next & ~MASK; // find the window that the next match belongs to + final int windowBase = top.doc & ~MASK; // find the window that the next match belongs to final int windowMin = Math.max(min, windowBase); final int windowMax = Math.min(max, windowBase + SIZE); // Fill 'leads' with all scorers from 'head' that are in the right window leads[0] = head.pop(); int maxFreq = 1; - while (head.size() > 0 && head.top().next < windowMax) { + while (head.size() > 0 && head.top().doc < windowMax) { leads[maxFreq++] = head.pop(); } if (minShouldMatch == 1 && maxFreq == 1) { // special case: only one scorer can match in the current window, // we can collect directly - final BulkScorerAndDoc bulkScorer = leads[0]; + final DisiWrapper bulkScorer = leads[0]; scoreWindowSingleScorer(bulkScorer, collector, acceptDocs, windowMin, windowMax, max); return head.add(bulkScorer); } else { @@ -360,11 +343,11 @@ private BulkScorerAndDoc scoreWindow( public int score(LeafCollector collector, Bits acceptDocs, int min, int max) throws IOException { collector.setScorer(score); - BulkScorerAndDoc top = advance(min); - while (top.next < max) { + DisiWrapper top = advance(min); + while (top.doc < max) { top = scoreWindow(top, collector, acceptDocs, min, max); } - return top.next; + return top.doc; } } diff --git a/lucene/core/src/java/org/apache/lucene/search/BooleanScorerSupplier.java b/lucene/core/src/java/org/apache/lucene/search/BooleanScorerSupplier.java index b07d8ebb88e1..7a53bc9a4852 100644 --- a/lucene/core/src/java/org/apache/lucene/search/BooleanScorerSupplier.java +++ b/lucene/core/src/java/org/apache/lucene/search/BooleanScorerSupplier.java @@ -289,9 +289,9 @@ BulkScorer optionalBulkScorer() throws IOException { return new MaxScoreBulkScorer(maxDoc, optionalScorers); } - List optional = new ArrayList(); + List optional = new ArrayList(); for (ScorerSupplier ss : subs.get(Occur.SHOULD)) { - optional.add(ss.bulkScorer()); + optional.add(ss.get(Long.MAX_VALUE)); } return new BooleanScorer(optional, Math.max(1, minShouldMatch), scoreMode.needsScores()); @@ -333,10 +333,15 @@ private BulkScorer requiredBulkScorer() throws IOException { requiredScoring.add(ss.get(leadCost)); } if (scoreMode == ScoreMode.TOP_SCORES - && requiredNoScoring.isEmpty() && requiredScoring.size() > 1 // Only specialize top-level conjunctions for clauses that don't have a two-phase iterator. + && requiredNoScoring.stream().map(Scorer::twoPhaseIterator).allMatch(Objects::isNull) && requiredScoring.stream().map(Scorer::twoPhaseIterator).allMatch(Objects::isNull)) { + // Turn all filters into scoring clauses with a score of zero, so that + // BlockMaxConjunctionBulkScorer is applicable. + for (Scorer filter : requiredNoScoring) { + requiredScoring.add(new ConstantScoreScorer(0f, ScoreMode.COMPLETE, filter.iterator())); + } return new BlockMaxConjunctionBulkScorer(maxDoc, requiredScoring); } if (scoreMode != ScoreMode.TOP_SCORES diff --git a/lucene/core/src/java/org/apache/lucene/search/BooleanWeight.java b/lucene/core/src/java/org/apache/lucene/search/BooleanWeight.java index f73f504f3cfa..08fc1296d101 100644 --- a/lucene/core/src/java/org/apache/lucene/search/BooleanWeight.java +++ b/lucene/core/src/java/org/apache/lucene/search/BooleanWeight.java @@ -153,70 +153,6 @@ public Matches matches(LeafReaderContext context, int doc) throws IOException { return MatchesUtils.fromSubMatches(matches); } - // Return a BulkScorer for the optional clauses only, - // or null if it is not applicable - // pkg-private for forcing use of BooleanScorer in tests - BulkScorer optionalBulkScorer(LeafReaderContext context) throws IOException { - if (scoreMode == ScoreMode.TOP_SCORES) { - if (!query.isPureDisjunction()) { - return null; - } - - List optional = new ArrayList<>(); - for (WeightedBooleanClause wc : weightedClauses) { - Weight w = wc.weight; - BooleanClause c = wc.clause; - if (c.occur() != Occur.SHOULD) { - continue; - } - ScorerSupplier scorer = w.scorerSupplier(context); - if (scorer != null) { - optional.add(scorer); - } - } - - if (optional.size() <= 1) { - return null; - } - - List optionalScorers = new ArrayList<>(); - for (ScorerSupplier ss : optional) { - optionalScorers.add(ss.get(Long.MAX_VALUE)); - } - - return new MaxScoreBulkScorer(context.reader().maxDoc(), optionalScorers); - } - - List optional = new ArrayList(); - for (WeightedBooleanClause wc : weightedClauses) { - Weight w = wc.weight; - BooleanClause c = wc.clause; - if (c.occur() != Occur.SHOULD) { - continue; - } - BulkScorer subScorer = w.bulkScorer(context); - - if (subScorer != null) { - optional.add(subScorer); - } - } - - if (optional.size() == 0) { - return null; - } - - if (query.getMinimumNumberShouldMatch() > optional.size()) { - return null; - } - - if (optional.size() == 1) { - return optional.get(0); - } - - return new BooleanScorer( - optional, Math.max(1, query.getMinimumNumberShouldMatch()), scoreMode.needsScores()); - } - @Override public int count(LeafReaderContext context) throws IOException { final int numDocs = context.reader().numDocs(); diff --git a/lucene/core/src/java/org/apache/lucene/search/BulkScorer.java b/lucene/core/src/java/org/apache/lucene/search/BulkScorer.java index fb7805ac6ced..eecf0164a0f1 100644 --- a/lucene/core/src/java/org/apache/lucene/search/BulkScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/BulkScorer.java @@ -27,18 +27,6 @@ */ public abstract class BulkScorer { - /** - * Scores and collects all matching documents. - * - * @param collector The collector to which all matching documents are passed. - * @param acceptDocs {@link Bits} that represents the allowed documents to match, or {@code null} - * if they are all allowed to match. - */ - public void score(LeafCollector collector, Bits acceptDocs) throws IOException { - final int next = score(collector, acceptDocs, 0, DocIdSetIterator.NO_MORE_DOCS); - assert next == DocIdSetIterator.NO_MORE_DOCS; - } - /** * Collects matching documents in a range and return an estimation of the next matching document * which is on or after {@code max}. diff --git a/lucene/core/src/java/org/apache/lucene/search/CollectionStatistics.java b/lucene/core/src/java/org/apache/lucene/search/CollectionStatistics.java index d424abca0789..ba118fa5452e 100644 --- a/lucene/core/src/java/org/apache/lucene/search/CollectionStatistics.java +++ b/lucene/core/src/java/org/apache/lucene/search/CollectionStatistics.java @@ -17,8 +17,6 @@ package org.apache.lucene.search; import java.util.Objects; -import org.apache.lucene.index.IndexReader; // javadocs -import org.apache.lucene.index.Terms; // javadocs /** * Contains statistics for a collection (field). @@ -45,31 +43,41 @@ *

    Be careful when performing calculations on these values because they are represented as 64-bit * integer values, you may need to cast to {@code double} for your use. * + * @param field Field's name. + *

    This value is never {@code null}. + * @param maxDoc The total number of documents in the range [1 .. {@link Long#MAX_VALUE}], + * regardless of whether they all contain values for this field. + *

    This value is always a positive number. @see IndexReader#maxDoc() + * @param docCount The total number of documents that have at least one term for this field , in the + * range [1 .. {@link #maxDoc()}]. + *

    This value is always a positive number, and never exceeds {@link #maxDoc()}. @see + * Terms#getDocCount() + * @param sumTotalTermFreq The total number of tokens for this field , in the range [{@link + * #sumDocFreq()} .. {@link Long#MAX_VALUE}]. This is the "word count" for this field across all + * documents. It is the sum of {@link TermStatistics#totalTermFreq()} across all terms. It is + * also the sum of each document's field length across all documents. + *

    This value is always a positive number, and always at least {@link #sumDocFreq()}. @see + * Terms#getSumTotalTermFreq() + * @param sumDocFreq The total number of posting list entries for this field, in the range [{@link + * #docCount()} .. {@link #sumTotalTermFreq()}]. This is the sum of term-document pairs: the sum + * of {@link TermStatistics#docFreq()} across all terms. It is also the sum of each document's + * unique term count for this field across all documents. + *

    This value is always a positive number, always at least {@link #docCount()}, and never + * exceeds {@link #sumTotalTermFreq()}. @see Terms#getSumDocFreq() * @lucene.experimental */ -public class CollectionStatistics { - private final String field; - private final long maxDoc; - private final long docCount; - private final long sumTotalTermFreq; - private final long sumDocFreq; - +public record CollectionStatistics( + String field, long maxDoc, long docCount, long sumTotalTermFreq, long sumDocFreq) { /** * Creates statistics instance for a collection (field). * - * @param field Field's name - * @param maxDoc total number of documents. - * @param docCount number of documents containing the field. - * @param sumTotalTermFreq number of tokens in the field. - * @param sumDocFreq number of postings list entries for the field. * @throws IllegalArgumentException if {@code maxDoc} is negative or zero. * @throws IllegalArgumentException if {@code docCount} is negative or zero. * @throws IllegalArgumentException if {@code docCount} is more than {@code maxDoc}. * @throws IllegalArgumentException if {@code sumDocFreq} is less than {@code docCount}. * @throws IllegalArgumentException if {@code sumTotalTermFreq} is less than {@code sumDocFreq}. */ - public CollectionStatistics( - String field, long maxDoc, long docCount, long sumTotalTermFreq, long sumDocFreq) { + public CollectionStatistics { Objects.requireNonNull(field); if (maxDoc <= 0) { throw new IllegalArgumentException("maxDoc must be positive, maxDoc: " + maxDoc); @@ -102,94 +110,5 @@ public CollectionStatistics( + ", sumDocFreq: " + sumDocFreq); } - this.field = field; - this.maxDoc = maxDoc; - this.docCount = docCount; - this.sumTotalTermFreq = sumTotalTermFreq; - this.sumDocFreq = sumDocFreq; - } - - /** - * The field's name. - * - *

    This value is never {@code null}. - * - * @return field's name, not {@code null} - */ - public final String field() { - return field; - } - - /** - * The total number of documents, regardless of whether they all contain values for this field. - * - *

    This value is always a positive number. - * - * @return total number of documents, in the range [1 .. {@link Long#MAX_VALUE}] - * @see IndexReader#maxDoc() - */ - public final long maxDoc() { - return maxDoc; - } - - /** - * The total number of documents that have at least one term for this field. - * - *

    This value is always a positive number, and never exceeds {@link #maxDoc()}. - * - * @return total number of documents containing this field, in the range [1 .. {@link #maxDoc()}] - * @see Terms#getDocCount() - */ - public final long docCount() { - return docCount; - } - - /** - * The total number of tokens for this field. This is the "word count" for this field across all - * documents. It is the sum of {@link TermStatistics#totalTermFreq()} across all terms. It is also - * the sum of each document's field length across all documents. - * - *

    This value is always a positive number, and always at least {@link #sumDocFreq()}. - * - * @return total number of tokens in the field, in the range [{@link #sumDocFreq()} .. {@link - * Long#MAX_VALUE}] - * @see Terms#getSumTotalTermFreq() - */ - public final long sumTotalTermFreq() { - return sumTotalTermFreq; - } - - /** - * The total number of posting list entries for this field. This is the sum of term-document - * pairs: the sum of {@link TermStatistics#docFreq()} across all terms. It is also the sum of each - * document's unique term count for this field across all documents. - * - *

    This value is always a positive number, always at least {@link #docCount()}, and never - * exceeds {@link #sumTotalTermFreq()}. - * - * @return number of posting list entries, in the range [{@link #docCount()} .. {@link - * #sumTotalTermFreq()}] - * @see Terms#getSumDocFreq() - */ - public final long sumDocFreq() { - return sumDocFreq; - } - - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - sb.append("field="); - sb.append('"'); - sb.append(field()); - sb.append('"'); - sb.append(",maxDoc="); - sb.append(maxDoc()); - sb.append(",docCount="); - sb.append(docCount()); - sb.append(",sumTotalTermFreq="); - sb.append(sumTotalTermFreq()); - sb.append(",sumDocFreq="); - sb.append(sumDocFreq); - return sb.toString(); } } diff --git a/lucene/core/src/java/org/apache/lucene/search/CollectionTerminatedException.java b/lucene/core/src/java/org/apache/lucene/search/CollectionTerminatedException.java index 89f14fff20bc..9a21c882052a 100644 --- a/lucene/core/src/java/org/apache/lucene/search/CollectionTerminatedException.java +++ b/lucene/core/src/java/org/apache/lucene/search/CollectionTerminatedException.java @@ -21,8 +21,8 @@ * the current leaf. * *

    Note: IndexSearcher swallows this exception and never re-throws it. As a consequence, you - * should not catch it when calling {@link IndexSearcher#search} as it is unnecessary and might hide - * misuse of this exception. + * should not catch it when calling the different search methods that {@link IndexSearcher} exposes + * as it is unnecessary and might hide misuse of this exception. */ @SuppressWarnings("serial") public final class CollectionTerminatedException extends RuntimeException { diff --git a/lucene/core/src/java/org/apache/lucene/search/CollectorManager.java b/lucene/core/src/java/org/apache/lucene/search/CollectorManager.java index d9969e03ed42..3c87a94e36d9 100644 --- a/lucene/core/src/java/org/apache/lucene/search/CollectorManager.java +++ b/lucene/core/src/java/org/apache/lucene/search/CollectorManager.java @@ -18,6 +18,7 @@ import java.io.IOException; import java.util.Collection; +import org.apache.lucene.index.LeafReaderContext; /** * A manager of collectors. This class is useful to parallelize execution of search requests and has @@ -31,6 +32,12 @@ * fully collected. * * + *

    Note: Multiple {@link LeafCollector}s may be requested for the same {@link + * LeafReaderContext} via {@link Collector#getLeafCollector(LeafReaderContext)} across the different + * {@link Collector}s returned by {@link #newCollector()}. Any computation or logic that needs to + * happen once per segment requires specific handling in the collector manager implementation, + * because the collection of an entire segment may be split across threads. + * * @see IndexSearcher#search(Query, CollectorManager) * @lucene.experimental */ diff --git a/lucene/core/src/java/org/apache/lucene/search/DisjunctionMaxQuery.java b/lucene/core/src/java/org/apache/lucene/search/DisjunctionMaxQuery.java index 5a53749b6675..8d6bae3e7ea0 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DisjunctionMaxQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/DisjunctionMaxQuery.java @@ -23,6 +23,7 @@ import java.util.Iterator; import java.util.List; import java.util.Objects; +import java.util.stream.Collectors; import org.apache.lucene.index.LeafReaderContext; /** @@ -44,6 +45,7 @@ public final class DisjunctionMaxQuery extends Query implements Iterable /* The subqueries */ private final Multiset disjuncts = new Multiset<>(); + private final List orderedQueries; // used for toString() /* Multiple of the non-max disjunct scores added into our final score. Non-zero values support tie-breaking. */ private final float tieBreakerMultiplier; @@ -65,6 +67,7 @@ public DisjunctionMaxQuery(Collection disjuncts, float tieBreakerMultipli } this.tieBreakerMultiplier = tieBreakerMultiplier; this.disjuncts.addAll(disjuncts); + this.orderedQueries = new ArrayList<>(disjuncts); // order from the caller } /** @@ -295,24 +298,19 @@ public void visit(QueryVisitor visitor) { */ @Override public String toString(String field) { - StringBuilder buffer = new StringBuilder(); - buffer.append("("); - Iterator it = disjuncts.iterator(); - for (int i = 0; it.hasNext(); i++) { - Query subquery = it.next(); - if (subquery instanceof BooleanQuery) { // wrap sub-bools in parens - buffer.append("("); - buffer.append(subquery.toString(field)); - buffer.append(")"); - } else buffer.append(subquery.toString(field)); - if (i != disjuncts.size() - 1) buffer.append(" | "); - } - buffer.append(")"); - if (tieBreakerMultiplier != 0.0f) { - buffer.append("~"); - buffer.append(tieBreakerMultiplier); - } - return buffer.toString(); + return this.orderedQueries.stream() + .map( + subquery -> { + if (subquery instanceof BooleanQuery) { // wrap sub-bools in parens + return "(" + subquery.toString(field) + ")"; + } + return subquery.toString(field); + }) + .collect( + Collectors.joining( + " | ", + "(", + ")" + ((tieBreakerMultiplier != 0.0f) ? "~" + tieBreakerMultiplier : ""))); } /** diff --git a/lucene/core/src/java/org/apache/lucene/search/DisjunctionScoreBlockBoundaryPropagator.java b/lucene/core/src/java/org/apache/lucene/search/DisjunctionScoreBlockBoundaryPropagator.java index 20d4dcfefa82..03a5428598d7 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DisjunctionScoreBlockBoundaryPropagator.java +++ b/lucene/core/src/java/org/apache/lucene/search/DisjunctionScoreBlockBoundaryPropagator.java @@ -40,7 +40,7 @@ final class DisjunctionScoreBlockBoundaryPropagator { throw new RuntimeException(e); } }) - .thenComparing(Comparator.comparing(s -> s.iterator().cost())); + .thenComparing(s -> s.iterator().cost()); private final Scorer[] scorers; private final float[] maxScores; diff --git a/lucene/core/src/java/org/apache/lucene/document/DocValuesRangeIterator.java b/lucene/core/src/java/org/apache/lucene/search/DocValuesRangeIterator.java similarity index 73% rename from lucene/core/src/java/org/apache/lucene/document/DocValuesRangeIterator.java rename to lucene/core/src/java/org/apache/lucene/search/DocValuesRangeIterator.java index fbefe128cca8..f7cbfc490520 100644 --- a/lucene/core/src/java/org/apache/lucene/document/DocValuesRangeIterator.java +++ b/lucene/core/src/java/org/apache/lucene/search/DocValuesRangeIterator.java @@ -14,18 +14,18 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.document; +package org.apache.lucene.search; import java.io.IOException; import org.apache.lucene.index.DocValuesSkipper; -import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.search.TwoPhaseIterator; /** * Wrapper around a {@link TwoPhaseIterator} for a doc-values range query that speeds things up by * taking advantage of a {@link DocValuesSkipper}. + * + * @lucene.experimental */ -final class DocValuesRangeIterator extends TwoPhaseIterator { +public final class DocValuesRangeIterator extends TwoPhaseIterator { enum Match { /** None of the documents in the range match */ @@ -41,19 +41,29 @@ enum Match { private final Approximation approximation; private final TwoPhaseIterator innerTwoPhase; - DocValuesRangeIterator( - TwoPhaseIterator twoPhase, DocValuesSkipper skipper, long lowerValue, long upperValue) { - super(new Approximation(twoPhase.approximation(), skipper, lowerValue, upperValue)); + public DocValuesRangeIterator( + TwoPhaseIterator twoPhase, + DocValuesSkipper skipper, + long lowerValue, + long upperValue, + boolean queryRangeHasGaps) { + super( + queryRangeHasGaps + ? new RangeWithGapsApproximation( + twoPhase.approximation(), skipper, lowerValue, upperValue) + : new RangeNoGapsApproximation( + twoPhase.approximation(), skipper, lowerValue, upperValue)); this.approximation = (Approximation) approximation(); this.innerTwoPhase = twoPhase; } - static class Approximation extends DocIdSetIterator { + abstract static class Approximation extends DocIdSetIterator { private final DocIdSetIterator innerApproximation; - private final DocValuesSkipper skipper; - private final long lowerValue; - private final long upperValue; + + protected final DocValuesSkipper skipper; + protected final long lowerValue; + protected final long upperValue; private int doc = -1; @@ -137,7 +147,21 @@ public long cost() { return innerApproximation.cost(); } - private Match match(int level) { + protected abstract Match match(int level); + } + + private static final class RangeNoGapsApproximation extends Approximation { + + RangeNoGapsApproximation( + DocIdSetIterator innerApproximation, + DocValuesSkipper skipper, + long lowerValue, + long upperValue) { + super(innerApproximation, skipper, lowerValue, upperValue); + } + + @Override + protected Match match(int level) { long minValue = skipper.minValue(level); long maxValue = skipper.maxValue(level); if (minValue > upperValue || maxValue < lowerValue) { @@ -154,6 +178,28 @@ private Match match(int level) { } } + private static final class RangeWithGapsApproximation extends Approximation { + + RangeWithGapsApproximation( + DocIdSetIterator innerApproximation, + DocValuesSkipper skipper, + long lowerValue, + long upperValue) { + super(innerApproximation, skipper, lowerValue, upperValue); + } + + @Override + protected Match match(int level) { + long minValue = skipper.minValue(level); + long maxValue = skipper.maxValue(level); + if (minValue > upperValue || maxValue < lowerValue) { + return Match.NO; + } else { + return Match.MAYBE; + } + } + } + @Override public final boolean matches() throws IOException { return switch (approximation.match) { diff --git a/lucene/core/src/java/org/apache/lucene/search/DocValuesRewriteMethod.java b/lucene/core/src/java/org/apache/lucene/search/DocValuesRewriteMethod.java index ce179cf85d2e..eb4d06259401 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DocValuesRewriteMethod.java +++ b/lucene/core/src/java/org/apache/lucene/search/DocValuesRewriteMethod.java @@ -18,6 +18,7 @@ import java.io.IOException; import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.DocValuesSkipper; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.SortedSetDocValues; @@ -166,27 +167,29 @@ public Scorer get(long leadCost) throws IOException { return new ConstantScoreScorer(score(), scoreMode, DocIdSetIterator.empty()); } + // Leverage a DV skipper if one was indexed for the field: + DocValuesSkipper skipper = context.reader().getDocValuesSkipper(query.field); + // Create a bit set for the "term set" ordinals (these are the terms provided by the // query that are actually present in the doc values field). Cannot use FixedBitSet // because we require long index (ord): final LongBitSet termSet = new LongBitSet(values.getValueCount()); + long minOrd = termsEnum.ord(); + assert minOrd >= 0; long maxOrd = -1; do { long ord = termsEnum.ord(); - if (ord >= 0) { - assert ord > maxOrd; - maxOrd = ord; - termSet.set(ord); - } + assert ord >= 0 && ord > maxOrd; + maxOrd = ord; + termSet.set(ord); } while (termsEnum.next() != null); - // no terms matched in this segment - if (maxOrd < 0) { + if (skipper != null && (minOrd > skipper.maxValue() || maxOrd < skipper.minValue())) { return new ConstantScoreScorer(score(), scoreMode, DocIdSetIterator.empty()); } final SortedDocValues singleton = DocValues.unwrapSingleton(values); - final TwoPhaseIterator iterator; + TwoPhaseIterator iterator; final long max = maxOrd; if (singleton != null) { iterator = @@ -224,6 +227,9 @@ public float matchCost() { }; } + if (skipper != null) { + iterator = new DocValuesRangeIterator(iterator, skipper, minOrd, maxOrd, true); + } return new ConstantScoreScorer(score(), scoreMode, iterator); } diff --git a/lucene/core/src/java/org/apache/lucene/search/DoubleValuesSource.java b/lucene/core/src/java/org/apache/lucene/search/DoubleValuesSource.java index 000c4b2c3151..2650fb164cba 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DoubleValuesSource.java +++ b/lucene/core/src/java/org/apache/lucene/search/DoubleValuesSource.java @@ -26,6 +26,7 @@ import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.VectorEncoding; import org.apache.lucene.search.comparators.DoubleComparator; +import org.apache.lucene.util.NumericUtils; /** * Base class for producing {@link DoubleValues} @@ -115,6 +116,70 @@ public final LongValuesSource toLongValuesSource() { return new LongDoubleValuesSource(this); } + /** Convert to {@link LongValuesSource} by calling {@link NumericUtils#doubleToSortableLong} */ + public final LongValuesSource toSortableLongDoubleValuesSource() { + return new SortableLongDoubleValuesSource(this); + } + + private static class SortableLongDoubleValuesSource extends LongValuesSource { + + private final DoubleValuesSource inner; + + private SortableLongDoubleValuesSource(DoubleValuesSource inner) { + this.inner = Objects.requireNonNull(inner); + } + + @Override + public LongValues getValues(LeafReaderContext ctx, DoubleValues scores) throws IOException { + DoubleValues in = inner.getValues(ctx, scores); + + return new LongValues() { + @Override + public long longValue() throws IOException { + return NumericUtils.doubleToSortableLong(in.doubleValue()); + } + + @Override + public boolean advanceExact(int doc) throws IOException { + return in.advanceExact(doc); + } + }; + } + + @Override + public boolean needsScores() { + return inner.needsScores(); + } + + @Override + public int hashCode() { + return inner.hashCode(); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + SortableLongDoubleValuesSource that = (SortableLongDoubleValuesSource) o; + return Objects.equals(inner, that.inner); + } + + @Override + public String toString() { + return "sortableLong(" + inner.toString() + ")"; + } + + @Override + public LongValuesSource rewrite(IndexSearcher searcher) throws IOException { + return inner.rewrite(searcher).toLongValuesSource(); + } + + @Override + public boolean isCacheable(LeafReaderContext ctx) { + return false; + } + } + private static class LongDoubleValuesSource extends LongValuesSource { private final DoubleValuesSource inner; diff --git a/lucene/core/src/java/org/apache/lucene/search/FieldComparator.java b/lucene/core/src/java/org/apache/lucene/search/FieldComparator.java index 2a810e674e9d..aad2ac43edbf 100644 --- a/lucene/core/src/java/org/apache/lucene/search/FieldComparator.java +++ b/lucene/core/src/java/org/apache/lucene/search/FieldComparator.java @@ -134,7 +134,8 @@ public void disableSkipping() {} /** * Sorts by descending relevance. NOTE: if you are sorting only by descending relevance and then * secondarily by ascending docID, performance is faster using {@link TopScoreDocCollector} - * directly (which {@link IndexSearcher#search} uses when no {@link Sort} is specified). + * directly (which {@link IndexSearcher#search(Query, int)} uses when no {@link Sort} is + * specified). */ public static final class RelevanceComparator extends FieldComparator implements LeafFieldComparator { diff --git a/lucene/core/src/java/org/apache/lucene/search/FieldExistsQuery.java b/lucene/core/src/java/org/apache/lucene/search/FieldExistsQuery.java index 409bcbc0b643..adaace27727e 100644 --- a/lucene/core/src/java/org/apache/lucene/search/FieldExistsQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/FieldExistsQuery.java @@ -181,8 +181,8 @@ public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOExcepti } else if (fieldInfo.getVectorDimension() != 0) { // the field indexes vectors iterator = switch (fieldInfo.getVectorEncoding()) { - case FLOAT32 -> context.reader().getFloatVectorValues(field); - case BYTE -> context.reader().getByteVectorValues(field); + case FLOAT32 -> context.reader().getFloatVectorValues(field).iterator(); + case BYTE -> context.reader().getByteVectorValues(field).iterator(); }; } else if (fieldInfo.getDocValuesType() != DocValuesType.NONE) { // the field indexes doc values diff --git a/lucene/core/src/java/org/apache/lucene/search/HitsThresholdChecker.java b/lucene/core/src/java/org/apache/lucene/search/HitsThresholdChecker.java deleted file mode 100644 index 43ff4fecdbb1..000000000000 --- a/lucene/core/src/java/org/apache/lucene/search/HitsThresholdChecker.java +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.lucene.search; - -import java.util.concurrent.atomic.LongAdder; - -/** Used for defining custom algorithms to allow searches to early terminate */ -abstract class HitsThresholdChecker { - /** Implementation of HitsThresholdChecker which allows global hit counting */ - private static class GlobalHitsThresholdChecker extends HitsThresholdChecker { - private final LongAdder globalHitCount = new LongAdder(); - // Cache whether the threshold has been reached already. It is not volatile or synchronized on - // purpose to contain the overhead of reading the value similarly to what String#hashCode() - // does. This does not affect correctness. - private boolean thresholdReached = false; - - GlobalHitsThresholdChecker(int totalHitsThreshold) { - super(totalHitsThreshold); - assert totalHitsThreshold != Integer.MAX_VALUE; - } - - @Override - void incrementHitCount() { - if (thresholdReached == false) { - globalHitCount.increment(); - } - } - - @Override - boolean isThresholdReached() { - if (thresholdReached) { - return true; - } - return thresholdReached = globalHitCount.longValue() > getHitsThreshold(); - } - - @Override - ScoreMode scoreMode() { - return ScoreMode.TOP_SCORES; - } - } - - /** Default implementation of HitsThresholdChecker to be used for single threaded execution */ - private static class LocalHitsThresholdChecker extends HitsThresholdChecker { - private int hitCount; - - LocalHitsThresholdChecker(int totalHitsThreshold) { - super(totalHitsThreshold); - assert totalHitsThreshold != Integer.MAX_VALUE; - } - - @Override - void incrementHitCount() { - ++hitCount; - } - - @Override - boolean isThresholdReached() { - return hitCount > getHitsThreshold(); - } - - @Override - ScoreMode scoreMode() { - return ScoreMode.TOP_SCORES; - } - } - - /** - * No-op implementation of {@link HitsThresholdChecker} that does no counting, as the threshold - * can never be reached. This is useful for cases where early termination is never desired, so - * that the overhead of counting hits can be avoided. - */ - private static final HitsThresholdChecker EXACT_HITS_COUNT_THRESHOLD_CHECKER = - new HitsThresholdChecker(Integer.MAX_VALUE) { - @Override - void incrementHitCount() { - // noop - } - - @Override - boolean isThresholdReached() { - return false; - } - - @Override - ScoreMode scoreMode() { - return ScoreMode.COMPLETE; - } - }; - - /* - * Returns a threshold checker that is useful for single threaded searches - */ - static HitsThresholdChecker create(final int totalHitsThreshold) { - return totalHitsThreshold == Integer.MAX_VALUE - ? HitsThresholdChecker.EXACT_HITS_COUNT_THRESHOLD_CHECKER - : new LocalHitsThresholdChecker(totalHitsThreshold); - } - - /* - * Returns a threshold checker that is based on a shared counter - */ - static HitsThresholdChecker createShared(final int totalHitsThreshold) { - return totalHitsThreshold == Integer.MAX_VALUE - ? HitsThresholdChecker.EXACT_HITS_COUNT_THRESHOLD_CHECKER - : new GlobalHitsThresholdChecker(totalHitsThreshold); - } - - private final int totalHitsThreshold; - - HitsThresholdChecker(int totalHitsThreshold) { - if (totalHitsThreshold < 0) { - throw new IllegalArgumentException( - "totalHitsThreshold must be >= 0, got " + totalHitsThreshold); - } - this.totalHitsThreshold = totalHitsThreshold; - } - - final int getHitsThreshold() { - return totalHitsThreshold; - } - - abstract boolean isThresholdReached(); - - abstract ScoreMode scoreMode(); - - abstract void incrementHitCount(); -} diff --git a/lucene/core/src/java/org/apache/lucene/search/ImpactsDISI.java b/lucene/core/src/java/org/apache/lucene/search/ImpactsDISI.java index 5ab19747246a..355aa774813e 100644 --- a/lucene/core/src/java/org/apache/lucene/search/ImpactsDISI.java +++ b/lucene/core/src/java/org/apache/lucene/search/ImpactsDISI.java @@ -106,6 +106,10 @@ public int advance(int target) throws IOException { @Override public int nextDoc() throws IOException { + DocIdSetIterator in = this.in; + if (in.docID() < upTo) { + return in.nextDoc(); + } return advance(in.docID() + 1); } diff --git a/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java b/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java index 77d6edf34a05..6e8bbf81966e 100644 --- a/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java +++ b/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java @@ -21,11 +21,12 @@ import java.util.Arrays; import java.util.Collections; import java.util.Comparator; +import java.util.HashSet; import java.util.List; import java.util.Objects; +import java.util.Set; import java.util.concurrent.Callable; import java.util.concurrent.Executor; -import java.util.function.Function; import java.util.function.Supplier; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; @@ -113,13 +114,7 @@ public class IndexSearcher { protected final IndexReaderContext readerContext; protected final List leafContexts; - /** - * Used with executor - LeafSlice supplier where each slice holds a set of leafs executed within - * one thread. We are caching it instead of creating it eagerly to avoid calling a protected - * method from constructor, which is a bad practice. Always non-null, regardless of whether an - * executor is provided or not. - */ - private final Supplier leafSlicesSupplier; + private volatile LeafSlice[] leafSlices; // Used internally for load balancing threads executing for the query private final TaskExecutor taskExecutor; @@ -228,14 +223,18 @@ public IndexSearcher(IndexReaderContext context, Executor executor) { executor == null ? new TaskExecutor(Runnable::run) : new TaskExecutor(executor); this.readerContext = context; leafContexts = context.leaves(); - Function, LeafSlice[]> slicesProvider = - executor == null - ? leaves -> - leaves.isEmpty() - ? new LeafSlice[0] - : new LeafSlice[] {new LeafSlice(new ArrayList<>(leaves))} - : this::slices; - leafSlicesSupplier = new CachingLeafSlicesSupplier(slicesProvider, leafContexts); + if (executor == null) { + leafSlices = + leafContexts.isEmpty() + ? new LeafSlice[0] + : new LeafSlice[] { + new LeafSlice( + new ArrayList<>( + leafContexts.stream() + .map(LeafReaderContextPartition::createForEntireSegment) + .toList())) + }; + } } /** @@ -319,21 +318,97 @@ public QueryCachingPolicy getQueryCachingPolicy() { /** * Expert: Creates an array of leaf slices each holding a subset of the given leaves. Each {@link * LeafSlice} is executed in a single thread. By default, segments with more than - * MAX_DOCS_PER_SLICE will get their own thread + * MAX_DOCS_PER_SLICE will get their own thread. + * + *

    It is possible to leverage intra-segment concurrency by splitting segments into multiple + * partitions. Such behaviour is not enabled by default as there is still a performance penalty + * for queries that require segment-level computation ahead of time, such as points/range queries. + * This is an implementation limitation that we expect to improve in future releases, see the corresponding github issue. */ protected LeafSlice[] slices(List leaves) { - return slices(leaves, MAX_DOCS_PER_SLICE, MAX_SEGMENTS_PER_SLICE); + return slices(leaves, MAX_DOCS_PER_SLICE, MAX_SEGMENTS_PER_SLICE, false); } - /** Static method to segregate LeafReaderContexts amongst multiple slices */ + /** + * Static method to segregate LeafReaderContexts amongst multiple slices. Creates slices according + * to the provided max number of documents per slice and max number of segments per slice. Splits + * segments into partitions when the last argument is true. + * + * @param leaves the leaves to slice + * @param maxDocsPerSlice the maximum number of documents in a single slice + * @param maxSegmentsPerSlice the maximum number of segments in a single slice + * @param allowSegmentPartitions whether segments may be split into partitions according to the + * provided maxDocsPerSlice argument. When true, if a segment holds more + * documents than the provided max docs per slice, it is split into equal size partitions that + * each gets its own slice assigned. + * @return the array of slices + */ public static LeafSlice[] slices( - List leaves, int maxDocsPerSlice, int maxSegmentsPerSlice) { + List leaves, + int maxDocsPerSlice, + int maxSegmentsPerSlice, + boolean allowSegmentPartitions) { + // Make a copy so we can sort: List sortedLeaves = new ArrayList<>(leaves); // Sort by maxDoc, descending: - Collections.sort( - sortedLeaves, Collections.reverseOrder(Comparator.comparingInt(l -> l.reader().maxDoc()))); + sortedLeaves.sort(Collections.reverseOrder(Comparator.comparingInt(l -> l.reader().maxDoc()))); + + if (allowSegmentPartitions) { + final List> groupedLeafPartitions = new ArrayList<>(); + int currentSliceNumDocs = 0; + List group = null; + for (LeafReaderContext ctx : sortedLeaves) { + if (ctx.reader().maxDoc() > maxDocsPerSlice) { + assert group == null; + // if the segment does not fit in a single slice, we split it into maximum 5 partitions of + // equal size + int numSlices = Math.min(5, Math.ceilDiv(ctx.reader().maxDoc(), maxDocsPerSlice)); + int numDocs = ctx.reader().maxDoc() / numSlices; + int maxDocId = numDocs; + int minDocId = 0; + for (int i = 0; i < numSlices - 1; i++) { + groupedLeafPartitions.add( + Collections.singletonList( + LeafReaderContextPartition.createFromAndTo(ctx, minDocId, maxDocId))); + minDocId = maxDocId; + maxDocId += numDocs; + } + // the last slice gets all the remaining docs + groupedLeafPartitions.add( + Collections.singletonList( + LeafReaderContextPartition.createFromAndTo( + ctx, minDocId, ctx.reader().maxDoc()))); + } else { + if (group == null) { + group = new ArrayList<>(); + groupedLeafPartitions.add(group); + } + group.add(LeafReaderContextPartition.createForEntireSegment(ctx)); + + currentSliceNumDocs += ctx.reader().maxDoc(); + // We only split a segment when it does not fit entirely in a slice. We don't partition + // the + // segment that makes the current slice (which holds multiple segments) go over + // maxDocsPerSlice. This means that a slice either contains multiple entire segments, or a + // single partition of a segment. + if (group.size() >= maxSegmentsPerSlice || currentSliceNumDocs > maxDocsPerSlice) { + group = null; + currentSliceNumDocs = 0; + } + } + } + + LeafSlice[] slices = new LeafSlice[groupedLeafPartitions.size()]; + int upto = 0; + for (List currentGroup : groupedLeafPartitions) { + slices[upto] = new LeafSlice(currentGroup); + ++upto; + } + return slices; + } final List> groupedLeaves = new ArrayList<>(); long docSum = 0; @@ -363,7 +438,12 @@ public static LeafSlice[] slices( LeafSlice[] slices = new LeafSlice[groupedLeaves.size()]; int upto = 0; for (List currentLeaf : groupedLeaves) { - slices[upto] = new LeafSlice(currentLeaf); + slices[upto] = + new LeafSlice( + new ArrayList<>( + currentLeaf.stream() + .map(LeafReaderContextPartition::createForEntireSegment) + .toList())); ++upto; } @@ -441,7 +521,7 @@ public int count(Query query) throws IOException { return countTerm1 + countTerm2 - count(queries[2]); } } - return search(new ConstantScoreQuery(query), new TotalHitCountCollectorManager()); + return search(new ConstantScoreQuery(query), new TotalHitCountCollectorManager(getSlices())); } /** @@ -451,7 +531,43 @@ public int count(Query query) throws IOException { * @lucene.experimental */ public final LeafSlice[] getSlices() { - return leafSlicesSupplier.get(); + LeafSlice[] res = leafSlices; + if (res == null) { + res = computeAndCacheSlices(); + } + return res; + } + + private synchronized LeafSlice[] computeAndCacheSlices() { + LeafSlice[] res = leafSlices; + if (res == null) { + res = slices(leafContexts); + /* + * Enforce that there aren't multiple leaf partitions within the same leaf slice pointing to the + * same leaf context. It is a requirement that {@link Collector#getLeafCollector(LeafReaderContext)} + * gets called once per leaf context. Also, it does not make sense to partition a segment to then search + * those partitions as part of the same slice, because the goal of partitioning is parallel searching + * which happens at the slice level. + */ + for (LeafSlice leafSlice : res) { + if (leafSlice.partitions.length <= 1) { + continue; + } + enforceDistinctLeaves(leafSlice); + } + leafSlices = res; + } + return res; + } + + private static void enforceDistinctLeaves(LeafSlice leafSlice) { + Set distinctLeaves = new HashSet<>(); + for (LeafReaderContextPartition leafPartition : leafSlice.partitions) { + if (distinctLeaves.add(leafPartition.ctx) == false) { + throw new IllegalStateException( + "The same slice targets multiple leaf partitions of the same leaf reader context. A physical segment should rather get partitioned to be searched concurrently from as many slices as the number of leaf partitions it is split into."); + } + } } /** @@ -475,10 +591,8 @@ public TopDocs searchAfter(ScoreDoc after, Query query, int numHits) throws IOEx } final int cappedNumHits = Math.min(numHits, limit); - final boolean supportsConcurrency = getSlices().length > 1; CollectorManager manager = - new TopScoreDocCollectorManager( - cappedNumHits, after, TOTAL_HITS_THRESHOLD, supportsConcurrency); + new TopScoreDocCollectorManager(cappedNumHits, after, TOTAL_HITS_THRESHOLD); return search(query, manager); } @@ -517,9 +631,13 @@ public TopDocs search(Query query, int n) throws IOException { * CollectorManager)} due to its support for concurrency in IndexSearcher */ @Deprecated - public void search(Query query, Collector results) throws IOException { - query = rewrite(query, results.scoreMode().needsScores()); - search(leafContexts, createWeight(query, results.scoreMode(), 1), results); + public void search(Query query, Collector collector) throws IOException { + query = rewrite(query, collector.scoreMode().needsScores()); + Weight weight = createWeight(query, collector.scoreMode(), 1); + collector.setWeight(weight); + for (LeafReaderContext ctx : leafContexts) { // search each subreader + searchLeaf(ctx, 0, DocIdSetIterator.NO_MORE_DOCS, weight, collector); + } } /** Returns true if any search hit the {@link #setTimeout(QueryTimeout) timeout}. */ @@ -606,12 +724,9 @@ private TopFieldDocs searchAfter( } final int cappedNumHits = Math.min(numHits, limit); final Sort rewrittenSort = sort.rewrite(this); - final LeafSlice[] leafSlices = getSlices(); - final boolean supportsConcurrency = leafSlices.length > 1; final CollectorManager manager = - new TopFieldCollectorManager( - rewrittenSort, cappedNumHits, after, TOTAL_HITS_THRESHOLD, supportsConcurrency); + new TopFieldCollectorManager(rewrittenSort, cappedNumHits, after, TOTAL_HITS_THRESHOLD); TopFieldDocs topDocs = search(query, manager); if (doDocScores) { @@ -658,11 +773,11 @@ private T search( } final List> listTasks = new ArrayList<>(leafSlices.length); for (int i = 0; i < leafSlices.length; ++i) { - final LeafReaderContext[] leaves = leafSlices[i].leaves; + final LeafReaderContextPartition[] leaves = leafSlices[i].partitions; final C collector = collectors.get(i); listTasks.add( () -> { - search(Arrays.asList(leaves), weight, collector); + search(leaves, weight, collector); return collector; }); } @@ -674,28 +789,25 @@ private T search( /** * Lower-level search API. * - *

    {@link #searchLeaf(LeafReaderContext, Weight, Collector)} is called for every leaf + *

    {@link #searchLeaf(LeafReaderContext, int, int, Weight, Collector)} is called for every leaf * partition.
    * - *

    NOTE: this method executes the searches on all given leaves exclusively. To search across - * all the searchers leaves use {@link #leafContexts}. + *

    NOTE: this method executes the searches on all given leaf partitions exclusively. To search + * across all the searchers leaves use {@link #leafContexts}. * - * @param leaves the searchers leaves to execute the searches on + * @param partitions the leaf partitions to execute the searches on * @param weight to match documents * @param collector to receive hits * @throws TooManyClauses If a query would exceed {@link IndexSearcher#getMaxClauseCount()} * clauses. */ - protected void search(List leaves, Weight weight, Collector collector) + protected void search(LeafReaderContextPartition[] partitions, Weight weight, Collector collector) throws IOException { collector.setWeight(weight); - // TODO: should we make this - // threaded...? the Collector could be sync'd? - // always use single thread: - for (LeafReaderContext ctx : leaves) { // search each subreader - searchLeaf(ctx, weight, collector); + for (LeafReaderContextPartition partition : partitions) { // search each subreader partition + searchLeaf(partition.ctx, partition.minDocId, partition.maxDocId, weight, collector); } } @@ -705,12 +817,15 @@ protected void search(List leaves, Weight weight, Collector c *

    {@link LeafCollector#collect(int)} is called for every document.
    * * @param ctx the leaf to execute the search against + * @param minDocId the lower bound of the doc id range to search + * @param maxDocId the upper bound of the doc id range to search * @param weight to match document * @param collector to receive hits * @throws TooManyClauses If a query would exceed {@link IndexSearcher#getMaxClauseCount()} * clauses. */ - protected void searchLeaf(LeafReaderContext ctx, Weight weight, Collector collector) + protected void searchLeaf( + LeafReaderContext ctx, int minDocId, int maxDocId, Weight weight, Collector collector) throws IOException { final LeafCollector leafCollector; try { @@ -730,7 +845,7 @@ protected void searchLeaf(LeafReaderContext ctx, Weight weight, Collector collec scorer = new TimeLimitingBulkScorer(scorer, queryTimeout); } try { - scorer.score(leafCollector, ctx.reader().getLiveDocs()); + scorer.score(leafCollector, ctx.reader().getLiveDocs(), minDocId, maxDocId); } catch ( @SuppressWarnings("unused") CollectionTerminatedException e) { @@ -879,7 +994,8 @@ public IndexReaderContext getTopReaderContext() { /** * A class holding a subset of the {@link IndexSearcher}s leaf contexts to be executed within a - * single thread. + * single thread. A leaf slice holds references to one or more {@link LeafReaderContextPartition} + * instances. Each partition targets a specific doc id range of a {@link LeafReaderContext}. * * @lucene.experimental */ @@ -890,11 +1006,95 @@ public static class LeafSlice { * * @lucene.experimental */ - public final LeafReaderContext[] leaves; + public final LeafReaderContextPartition[] partitions; + + private final int maxDocs; + + public LeafSlice(List leafReaderContextPartitions) { + Comparator docBaseComparator = + Comparator.comparingInt(l -> l.ctx.docBase); + Comparator minDocIdComparator = + Comparator.comparingInt(l -> l.minDocId); + leafReaderContextPartitions.sort(docBaseComparator.thenComparing(minDocIdComparator)); + this.partitions = leafReaderContextPartitions.toArray(new LeafReaderContextPartition[0]); + this.maxDocs = + Arrays.stream(partitions) + .map(leafPartition -> leafPartition.maxDocs) + .reduce(Integer::sum) + .get(); + } - public LeafSlice(List leavesList) { - Collections.sort(leavesList, Comparator.comparingInt(l -> l.docBase)); - this.leaves = leavesList.toArray(new LeafReaderContext[0]); + /** + * Returns the total number of docs that a slice targets, by summing the number of docs that + * each of its leaf context partitions targets. + */ + public int getMaxDocs() { + return maxDocs; + } + } + + /** + * Holds information about a specific leaf context and the corresponding range of doc ids to + * search within. Used to optionally search across partitions of the same segment concurrently. + * + *

    A partition instance can be created via {@link #createForEntireSegment(LeafReaderContext)}, + * in which case it will target the entire provided {@link LeafReaderContext}. A true partition of + * a segment can be created via {@link #createFromAndTo(LeafReaderContext, int, int)} providing + * the minimum doc id (including) to search as well as the max doc id (excluding). + * + * @lucene.experimental + */ + public static final class LeafReaderContextPartition { + public final int minDocId; + public final int maxDocId; + public final LeafReaderContext ctx; + // we keep track of maxDocs separately because we use NO_MORE_DOCS as upper bound when targeting + // the entire segment. We use this only in tests. + private final int maxDocs; + + private LeafReaderContextPartition( + LeafReaderContext leafReaderContext, int minDocId, int maxDocId, int maxDocs) { + if (minDocId >= maxDocId) { + throw new IllegalArgumentException( + "minDocId is greater than or equal to maxDocId: [" + + minDocId + + "] > [" + + maxDocId + + "]"); + } + if (minDocId < 0) { + throw new IllegalArgumentException("minDocId is lower than 0: [" + minDocId + "]"); + } + if (minDocId >= leafReaderContext.reader().maxDoc()) { + throw new IllegalArgumentException( + "minDocId is greater than than maxDoc: [" + + minDocId + + "] > [" + + leafReaderContext.reader().maxDoc() + + "]"); + } + + this.ctx = leafReaderContext; + this.minDocId = minDocId; + this.maxDocId = maxDocId; + this.maxDocs = maxDocs; + } + + /** Creates a partition of the provided leaf context that targets the entire segment */ + public static LeafReaderContextPartition createForEntireSegment(LeafReaderContext ctx) { + return new LeafReaderContextPartition( + ctx, 0, DocIdSetIterator.NO_MORE_DOCS, ctx.reader().maxDoc()); + } + + /** + * Creates a partition of the provided leaf context that targets a subset of the entire segment, + * starting from and including the min doc id provided, until and not including the provided max + * doc id + */ + public static LeafReaderContextPartition createFromAndTo( + LeafReaderContext ctx, int minDocId, int maxDocId) { + assert maxDocId != DocIdSetIterator.NO_MORE_DOCS; + return new LeafReaderContextPartition(ctx, minDocId, maxDocId, maxDocId - minDocId); } } @@ -991,43 +1191,4 @@ public TooManyNestedClauses() { + IndexSearcher.getMaxClauseCount()); } } - - /** - * Supplier for {@link LeafSlice} slices which computes and caches the value on first invocation - * and returns cached value on subsequent invocation. If the passed in provider for slice - * computation throws exception then same will be passed to the caller of this supplier on each - * invocation. If the provider returns null then {@link NullPointerException} will be thrown to - * the caller. - * - *

    NOTE: To provide thread safe caching mechanism this class is implementing the (subtle) double-checked locking - * idiom - */ - private static class CachingLeafSlicesSupplier implements Supplier { - private volatile LeafSlice[] leafSlices; - - private final Function, LeafSlice[]> sliceProvider; - - private final List leaves; - - private CachingLeafSlicesSupplier( - Function, LeafSlice[]> provider, List leaves) { - this.sliceProvider = Objects.requireNonNull(provider, "leaf slice provider cannot be null"); - this.leaves = Objects.requireNonNull(leaves, "list of LeafReaderContext cannot be null"); - } - - @Override - public LeafSlice[] get() { - if (leafSlices == null) { - synchronized (this) { - if (leafSlices == null) { - leafSlices = - Objects.requireNonNull( - sliceProvider.apply(leaves), "slices computed by the provider is null"); - } - } - } - return leafSlices; - } - } } diff --git a/lucene/core/src/java/org/apache/lucene/search/IndexSortSortedNumericDocValuesRangeQuery.java b/lucene/core/src/java/org/apache/lucene/search/IndexSortSortedNumericDocValuesRangeQuery.java index 0c27e4de83ff..3b6ef588dfbf 100644 --- a/lucene/core/src/java/org/apache/lucene/search/IndexSortSortedNumericDocValuesRangeQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/IndexSortSortedNumericDocValuesRangeQuery.java @@ -186,10 +186,44 @@ public boolean isCacheable(LeafReaderContext ctx) { @Override public int count(LeafReaderContext context) throws IOException { if (context.reader().hasDeletions() == false) { - IteratorAndCount itAndCount = getDocIdSetIteratorOrNull(context); + if (lowerValue > upperValue) { + return 0; + } + IteratorAndCount itAndCount = null; + LeafReader reader = context.reader(); + + // first use bkd optimization if possible + SortedNumericDocValues sortedNumericValues = DocValues.getSortedNumeric(reader, field); + NumericDocValues numericValues = DocValues.unwrapSingleton(sortedNumericValues); + PointValues pointValues = reader.getPointValues(field); + if (pointValues != null && pointValues.getDocCount() == reader.maxDoc()) { + itAndCount = getDocIdSetIteratorOrNullFromBkd(context, numericValues); + } if (itAndCount != null && itAndCount.count != -1) { return itAndCount.count; } + + // use index sort optimization if possible + Sort indexSort = reader.getMetaData().sort(); + if (indexSort != null + && indexSort.getSort().length > 0 + && indexSort.getSort()[0].getField().equals(field)) { + final SortField sortField = indexSort.getSort()[0]; + final SortField.Type sortFieldType = getSortFieldType(sortField); + // The index sort optimization is only supported for Type.INT and Type.LONG + if (sortFieldType == Type.INT || sortFieldType == Type.LONG) { + Object missingValue = sortField.getMissingValue(); + final long missingLongValue = missingValue == null ? 0L : (long) missingValue; + // all documents have docValues or missing value falls outside the range + if ((pointValues != null && pointValues.getDocCount() == reader.maxDoc()) + || (missingLongValue < lowerValue || missingLongValue > upperValue)) { + itAndCount = getDocIdSetIterator(sortField, sortFieldType, context, numericValues); + } + if (itAndCount != null && itAndCount.count != -1) { + return itAndCount.count; + } + } + } } return fallbackWeight.count(context); } @@ -398,7 +432,7 @@ private boolean matchAll(PointValues points, byte[] queryLowerPoint, byte[] quer private IteratorAndCount getDocIdSetIteratorOrNullFromBkd( LeafReaderContext context, DocIdSetIterator delegate) throws IOException { - Sort indexSort = context.reader().getMetaData().getSort(); + Sort indexSort = context.reader().getMetaData().sort(); if (indexSort == null || indexSort.getSort().length == 0 || indexSort.getSort()[0].getField().equals(field) == false) { @@ -498,7 +532,7 @@ private IteratorAndCount getDocIdSetIteratorOrNull(LeafReaderContext context) th if (itAndCount != null) { return itAndCount; } - Sort indexSort = context.reader().getMetaData().getSort(); + Sort indexSort = context.reader().getMetaData().sort(); if (indexSort != null && indexSort.getSort().length > 0 && indexSort.getSort()[0].getField().equals(field)) { diff --git a/lucene/core/src/java/org/apache/lucene/search/KnnByteVectorQuery.java b/lucene/core/src/java/org/apache/lucene/search/KnnByteVectorQuery.java index db5ae4a0d9d2..35144055830c 100644 --- a/lucene/core/src/java/org/apache/lucene/search/KnnByteVectorQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/KnnByteVectorQuery.java @@ -20,7 +20,7 @@ import java.util.Arrays; import java.util.Objects; import org.apache.lucene.codecs.KnnVectorsReader; -import org.apache.lucene.document.KnnFloatVectorField; +import org.apache.lucene.document.KnnByteVectorField; import org.apache.lucene.index.ByteVectorValues; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.LeafReader; @@ -52,7 +52,7 @@ public class KnnByteVectorQuery extends AbstractKnnVectorQuery { * Find the k nearest documents to the target vector according to the vectors in the * given field. target vector. * - * @param field a field that has been indexed as a {@link KnnFloatVectorField}. + * @param field a field that has been indexed as a {@link KnnByteVectorField}. * @param target the target of the search * @param k the number of documents to find * @throws IllegalArgumentException if k is less than 1 @@ -65,7 +65,7 @@ public KnnByteVectorQuery(String field, byte[] target, int k) { * Find the k nearest documents to the target vector according to the vectors in the * given field. target vector. * - * @param field a field that has been indexed as a {@link KnnFloatVectorField}. + * @param field a field that has been indexed as a {@link KnnByteVectorField}. * @param target the target of the search * @param k the number of documents to find * @param filter a filter applied before the vector search @@ -111,7 +111,14 @@ VectorScorer createVectorScorer(LeafReaderContext context, FieldInfo fi) throws @Override public String toString(String field) { - return getClass().getSimpleName() + ":" + this.field + "[" + target[0] + ",...][" + k + "]"; + StringBuilder buffer = new StringBuilder(); + buffer.append(getClass().getSimpleName() + ":"); + buffer.append(this.field + "[" + target[0] + ",...]"); + buffer.append("[" + k + "]"); + if (this.filter != null) { + buffer.append("[" + this.filter + "]"); + } + return buffer.toString(); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/KnnFloatVectorQuery.java b/lucene/core/src/java/org/apache/lucene/search/KnnFloatVectorQuery.java index 585893fa3c2a..d2aaf4296eda 100644 --- a/lucene/core/src/java/org/apache/lucene/search/KnnFloatVectorQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/KnnFloatVectorQuery.java @@ -112,7 +112,14 @@ VectorScorer createVectorScorer(LeafReaderContext context, FieldInfo fi) throws @Override public String toString(String field) { - return getClass().getSimpleName() + ":" + this.field + "[" + target[0] + ",...][" + k + "]"; + StringBuilder buffer = new StringBuilder(); + buffer.append(getClass().getSimpleName() + ":"); + buffer.append(this.field + "[" + target[0] + ",...]"); + buffer.append("[" + k + "]"); + if (this.filter != null) { + buffer.append("[" + this.filter + "]"); + } + return buffer.toString(); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/LRUQueryCache.java b/lucene/core/src/java/org/apache/lucene/search/LRUQueryCache.java index 7f82e78e056f..9c73b15963da 100644 --- a/lucene/core/src/java/org/apache/lucene/search/LRUQueryCache.java +++ b/lucene/core/src/java/org/apache/lucene/search/LRUQueryCache.java @@ -526,7 +526,9 @@ public void collect(int doc) throws IOException { bitSet.set(doc); } }, - null); + null, + 0, + DocIdSetIterator.NO_MORE_DOCS); return new CacheAndCount(new BitDocIdSet(bitSet, count[0]), count[0]); } @@ -544,7 +546,9 @@ public void collect(int doc) throws IOException { builder.add(doc); } }, - null); + null, + 0, + DocIdSetIterator.NO_MORE_DOCS); RoaringDocIdSet cache = builder.build(); return new CacheAndCount(cache, cache.cardinality()); } @@ -816,15 +820,9 @@ public long cost() { @Override public int count(LeafReaderContext context) throws IOException { - // If the wrapped weight can count quickly then use that - int innerCount = in.count(context); - if (innerCount != -1) { - return innerCount; - } - // Our cache won't have an accurate count if there are deletions if (context.reader().hasDeletions()) { - return -1; + return in.count(context); } // Otherwise check if the count is in the cache @@ -834,24 +832,24 @@ public int count(LeafReaderContext context) throws IOException { if (in.isCacheable(context) == false) { // this segment is not suitable for caching - return -1; + return in.count(context); } // Short-circuit: Check whether this segment is eligible for caching // before we take a lock because of #get if (shouldCache(context) == false) { - return -1; + return in.count(context); } final IndexReader.CacheHelper cacheHelper = context.reader().getCoreCacheHelper(); if (cacheHelper == null) { // this reader has no cacheHelper - return -1; + return in.count(context); } // If the lock is already busy, prefer using the uncached version than waiting if (readLock.tryLock() == false) { - return -1; + return in.count(context); } CacheAndCount cached; @@ -860,11 +858,12 @@ public int count(LeafReaderContext context) throws IOException { } finally { readLock.unlock(); } - if (cached == null) { - // Not cached - return -1; + if (cached != null) { + // cached + return cached.count(); } - return cached.count(); + // Not cached, check if the wrapped weight can count quickly then use that + return in.count(context); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/LeafSimScorer.java b/lucene/core/src/java/org/apache/lucene/search/LeafSimScorer.java deleted file mode 100644 index 5d8d2e922861..000000000000 --- a/lucene/core/src/java/org/apache/lucene/search/LeafSimScorer.java +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.search; - -import java.io.IOException; -import java.util.Objects; -import org.apache.lucene.index.LeafReader; -import org.apache.lucene.index.NumericDocValues; -import org.apache.lucene.search.similarities.Similarity.SimScorer; - -/** {@link SimScorer} on a specific {@link LeafReader}. */ -public final class LeafSimScorer { - - private final SimScorer scorer; - private final NumericDocValues norms; - - /** Sole constructor: Score documents of {@code reader} with {@code scorer}. */ - public LeafSimScorer(SimScorer scorer, LeafReader reader, String field, boolean needsScores) - throws IOException { - this.scorer = Objects.requireNonNull(scorer); - norms = needsScores ? reader.getNormValues(field) : null; - } - - /** Return the wrapped {@link SimScorer}. */ - public SimScorer getSimScorer() { - return scorer; - } - - private long getNormValue(int doc) throws IOException { - if (norms != null) { - boolean found = norms.advanceExact(doc); - assert found; - return norms.longValue(); - } else { - return 1L; // default norm - } - } - - /** - * Score the provided document assuming the given term document frequency. This method must be - * called on non-decreasing sequences of doc ids. - * - * @see SimScorer#score(float, long) - */ - public float score(int doc, float freq) throws IOException { - return scorer.score(freq, getNormValue(doc)); - } - - /** - * Explain the score for the provided document assuming the given term document frequency. This - * method must be called on non-decreasing sequences of doc ids. - * - * @see SimScorer#explain(Explanation, long) - */ - public Explanation explain(int doc, Explanation freqExpl) throws IOException { - return scorer.explain(freqExpl, getNormValue(doc)); - } -} diff --git a/lucene/core/src/java/org/apache/lucene/search/MaxScoreAccumulator.java b/lucene/core/src/java/org/apache/lucene/search/MaxScoreAccumulator.java index b0f60f8113f0..eac33dbf039d 100644 --- a/lucene/core/src/java/org/apache/lucene/search/MaxScoreAccumulator.java +++ b/lucene/core/src/java/org/apache/lucene/search/MaxScoreAccumulator.java @@ -17,7 +17,6 @@ package org.apache.lucene.search; -import java.util.Objects; import java.util.concurrent.atomic.LongAccumulator; /** Maintains the maximum score and its corresponding document id concurrently */ @@ -36,8 +35,8 @@ final class MaxScoreAccumulator { } /** - * Return the max encoded DocAndScore in a way that is consistent with {@link - * DocAndScore#compareTo}. + * Return the max encoded docId and score found in the two longs, following the encoding in {@link + * #accumulate}. */ private static long maxEncode(long v1, long v2) { float score1 = Float.intBitsToFloat((int) (v1 >> 32)); @@ -52,63 +51,21 @@ private static long maxEncode(long v1, long v2) { return v2; } - void accumulate(int docBase, float score) { - assert docBase >= 0 && score >= 0; - long encode = (((long) Float.floatToIntBits(score)) << 32) | docBase; + void accumulate(int docId, float score) { + assert docId >= 0 && score >= 0; + long encode = (((long) Float.floatToIntBits(score)) << 32) | docId; acc.accumulate(encode); } - DocAndScore get() { - long value = acc.get(); - if (value == Long.MIN_VALUE) { - return null; - } - float score = Float.intBitsToFloat((int) (value >> 32)); - int docBase = (int) value; - return new DocAndScore(docBase, score); + public static float toScore(long value) { + return Float.intBitsToFloat((int) (value >> 32)); } - static class DocAndScore implements Comparable { - final int docBase; - final float score; - - DocAndScore(int docBase, float score) { - this.docBase = docBase; - this.score = score; - } - - @Override - public int compareTo(DocAndScore o) { - int cmp = Float.compare(score, o.score); - if (cmp == 0) { - // tie-break on the minimum doc base - // For a given minimum competitive score, we want to know the first segment - // where this score occurred, hence the reverse order here. - // On segments with a lower docBase, any document whose score is greater - // than or equal to this score would be competitive, while on segments with a - // higher docBase, documents need to have a strictly greater score to be - // competitive since we tie break on doc ID. - return Integer.compare(o.docBase, docBase); - } - return cmp; - } - - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - DocAndScore result = (DocAndScore) o; - return docBase == result.docBase && Float.compare(result.score, score) == 0; - } - - @Override - public int hashCode() { - return Objects.hash(docBase, score); - } + public static int docId(long value) { + return (int) value; + } - @Override - public String toString() { - return "DocAndScore{" + "docBase=" + docBase + ", score=" + score + '}'; - } + long getRaw() { + return acc.get(); } } diff --git a/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java b/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java index 8786343cceca..56857bc67cc1 100644 --- a/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java @@ -40,6 +40,8 @@ final class MaxScoreBulkScorer extends BulkScorer { // Index of the first scorer that is required, this scorer and all following scorers are required // for a document to match. int firstRequiredScorer; + // The minimum value of minCompetitiveScore that would produce a more favorable partitioning. + float nextMinCompetitiveScore; private final long cost; float minCompetitiveScore; private final Score scorable = new Score(); @@ -64,6 +66,15 @@ final class MaxScoreBulkScorer extends BulkScorer { maxScoreSums = new double[allScorers.length]; } + // Number of outer windows that have been evaluated + private int numOuterWindows; + // Number of candidate matches so far + private int numCandidates; + // Minimum window size. See #computeOuterWindowMax where we have heuristics that adjust the + // minimum window size based on the average number of candidate matches per outer window, to keep + // the per-window overhead under control. + private int minWindowSize = 1; + @Override public int score(LeafCollector collector, Bits acceptDocs, int min, int max) throws IOException { collector.setScorer(scorable); @@ -114,9 +125,15 @@ public int score(LeafCollector collector, Bits acceptDocs, int min, int max) thr while (top.doc < outerWindowMax) { scoreInnerWindow(collector, acceptDocs, outerWindowMax); top = essentialQueue.top(); + if (minCompetitiveScore >= nextMinCompetitiveScore) { + // The minimum competitive score increased substantially, so we can now partition scorers + // in a more favorable way. + break; + } } - outerWindowMin = outerWindowMax; + outerWindowMin = Math.min(top.doc, outerWindowMax); + ++numOuterWindows; } return nextCandidate(max); @@ -271,6 +288,23 @@ private int computeOuterWindowMax(int windowMin) throws IOException { windowMax = (int) Math.min(windowMax, upTo + 1L); // upTo is inclusive } + if (allScorers.length - firstWindowLead > 1) { + // The more clauses we consider to compute outer windows, the higher chances that one of these + // clauses has a block boundary in the next few doc IDs. This situation can result in more + // time spent computing maximum scores per outer window than evaluating hits. To avoid such + // situations, we target at least 32 candidate matches per clause per outer window on average, + // to make sure we amortize the cost of computing maximum scores. + long threshold = numOuterWindows * 32L * allScorers.length; + if (numCandidates < threshold) { + minWindowSize = Math.min(minWindowSize << 1, INNER_WINDOW_SIZE); + } else { + minWindowSize = 1; + } + + int minWindowMax = (int) Math.min(Integer.MAX_VALUE, (long) windowMin + minWindowSize); + windowMax = Math.max(windowMax, minWindowMax); + } + return windowMax; } @@ -293,6 +327,9 @@ void updateMaxWindowScores(int windowMin, int windowMax) throws IOException { private void scoreNonEssentialClauses( LeafCollector collector, int doc, double essentialScore, int numNonEssentialClauses) throws IOException { + + ++numCandidates; + double score = essentialScore; for (int i = numNonEssentialClauses - 1; i >= 0; --i) { float maxPossibleScore = @@ -337,6 +374,7 @@ boolean partitionScorers() { }); double maxScoreSum = 0; firstEssentialScorer = 0; + nextMinCompetitiveScore = Float.POSITIVE_INFINITY; for (int i = 0; i < allScorers.length; ++i) { final DisiWrapper w = scratch[i]; double newMaxScoreSum = maxScoreSum + w.maxWindowScore; @@ -349,6 +387,7 @@ boolean partitionScorers() { firstEssentialScorer++; } else { allScorers[allScorers.length - 1 - (i - firstEssentialScorer)] = w; + nextMinCompetitiveScore = Math.min(maxScoreSumFloat, nextMinCompetitiveScore); } } diff --git a/lucene/core/src/java/org/apache/lucene/search/MaxScoreCache.java b/lucene/core/src/java/org/apache/lucene/search/MaxScoreCache.java index 2d38370e86af..a449f675daaf 100644 --- a/lucene/core/src/java/org/apache/lucene/search/MaxScoreCache.java +++ b/lucene/core/src/java/org/apache/lucene/search/MaxScoreCache.java @@ -71,7 +71,9 @@ private void ensureCacheSize(int size) { private float computeMaxScore(List impacts) { float maxScore = 0; - for (Impact impact : impacts) { + var scorer = this.scorer; + for (int i = 0, length = impacts.size(); i < length; i++) { + Impact impact = impacts.get(i); maxScore = Math.max(scorer.score(impact.freq, impact.norm), maxScore); } return maxScore; diff --git a/lucene/core/src/java/org/apache/lucene/search/MultiTermQuery.java b/lucene/core/src/java/org/apache/lucene/search/MultiTermQuery.java index 5a56286ebed3..8192f6121656 100644 --- a/lucene/core/src/java/org/apache/lucene/search/MultiTermQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/MultiTermQuery.java @@ -313,7 +313,7 @@ public final TermsEnum getTermsEnum(Terms terms) throws IOException { * Return the number of unique terms contained in this query, if known up-front. If not known, -1 * will be returned. */ - public long getTermsCount() throws IOException { + public long getTermsCount() { return -1; } diff --git a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java index 48ac987bee30..1c5436dcf0a5 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java @@ -21,8 +21,8 @@ import java.util.Arrays; import java.util.List; import java.util.Objects; -import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat; -import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader; +import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat; +import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader; import org.apache.lucene.index.ImpactsEnum; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; @@ -399,10 +399,10 @@ public boolean equals(Object obj) { /** * A guess of the average number of simple operations for the initial seek and buffer refill per * document for the positions of a term. See also {@link - * Lucene912PostingsReader.BlockImpactsPostingsEnum#nextPosition()}. + * Lucene101PostingsReader.BlockImpactsPostingsEnum#nextPosition()}. * *

    Aside: Instead of being constant this could depend among others on {@link - * Lucene912PostingsFormat#BLOCK_SIZE}, {@link TermsEnum#docFreq()}, {@link + * Lucene101PostingsFormat#BLOCK_SIZE}, {@link TermsEnum#docFreq()}, {@link * TermsEnum#totalTermFreq()}, {@link DocIdSetIterator#cost()} (expected number of matching docs), * {@link LeafReader#maxDoc()} (total number of docs in the segment), and the seek time and block * size of the device storing the index. @@ -411,7 +411,7 @@ public boolean equals(Object obj) { /** * Number of simple operations in {@link - * Lucene912PostingsReader.BlockImpactsPostingsEnum#nextPosition()} when no seek or buffer refill + * Lucene101PostingsReader.BlockImpactsPostingsEnum#nextPosition()} when no seek or buffer refill * is done. */ private static final int TERM_OPS_PER_POS = 7; diff --git a/lucene/core/src/java/org/apache/lucene/search/PhraseScorer.java b/lucene/core/src/java/org/apache/lucene/search/PhraseScorer.java index 26a1387acda5..69f07e624f23 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PhraseScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/PhraseScorer.java @@ -18,6 +18,8 @@ package org.apache.lucene.search; import java.io.IOException; +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.search.similarities.Similarity.SimScorer; class PhraseScorer extends Scorer { @@ -26,16 +28,19 @@ class PhraseScorer extends Scorer { final MaxScoreCache maxScoreCache; final PhraseMatcher matcher; final ScoreMode scoreMode; - private final LeafSimScorer simScorer; + private final SimScorer simScorer; + private final NumericDocValues norms; final float matchCost; private float minCompetitiveScore = 0; private float freq = 0; - PhraseScorer(PhraseMatcher matcher, ScoreMode scoreMode, LeafSimScorer simScorer) { + PhraseScorer( + PhraseMatcher matcher, ScoreMode scoreMode, SimScorer simScorer, NumericDocValues norms) { this.matcher = matcher; this.scoreMode = scoreMode; this.simScorer = simScorer; + this.norms = norms; this.matchCost = matcher.getMatchCost(); this.approximation = matcher.approximation(); this.impactsApproximation = matcher.impactsApproximation(); @@ -50,7 +55,11 @@ public boolean matches() throws IOException { matcher.reset(); if (scoreMode == ScoreMode.TOP_SCORES && minCompetitiveScore > 0) { float maxFreq = matcher.maxFreq(); - if (simScorer.score(docID(), maxFreq) < minCompetitiveScore) { + long norm = 1L; + if (norms != null && norms.advanceExact(docID())) { + norm = norms.longValue(); + } + if (simScorer.score(maxFreq, norm) < minCompetitiveScore) { // The maximum score we could get is less than the min competitive score return false; } @@ -79,7 +88,11 @@ public float score() throws IOException { freq += matcher.sloppyWeight(); } } - return simScorer.score(docID(), freq); + long norm = 1L; + if (norms != null && norms.advanceExact(docID())) { + norm = norms.longValue(); + } + return simScorer.score(freq, norm); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/PhraseWeight.java b/lucene/core/src/java/org/apache/lucene/search/PhraseWeight.java index 6aa1d6849682..75439ca7eb0a 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PhraseWeight.java +++ b/lucene/core/src/java/org/apache/lucene/search/PhraseWeight.java @@ -19,6 +19,7 @@ import java.io.IOException; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.search.similarities.Similarity.SimScorer; @@ -63,9 +64,8 @@ protected abstract PhraseMatcher getPhraseMatcher( public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException { PhraseMatcher matcher = getPhraseMatcher(context, stats, false); if (matcher == null) return null; - LeafSimScorer simScorer = - new LeafSimScorer(stats, context.reader(), field, scoreMode.needsScores()); - final var scorer = new PhraseScorer(matcher, scoreMode, simScorer); + NumericDocValues norms = scoreMode.needsScores() ? context.reader().getNormValues(field) : null; + final var scorer = new PhraseScorer(matcher, scoreMode, stats, norms); return new DefaultScorerSupplier(scorer); } @@ -83,10 +83,13 @@ public Explanation explain(LeafReaderContext context, int doc) throws IOExceptio while (matcher.nextMatch()) { freq += matcher.sloppyWeight(); } - LeafSimScorer docScorer = - new LeafSimScorer(stats, context.reader(), field, scoreMode.needsScores()); Explanation freqExplanation = Explanation.match(freq, "phraseFreq=" + freq); - Explanation scoreExplanation = docScorer.explain(doc, freqExplanation); + NumericDocValues norms = scoreMode.needsScores() ? context.reader().getNormValues(field) : null; + long norm = 1L; + if (norms != null && norms.advanceExact(doc)) { + norm = norms.longValue(); + } + Explanation scoreExplanation = stats.explain(freqExplanation, norm); return Explanation.match( scoreExplanation.getValue(), "weight(" diff --git a/lucene/core/src/java/org/apache/lucene/search/PointInSetQuery.java b/lucene/core/src/java/org/apache/lucene/search/PointInSetQuery.java index c2efa68a45ba..f0e0cfd6bdb8 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PointInSetQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/PointInSetQuery.java @@ -181,7 +181,7 @@ public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOExcepti @Override public Scorer get(long leadCost) throws IOException { DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc(), values, field); - values.intersect(new MergePointVisitor(sortedPackedPoints, result)); + values.intersect(new MergePointVisitor(sortedPackedPoints.iterator(), result)); DocIdSetIterator iterator = result.build().iterator(); return new ConstantScoreScorer(score(), scoreMode, iterator); } @@ -192,7 +192,9 @@ public long cost() { if (cost == -1) { // Computing the cost may be expensive, so only do it if necessary DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc(), values, field); - cost = values.estimateDocCount(new MergePointVisitor(sortedPackedPoints, result)); + cost = + values.estimateDocCount( + new MergePointVisitor(sortedPackedPoints.iterator(), result)); assert cost >= 0; } return cost; @@ -260,18 +262,15 @@ public boolean isCacheable(LeafReaderContext ctx) { private class MergePointVisitor implements IntersectVisitor { private final DocIdSetBuilder result; - private TermIterator iterator; + private final TermIterator iterator; private BytesRef nextQueryPoint; private final ByteArrayComparator comparator; - private final PrefixCodedTerms sortedPackedPoints; private DocIdSetBuilder.BulkAdder adder; - public MergePointVisitor(PrefixCodedTerms sortedPackedPoints, DocIdSetBuilder result) - throws IOException { + public MergePointVisitor(TermIterator iterator, DocIdSetBuilder result) throws IOException { this.result = result; - this.sortedPackedPoints = sortedPackedPoints; this.comparator = ArrayUtil.getUnsignedComparator(bytesPerDim); - this.iterator = this.sortedPackedPoints.iterator(); + this.iterator = iterator; nextQueryPoint = iterator.next(); } diff --git a/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java b/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java index bfcef92f6f60..1b6d6869c19e 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java @@ -128,8 +128,8 @@ public final Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, fl private final ByteArrayComparator comparator = ArrayUtil.getUnsignedComparator(bytesPerDim); private boolean matches(byte[] packedValue) { - for (int dim = 0; dim < numDims; dim++) { - int offset = dim * bytesPerDim; + int offset = 0; + for (int dim = 0; dim < numDims; dim++, offset += bytesPerDim) { if (comparator.compare(packedValue, offset, lowerPoint, offset) < 0) { // Doc's value is too low, in this dimension return false; @@ -145,9 +145,9 @@ private boolean matches(byte[] packedValue) { private Relation relate(byte[] minPackedValue, byte[] maxPackedValue) { boolean crosses = false; + int offset = 0; - for (int dim = 0; dim < numDims; dim++) { - int offset = dim * bytesPerDim; + for (int dim = 0; dim < numDims; dim++, offset += bytesPerDim) { if (comparator.compare(minPackedValue, offset, upperPoint, offset) > 0 || comparator.compare(maxPackedValue, offset, lowerPoint, offset) < 0) { diff --git a/lucene/core/src/java/org/apache/lucene/search/QueryRescorer.java b/lucene/core/src/java/org/apache/lucene/search/QueryRescorer.java index 23a8991dddfa..27d88504b30b 100644 --- a/lucene/core/src/java/org/apache/lucene/search/QueryRescorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/QueryRescorer.java @@ -50,14 +50,7 @@ public TopDocs rescore(IndexSearcher searcher, TopDocs firstPassTopDocs, int top throws IOException { ScoreDoc[] hits = firstPassTopDocs.scoreDocs.clone(); - Arrays.sort( - hits, - new Comparator() { - @Override - public int compare(ScoreDoc a, ScoreDoc b) { - return a.doc - b.doc; - } - }); + Arrays.sort(hits, (a, b) -> a.doc - b.doc); List leaves = searcher.getIndexReader().leaves(); @@ -111,19 +104,16 @@ public int compare(ScoreDoc a, ScoreDoc b) { } Comparator sortDocComparator = - new Comparator() { - @Override - public int compare(ScoreDoc a, ScoreDoc b) { - // Sort by score descending, then docID ascending: - if (a.score > b.score) { - return -1; - } else if (a.score < b.score) { - return 1; - } else { - // This subtraction can't overflow int - // because docIDs are >= 0: - return a.doc - b.doc; - } + (a, b) -> { + // Sort by score descending, then docID ascending: + if (a.score > b.score) { + return -1; + } else if (a.score < b.score) { + return 1; + } else { + // This subtraction can't overflow int + // because docIDs are >= 0: + return a.doc - b.doc; } }; diff --git a/lucene/core/src/java/org/apache/lucene/search/RegexpQuery.java b/lucene/core/src/java/org/apache/lucene/search/RegexpQuery.java index f64f17d94b5a..6f19de234345 100644 --- a/lucene/core/src/java/org/apache/lucene/search/RegexpQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/RegexpQuery.java @@ -85,20 +85,20 @@ public RegexpQuery(Term term, int flags, int determinizeWorkLimit) { * Constructs a query for terms matching term. * * @param term regular expression. - * @param syntax_flags optional RegExp syntax features from {@link RegExp} automaton for the - * regexp can result in. Set higher to allow more complex queries and lower to prevent memory + * @param syntaxFlags optional RegExp syntax features from {@link RegExp} automaton for the regexp + * can result in. Set higher to allow more complex queries and lower to prevent memory * exhaustion. - * @param match_flags boolean 'or' of match behavior options such as case insensitivity + * @param matchFlags boolean 'or' of match behavior options such as case insensitivity * @param determinizeWorkLimit maximum effort to spend while compiling the automaton from this * regexp. Set higher to allow more complex queries and lower to prevent memory exhaustion. * Use {@link Operations#DEFAULT_DETERMINIZE_WORK_LIMIT} as a decent default if you don't * otherwise know what to specify. */ - public RegexpQuery(Term term, int syntax_flags, int match_flags, int determinizeWorkLimit) { + public RegexpQuery(Term term, int syntaxFlags, int matchFlags, int determinizeWorkLimit) { this( term, - syntax_flags, - match_flags, + syntaxFlags, + matchFlags, DEFAULT_PROVIDER, determinizeWorkLimit, CONSTANT_SCORE_BLENDED_REWRITE); @@ -108,7 +108,7 @@ public RegexpQuery(Term term, int syntax_flags, int match_flags, int determinize * Constructs a query for terms matching term. * * @param term regular expression. - * @param syntax_flags optional RegExp features from {@link RegExp} + * @param syntaxFlags optional RegExp features from {@link RegExp} * @param provider custom AutomatonProvider for named automata * @param determinizeWorkLimit maximum effort to spend while compiling the automaton from this * regexp. Set higher to allow more complex queries and lower to prevent memory exhaustion. @@ -116,16 +116,16 @@ public RegexpQuery(Term term, int syntax_flags, int match_flags, int determinize * otherwise know what to specify. */ public RegexpQuery( - Term term, int syntax_flags, AutomatonProvider provider, int determinizeWorkLimit) { - this(term, syntax_flags, 0, provider, determinizeWorkLimit, CONSTANT_SCORE_BLENDED_REWRITE); + Term term, int syntaxFlags, AutomatonProvider provider, int determinizeWorkLimit) { + this(term, syntaxFlags, 0, provider, determinizeWorkLimit, CONSTANT_SCORE_BLENDED_REWRITE); } /** * Constructs a query for terms matching term. * * @param term regular expression. - * @param syntax_flags optional RegExp features from {@link RegExp} - * @param match_flags boolean 'or' of match behavior options such as case insensitivity + * @param syntaxFlags optional RegExp features from {@link RegExp} + * @param matchFlags boolean 'or' of match behavior options such as case insensitivity * @param provider custom AutomatonProvider for named automata * @param determinizeWorkLimit maximum effort to spend while compiling the automaton from this * regexp. Set higher to allow more complex queries and lower to prevent memory exhaustion. @@ -135,20 +135,20 @@ public RegexpQuery( */ public RegexpQuery( Term term, - int syntax_flags, - int match_flags, + int syntaxFlags, + int matchFlags, AutomatonProvider provider, int determinizeWorkLimit, RewriteMethod rewriteMethod) { - this(term, syntax_flags, match_flags, provider, determinizeWorkLimit, rewriteMethod, true); + this(term, syntaxFlags, matchFlags, provider, determinizeWorkLimit, rewriteMethod, true); } /** * Constructs a query for terms matching term. * * @param term regular expression. - * @param syntax_flags optional RegExp features from {@link RegExp} - * @param match_flags boolean 'or' of match behavior options such as case insensitivity + * @param syntaxFlags optional RegExp features from {@link RegExp} + * @param matchFlags boolean 'or' of match behavior options such as case insensitivity * @param provider custom AutomatonProvider for named automata * @param determinizeWorkLimit maximum effort to spend while compiling the automaton from this * regexp. Set higher to allow more complex queries and lower to prevent memory exhaustion. @@ -165,8 +165,8 @@ public RegexpQuery( */ public RegexpQuery( Term term, - int syntax_flags, - int match_flags, + int syntaxFlags, + int matchFlags, AutomatonProvider provider, int determinizeWorkLimit, RewriteMethod rewriteMethod, @@ -174,7 +174,7 @@ public RegexpQuery( super( term, toAutomaton( - new RegExp(term.text(), syntax_flags, match_flags), + new RegExp(term.text(), syntaxFlags, matchFlags), determinizeWorkLimit, provider, doDeterminization), diff --git a/lucene/core/src/java/org/apache/lucene/search/Scorable.java b/lucene/core/src/java/org/apache/lucene/search/Scorable.java index 6fd531e7d84d..cfa0f49b6ad5 100644 --- a/lucene/core/src/java/org/apache/lucene/search/Scorable.java +++ b/lucene/core/src/java/org/apache/lucene/search/Scorable.java @@ -66,23 +66,12 @@ public Collection getChildren() throws IOException { * A child Scorer and its relationship to its parent. The meaning of the relationship depends upon * the parent query. * + *

    The relationship can be any string that makes sense to the parent Scorer. + * + * @param child Child Scorer. (note this is typically a direct child, and may itself also have + * children). + * @param relationship An arbitrary string relating this scorer to the parent. * @lucene.experimental */ - public static class ChildScorable { - /** Child Scorer. (note this is typically a direct child, and may itself also have children). */ - public final Scorable child; - - /** An arbitrary string relating this scorer to the parent. */ - public final String relationship; - - /** - * Creates a new ChildScorer node with the specified relationship. - * - *

    The relationship can be any string that makes sense to the parent Scorer. - */ - public ChildScorable(Scorable child, String relationship) { - this.child = child; - this.relationship = relationship; - } - } + public record ChildScorable(Scorable child, String relationship) {} } diff --git a/lucene/core/src/java/org/apache/lucene/search/SortRescorer.java b/lucene/core/src/java/org/apache/lucene/search/SortRescorer.java index 68c45bb2c668..e4a4891c570d 100644 --- a/lucene/core/src/java/org/apache/lucene/search/SortRescorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/SortRescorer.java @@ -46,9 +46,7 @@ public TopDocs rescore(IndexSearcher searcher, TopDocs firstPassTopDocs, int top List leaves = searcher.getIndexReader().leaves(); TopFieldCollector collector = - new TopFieldCollectorManager( - sort, topN, null, Integer.MAX_VALUE, searcher.getSlices().length > 1) - .newCollector(); + new TopFieldCollectorManager(sort, topN, null, Integer.MAX_VALUE).newCollector(); // Now merge sort docIDs from hits, with reader's leaves: int hitUpto = 0; @@ -102,7 +100,7 @@ public Explanation explain(IndexSearcher searcher, Explanation firstPassExplanat new TotalHits(1, Relation.EQUAL_TO), new ScoreDoc[] {new ScoreDoc(docID, firstPassExplanation.getValue().floatValue())}); TopDocs hits = rescore(searcher, oneHit, 1); - assert hits.totalHits.value == 1; + assert hits.totalHits.value() == 1; List subs = new ArrayList<>(); diff --git a/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java b/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java index 82a3b6e01489..357f97019f86 100644 --- a/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java @@ -30,6 +30,7 @@ import org.apache.lucene.index.ImpactsEnum; import org.apache.lucene.index.ImpactsSource; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.SlowImpactsEnum; import org.apache.lucene.index.Term; @@ -38,6 +39,7 @@ import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarities.Similarity.SimScorer; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOSupplier; import org.apache.lucene.util.PriorityQueue; @@ -259,9 +261,13 @@ public Explanation explain(LeafReaderContext context, int doc) throws IOExceptio assert scorer instanceof TermScorer; freq = ((TermScorer) scorer).freq(); } - LeafSimScorer docScorer = new LeafSimScorer(simWeight, context.reader(), field, true); Explanation freqExplanation = Explanation.match(freq, "termFreq=" + freq); - Explanation scoreExplanation = docScorer.explain(doc, freqExplanation); + NumericDocValues norms = context.reader().getNormValues(field); + long norm = 1L; + if (norms != null && norms.advanceExact(doc)) { + norm = norms.longValue(); + } + Explanation scoreExplanation = simWeight.explain(freqExplanation, norm); return Explanation.match( scoreExplanation.getValue(), "weight(" @@ -334,27 +340,27 @@ public Scorer get(long leadCost) throws IOException { return new ConstantScoreScorer(0f, scoreMode, DocIdSetIterator.empty()); } - LeafSimScorer simScorer = new LeafSimScorer(simWeight, context.reader(), field, true); + NumericDocValues norms = context.reader().getNormValues(field); // we must optimize this case (term not in segment), disjunctions require >= 2 subs if (iterators.size() == 1) { final TermScorer scorer; if (scoreMode == ScoreMode.TOP_SCORES) { - scorer = new TermScorer(impacts.get(0), simScorer); + scorer = new TermScorer(impacts.get(0), simWeight, norms); } else { - scorer = new TermScorer(iterators.get(0), simScorer); + scorer = new TermScorer(iterators.get(0), simWeight, norms); } float boost = termBoosts.get(0); return scoreMode == ScoreMode.COMPLETE_NO_SCORES || boost == 1f ? scorer - : new FreqBoostTermScorer(boost, scorer, simScorer); + : new FreqBoostTermScorer(boost, scorer, simWeight, norms); } else { // we use termscorers + disjunction as an impl detail DisiPriorityQueue queue = new DisiPriorityQueue(iterators.size()); for (int i = 0; i < iterators.size(); i++) { PostingsEnum postings = iterators.get(i); - final TermScorer termScorer = new TermScorer(postings, simScorer); + final TermScorer termScorer = new TermScorer(postings, simWeight, norms); float boost = termBoosts.get(i); final DisiWrapperFreq wrapper = new DisiWrapperFreq(termScorer, boost); queue.add(wrapper); @@ -368,8 +374,7 @@ public Scorer get(long leadCost) throws IOException { boosts[i] = termBoosts.get(i); } ImpactsSource impactsSource = mergeImpacts(impacts.toArray(new ImpactsEnum[0]), boosts); - MaxScoreCache maxScoreCache = - new MaxScoreCache(impactsSource, simScorer.getSimScorer()); + MaxScoreCache maxScoreCache = new MaxScoreCache(impactsSource, simWeight); ImpactsDISI impactsDisi = new ImpactsDISI(iterator, maxScoreCache); if (scoreMode == ScoreMode.TOP_SCORES) { @@ -379,7 +384,7 @@ public Scorer get(long leadCost) throws IOException { iterator = impactsDisi; } - return new SynonymScorer(queue, iterator, impactsDisi, simScorer); + return new SynonymScorer(queue, iterator, impactsDisi, simWeight, norms); } } @@ -575,18 +580,21 @@ private static class SynonymScorer extends Scorer { private final DocIdSetIterator iterator; private final MaxScoreCache maxScoreCache; private final ImpactsDISI impactsDisi; - private final LeafSimScorer simScorer; + private final SimScorer scorer; + private final NumericDocValues norms; SynonymScorer( DisiPriorityQueue queue, DocIdSetIterator iterator, ImpactsDISI impactsDisi, - LeafSimScorer simScorer) { + SimScorer scorer, + NumericDocValues norms) { this.queue = queue; this.iterator = iterator; this.maxScoreCache = impactsDisi.getMaxScoreCache(); this.impactsDisi = impactsDisi; - this.simScorer = simScorer; + this.scorer = scorer; + this.norms = norms; } @Override @@ -605,7 +613,11 @@ float freq() throws IOException { @Override public float score() throws IOException { - return simScorer.score(iterator.docID(), freq()); + long norm = 1L; + if (norms != null && norms.advanceExact(iterator.docID())) { + norm = norms.longValue(); + } + return scorer.score(freq(), norm); } @Override @@ -647,9 +659,11 @@ float freq() throws IOException { private static class FreqBoostTermScorer extends FilterScorer { final float boost; final TermScorer in; - final LeafSimScorer docScorer; + final SimScorer scorer; + final NumericDocValues norms; - public FreqBoostTermScorer(float boost, TermScorer in, LeafSimScorer docScorer) { + public FreqBoostTermScorer( + float boost, TermScorer in, SimScorer scorer, NumericDocValues norms) { super(in); if (Float.isNaN(boost) || Float.compare(boost, 0f) < 0 || Float.compare(boost, 1f) > 0) { throw new IllegalArgumentException( @@ -657,7 +671,8 @@ public FreqBoostTermScorer(float boost, TermScorer in, LeafSimScorer docScorer) } this.boost = boost; this.in = in; - this.docScorer = docScorer; + this.scorer = scorer; + this.norms = norms; } float freq() throws IOException { @@ -666,8 +681,11 @@ float freq() throws IOException { @Override public float score() throws IOException { - assert docID() != DocIdSetIterator.NO_MORE_DOCS; - return docScorer.score(in.docID(), freq()); + long norm = 1L; + if (norms != null && norms.advanceExact(in.docID())) { + norm = norms.longValue(); + } + return scorer.score(freq(), norm); } @Override @@ -686,26 +704,5 @@ public void setMinCompetitiveScore(float minScore) throws IOException { } } - private static class TermAndBoost { - final BytesRef term; - final float boost; - - TermAndBoost(BytesRef term, float boost) { - this.term = term; - this.boost = boost; - } - - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - TermAndBoost that = (TermAndBoost) o; - return Float.compare(that.boost, boost) == 0 && Objects.equals(term, that.term); - } - - @Override - public int hashCode() { - return Objects.hash(term, boost); - } - } + private record TermAndBoost(BytesRef term, float boost) {} } diff --git a/lucene/core/src/java/org/apache/lucene/search/TaskExecutor.java b/lucene/core/src/java/org/apache/lucene/search/TaskExecutor.java index 9d2176fd3af0..6c89c267a52f 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TaskExecutor.java +++ b/lucene/core/src/java/org/apache/lucene/search/TaskExecutor.java @@ -20,7 +20,6 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Collection; -import java.util.Collections; import java.util.List; import java.util.Objects; import java.util.concurrent.Callable; @@ -73,15 +72,68 @@ public TaskExecutor(Executor executor) { /** * Execute all the callables provided as an argument, wait for them to complete and return the * obtained results. If an exception is thrown by more than one callable, the subsequent ones will - * be added as suppressed exceptions to the first one that was caught. + * be added as suppressed exceptions to the first one that was caught. Additionally, if one task + * throws an exception, all other tasks from the same group are cancelled, to avoid needless + * computation as their results would not be exposed anyways. * * @param callables the callables to execute * @return a list containing the results from the tasks execution * @param the return type of the task execution */ public List invokeAll(Collection> callables) throws IOException { - TaskGroup taskGroup = new TaskGroup<>(callables); - return taskGroup.invokeAll(executor); + List> futures = new ArrayList<>(callables.size()); + for (Callable callable : callables) { + futures.add(new Task<>(callable, futures)); + } + final int count = futures.size(); + // taskId provides the first index of an un-executed task in #futures + final AtomicInteger taskId = new AtomicInteger(0); + // we fork execution count - 1 tasks to execute at least one task on the current thread to + // minimize needless forking and blocking of the current thread + if (count > 1) { + final Runnable work = + () -> { + int id = taskId.getAndIncrement(); + if (id < count) { + futures.get(id).run(); + } + }; + for (int j = 0; j < count - 1; j++) { + executor.execute(work); + } + } + // try to execute as many tasks as possible on the current thread to minimize context + // switching in case of long running concurrent + // tasks as well as dead-locking if the current thread is part of #executor for executors that + // have limited or no parallelism + int id; + while ((id = taskId.getAndIncrement()) < count) { + futures.get(id).run(); + if (id >= count - 1) { + // save redundant CAS in case this was the last task + break; + } + } + return collectResults(futures); + } + + private static List collectResults(List> futures) throws IOException { + Throwable exc = null; + List results = new ArrayList<>(futures.size()); + for (Future future : futures) { + try { + results.add(future.get()); + } catch (InterruptedException e) { + exc = IOUtils.useOrSuppress(exc, new ThreadInterruptedException(e)); + } catch (ExecutionException e) { + exc = IOUtils.useOrSuppress(exc, e.getCause()); + } + } + assert assertAllFuturesCompleted(futures) : "Some tasks are still running?"; + if (exc != null) { + throw IOUtils.rethrowAlways(exc); + } + return results; } @Override @@ -89,120 +141,62 @@ public String toString() { return "TaskExecutor(" + "executor=" + executor + ')'; } - /** - * Holds all the sub-tasks that a certain operation gets split into as it gets parallelized and - * exposes the ability to invoke such tasks and wait for them all to complete their execution and - * provide their results. Additionally, if one task throws an exception, all other tasks from the - * same group are cancelled, to avoid needless computation as their results would not be exposed - * anyways. Creates one {@link FutureTask} for each {@link Callable} provided - * - * @param the return type of all the callables - */ - private static final class TaskGroup { - private final List> futures; - - TaskGroup(Collection> callables) { - List> tasks = new ArrayList<>(callables.size()); - for (Callable callable : callables) { - tasks.add(createTask(callable)); + private static boolean assertAllFuturesCompleted(Collection> futures) { + for (Future future : futures) { + if (future.isDone() == false) { + return false; } - this.futures = Collections.unmodifiableList(tasks); } + return true; + } - RunnableFuture createTask(Callable callable) { - AtomicBoolean startedOrCancelled = new AtomicBoolean(false); - return new FutureTask<>( - () -> { - if (startedOrCancelled.compareAndSet(false, true)) { - try { - return callable.call(); - } catch (Throwable t) { - cancelAll(); - throw t; - } - } - // task is cancelled hence it has no results to return. That's fine: they would be - // ignored anyway. - return null; - }) { - @Override - public boolean cancel(boolean mayInterruptIfRunning) { - assert mayInterruptIfRunning == false - : "cancelling tasks that are running is not supported"; - /* - Future#get (called in invokeAll) throws CancellationException when invoked against a running task that has been cancelled but - leaves the task running. We rather want to make sure that invokeAll does not leave any running tasks behind when it returns. - Overriding cancel ensures that tasks that are already started will complete normally once cancelled, and Future#get will - wait for them to finish instead of throwing CancellationException. A cleaner way would have been to override FutureTask#get and - make it wait for cancelled tasks, but FutureTask#awaitDone is private. Tasks that are cancelled before they are started will be no-op. - */ - return startedOrCancelled.compareAndSet(false, true); - } - }; + private static void cancelAll(Collection> futures) { + for (Future future : futures) { + future.cancel(false); } + } - List invokeAll(Executor executor) throws IOException { - final int count = futures.size(); - // taskId provides the first index of an un-executed task in #futures - final AtomicInteger taskId = new AtomicInteger(0); - // we fork execution count - 1 tasks to execute at least one task on the current thread to - // minimize needless forking and blocking of the current thread - if (count > 1) { - final Runnable work = - () -> { - int id = taskId.getAndIncrement(); - if (id < count) { - futures.get(id).run(); - } - }; - for (int j = 0; j < count - 1; j++) { - executor.execute(work); - } - } - // try to execute as many tasks as possible on the current thread to minimize context - // switching in case of long running concurrent - // tasks as well as dead-locking if the current thread is part of #executor for executors that - // have limited or no parallelism - int id; - while ((id = taskId.getAndIncrement()) < count) { - futures.get(id).run(); - if (id >= count - 1) { - // save redundant CAS in case this was the last task - break; - } - } - Throwable exc = null; - List results = new ArrayList<>(count); - for (int i = 0; i < count; i++) { - Future future = futures.get(i); - try { - results.add(future.get()); - } catch (InterruptedException e) { - exc = IOUtils.useOrSuppress(exc, new ThreadInterruptedException(e)); - } catch (ExecutionException e) { - exc = IOUtils.useOrSuppress(exc, e.getCause()); - } - } - assert assertAllFuturesCompleted() : "Some tasks are still running?"; - if (exc != null) { - throw IOUtils.rethrowAlways(exc); - } - return results; + private static class Task extends FutureTask { + + private final AtomicBoolean startedOrCancelled = new AtomicBoolean(false); + + private final Collection> futures; + + public Task(Callable callable, Collection> futures) { + super(callable); + this.futures = futures; } - private boolean assertAllFuturesCompleted() { - for (RunnableFuture future : futures) { - if (future.isDone() == false) { - return false; - } + @Override + public void run() { + if (startedOrCancelled.compareAndSet(false, true)) { + super.run(); } - return true; } - private void cancelAll() { - for (Future future : futures) { - future.cancel(false); + @Override + protected void setException(Throwable t) { + super.setException(t); + cancelAll(futures); + } + + @Override + public boolean cancel(boolean mayInterruptIfRunning) { + assert mayInterruptIfRunning == false : "cancelling tasks that are running is not supported"; + /* + Future#get (called in #collectResults) throws CancellationException when invoked against a running task that has been cancelled but + leaves the task running. We rather want to make sure that invokeAll does not leave any running tasks behind when it returns. + Overriding cancel ensures that tasks that are already started will complete normally once cancelled, and Future#get will + wait for them to finish instead of throwing CancellationException. A cleaner way would have been to override FutureTask#get and + make it wait for cancelled tasks, but FutureTask#awaitDone is private. Tasks that are cancelled before they are started will be no-op. + */ + if (startedOrCancelled.compareAndSet(false, true)) { + // task is cancelled hence it has no results to return. That's fine: they would be + // ignored anyway. + set(null); + return true; } + return false; } } } diff --git a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java index 4e1abab09cc1..c82df0ac1ebe 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java @@ -27,13 +27,7 @@ import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.util.Accountable; -import org.apache.lucene.util.AttributeSource; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.BytesRefBuilder; -import org.apache.lucene.util.BytesRefComparator; -import org.apache.lucene.util.RamUsageEstimator; -import org.apache.lucene.util.StringSorter; +import org.apache.lucene.util.*; import org.apache.lucene.util.automaton.Automata; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.ByteRunAutomaton; @@ -137,10 +131,20 @@ protected void swap(int i, int j) { } @Override - public long getTermsCount() throws IOException { + public long getTermsCount() { return termData.size(); } + /** + * Get an iterator over the encoded terms for query inspection. + * + * @lucene.experimental + */ + public BytesRefIterator getBytesRefIterator() { + final TermIterator iterator = this.termData.iterator(); + return () -> iterator.next(); + } + @Override public void visit(QueryVisitor visitor) { if (visitor.acceptField(field) == false) { diff --git a/lucene/core/src/java/org/apache/lucene/search/TermQuery.java b/lucene/core/src/java/org/apache/lucene/search/TermQuery.java index 3a843addcc30..814f74d8e780 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TermQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/TermQuery.java @@ -22,6 +22,7 @@ import org.apache.lucene.index.IndexReaderContext; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.ReaderUtil; import org.apache.lucene.index.Term; @@ -150,19 +151,17 @@ public Scorer get(long leadCost) throws IOException { return new ConstantScoreScorer(0f, scoreMode, DocIdSetIterator.empty()); } - LeafSimScorer scorer = - new LeafSimScorer(simScorer, context.reader(), term.field(), scoreMode.needsScores()); + NumericDocValues norms = null; + if (scoreMode.needsScores()) { + norms = context.reader().getNormValues(term.field()); + } + if (scoreMode == ScoreMode.TOP_SCORES) { return new TermScorer( - TermWeight.this, - termsEnum.impacts(PostingsEnum.FREQS), - scorer, - topLevelScoringClause); + termsEnum.impacts(PostingsEnum.FREQS), simScorer, norms, topLevelScoringClause); } else { - return new TermScorer( - termsEnum.postings( - null, scoreMode.needsScores() ? PostingsEnum.FREQS : PostingsEnum.NONE), - scorer); + int flags = scoreMode.needsScores() ? PostingsEnum.FREQS : PostingsEnum.NONE; + return new TermScorer(termsEnum.postings(null, flags), simScorer, norms); } } @@ -223,11 +222,14 @@ public Explanation explain(LeafReaderContext context, int doc) throws IOExceptio int newDoc = scorer.iterator().advance(doc); if (newDoc == doc) { float freq = ((TermScorer) scorer).freq(); - LeafSimScorer docScorer = - new LeafSimScorer(simScorer, context.reader(), term.field(), true); + NumericDocValues norms = context.reader().getNormValues(term.field()); + long norm = 1L; + if (norms != null && norms.advanceExact(doc)) { + norm = norms.longValue(); + } Explanation freqExplanation = Explanation.match(freq, "freq, occurrences of term within document"); - Explanation scoreExplanation = docScorer.explain(doc, freqExplanation); + Explanation scoreExplanation = simScorer.explain(freqExplanation, norm); return Explanation.match( scoreExplanation.getValue(), "weight(" diff --git a/lucene/core/src/java/org/apache/lucene/search/TermScorer.java b/lucene/core/src/java/org/apache/lucene/search/TermScorer.java index 22c6102b9584..4b53788f233e 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TermScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/TermScorer.java @@ -18,8 +18,10 @@ import java.io.IOException; import org.apache.lucene.index.ImpactsEnum; +import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.SlowImpactsEnum; +import org.apache.lucene.search.similarities.Similarity.SimScorer; /** * Expert: A Scorer for documents matching a Term. @@ -29,17 +31,19 @@ public final class TermScorer extends Scorer { private final PostingsEnum postingsEnum; private final DocIdSetIterator iterator; - private final LeafSimScorer docScorer; + private final SimScorer scorer; + private final NumericDocValues norms; private final ImpactsDISI impactsDisi; private final MaxScoreCache maxScoreCache; /** Construct a {@link TermScorer} that will iterate all documents. */ - public TermScorer(PostingsEnum postingsEnum, LeafSimScorer docScorer) { + public TermScorer(PostingsEnum postingsEnum, SimScorer scorer, NumericDocValues norms) { iterator = this.postingsEnum = postingsEnum; ImpactsEnum impactsEnum = new SlowImpactsEnum(postingsEnum); - maxScoreCache = new MaxScoreCache(impactsEnum, docScorer.getSimScorer()); + maxScoreCache = new MaxScoreCache(impactsEnum, scorer); impactsDisi = null; - this.docScorer = docScorer; + this.scorer = scorer; + this.norms = norms; } /** @@ -47,12 +51,12 @@ public TermScorer(PostingsEnum postingsEnum, LeafSimScorer docScorer) { * documents. */ public TermScorer( - Weight weight, ImpactsEnum impactsEnum, - LeafSimScorer docScorer, + SimScorer scorer, + NumericDocValues norms, boolean topLevelScoringClause) { postingsEnum = impactsEnum; - maxScoreCache = new MaxScoreCache(impactsEnum, docScorer.getSimScorer()); + maxScoreCache = new MaxScoreCache(impactsEnum, scorer); if (topLevelScoringClause) { impactsDisi = new ImpactsDISI(impactsEnum, maxScoreCache); iterator = impactsDisi; @@ -60,7 +64,8 @@ public TermScorer( impactsDisi = null; iterator = impactsEnum; } - this.docScorer = docScorer; + this.scorer = scorer; + this.norms = norms; } @Override @@ -80,13 +85,23 @@ public DocIdSetIterator iterator() { @Override public float score() throws IOException { - assert docID() != DocIdSetIterator.NO_MORE_DOCS; - return docScorer.score(postingsEnum.docID(), postingsEnum.freq()); + var postingsEnum = this.postingsEnum; + var norms = this.norms; + + long norm = 1L; + if (norms != null && norms.advanceExact(postingsEnum.docID())) { + norm = norms.longValue(); + } + return scorer.score(postingsEnum.freq(), norm); } @Override public float smoothingScore(int docId) throws IOException { - return docScorer.score(docId, 0); + long norm = 1L; + if (norms != null && norms.advanceExact(docId)) { + norm = norms.longValue(); + } + return scorer.score(0, norm); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/TermStatistics.java b/lucene/core/src/java/org/apache/lucene/search/TermStatistics.java index 9f24ed23b5ec..aeca533e9ba5 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TermStatistics.java +++ b/lucene/core/src/java/org/apache/lucene/search/TermStatistics.java @@ -17,8 +17,6 @@ package org.apache.lucene.search; import java.util.Objects; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermsEnum; // javadocs import org.apache.lucene.util.BytesRef; /** @@ -45,27 +43,34 @@ *

    Be careful when performing calculations on these values because they are represented as 64-bit * integer values, you may need to cast to {@code double} for your use. * + * @param term Term bytes. + *

    This value is never {@code null}. + * @param docFreq number of documents containing the term in the collection, in the range [1 .. + * {@link #totalTermFreq()}]. + *

    This is the document-frequency for the term: the count of documents where the term appears + * at least one time. + *

    This value is always a positive number, and never exceeds {@link #totalTermFreq}. It also + * cannot exceed {@link CollectionStatistics#sumDocFreq()}. @see TermsEnum#docFreq() + * @param totalTermFreq number of occurrences of the term in the collection, in the range [{@link + * #docFreq()} .. {@link CollectionStatistics#sumTotalTermFreq()}]. + *

    This is the token count for the term: the number of times it appears in the field across + * all documents. + *

    This value is always a positive number, always at least {@link #docFreq()}, and never + * exceeds {@link CollectionStatistics#sumTotalTermFreq()}. @see TermsEnum#totalTermFreq() * @lucene.experimental */ // TODO: actually add missing cross-checks to guarantee TermStatistics is in bounds of // CollectionStatistics, // otherwise many similarity functions will implode. -public class TermStatistics { - private final BytesRef term; - private final long docFreq; - private final long totalTermFreq; - +public record TermStatistics(BytesRef term, long docFreq, long totalTermFreq) { /** * Creates statistics instance for a term. * - * @param term Term bytes - * @param docFreq number of documents containing the term in the collection. - * @param totalTermFreq number of occurrences of the term in the collection. * @throws NullPointerException if {@code term} is {@code null}. * @throws IllegalArgumentException if {@code docFreq} is negative or zero. * @throws IllegalArgumentException if {@code totalTermFreq} is less than {@code docFreq}. */ - public TermStatistics(BytesRef term, long docFreq, long totalTermFreq) { + public TermStatistics { Objects.requireNonNull(term); if (docFreq <= 0) { throw new IllegalArgumentException("docFreq must be positive, docFreq: " + docFreq); @@ -81,66 +86,5 @@ public TermStatistics(BytesRef term, long docFreq, long totalTermFreq) { + ", docFreq: " + docFreq); } - this.term = term; - this.docFreq = docFreq; - this.totalTermFreq = totalTermFreq; - } - - /** - * The term text. - * - *

    This value is never {@code null}. - * - * @return term's text, not {@code null} - */ - public final BytesRef term() { - return term; - } - - /** - * The number of documents this term occurs in. - * - *

    This is the document-frequency for the term: the count of documents where the term appears - * at least one time. - * - *

    This value is always a positive number, and never exceeds {@link #totalTermFreq}. It also - * cannot exceed {@link CollectionStatistics#sumDocFreq()}. - * - * @return document frequency, in the range [1 .. {@link #totalTermFreq()}] - * @see TermsEnum#docFreq() - */ - public final long docFreq() { - return docFreq; - } - - /** - * The total number of occurrences of this term. - * - *

    This is the token count for the term: the number of times it appears in the field across all - * documents. - * - *

    This value is always a positive number, always at least {@link #docFreq()}, and never - * exceeds {@link CollectionStatistics#sumTotalTermFreq()}. - * - * @return number of occurrences, in the range [{@link #docFreq()} .. {@link - * CollectionStatistics#sumTotalTermFreq()}] - * @see TermsEnum#totalTermFreq() - */ - public final long totalTermFreq() { - return totalTermFreq; - } - - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - sb.append("term="); - sb.append('"'); - sb.append(Term.toString(term())); - sb.append('"'); - sb.append(",docFreq="); - sb.append(docFreq()); - sb.append(",totalTermFreq="); - sb.append(totalTermFreq()); - return sb.toString(); } } diff --git a/lucene/core/src/java/org/apache/lucene/search/TimeLimitingKnnCollectorManager.java b/lucene/core/src/java/org/apache/lucene/search/TimeLimitingKnnCollectorManager.java index aa26a72808c5..2a1f312fbc58 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TimeLimitingKnnCollectorManager.java +++ b/lucene/core/src/java/org/apache/lucene/search/TimeLimitingKnnCollectorManager.java @@ -95,9 +95,9 @@ public TopDocs topDocs() { TotalHits.Relation relation = queryTimeout.shouldExit() ? TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO - : docs.totalHits.relation; + : docs.totalHits.relation(); - return new TopDocs(new TotalHits(docs.totalHits.value, relation), docs.scoreDocs); + return new TopDocs(new TotalHits(docs.totalHits.value(), relation), docs.scoreDocs); } } } diff --git a/lucene/core/src/java/org/apache/lucene/search/TopDocs.java b/lucene/core/src/java/org/apache/lucene/search/TopDocs.java index 08b25a505549..5a4c3bfafb41 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TopDocs.java +++ b/lucene/core/src/java/org/apache/lucene/search/TopDocs.java @@ -291,10 +291,10 @@ private static TopDocs mergeAux( final TopDocs shard = shardHits[shardIDX]; // totalHits can be non-zero even if no hits were // collected, when searchAfter was used: - totalHitCount += shard.totalHits.value; + totalHitCount += shard.totalHits.value(); // If any hit count is a lower bound then the merged // total hit count is a lower bound as well - if (shard.totalHits.relation == TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO) { + if (shard.totalHits.relation() == TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO) { totalHitsRelation = TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO; } if (shard.scoreDocs != null && shard.scoreDocs.length > 0) { diff --git a/lucene/core/src/java/org/apache/lucene/search/TopFieldCollector.java b/lucene/core/src/java/org/apache/lucene/search/TopFieldCollector.java index 7caf34c95f26..384f5fa1168e 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TopFieldCollector.java +++ b/lucene/core/src/java/org/apache/lucene/search/TopFieldCollector.java @@ -24,7 +24,6 @@ import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.ReaderUtil; import org.apache.lucene.search.FieldValueHitQueue.Entry; -import org.apache.lucene.search.MaxScoreAccumulator.DocAndScore; import org.apache.lucene.search.TotalHits.Relation; /** @@ -54,7 +53,7 @@ private abstract class TopFieldLeafCollector implements LeafCollector { // as all segments are sorted in the same way, enough to check only the 1st segment for // indexSort if (searchSortPartOfIndexSort == null) { - final Sort indexSort = context.reader().getMetaData().getSort(); + final Sort indexSort = context.reader().getMetaData().sort(); searchSortPartOfIndexSort = canEarlyTerminate(sort, indexSort); if (searchSortPartOfIndexSort) { firstComparator.disableSkipping(); @@ -72,15 +71,14 @@ private abstract class TopFieldLeafCollector implements LeafCollector { } void countHit(int doc) throws IOException { - ++totalHits; - hitsThresholdChecker.incrementHitCount(); + int hitCountSoFar = ++totalHits; - if (minScoreAcc != null && (totalHits & minScoreAcc.modInterval) == 0) { + if (minScoreAcc != null && (hitCountSoFar & minScoreAcc.modInterval) == 0) { updateGlobalMinCompetitiveScore(scorer); } if (scoreMode.isExhaustive() == false && totalHitsRelation == TotalHits.Relation.EQUAL_TO - && hitsThresholdChecker.isThresholdReached()) { + && totalHits > totalHitsThreshold) { // for the first time hitsThreshold is reached, notify comparator about this comparator.setHitsThresholdReached(); totalHitsRelation = TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO; @@ -93,7 +91,7 @@ boolean thresholdCheck(int doc) throws IOException { // this document is larger than anything else in the queue, and // therefore not competitive. if (searchSortPartOfIndexSort) { - if (hitsThresholdChecker.isThresholdReached()) { + if (totalHits > totalHitsThreshold) { totalHitsRelation = Relation.GREATER_THAN_OR_EQUAL_TO; throw new CollectionTerminatedException(); } else { @@ -181,9 +179,9 @@ public SimpleFieldCollector( Sort sort, FieldValueHitQueue queue, int numHits, - HitsThresholdChecker hitsThresholdChecker, + int totalHitsThreshold, MaxScoreAccumulator minScoreAcc) { - super(queue, numHits, hitsThresholdChecker, sort.needsScores(), minScoreAcc); + super(queue, numHits, totalHitsThreshold, sort.needsScores(), minScoreAcc); this.sort = sort; this.queue = queue; } @@ -236,9 +234,9 @@ public PagingFieldCollector( FieldValueHitQueue queue, FieldDoc after, int numHits, - HitsThresholdChecker hitsThresholdChecker, + int totalHitsThreshold, MaxScoreAccumulator minScoreAcc) { - super(queue, numHits, hitsThresholdChecker, sort.needsScores(), minScoreAcc); + super(queue, numHits, totalHitsThreshold, sort.needsScores(), minScoreAcc); this.sort = sort; this.queue = queue; this.after = after; @@ -302,7 +300,7 @@ public void collect(int doc) throws IOException { private static final ScoreDoc[] EMPTY_SCOREDOCS = new ScoreDoc[0]; final int numHits; - final HitsThresholdChecker hitsThresholdChecker; + final int totalHitsThreshold; final FieldComparator firstComparator; final boolean canSetMinScore; @@ -328,25 +326,25 @@ public void collect(int doc) throws IOException { private TopFieldCollector( FieldValueHitQueue pq, int numHits, - HitsThresholdChecker hitsThresholdChecker, + int totalHitsThreshold, boolean needsScores, MaxScoreAccumulator minScoreAcc) { super(pq); this.needsScores = needsScores; this.numHits = numHits; - this.hitsThresholdChecker = hitsThresholdChecker; + this.totalHitsThreshold = Math.max(totalHitsThreshold, numHits); this.numComparators = pq.getComparators().length; this.firstComparator = pq.getComparators()[0]; int reverseMul = pq.reverseMul[0]; if (firstComparator.getClass().equals(FieldComparator.RelevanceComparator.class) && reverseMul == 1 // if the natural sort is preserved (sort by descending relevance) - && hitsThresholdChecker.getHitsThreshold() != Integer.MAX_VALUE) { + && totalHitsThreshold != Integer.MAX_VALUE) { scoreMode = ScoreMode.TOP_SCORES; canSetMinScore = true; } else { canSetMinScore = false; - if (hitsThresholdChecker.getHitsThreshold() != Integer.MAX_VALUE) { + if (totalHitsThreshold != Integer.MAX_VALUE) { scoreMode = needsScores ? ScoreMode.TOP_DOCS_WITH_SCORES : ScoreMode.TOP_DOCS; } else { scoreMode = needsScores ? ScoreMode.COMPLETE : ScoreMode.COMPLETE_NO_SCORES; @@ -362,21 +360,23 @@ public ScoreMode scoreMode() { protected void updateGlobalMinCompetitiveScore(Scorable scorer) throws IOException { assert minScoreAcc != null; - if (canSetMinScore && hitsThresholdChecker.isThresholdReached()) { - // we can start checking the global maximum score even - // if the local queue is not full because the threshold - // is reached. - DocAndScore maxMinScore = minScoreAcc.get(); - if (maxMinScore != null && maxMinScore.score > minCompetitiveScore) { - scorer.setMinCompetitiveScore(maxMinScore.score); - minCompetitiveScore = maxMinScore.score; + if (canSetMinScore) { + // we can start checking the global maximum score even if the local queue is not full or if + // the threshold is not reached on the local competitor: the fact that there is a shared min + // competitive score implies that one of the collectors hit its totalHitsThreshold already + long maxMinScore = minScoreAcc.getRaw(); + float score; + if (maxMinScore != Long.MIN_VALUE + && (score = MaxScoreAccumulator.toScore(maxMinScore)) > minCompetitiveScore) { + scorer.setMinCompetitiveScore(score); + minCompetitiveScore = score; totalHitsRelation = TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO; } } } protected void updateMinCompetitiveScore(Scorable scorer) throws IOException { - if (canSetMinScore && queueFull && hitsThresholdChecker.isThresholdReached()) { + if (canSetMinScore && queueFull && totalHits > totalHitsThreshold) { assert bottom != null; float minScore = (float) firstComparator.value(bottom.slot); if (minScore > minCompetitiveScore) { diff --git a/lucene/core/src/java/org/apache/lucene/search/TopFieldCollectorManager.java b/lucene/core/src/java/org/apache/lucene/search/TopFieldCollectorManager.java index d09a589be975..1f37a82864eb 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TopFieldCollectorManager.java +++ b/lucene/core/src/java/org/apache/lucene/search/TopFieldCollectorManager.java @@ -32,11 +32,9 @@ public class TopFieldCollectorManager implements CollectorManager collectors; - private final boolean supportsConcurrency; - private boolean collectorCreated; /** * Creates a new {@link TopFieldCollectorManager} from the given arguments. @@ -53,9 +51,32 @@ public class TopFieldCollectorManager implements CollectorManagerNOTE: The instances returned by this method pre-allocate a full array of length + * numHits. + * + * @param sort the sort criteria (SortFields). + * @param numHits the number of results to collect. + * @param after the previous doc after which matching docs will be collected. + * @param totalHitsThreshold the number of docs to count accurately. If the query matches more + * than {@code totalHitsThreshold} hits then its hit count will be a lower bound. On the other + * hand if the query matches less than or exactly {@code totalHitsThreshold} hits then the hit + * count of the result will be accurate. {@link Integer#MAX_VALUE} may be used to make the hit + * count accurate, but this will also make query processing slower. + */ + public TopFieldCollectorManager(Sort sort, int numHits, FieldDoc after, int totalHitsThreshold) { if (totalHitsThreshold < 0) { throw new IllegalArgumentException( "totalHitsThreshold must be >= 0, got " + totalHitsThreshold); @@ -88,35 +109,11 @@ public TopFieldCollectorManager( this.sort = sort; this.numHits = numHits; this.after = after; - this.supportsConcurrency = supportsConcurrency; - this.hitsThresholdChecker = - supportsConcurrency - ? HitsThresholdChecker.createShared(Math.max(totalHitsThreshold, numHits)) - : HitsThresholdChecker.create(Math.max(totalHitsThreshold, numHits)); - this.minScoreAcc = supportsConcurrency ? new MaxScoreAccumulator() : null; + this.totalHitsThreshold = totalHitsThreshold; + this.minScoreAcc = totalHitsThreshold != Integer.MAX_VALUE ? new MaxScoreAccumulator() : null; this.collectors = new ArrayList<>(); } - /** - * Creates a new {@link TopFieldCollectorManager} from the given arguments, with thread-safe - * internal states. - * - *

    NOTE: The instances returned by this method pre-allocate a full array of length - * numHits. - * - * @param sort the sort criteria (SortFields). - * @param numHits the number of results to collect. - * @param after the previous doc after which matching docs will be collected. - * @param totalHitsThreshold the number of docs to count accurately. If the query matches more - * than {@code totalHitsThreshold} hits then its hit count will be a lower bound. On the other - * hand if the query matches less than or exactly {@code totalHitsThreshold} hits then the hit - * count of the result will be accurate. {@link Integer#MAX_VALUE} may be used to make the hit - * count accurate, but this will also make query processing slower. - */ - public TopFieldCollectorManager(Sort sort, int numHits, FieldDoc after, int totalHitsThreshold) { - this(sort, numHits, after, totalHitsThreshold, true); - } - /** * Creates a new {@link TopFieldCollectorManager} from the given arguments, with thread-safe * internal states. @@ -138,13 +135,6 @@ public TopFieldCollectorManager(Sort sort, int numHits, int totalHitsThreshold) @Override public TopFieldCollector newCollector() { - if (collectorCreated && supportsConcurrency == false) { - throw new IllegalStateException( - "This TopFieldCollectorManager was created without concurrency (supportsConcurrency=false), but multiple collectors are being created"); - } else { - collectorCreated = true; - } - FieldValueHitQueue queue = FieldValueHitQueue.create(sort.getSort(), numHits); @@ -159,7 +149,7 @@ public TopFieldCollector newCollector() { } collector = new TopFieldCollector.SimpleFieldCollector( - sort, queue, numHits, hitsThresholdChecker, minScoreAcc); + sort, queue, numHits, totalHitsThreshold, minScoreAcc); } else { if (after.fields == null) { throw new IllegalArgumentException( @@ -175,7 +165,7 @@ public TopFieldCollector newCollector() { } collector = new TopFieldCollector.PagingFieldCollector( - sort, queue, after, numHits, hitsThresholdChecker, minScoreAcc); + sort, queue, after, numHits, totalHitsThreshold, minScoreAcc); } collectors.add(collector); diff --git a/lucene/core/src/java/org/apache/lucene/search/TopScoreDocCollector.java b/lucene/core/src/java/org/apache/lucene/search/TopScoreDocCollector.java index f74f087b7d5e..4e299cbb618f 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TopScoreDocCollector.java +++ b/lucene/core/src/java/org/apache/lucene/search/TopScoreDocCollector.java @@ -18,7 +18,6 @@ import java.io.IOException; import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.search.MaxScoreAccumulator.DocAndScore; /** * A {@link Collector} implementation that collects the top-scoring hits, returning them as a {@link @@ -46,8 +45,8 @@ public void setScorer(Scorable scorer) throws IOException { static class SimpleTopScoreDocCollector extends TopScoreDocCollector { SimpleTopScoreDocCollector( - int numHits, HitsThresholdChecker hitsThresholdChecker, MaxScoreAccumulator minScoreAcc) { - super(numHits, hitsThresholdChecker, minScoreAcc); + int numHits, int totalHitsThreshold, MaxScoreAccumulator minScoreAcc) { + super(numHits, totalHitsThreshold, minScoreAcc); } @Override @@ -71,18 +70,16 @@ public void setScorer(Scorable scorer) throws IOException { public void collect(int doc) throws IOException { float score = scorer.score(); - // This collector relies on the fact that scorers produce positive values: - assert score >= 0; // NOTE: false for NaN + int hitCountSoFar = ++totalHits; - totalHits++; - hitsThresholdChecker.incrementHitCount(); - - if (minScoreAcc != null && (totalHits & minScoreAcc.modInterval) == 0) { + if (minScoreAcc != null && (hitCountSoFar & minScoreAcc.modInterval) == 0) { updateGlobalMinCompetitiveScore(scorer); } if (score <= pqTop.score) { - if (totalHitsRelation == TotalHits.Relation.EQUAL_TO) { + // Note: for queries that match lots of hits, this is the common case: most hits are not + // competitive. + if (hitCountSoFar == totalHitsThreshold + 1) { // we just reached totalHitsThreshold, we can start setting the min // competitive score now updateMinCompetitiveScore(scorer); @@ -90,8 +87,12 @@ public void collect(int doc) throws IOException { // Since docs are returned in-order (i.e., increasing doc Id), a document // with equal score to pqTop.score cannot compete since HitQueue favors // documents with lower doc Ids. Therefore reject those docs too. - return; + } else { + collectCompetitiveHit(doc, score); } + } + + private void collectCompetitiveHit(int doc, float score) throws IOException { pqTop.doc = doc + docBase; pqTop.score = score; pqTop = pq.updateTop(); @@ -104,21 +105,24 @@ public void collect(int doc) throws IOException { static class PagingTopScoreDocCollector extends TopScoreDocCollector { private final ScoreDoc after; - private int collectedHits; PagingTopScoreDocCollector( - int numHits, - ScoreDoc after, - HitsThresholdChecker hitsThresholdChecker, - MaxScoreAccumulator minScoreAcc) { - super(numHits, hitsThresholdChecker, minScoreAcc); + int numHits, ScoreDoc after, int totalHitsThreshold, MaxScoreAccumulator minScoreAcc) { + super(numHits, totalHitsThreshold, minScoreAcc); this.after = after; - this.collectedHits = 0; } @Override protected int topDocsSize() { - return collectedHits < pq.size() ? collectedHits : pq.size(); + // Note: this relies on sentinel values having Integer.MAX_VALUE as a doc ID. + int[] validTopHitCount = new int[1]; + pq.forEach( + scoreDoc -> { + if (scoreDoc.doc != Integer.MAX_VALUE) { + validTopHitCount[0]++; + } + }); + return validTopHitCount[0]; } @Override @@ -149,17 +153,14 @@ public void setScorer(Scorable scorer) throws IOException { public void collect(int doc) throws IOException { float score = scorer.score(); - // This collector relies on the fact that scorers produce positive values: - assert score >= 0; // NOTE: false for NaN + int hitCountSoFar = ++totalHits; - totalHits++; - hitsThresholdChecker.incrementHitCount(); - - if (minScoreAcc != null && (totalHits & minScoreAcc.modInterval) == 0) { + if (minScoreAcc != null && (hitCountSoFar & minScoreAcc.modInterval) == 0) { updateGlobalMinCompetitiveScore(scorer); } - if (score > after.score || (score == after.score && doc <= afterDoc)) { + float afterScore = after.score; + if (score > afterScore || (score == afterScore && doc <= afterDoc)) { // hit was collected on a previous page if (totalHitsRelation == TotalHits.Relation.EQUAL_TO) { // we just reached totalHitsThreshold, we can start setting the min @@ -170,8 +171,10 @@ public void collect(int doc) throws IOException { } if (score <= pqTop.score) { - if (totalHitsRelation == TotalHits.Relation.EQUAL_TO) { - // we just reached totalHitsThreshold, we can start setting the min + // Note: for queries that match lots of hits, this is the common case: most hits are not + // competitive. + if (hitCountSoFar == totalHitsThreshold + 1) { + // we just exceeded totalHitsThreshold, we can start setting the min // competitive score now updateMinCompetitiveScore(scorer); } @@ -179,9 +182,12 @@ public void collect(int doc) throws IOException { // Since docs are returned in-order (i.e., increasing doc Id), a document // with equal score to pqTop.score cannot compete since HitQueue favors // documents with lower doc Ids. Therefore reject those docs too. - return; + } else { + collectCompetitiveHit(doc, score); } - collectedHits++; + } + + private void collectCompetitiveHit(int doc, float score) throws IOException { pqTop.doc = doc + docBase; pqTop.score = score; pqTop = pq.updateTop(); @@ -193,20 +199,18 @@ public void collect(int doc) throws IOException { int docBase; ScoreDoc pqTop; - final HitsThresholdChecker hitsThresholdChecker; + final int totalHitsThreshold; final MaxScoreAccumulator minScoreAcc; float minCompetitiveScore; // prevents instantiation - TopScoreDocCollector( - int numHits, HitsThresholdChecker hitsThresholdChecker, MaxScoreAccumulator minScoreAcc) { + TopScoreDocCollector(int numHits, int totalHitsThreshold, MaxScoreAccumulator minScoreAcc) { super(new HitQueue(numHits, true)); - assert hitsThresholdChecker != null; // HitQueue implements getSentinelObject to return a ScoreDoc, so we know // that at this point top() is already initialized. pqTop = pq.top(); - this.hitsThresholdChecker = hitsThresholdChecker; + this.totalHitsThreshold = totalHitsThreshold; this.minScoreAcc = minScoreAcc; } @@ -221,20 +225,19 @@ protected TopDocs newTopDocs(ScoreDoc[] results, int start) { @Override public ScoreMode scoreMode() { - return hitsThresholdChecker.scoreMode(); + return totalHitsThreshold == Integer.MAX_VALUE ? ScoreMode.COMPLETE : ScoreMode.TOP_SCORES; } protected void updateGlobalMinCompetitiveScore(Scorable scorer) throws IOException { assert minScoreAcc != null; - DocAndScore maxMinScore = minScoreAcc.get(); - if (maxMinScore != null) { + long maxMinScore = minScoreAcc.getRaw(); + if (maxMinScore != Long.MIN_VALUE) { // since we tie-break on doc id and collect in doc id order we can require // the next float if the global minimum score is set on a document id that is // smaller than the ids in the current leaf - float score = - docBase >= maxMinScore.docBase ? Math.nextUp(maxMinScore.score) : maxMinScore.score; + float score = MaxScoreAccumulator.toScore(maxMinScore); + score = docBase >= MaxScoreAccumulator.docId(maxMinScore) ? Math.nextUp(score) : score; if (score > minCompetitiveScore) { - assert hitsThresholdChecker.isThresholdReached(); scorer.setMinCompetitiveScore(score); minCompetitiveScore = score; totalHitsRelation = TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO; @@ -243,21 +246,22 @@ protected void updateGlobalMinCompetitiveScore(Scorable scorer) throws IOExcepti } protected void updateMinCompetitiveScore(Scorable scorer) throws IOException { - if (hitsThresholdChecker.isThresholdReached() - && pqTop != null - && pqTop.score != Float.NEGATIVE_INFINITY) { // -Infinity is the score of sentinels + if (totalHits > totalHitsThreshold) { // since we tie-break on doc id and collect in doc id order, we can require // the next float + // pqTop is never null since TopScoreDocCollector fills the priority queue with sentinel + // values + // if the top element is a sentinel value, its score will be -Infty and the below logic is + // still valid float localMinScore = Math.nextUp(pqTop.score); if (localMinScore > minCompetitiveScore) { scorer.setMinCompetitiveScore(localMinScore); totalHitsRelation = TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO; minCompetitiveScore = localMinScore; if (minScoreAcc != null) { - // we don't use the next float but we register the document - // id so that other leaves can require it if they are after - // the current maximum - minScoreAcc.accumulate(docBase, pqTop.score); + // we don't use the next float but we register the document id so that other leaves or + // leaf partitions can require it if they are after the current maximum + minScoreAcc.accumulate(pqTop.doc, pqTop.score); } } } diff --git a/lucene/core/src/java/org/apache/lucene/search/TopScoreDocCollectorManager.java b/lucene/core/src/java/org/apache/lucene/search/TopScoreDocCollectorManager.java index 4e3181abdf7c..6a206088013b 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TopScoreDocCollectorManager.java +++ b/lucene/core/src/java/org/apache/lucene/search/TopScoreDocCollectorManager.java @@ -29,10 +29,8 @@ public class TopScoreDocCollectorManager implements CollectorManager { private final int numHits; private final ScoreDoc after; - private final HitsThresholdChecker hitsThresholdChecker; + private final int totalHitsThreshold; private final MaxScoreAccumulator minScoreAcc; - private final boolean supportsConcurrency; - private boolean collectorCreated; /** * Creates a new {@link TopScoreDocCollectorManager} given the number of hits to collect and the @@ -54,28 +52,13 @@ public class TopScoreDocCollectorManager * hand if the query matches less than or exactly {@code totalHitsThreshold} hits then the hit * count of the result will be accurate. {@link Integer#MAX_VALUE} may be used to make the hit * count accurate, but this will also make query processing slower. - * @param supportsConcurrency to use thread-safe and slower internal states for count tracking. + * @deprecated Use {@link #TopScoreDocCollectorManager(int, ScoreDoc, int)}, the + * supportsConcurrency parameter is now a no-op. */ + @Deprecated public TopScoreDocCollectorManager( int numHits, ScoreDoc after, int totalHitsThreshold, boolean supportsConcurrency) { - if (totalHitsThreshold < 0) { - throw new IllegalArgumentException( - "totalHitsThreshold must be >= 0, got " + totalHitsThreshold); - } - - if (numHits <= 0) { - throw new IllegalArgumentException( - "numHits must be > 0; please use TotalHitCountCollectorManager if you just need the total hit count"); - } - - this.numHits = numHits; - this.after = after; - this.supportsConcurrency = supportsConcurrency; - this.hitsThresholdChecker = - supportsConcurrency - ? HitsThresholdChecker.createShared(Math.max(totalHitsThreshold, numHits)) - : HitsThresholdChecker.create(Math.max(totalHitsThreshold, numHits)); - this.minScoreAcc = supportsConcurrency ? new MaxScoreAccumulator() : null; + this(numHits, after, totalHitsThreshold); } /** @@ -100,7 +83,20 @@ public TopScoreDocCollectorManager( * count accurate, but this will also make query processing slower. */ public TopScoreDocCollectorManager(int numHits, ScoreDoc after, int totalHitsThreshold) { - this(numHits, after, totalHitsThreshold, true); + if (totalHitsThreshold < 0) { + throw new IllegalArgumentException( + "totalHitsThreshold must be >= 0, got " + totalHitsThreshold); + } + + if (numHits <= 0) { + throw new IllegalArgumentException( + "numHits must be > 0; please use TotalHitCountCollectorManager if you just need the total hit count"); + } + + this.numHits = numHits; + this.after = after; + this.totalHitsThreshold = Math.max(totalHitsThreshold, numHits); + this.minScoreAcc = totalHitsThreshold != Integer.MAX_VALUE ? new MaxScoreAccumulator() : null; } /** @@ -129,19 +125,12 @@ public TopScoreDocCollectorManager(int numHits, int totalHitsThreshold) { @Override public TopScoreDocCollector newCollector() { - if (collectorCreated && supportsConcurrency == false) { - throw new IllegalStateException( - "This TopScoreDocCollectorManager was created without concurrency (supportsConcurrency=false), but multiple collectors are being created"); - } else { - collectorCreated = true; - } - if (after == null) { return new TopScoreDocCollector.SimpleTopScoreDocCollector( - numHits, hitsThresholdChecker, minScoreAcc); + numHits, totalHitsThreshold, minScoreAcc); } else { return new TopScoreDocCollector.PagingTopScoreDocCollector( - numHits, after, hitsThresholdChecker, minScoreAcc); + numHits, after, totalHitsThreshold, minScoreAcc); } } diff --git a/lucene/core/src/java/org/apache/lucene/search/TopTermsRewrite.java b/lucene/core/src/java/org/apache/lucene/search/TopTermsRewrite.java index fe95a2fb3e31..491f1aaf57ac 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TopTermsRewrite.java +++ b/lucene/core/src/java/org/apache/lucene/search/TopTermsRewrite.java @@ -17,7 +17,6 @@ package org.apache.lucene.search; import java.io.IOException; -import java.util.Comparator; import java.util.HashMap; import java.util.Map; import java.util.PriorityQueue; @@ -160,7 +159,7 @@ public boolean collect(BytesRef bytes) throws IOException { final B b = getTopLevelBuilder(); final ScoreTerm[] scoreTerms = stQueue.toArray(new ScoreTerm[stQueue.size()]); - ArrayUtil.timSort(scoreTerms, scoreTermSortByTermComp); + ArrayUtil.timSort(scoreTerms, (st1, st2) -> st1.bytes.get().compareTo(st2.bytes.get())); for (final ScoreTerm st : scoreTerms) { final Term term = new Term(query.field, st.bytes.toBytesRef()); @@ -188,14 +187,6 @@ public boolean equals(Object obj) { return true; } - private static final Comparator scoreTermSortByTermComp = - new Comparator() { - @Override - public int compare(ScoreTerm st1, ScoreTerm st2) { - return st1.bytes.get().compareTo(st2.bytes.get()); - } - }; - static final class ScoreTerm implements Comparable { public final BytesRefBuilder bytes = new BytesRefBuilder(); public float boost; diff --git a/lucene/core/src/java/org/apache/lucene/search/TotalHitCountCollector.java b/lucene/core/src/java/org/apache/lucene/search/TotalHitCountCollector.java index 0dcf7af5d01f..55182dd1360a 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TotalHitCountCollector.java +++ b/lucene/core/src/java/org/apache/lucene/search/TotalHitCountCollector.java @@ -50,13 +50,17 @@ public LeafCollector getLeafCollector(LeafReaderContext context) throws IOExcept totalHits += leafCount; throw new CollectionTerminatedException(); } + return createLeafCollector(); + } + + protected final LeafCollector createLeafCollector() { return new LeafCollector() { @Override - public void setScorer(Scorable scorer) throws IOException {} + public void setScorer(Scorable scorer) {} @Override - public void collect(int doc) throws IOException { + public void collect(int doc) { totalHits++; } diff --git a/lucene/core/src/java/org/apache/lucene/search/TotalHitCountCollectorManager.java b/lucene/core/src/java/org/apache/lucene/search/TotalHitCountCollectorManager.java index 664602a4e5e4..ac77fc96595e 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TotalHitCountCollectorManager.java +++ b/lucene/core/src/java/org/apache/lucene/search/TotalHitCountCollectorManager.java @@ -18,6 +18,13 @@ import java.io.IOException; import java.util.Collection; +import java.util.Map; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Future; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.util.ThreadInterruptedException; /** * Collector manager based on {@link TotalHitCountCollector} that allows users to parallelize @@ -28,17 +35,112 @@ */ public class TotalHitCountCollectorManager implements CollectorManager { + + private final boolean hasSegmentPartitions; + + /** + * Creates a new total hit count collector manager, providing the array of leaf slices that search + * targets, which can be retrieved via {@link IndexSearcher#getSlices()} for the searcher. + * + * @param leafSlices the slices that the searcher targets. Used to optimize the collection + * depending on whether segments have been partitioned into partitions or not. + */ + public TotalHitCountCollectorManager(IndexSearcher.LeafSlice[] leafSlices) { + this.hasSegmentPartitions = hasSegmentPartitions(leafSlices); + } + + private static boolean hasSegmentPartitions(IndexSearcher.LeafSlice[] leafSlices) { + for (IndexSearcher.LeafSlice leafSlice : leafSlices) { + for (IndexSearcher.LeafReaderContextPartition leafPartition : leafSlice.partitions) { + if (leafPartition.minDocId > 0 + || leafPartition.maxDocId < leafPartition.ctx.reader().maxDoc()) { + return true; + } + } + } + return false; + } + + /** + * Internal state shared across the different collectors that this collector manager creates. This + * is necessary to support intra-segment concurrency. We track leaves seen as an argument of + * {@link Collector#getLeafCollector(LeafReaderContext)} calls, to ensure correctness: if the + * first partition of a segment early terminates, count has been already retrieved for the entire + * segment hence subsequent partitions of the same segment should also early terminate without + * further incrementing hit count. If the first partition of a segment computes hit counts, + * subsequent partitions of the same segment should do the same, to prevent their counts from + * being retrieved from {@link LRUQueryCache} (which returns counts for the entire segment while + * we'd need only that of the current leaf partition). + */ + private final Map> earlyTerminatedMap = new ConcurrentHashMap<>(); + @Override public TotalHitCountCollector newCollector() throws IOException { + if (hasSegmentPartitions) { + return new LeafPartitionAwareTotalHitCountCollector(earlyTerminatedMap); + } return new TotalHitCountCollector(); } @Override public Integer reduce(Collection collectors) throws IOException { + // Make the same collector manager instance reusable across multiple searches. It isn't a strict + // requirement but it is generally supported as collector managers normally don't hold state, as + // opposed to collectors. + assert hasSegmentPartitions || earlyTerminatedMap.isEmpty(); + if (hasSegmentPartitions) { + earlyTerminatedMap.clear(); + } int totalHits = 0; for (TotalHitCountCollector collector : collectors) { totalHits += collector.getTotalHits(); } return totalHits; } + + private static class LeafPartitionAwareTotalHitCountCollector extends TotalHitCountCollector { + private final Map> earlyTerminatedMap; + + LeafPartitionAwareTotalHitCountCollector(Map> earlyTerminatedMap) { + this.earlyTerminatedMap = earlyTerminatedMap; + } + + @Override + public LeafCollector getLeafCollector(LeafReaderContext context) throws IOException { + Future earlyTerminated = earlyTerminatedMap.get(context.id()); + if (earlyTerminated == null) { + CompletableFuture firstEarlyTerminated = new CompletableFuture<>(); + Future previousEarlyTerminated = + earlyTerminatedMap.putIfAbsent(context.id(), firstEarlyTerminated); + if (previousEarlyTerminated == null) { + // first thread for a given leaf gets to decide what the next threads targeting the same + // leaf do + try { + LeafCollector leafCollector = super.getLeafCollector(context); + firstEarlyTerminated.complete(false); + return leafCollector; + } catch (CollectionTerminatedException e) { + firstEarlyTerminated.complete(true); + throw e; + } + } + earlyTerminated = previousEarlyTerminated; + } + + try { + if (earlyTerminated.get()) { + // first partition of the same leaf early terminated, do the same for subsequent ones + throw new CollectionTerminatedException(); + } + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new ThreadInterruptedException(e); + } catch (ExecutionException e) { + throw new RuntimeException(e); + } + + // first partition of the same leaf computed hit counts, do the same for subsequent ones + return createLeafCollector(); + } + } } diff --git a/lucene/core/src/java/org/apache/lucene/search/TotalHits.java b/lucene/core/src/java/org/apache/lucene/search/TotalHits.java index 8d3e9f1ed852..7d82654b9e30 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TotalHits.java +++ b/lucene/core/src/java/org/apache/lucene/search/TotalHits.java @@ -24,8 +24,14 @@ * documents. Given that it is often enough to have a lower bounds of the number of hits, such as * "there are more than 1000 hits", Lucene has options to stop counting as soon as a threshold has * been reached in order to improve query times. + * + * @param value The value of the total hit count. Must be interpreted in the context of {@link + * #relation}. + * @param relation Whether {@link #value} is the exact hit count, in which case {@link #relation} is + * equal to {@link Relation#EQUAL_TO}, or a lower bound of the total hit count, in which case + * {@link #relation} is equal to {@link Relation#GREATER_THAN_OR_EQUAL_TO}. */ -public final class TotalHits { +public record TotalHits(long value, Relation relation) { /** How the {@link TotalHits#value} should be interpreted. */ public enum Relation { @@ -35,40 +41,12 @@ public enum Relation { GREATER_THAN_OR_EQUAL_TO } - /** The value of the total hit count. Must be interpreted in the context of {@link #relation}. */ - public final long value; - - /** - * Whether {@link #value} is the exact hit count, in which case {@link #relation} is equal to - * {@link Relation#EQUAL_TO}, or a lower bound of the total hit count, in which case {@link - * #relation} is equal to {@link Relation#GREATER_THAN_OR_EQUAL_TO}. - */ - public final Relation relation; - /** Sole constructor. */ - public TotalHits(long value, Relation relation) { + public TotalHits { if (value < 0) { throw new IllegalArgumentException("value must be >= 0, got " + value); } - this.value = value; - this.relation = Objects.requireNonNull(relation); - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - if (o == null || getClass() != o.getClass()) { - return false; - } - TotalHits totalHits = (TotalHits) o; - return value == totalHits.value && relation == totalHits.relation; - } - - @Override - public int hashCode() { - return Objects.hash(value, relation); + Objects.requireNonNull(relation); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/WANDScorer.java b/lucene/core/src/java/org/apache/lucene/search/WANDScorer.java index fe5e8803b4cb..f910db30a26b 100644 --- a/lucene/core/src/java/org/apache/lucene/search/WANDScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/WANDScorer.java @@ -229,7 +229,11 @@ private boolean ensureConsistent() { } for (DisiWrapper w : head) { - assert w.doc > doc; + if (lead == null) { // After calling advance() but before matches() + assert w.doc >= doc; + } else { + assert w.doc > doc; + } } return true; @@ -284,20 +288,21 @@ public int advance(int target) throws IOException { // Move 'lead' iterators back to the tail pushBackLeads(target); - // Advance 'head' as well - advanceHead(target); - - // Pop the new 'lead' from 'head' - moveToNextCandidate(target); + // Make sure `head` is also on or beyond `target` + DisiWrapper headTop = advanceHead(target); - if (doc == DocIdSetIterator.NO_MORE_DOCS) { - return DocIdSetIterator.NO_MORE_DOCS; + if (scoreMode == ScoreMode.TOP_SCORES && (headTop == null || headTop.doc > upTo)) { + // Update score bounds if necessary + moveToNextBlock(target); + assert upTo >= target; + headTop = head.top(); } - assert ensureConsistent(); - - // Advance to the next possible match - return doNextCompetitiveCandidate(); + if (headTop == null) { + return doc = DocIdSetIterator.NO_MORE_DOCS; + } else { + return doc = headTop.doc; + } } @Override @@ -309,6 +314,9 @@ public long cost() { @Override public boolean matches() throws IOException { + assert lead == null; + moveToNextCandidate(); + while (leadMaxScore < minCompetitiveScore || freq < minShouldMatch) { if (leadMaxScore + tailMaxScore < minCompetitiveScore || freq + tailSize < minShouldMatch) { @@ -353,7 +361,7 @@ private void pushBackLeads(int target) throws IOException { } /** Make sure all disis in 'head' are on or after 'target'. */ - private void advanceHead(int target) throws IOException { + private DisiWrapper advanceHead(int target) throws IOException { DisiWrapper headTop = head.top(); while (headTop != null && headTop.doc < target) { final DisiWrapper evicted = insertTailWithOverFlow(headTop); @@ -365,6 +373,7 @@ private void advanceHead(int target) throws IOException { headTop = head.top(); } } + return headTop; } private void advanceTail(DisiWrapper disi) throws IOException { @@ -429,7 +438,7 @@ private void updateMaxScores(int target) throws IOException { * Update {@code upTo} and maximum scores of sub scorers so that {@code upTo} is greater than or * equal to the next candidate after {@code target}, i.e. the top of `head`. */ - private void updateMaxScoresIfNecessary(int target) throws IOException { + private void moveToNextBlock(int target) throws IOException { assert lead == null; while (upTo < DocIdSetIterator.NO_MORE_DOCS) { @@ -460,48 +469,19 @@ private void updateMaxScoresIfNecessary(int target) throws IOException { * Set 'doc' to the next potential match, and move all disis of 'head' that are on this doc into * 'lead'. */ - private void moveToNextCandidate(int target) throws IOException { - if (scoreMode == ScoreMode.TOP_SCORES) { - // Update score bounds if necessary so - updateMaxScoresIfNecessary(target); - assert upTo >= target; - - // updateMaxScores tries to move forward until a block with matches is found - // so if the head is empty it means there are no matches at all anymore - if (head.size() == 0) { - assert upTo == DocIdSetIterator.NO_MORE_DOCS; - doc = DocIdSetIterator.NO_MORE_DOCS; - return; - } - } - + private void moveToNextCandidate() throws IOException { // The top of `head` defines the next potential match // pop all documents which are on this doc lead = head.pop(); + assert doc == lead.doc; lead.next = null; leadMaxScore = lead.scaledMaxScore; freq = 1; - doc = lead.doc; while (head.size() > 0 && head.top().doc == doc) { addLead(head.pop()); } } - /** Move iterators to the tail until there is a potential match. */ - private int doNextCompetitiveCandidate() throws IOException { - while (leadMaxScore + tailMaxScore < minCompetitiveScore || freq + tailSize < minShouldMatch) { - // no match on doc is possible, move to the next potential match - pushBackLeads(doc + 1); - moveToNextCandidate(doc + 1); - assert ensureConsistent(); - if (doc == DocIdSetIterator.NO_MORE_DOCS) { - break; - } - } - - return doc; - } - /** Advance all entries from the tail to know about all matches on the current doc. */ private void advanceAllTail() throws IOException { // we return the next doc when the sum of the scores of the potential diff --git a/lucene/core/src/java/org/apache/lucene/search/Weight.java b/lucene/core/src/java/org/apache/lucene/search/Weight.java index 5c0e1f45eb41..10d41e4a3fee 100644 --- a/lucene/core/src/java/org/apache/lucene/search/Weight.java +++ b/lucene/core/src/java/org/apache/lucene/search/Weight.java @@ -113,7 +113,8 @@ public final Query getQuery() { * Optional method that delegates to scorerSupplier. * *

    Returns a {@link Scorer} which can iterate in order over all matching documents and assign - * them a score. + * them a score. A scorer for the same {@link LeafReaderContext} instance may be requested + * multiple times as part of a single search call. * *

    NOTE: null can be returned if no documents will be scored by this query. * @@ -135,7 +136,8 @@ public final Scorer scorer(LeafReaderContext context) throws IOException { /** * Get a {@link ScorerSupplier}, which allows knowing the cost of the {@link Scorer} before - * building it. + * building it. A scorer supplier for the same {@link LeafReaderContext} instance may be requested + * multiple times as part of a single search call. * *

    Note: It must return null if the scorer is null. * @@ -161,6 +163,9 @@ public final Scorer scorer(LeafReaderContext context) throws IOException { * scorerSupplier.setTopLevelScoringClause(); * return scorerSupplier.bulkScorer(); * + * + * A bulk scorer for the same {@link LeafReaderContext} instance may be requested multiple times + * as part of a single search call. */ public final BulkScorer bulkScorer(LeafReaderContext context) throws IOException { ScorerSupplier scorerSupplier = scorerSupplier(context); diff --git a/lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java b/lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java index 3d1a84ee645e..2bd594cc731c 100644 --- a/lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java +++ b/lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java @@ -18,9 +18,6 @@ package org.apache.lucene.search.comparators; import java.io.IOException; -import java.util.ArrayDeque; -import java.util.Deque; -import java.util.function.Consumer; import org.apache.lucene.index.DocValues; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.LeafReaderContext; @@ -32,12 +29,7 @@ import org.apache.lucene.search.Pruning; import org.apache.lucene.search.Scorable; import org.apache.lucene.search.Scorer; -import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.IntArrayDocIdSet; -import org.apache.lucene.util.IntsRef; -import org.apache.lucene.util.LSBRadixSorter; -import org.apache.lucene.util.PriorityQueue; -import org.apache.lucene.util.packed.PackedInts; +import org.apache.lucene.util.DocIdSetBuilder; /** * Abstract numeric comparator for comparing numeric values. This comparator provides a skipping @@ -50,6 +42,9 @@ */ public abstract class NumericComparator extends FieldComparator { + // MIN_SKIP_INTERVAL and MAX_SKIP_INTERVAL both should be powers of 2 + private static final int MIN_SKIP_INTERVAL = 32; + private static final int MAX_SKIP_INTERVAL = 8192; protected final T missingValue; private final long missingValueAsLong; protected final String field; @@ -97,10 +92,11 @@ public void disableSkipping() { /** Leaf comparator for {@link NumericComparator} that provides skipping functionality */ public abstract class NumericLeafComparator implements LeafFieldComparator { - private static final long MAX_DISJUNCTION_CLAUSE = 128; private final LeafReaderContext context; protected final NumericDocValues docValues; private final PointValues pointValues; + // lazily constructed to avoid performance overhead when this is not used + private PointValues.PointTree pointTree; // if skipping functionality should be enabled on this segment private final boolean enableSkipping; private final int maxDoc; @@ -110,11 +106,14 @@ public abstract class NumericLeafComparator implements LeafFieldComparator { private long minValueAsLong = Long.MIN_VALUE; private long maxValueAsLong = Long.MAX_VALUE; - private Long thresholdAsLong; private DocIdSetIterator competitiveIterator; - private long leadCost = -1; + private long iteratorCost = -1; private int maxDocVisited = -1; + private int updateCounter = 0; + private int currentSkipInterval = MIN_SKIP_INTERVAL; + // helps to be conservative about increasing the sampling interval + private int tryUpdateFailCount = 0; public NumericLeafComparator(LeafReaderContext context) throws IOException { this.context = context; @@ -183,12 +182,12 @@ public void copy(int slot, int doc) throws IOException { @Override public void setScorer(Scorable scorer) throws IOException { - if (leadCost == -1) { + if (iteratorCost == -1) { if (scorer instanceof Scorer) { - leadCost = + iteratorCost = ((Scorer) scorer).iterator().cost(); // starting iterator cost is the scorer's cost } else { - leadCost = maxDoc; + iteratorCost = maxDoc; } updateCompetitiveIterator(); // update an iterator when we have a new segment } @@ -207,91 +206,110 @@ private void updateCompetitiveIterator() throws IOException { || hitsThresholdReached == false || (leafTopSet == false && queueFull == false)) return; // if some documents have missing points, check that missing values prohibits optimization - boolean dense = pointValues.getDocCount() == maxDoc; - if (dense == false && isMissingValueCompetitive()) { + if ((pointValues.getDocCount() < maxDoc) && isMissingValueCompetitive()) { return; // we can't filter out documents, as documents with missing values are competitive } - if (competitiveIterator instanceof CompetitiveIterator iter) { - if (queueFull) { - encodeBottom(); - } - // CompetitiveIterator already built, try to reduce clause. - tryReduceDisjunctionClause(iter); + updateCounter++; + // Start sampling if we get called too much + if (updateCounter > 256 + && (updateCounter & (currentSkipInterval - 1)) != currentSkipInterval - 1) { return; } - if (thresholdAsLong == null) { - if (dense == false) { - competitiveIterator = getNumericDocValues(context, field); - leadCost = Math.min(leadCost, competitiveIterator.cost()); - } - long threshold = Math.min(leadCost >> 3, maxDoc >> 5); - thresholdAsLong = intersectThresholdValue(threshold); + if (queueFull) { + encodeBottom(); } - if ((reverse == false && bottomAsComparableLong() <= thresholdAsLong) - || (reverse && bottomAsComparableLong() >= thresholdAsLong)) { - if (queueFull) { - encodeBottom(); - } - DisjunctionBuildVisitor visitor = new DisjunctionBuildVisitor(); - competitiveIterator = visitor.generateCompetitiveIterator(); - } - } + DocIdSetBuilder result = new DocIdSetBuilder(maxDoc); + PointValues.IntersectVisitor visitor = + new PointValues.IntersectVisitor() { + DocIdSetBuilder.BulkAdder adder; - private void tryReduceDisjunctionClause(CompetitiveIterator iter) { - int originalSize = iter.disis.size(); + @Override + public void grow(int count) { + adder = result.grow(count); + } - while (iter.disis.isEmpty() == false - && (iter.disis.getFirst().mostCompetitiveValue > maxValueAsLong - || iter.disis.getFirst().mostCompetitiveValue < minValueAsLong)) { - iter.disis.removeFirst(); - } + @Override + public void visit(int docID) { + if (docID <= maxDocVisited) { + return; // Already visited or skipped + } + adder.add(docID); + } + + @Override + public void visit(int docID, byte[] packedValue) { + if (docID <= maxDocVisited) { + return; // already visited or skipped + } + long l = sortableBytesToLong(packedValue); + if (l >= minValueAsLong && l <= maxValueAsLong) { + adder.add(docID); // doc is competitive + } + } + + @Override + public PointValues.Relation compare(byte[] minPackedValue, byte[] maxPackedValue) { + long min = sortableBytesToLong(minPackedValue); + long max = sortableBytesToLong(maxPackedValue); + + if (min > maxValueAsLong || max < minValueAsLong) { + // 1. cmp ==0 and pruning==Pruning.GREATER_THAN_OR_EQUAL_TO : if the sort is + // ascending then maxValueAsLong is bottom's next less value, so it is competitive + // 2. cmp ==0 and pruning==Pruning.GREATER_THAN: maxValueAsLong equals to + // bottom, but there are multiple comparators, so it could be competitive + return PointValues.Relation.CELL_OUTSIDE_QUERY; + } - if (originalSize != iter.disis.size()) { - iter.disjunction.clear(); - iter.disjunction.addAll(iter.disis); + if (min < minValueAsLong || max > maxValueAsLong) { + return PointValues.Relation.CELL_CROSSES_QUERY; + } + return PointValues.Relation.CELL_INSIDE_QUERY; + } + }; + + final long threshold = iteratorCost >>> 3; + + if (PointValues.isEstimatedPointCountGreaterThanOrEqualTo( + visitor, getPointTree(), threshold)) { + // the new range is not selective enough to be worth materializing, it doesn't reduce number + // of docs at least 8x + updateSkipInterval(false); + if (pointValues.getDocCount() < iteratorCost) { + // Use the set of doc with values to help drive iteration + competitiveIterator = getNumericDocValues(context, field); + iteratorCost = pointValues.getDocCount(); + } + return; } + pointValues.intersect(visitor); + competitiveIterator = result.build().iterator(); + iteratorCost = competitiveIterator.cost(); + updateSkipInterval(true); } - /** Find out the value that threshold docs away from topValue/infinite. */ - private long intersectThresholdValue(long threshold) throws IOException { - long thresholdValuePos; - if (leafTopSet) { - long topValue = topAsComparableLong(); - PointValues.IntersectVisitor visitor = new RangeVisitor(Long.MIN_VALUE, topValue, -1); - long topValuePos = pointValues.estimatePointCount(visitor); - thresholdValuePos = reverse == false ? topValuePos + threshold : topValuePos - threshold; - } else { - thresholdValuePos = reverse == false ? threshold : pointValues.size() - threshold; - } - if (thresholdValuePos <= 0) { - return sortableBytesToLong(pointValues.getMinPackedValue()); - } else if (thresholdValuePos >= pointValues.size()) { - return sortableBytesToLong(pointValues.getMaxPackedValue()); - } else { - return intersectValueByPos(pointValues.getPointTree(), thresholdValuePos); + private PointValues.PointTree getPointTree() throws IOException { + if (pointTree == null) { + pointTree = pointValues.getPointTree(); } + return pointTree; } - /** Get the point value by a left-to-right position. */ - private long intersectValueByPos(PointValues.PointTree pointTree, long pos) throws IOException { - assert pos > 0 : pos; - while (pointTree.size() < pos) { - pos -= pointTree.size(); - pointTree.moveToSibling(); - } - if (pointTree.size() == pos) { - return sortableBytesToLong(pointTree.getMaxPackedValue()); - } else if (pos == 0) { - return sortableBytesToLong(pointTree.getMinPackedValue()); - } else if (pointTree.moveToChild()) { - return intersectValueByPos(pointTree, pos); - } else { - return reverse == false - ? sortableBytesToLong(pointTree.getMaxPackedValue()) - : sortableBytesToLong(pointTree.getMinPackedValue()); + private void updateSkipInterval(boolean success) { + if (updateCounter > 256) { + if (success) { + currentSkipInterval = Math.max(currentSkipInterval / 2, MIN_SKIP_INTERVAL); + tryUpdateFailCount = 0; + } else { + if (tryUpdateFailCount >= 3) { + currentSkipInterval = Math.min(currentSkipInterval * 2, MAX_SKIP_INTERVAL); + tryUpdateFailCount = 0; + } else { + tryUpdateFailCount++; + } + } } } @@ -342,25 +360,31 @@ private void encodeTop() { } private boolean isMissingValueCompetitive() { - // if queue is full, always compare with bottom, - // if not, check if we can compare with topValue + // if queue is full, compare with bottom first, + // if competitive, then check if we can compare with topValue if (queueFull) { int result = Long.compare(missingValueAsLong, bottomAsComparableLong()); // in reverse (desc) sort missingValue is competitive when it's greater or equal to bottom, // in asc sort missingValue is competitive when it's smaller or equal to bottom - return reverse - ? (pruning == Pruning.GREATER_THAN_OR_EQUAL_TO ? result > 0 : result >= 0) - : (pruning == Pruning.GREATER_THAN_OR_EQUAL_TO ? result < 0 : result <= 0); - } else if (leafTopSet) { + final boolean competitive = + reverse + ? (pruning == Pruning.GREATER_THAN_OR_EQUAL_TO ? result > 0 : result >= 0) + : (pruning == Pruning.GREATER_THAN_OR_EQUAL_TO ? result < 0 : result <= 0); + if (competitive == false) { + return false; + } + } + + if (leafTopSet) { int result = Long.compare(missingValueAsLong, topAsComparableLong()); // in reverse (desc) sort missingValue is competitive when it's smaller or equal to // topValue, // in asc sort missingValue is competitive when it's greater or equal to topValue return reverse ? (result <= 0) : (result >= 0); - } else { - // by default competitive - return true; } + + // by default competitive + return true; } @Override @@ -394,276 +418,5 @@ public int advance(int target) throws IOException { protected abstract long bottomAsComparableLong(); protected abstract long topAsComparableLong(); - - class DisjunctionBuildVisitor extends RangeVisitor { - - final Deque disis = new ArrayDeque<>(); - // most competitive entry stored last. - final Consumer adder = - reverse == false ? disis::addFirst : disis::addLast; - - final int minBlockLength = minBlockLength(); - - final LSBRadixSorter sorter = new LSBRadixSorter(); - int[] docs = IntsRef.EMPTY_INTS; - int index = 0; - int blockMaxDoc = -1; - boolean docsInOrder = true; - long blockMinValue = Long.MAX_VALUE; - long blockMaxValue = Long.MIN_VALUE; - - private DisjunctionBuildVisitor() { - super(minValueAsLong, maxValueAsLong, maxDocVisited); - } - - @Override - public void grow(int count) { - docs = ArrayUtil.grow(docs, index + count + 1); - } - - @Override - protected void consumeDoc(int doc) { - docs[index++] = doc; - if (doc >= blockMaxDoc) { - blockMaxDoc = doc; - } else { - docsInOrder = false; - } - } - - void intersectLeaves(PointValues.PointTree pointTree) throws IOException { - PointValues.Relation r = - compare(pointTree.getMinPackedValue(), pointTree.getMaxPackedValue()); - switch (r) { - case CELL_INSIDE_QUERY, CELL_CROSSES_QUERY -> { - if (pointTree.moveToChild()) { - do { - intersectLeaves(pointTree); - } while (pointTree.moveToSibling()); - pointTree.moveToParent(); - } else { - if (r == PointValues.Relation.CELL_CROSSES_QUERY) { - pointTree.visitDocValues(this); - } else { - pointTree.visitDocIDs(this); - } - updateMinMax( - sortableBytesToLong(pointTree.getMinPackedValue()), - sortableBytesToLong(pointTree.getMaxPackedValue())); - } - } - case CELL_OUTSIDE_QUERY -> {} - default -> throw new IllegalStateException("unreachable code"); - } - } - - void updateMinMax(long leafMinValue, long leafMaxValue) throws IOException { - this.blockMinValue = Math.min(blockMinValue, leafMinValue); - this.blockMaxValue = Math.max(blockMaxValue, leafMaxValue); - if (index >= minBlockLength) { - update(); - this.blockMinValue = Long.MAX_VALUE; - this.blockMaxValue = Long.MIN_VALUE; - } - } - - void update() throws IOException { - if (blockMinValue > blockMaxValue) { - return; - } - long mostCompetitiveValue = - reverse == false - ? Math.max(blockMinValue, minValueAsLong) - : Math.min(blockMaxValue, maxValueAsLong); - - if (docsInOrder == false) { - sorter.sort(PackedInts.bitsRequired(blockMaxDoc), docs, index); - } - docs[index] = DocIdSetIterator.NO_MORE_DOCS; - DocIdSetIterator iter = new IntArrayDocIdSet(docs, index).iterator(); - adder.accept(new DisiAndMostCompetitiveValue(iter, mostCompetitiveValue)); - docs = IntsRef.EMPTY_INTS; - index = 0; - blockMaxDoc = -1; - docsInOrder = true; - } - - DocIdSetIterator generateCompetitiveIterator() throws IOException { - intersectLeaves(pointValues.getPointTree()); - update(); - - if (disis.isEmpty()) { - return DocIdSetIterator.empty(); - } - assert assertMostCompetitiveValuesSorted(disis); - - PriorityQueue disjunction = - new PriorityQueue<>(disis.size()) { - @Override - protected boolean lessThan( - DisiAndMostCompetitiveValue a, DisiAndMostCompetitiveValue b) { - return a.disi.docID() < b.disi.docID(); - } - }; - disjunction.addAll(disis); - - return new CompetitiveIterator(maxDoc, disis, disjunction); - } - - /** - * Used for assert. When reverse is false, smaller values are more competitive, so - * mostCompetitiveValues should be in desc order. - */ - private boolean assertMostCompetitiveValuesSorted(Deque deque) { - long lastValue = reverse == false ? Long.MAX_VALUE : Long.MIN_VALUE; - for (DisiAndMostCompetitiveValue value : deque) { - if (reverse == false) { - assert value.mostCompetitiveValue <= lastValue - : deque.stream().map(d -> d.mostCompetitiveValue).toList().toString(); - } else { - assert value.mostCompetitiveValue >= lastValue - : deque.stream().map(d -> d.mostCompetitiveValue).toList().toString(); - } - lastValue = value.mostCompetitiveValue; - } - return true; - } - - private int minBlockLength() { - // bottom value can be much more competitive than thresholdAsLong, recompute the cost. - long cost = - pointValues.estimatePointCount(new RangeVisitor(minValueAsLong, maxValueAsLong, -1)); - long disjunctionClause = Math.min(MAX_DISJUNCTION_CLAUSE, cost / 512 + 1); - return Math.toIntExact(cost / disjunctionClause); - } - } - } - - private class RangeVisitor implements PointValues.IntersectVisitor { - - private final long minInclusive; - private final long maxInclusive; - private final int docLowerBound; - - private RangeVisitor(long minInclusive, long maxInclusive, int docLowerBound) { - this.minInclusive = minInclusive; - this.maxInclusive = maxInclusive; - this.docLowerBound = docLowerBound; - } - - @Override - public void visit(int docID) throws IOException { - if (docID <= docLowerBound) { - return; // Already visited or skipped - } - consumeDoc(docID); - } - - @Override - public void visit(int docID, byte[] packedValue) throws IOException { - if (docID <= docLowerBound) { - return; // already visited or skipped - } - long l = sortableBytesToLong(packedValue); - if (l >= minInclusive && l <= maxInclusive) { - consumeDoc(docID); - } - } - - @Override - public void visit(DocIdSetIterator iterator, byte[] packedValue) throws IOException { - long l = sortableBytesToLong(packedValue); - if (l >= minInclusive && l <= maxInclusive) { - int doc = docLowerBound >= 0 ? iterator.advance(docLowerBound) : iterator.nextDoc(); - while (doc != DocIdSetIterator.NO_MORE_DOCS) { - consumeDoc(doc); - doc = iterator.nextDoc(); - } - } - } - - @Override - public void visit(DocIdSetIterator iterator) throws IOException { - int doc = docLowerBound >= 0 ? iterator.advance(docLowerBound) : iterator.nextDoc(); - while (doc != DocIdSetIterator.NO_MORE_DOCS) { - consumeDoc(doc); - doc = iterator.nextDoc(); - } - } - - @Override - public PointValues.Relation compare(byte[] minPackedValue, byte[] maxPackedValue) { - long min = sortableBytesToLong(minPackedValue); - long max = sortableBytesToLong(maxPackedValue); - - if (min > maxInclusive || max < minInclusive) { - // 1. cmp ==0 and pruning==Pruning.GREATER_THAN_OR_EQUAL_TO : if the sort is - // ascending then maxValueAsLong is bottom's next less value, so it is competitive - // 2. cmp ==0 and pruning==Pruning.GREATER_THAN: maxValueAsLong equals to - // bottom, but there are multiple comparators, so it could be competitive - return PointValues.Relation.CELL_OUTSIDE_QUERY; - } - - if (min < minInclusive || max > maxInclusive) { - return PointValues.Relation.CELL_CROSSES_QUERY; - } - return PointValues.Relation.CELL_INSIDE_QUERY; - } - - void consumeDoc(int doc) { - throw new UnsupportedOperationException(); - } - } - - private record DisiAndMostCompetitiveValue(DocIdSetIterator disi, long mostCompetitiveValue) {} - - private static class CompetitiveIterator extends DocIdSetIterator { - - private final int maxDoc; - private int doc = -1; - private final Deque disis; - private final PriorityQueue disjunction; - - CompetitiveIterator( - int maxDoc, - Deque disis, - PriorityQueue disjunction) { - this.maxDoc = maxDoc; - this.disis = disis; - this.disjunction = disjunction; - } - - @Override - public int docID() { - return doc; - } - - @Override - public int nextDoc() throws IOException { - return advance(docID() + 1); - } - - @Override - public int advance(int target) throws IOException { - if (target >= maxDoc) { - return doc = NO_MORE_DOCS; - } else { - DisiAndMostCompetitiveValue top = disjunction.top(); - if (top == null) { - // priority queue is empty, none of the remaining documents are competitive - return doc = NO_MORE_DOCS; - } - while (top.disi.docID() < target) { - top.disi.advance(target); - top = disjunction.updateTop(); - } - return doc = top.disi.docID(); - } - } - - @Override - public long cost() { - return maxDoc; - } } } diff --git a/lucene/core/src/java/org/apache/lucene/search/comparators/TermOrdValComparator.java b/lucene/core/src/java/org/apache/lucene/search/comparators/TermOrdValComparator.java index 31199c3cd2c7..ebe4fb2f956a 100644 --- a/lucene/core/src/java/org/apache/lucene/search/comparators/TermOrdValComparator.java +++ b/lucene/core/src/java/org/apache/lucene/search/comparators/TermOrdValComparator.java @@ -464,15 +464,7 @@ public DocIdSetIterator competitiveIterator() { } } - private static class PostingsEnumAndOrd { - private final PostingsEnum postings; - private final int ord; - - PostingsEnumAndOrd(PostingsEnum postings, int ord) { - this.postings = postings; - this.ord = ord; - } - } + private record PostingsEnumAndOrd(PostingsEnum postings, int ord) {} private class CompetitiveIterator extends DocIdSetIterator { diff --git a/lucene/core/src/java/org/apache/lucene/search/knn/MultiLeafKnnCollector.java b/lucene/core/src/java/org/apache/lucene/search/knn/MultiLeafKnnCollector.java index 1ca979d67947..051cd9ed6339 100644 --- a/lucene/core/src/java/org/apache/lucene/search/knn/MultiLeafKnnCollector.java +++ b/lucene/core/src/java/org/apache/lucene/search/knn/MultiLeafKnnCollector.java @@ -33,17 +33,17 @@ public final class MultiLeafKnnCollector implements KnnCollector { // greediness of globally non-competitive search: (0,1] private static final float DEFAULT_GREEDINESS = 0.9f; + private static final int DEFAULT_INTERVAL = 0xff; // the global queue of the highest similarities collected so far across all segments private final BlockingFloatHeap globalSimilarityQueue; // the local queue of the highest similarities if we are not competitive globally // the size of this queue is defined by greediness private final FloatHeap nonCompetitiveQueue; - private final float greediness; // the queue of the local similarities to periodically update with the global queue private final FloatHeap updatesQueue; private final float[] updatesScratch; // interval to synchronize the local and global queues, as a number of visited vectors - private final int interval = 0xff; // 255 + private final int interval; private boolean kResultsCollected = false; private float cachedGlobalMinSim = Float.NEGATIVE_INFINITY; private final AbstractKnnCollector subCollector; @@ -58,7 +58,32 @@ public final class MultiLeafKnnCollector implements KnnCollector { */ public MultiLeafKnnCollector( int k, BlockingFloatHeap globalSimilarityQueue, AbstractKnnCollector subCollector) { - this.greediness = DEFAULT_GREEDINESS; + this(k, DEFAULT_GREEDINESS, DEFAULT_INTERVAL, globalSimilarityQueue, subCollector); + } + + /** + * Create a new MultiLeafKnnCollector. + * + * @param k the number of neighbors to collect + * @param greediness the greediness of the global search + * @param interval (by number of collected values) the interval to synchronize the local and + * global queues + * @param globalSimilarityQueue the global queue of the highest similarities collected so far + * @param subCollector the local collector + */ + public MultiLeafKnnCollector( + int k, + float greediness, + int interval, + BlockingFloatHeap globalSimilarityQueue, + AbstractKnnCollector subCollector) { + if (greediness < 0 || greediness > 1) { + throw new IllegalArgumentException("greediness must be in [0,1]"); + } + if (interval <= 0) { + throw new IllegalArgumentException("interval must be positive"); + } + this.interval = interval; this.subCollector = subCollector; this.globalSimilarityQueue = globalSimilarityQueue; this.nonCompetitiveQueue = new FloatHeap(Math.max(1, Math.round((1 - greediness) * k))); diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/Axiomatic.java b/lucene/core/src/java/org/apache/lucene/search/similarities/Axiomatic.java index 77f71782e315..b4546946acfd 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/Axiomatic.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/Axiomatic.java @@ -44,13 +44,26 @@ public abstract class Axiomatic extends SimilarityBase { protected final int queryLen; /** - * Constructor setting all Axiomatic hyperparameters + * Constructor setting all Axiomatic hyperparameters and using default discountOverlaps value. * * @param s hyperparam for the growth function * @param queryLen the query length * @param k hyperparam for the primitive weighting function */ public Axiomatic(float s, int queryLen, float k) { + this(true, s, queryLen, k); + } + + /** + * Constructor setting all Axiomatic hyperparameters + * + * @param discountOverlaps true if overlap tokens should not impact document length for scoring. + * @param s hyperparam for the growth function + * @param queryLen the query length + * @param k hyperparam for the primitive weighting function + */ + public Axiomatic(boolean discountOverlaps, float s, int queryLen, float k) { + super(discountOverlaps); if (Float.isFinite(s) == false || Float.isNaN(s) || s < 0 || s > 1) { throw new IllegalArgumentException("illegal s value: " + s + ", must be between 0 and 1"); } diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java index f6dacd54c1e6..274bb475d6c7 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java @@ -18,8 +18,6 @@ import java.util.ArrayList; import java.util.List; -import org.apache.lucene.index.FieldInvertState; -import org.apache.lucene.index.IndexOptions; import org.apache.lucene.search.CollectionStatistics; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.TermStatistics; @@ -33,7 +31,6 @@ public class BM25Similarity extends Similarity { private final float k1; private final float b; - private final boolean discountOverlaps; /** * BM25 with the supplied parameter values. @@ -46,6 +43,7 @@ public class BM25Similarity extends Similarity { * within the range {@code [0..1]} */ public BM25Similarity(float k1, float b, boolean discountOverlaps) { + super(discountOverlaps); if (Float.isFinite(k1) == false || k1 < 0) { throw new IllegalArgumentException( "illegal k1 value: " + k1 + ", must be a non-negative finite value"); @@ -55,7 +53,6 @@ public BM25Similarity(float k1, float b, boolean discountOverlaps) { } this.k1 = k1; this.b = b; - this.discountOverlaps = discountOverlaps; } /** @@ -110,15 +107,6 @@ protected float avgFieldLength(CollectionStatistics collectionStats) { return (float) (collectionStats.sumTotalTermFreq() / (double) collectionStats.docCount()); } - /** - * Returns true if overlap tokens are discounted from the document's length. - * - * @see #BM25Similarity(float, float, boolean) - */ - public boolean getDiscountOverlaps() { - return discountOverlaps; - } - /** Cache of decoded bytes. */ private static final float[] LENGTH_TABLE = new float[256]; @@ -128,19 +116,6 @@ public boolean getDiscountOverlaps() { } } - @Override - public final long computeNorm(FieldInvertState state) { - final int numTerms; - if (state.getIndexOptions() == IndexOptions.DOCS && state.getIndexCreatedVersionMajor() >= 8) { - numTerms = state.getUniqueTermCount(); - } else if (discountOverlaps) { - numTerms = state.getLength() - state.getNumOverlap(); - } else { - numTerms = state.getLength(); - } - return SmallFloat.intToByte4(numTerms); - } - /** * Computes a score factor for a simple term and returns an explanation for that score factor. * diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BooleanSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BooleanSimilarity.java index ca1d579c93c7..db28e8454488 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/BooleanSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BooleanSimilarity.java @@ -16,7 +16,6 @@ */ package org.apache.lucene.search.similarities; -import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.search.CollectionStatistics; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.TermStatistics; @@ -25,22 +24,15 @@ * Simple similarity that gives terms a score that is equal to their query boost. This similarity is * typically used with disabled norms since neither document statistics nor index statistics are * used for scoring. That said, if norms are enabled, they will be computed the same way as {@link - * SimilarityBase} and {@link BM25Similarity} with {@link - * SimilarityBase#setDiscountOverlaps(boolean) discounted overlaps} so that the {@link Similarity} - * can be changed after the index has been created. + * SimilarityBase} and {@link BM25Similarity} with {@link SimilarityBase#getDiscountOverlaps() + * discounted overlaps} so that the {@link Similarity} can be changed after the index has been + * created. */ public class BooleanSimilarity extends Similarity { - private static final Similarity BM25_SIM = new BM25Similarity(); - /** Sole constructor */ public BooleanSimilarity() {} - @Override - public long computeNorm(FieldInvertState state) { - return BM25_SIM.computeNorm(state); - } - @Override public SimScorer scorer( float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java index 84b688feda7b..5fad406c5856 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java @@ -26,8 +26,15 @@ */ public class ClassicSimilarity extends TFIDFSimilarity { - /** Sole constructor: parameter-free */ - public ClassicSimilarity() {} + /** Default constructor: parameter-free */ + public ClassicSimilarity() { + super(); + } + + /** Primary constructor. */ + public ClassicSimilarity(boolean discountOverlaps) { + super(discountOverlaps); + } /** * Implemented as 1/sqrt(length). diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/DFISimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/DFISimilarity.java index b9c651008ccd..34d619ea69f3 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/DFISimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/DFISimilarity.java @@ -46,11 +46,23 @@ public class DFISimilarity extends SimilarityBase { private final Independence independence; /** - * Create DFI with the specified divergence from independence measure + * Create DFI with the specified divergence from independence measure and using default + * discountOverlaps value * * @param independenceMeasure measure of divergence from independence */ public DFISimilarity(Independence independenceMeasure) { + this(independenceMeasure, true); + } + + /** + * Create DFI with the specified parameters + * + * @param independenceMeasure measure of divergence from independence + * @param discountOverlaps true if overlap tokens should not impact document length for scoring. + */ + public DFISimilarity(Independence independenceMeasure, boolean discountOverlaps) { + super(discountOverlaps); this.independence = independenceMeasure; } diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java index f47c05be1f02..08e424b32303 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java @@ -83,7 +83,7 @@ public class DFRSimilarity extends SimilarityBase { protected final Normalization normalization; /** - * Creates DFRSimilarity from the three components. + * Creates DFRSimilarity from the three components and using default discountOverlaps value. * *

    Note that null values are not allowed: if you want no normalization, instead * pass {@link NoNormalization}. @@ -94,6 +94,27 @@ public class DFRSimilarity extends SimilarityBase { */ public DFRSimilarity( BasicModel basicModel, AfterEffect afterEffect, Normalization normalization) { + this(basicModel, afterEffect, normalization, true); + } + + /** + * Creates DFRSimilarity from the three components and with the specified discountOverlaps value. + * + *

    Note that null values are not allowed: if you want no normalization, instead + * pass {@link NoNormalization}. + * + * @param basicModel Basic model of information content + * @param afterEffect First normalization of information gain + * @param normalization Second (length) normalization + * @param discountOverlaps True if overlap tokens (tokens with a position of increment of zero) + * are discounted from the document's length. + */ + public DFRSimilarity( + BasicModel basicModel, + AfterEffect afterEffect, + Normalization normalization, + boolean discountOverlaps) { + super(discountOverlaps); if (basicModel == null || afterEffect == null || normalization == null) { throw new NullPointerException("null parameters not allowed."); } diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/IBSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/IBSimilarity.java index 5b0e93571b12..d2325d200335 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/IBSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/IBSimilarity.java @@ -76,7 +76,7 @@ public class IBSimilarity extends SimilarityBase { protected final Normalization normalization; /** - * Creates IBSimilarity from the three components. + * Creates IBSimilarity from the three components and using default discountOverlaps value. * *

    Note that null values are not allowed: if you want no normalization, instead * pass {@link NoNormalization}. @@ -86,6 +86,26 @@ public class IBSimilarity extends SimilarityBase { * @param normalization term frequency normalization */ public IBSimilarity(Distribution distribution, Lambda lambda, Normalization normalization) { + this(distribution, lambda, normalization, true); + } + + /** + * Creates IBSimilarity from the three components and with the specified discountOverlaps value. + * + *

    Note that null values are not allowed: if you want no normalization, instead + * pass {@link NoNormalization}. + * + * @param distribution probabilistic distribution modeling term occurrence + * @param lambda distribution's λw parameter + * @param normalization term frequency normalization + * @param discountOverlaps true if overlap tokens should not impact document length for scoring. + */ + public IBSimilarity( + Distribution distribution, + Lambda lambda, + Normalization normalization, + boolean discountOverlaps) { + super(discountOverlaps); this.distribution = distribution; this.lambda = lambda; this.normalization = normalization; diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/IndriDirichletSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/IndriDirichletSimilarity.java index 9f708362bb5f..b3994c5dc46e 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/IndriDirichletSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/IndriDirichletSimilarity.java @@ -37,6 +37,13 @@ public class IndriDirichletSimilarity extends LMSimilarity { /** The μ parameter. */ private final float mu; + /** Instantiates the similarity with the provided parameters. */ + public IndriDirichletSimilarity( + CollectionModel collectionModel, boolean discountOverlaps, float mu) { + super(collectionModel, discountOverlaps); + this.mu = mu; + } + /** Instantiates the similarity with the provided μ parameter. */ public IndriDirichletSimilarity(CollectionModel collectionModel, float mu) { super(collectionModel); diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/LMDirichletSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/LMDirichletSimilarity.java index 51b1604aef1c..ab80d0d337e5 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/LMDirichletSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/LMDirichletSimilarity.java @@ -39,7 +39,13 @@ public class LMDirichletSimilarity extends LMSimilarity { /** Instantiates the similarity with the provided μ parameter. */ public LMDirichletSimilarity(CollectionModel collectionModel, float mu) { - super(collectionModel); + this(collectionModel, true, mu); + } + + /** Instantiates the similarity with the provided parameters. */ + public LMDirichletSimilarity( + CollectionModel collectionModel, boolean discountOverlaps, float mu) { + super(collectionModel, discountOverlaps); if (Float.isFinite(mu) == false || mu < 0) { throw new IllegalArgumentException( "illegal mu value: " + mu + ", must be a non-negative finite value"); diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.java index e1990f34b0b6..7029fa8e133c 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.java @@ -43,7 +43,13 @@ public class LMJelinekMercerSimilarity extends LMSimilarity { /** Instantiates with the specified collectionModel and λ parameter. */ public LMJelinekMercerSimilarity(CollectionModel collectionModel, float lambda) { - super(collectionModel); + this(collectionModel, true, lambda); + } + + /** Instantiates with the specified collectionModel and parameters. */ + public LMJelinekMercerSimilarity( + CollectionModel collectionModel, boolean discountOverlaps, float lambda) { + super(collectionModel, discountOverlaps); if (Float.isNaN(lambda) || lambda <= 0 || lambda > 1) { throw new IllegalArgumentException("lambda must be in the range (0 .. 1]"); } diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/LMSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/LMSimilarity.java index e1536db268fd..5bd48f37a34e 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/LMSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/LMSimilarity.java @@ -43,6 +43,12 @@ public abstract class LMSimilarity extends SimilarityBase { /** Creates a new instance with the specified collection language model. */ public LMSimilarity(CollectionModel collectionModel) { + this(collectionModel, true); + } + + /** Creates a new instance with the specified collection language model and discountOverlaps. */ + public LMSimilarity(CollectionModel collectionModel, boolean discountOverlaps) { + super(discountOverlaps); this.collectionModel = collectionModel; } diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/RawTFSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/RawTFSimilarity.java new file mode 100644 index 000000000000..043d831dbe19 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/RawTFSimilarity.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.similarities; + +import org.apache.lucene.search.CollectionStatistics; +import org.apache.lucene.search.TermStatistics; + +/** Similarity that returns the raw TF as score. */ +public class RawTFSimilarity extends Similarity { + + /** Default constructor: parameter-free */ + public RawTFSimilarity() { + super(); + } + + /** Primary constructor. */ + public RawTFSimilarity(boolean discountOverlaps) { + super(discountOverlaps); + } + + @Override + public SimScorer scorer( + float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { + return new SimScorer() { + @Override + public float score(float freq, long norm) { + return boost * freq; + } + }; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/Similarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/Similarity.java index 24022e832774..83582e44e25f 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/Similarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/Similarity.java @@ -17,8 +17,10 @@ package org.apache.lucene.search.similarities; import java.util.Collections; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.document.NumericDocValuesField; import org.apache.lucene.index.FieldInvertState; +import org.apache.lucene.index.IndexOptions; import org.apache.lucene.search.CollectionStatistics; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.IndexSearcher; @@ -45,7 +47,7 @@ * is in this norm, but it is most useful for encoding length normalization information. * *

    Implementations should carefully consider how the normalization is encoded: while Lucene's - * {@link BM25Similarity} encodes length normalization information with {@link SmallFloat} into a + * default implementation encodes length normalization information with {@link SmallFloat} into a * single byte, this might not be suitable for all purposes. * *

    Many formulas require the use of average document length, which can be computed via a @@ -88,13 +90,49 @@ * @lucene.experimental */ public abstract class Similarity { - /** Sole constructor. (For invocation by subclass constructors, typically implicit.) */ - // Explicitly declared so that we have non-empty javadoc - protected Similarity() {} + /** + * True if overlap tokens (tokens with a position of increment of zero) are discounted from the + * document's length. + */ + private final boolean discountOverlaps; + + /** + * Returns true if overlap tokens are discounted from the document's length. + * + * @see #computeNorm + */ + public final boolean getDiscountOverlaps() { + return discountOverlaps; + } + + /** Default constructor. (For invocation by subclass constructors, typically implicit.) */ + protected Similarity() { + this(true); + } + + /** + * Expert constructor that allows adjustment of {@link #getDiscountOverlaps()} at index-time. + * + *

    Overlap tokens are tokens such as synonyms, that have a {@link PositionIncrementAttribute} + * of zero from the analysis chain. + * + *

    NOTE: If you modify this parameter, you'll need to re-index for it to take effect. + * + * @param discountOverlaps true if overlap tokens should not impact document length for scoring. + */ + protected Similarity(boolean discountOverlaps) { + this.discountOverlaps = discountOverlaps; + } /** - * Computes the normalization value for a field, given the accumulated state of term processing - * for this field (see {@link FieldInvertState}). + * Computes the normalization value for a field at index-time. + * + *

    The default implementation uses {@link SmallFloat#intToByte4} to encode the number of terms + * as a single byte. + * + *

    WARNING: The default implementation is used by Lucene's supplied Similarity classes, + * which means you can change the Similarity at runtime without reindexing. If you override this + * method, you'll need to re-index documents for it to take effect. * *

    Matches in longer fields are less precise, so implementations of this method usually set * smaller values when state.getLength() is large, and larger values when @@ -108,10 +146,20 @@ protected Similarity() {} *

    {@code 0} is not a legal norm, so {@code 1} is the norm that produces the highest scores. * * @lucene.experimental - * @param state current processing state for this field + * @param state accumulated state of term processing for this field * @return computed norm value */ - public abstract long computeNorm(FieldInvertState state); + public long computeNorm(FieldInvertState state) { + final int numTerms; + if (state.getIndexOptions() == IndexOptions.DOCS) { + numTerms = state.getUniqueTermCount(); + } else if (discountOverlaps) { + numTerms = state.getLength() - state.getNumOverlap(); + } else { + numTerms = state.getLength(); + } + return SmallFloat.intToByte4(numTerms); + } /** * Compute any collection-level weight (e.g. IDF, average document length, etc) needed for scoring diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java b/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java index ef5366b60380..af63b7bebeb6 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java @@ -18,8 +18,6 @@ import java.util.ArrayList; import java.util.List; -import org.apache.lucene.index.FieldInvertState; -import org.apache.lucene.index.IndexOptions; import org.apache.lucene.search.CollectionStatistics; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.TermStatistics; @@ -43,33 +41,14 @@ public abstract class SimilarityBase extends Similarity { /** For {@link #log2(double)}. Precomputed for efficiency reasons. */ private static final double LOG_2 = Math.log(2); - /** - * True if overlap tokens (tokens with a position of increment of zero) are discounted from the - * document's length. - */ - protected boolean discountOverlaps = true; - - /** Sole constructor. (For invocation by subclass constructors, typically implicit.) */ - public SimilarityBase() {} - - /** - * Determines whether overlap tokens (Tokens with 0 position increment) are ignored when computing - * norm. By default this is true, meaning overlap tokens do not count when computing norms. - * - * @lucene.experimental - * @see #computeNorm - */ - public void setDiscountOverlaps(boolean v) { - discountOverlaps = v; + /** Default constructor: parameter-free */ + public SimilarityBase() { + super(); } - /** - * Returns true if overlap tokens are discounted from the document's length. - * - * @see #setDiscountOverlaps - */ - public boolean getDiscountOverlaps() { - return discountOverlaps; + /** Primary constructor. */ + public SimilarityBase(boolean discountOverlaps) { + super(discountOverlaps); } @Override @@ -179,20 +158,6 @@ protected Explanation explain(BasicStats stats, Explanation freq, double docLen) } } - /** Encodes the document length in the same way as {@link BM25Similarity}. */ - @Override - public final long computeNorm(FieldInvertState state) { - final int numTerms; - if (state.getIndexOptions() == IndexOptions.DOCS && state.getIndexCreatedVersionMajor() >= 8) { - numTerms = state.getUniqueTermCount(); - } else if (discountOverlaps) { - numTerms = state.getLength() - state.getNumOverlap(); - } else { - numTerms = state.getLength(); - } - return SmallFloat.intToByte4(numTerms); - } - // ----------------------------- Static methods ------------------------------ /** Returns the base two logarithm of {@code x}. */ diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java index 906c7dd2875c..c81ef763862d 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java @@ -18,8 +18,6 @@ import java.util.ArrayList; import java.util.List; -import org.apache.lucene.index.FieldInvertState; -import org.apache.lucene.index.IndexOptions; import org.apache.lucene.search.CollectionStatistics; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.IndexSearcher; @@ -326,33 +324,14 @@ */ public abstract class TFIDFSimilarity extends Similarity { - /** Sole constructor. (For invocation by subclass constructors, typically implicit.) */ - public TFIDFSimilarity() {} - - /** - * True if overlap tokens (tokens with a position of increment of zero) are discounted from the - * document's length. - */ - protected boolean discountOverlaps = true; - - /** - * Determines whether overlap tokens (Tokens with 0 position increment) are ignored when computing - * norm. By default this is true, meaning overlap tokens do not count when computing norms. - * - * @lucene.experimental - * @see #computeNorm - */ - public void setDiscountOverlaps(boolean v) { - discountOverlaps = v; + /** Default constructor: parameter-free */ + public TFIDFSimilarity() { + super(); } - /** - * Returns true if overlap tokens are discounted from the document's length. - * - * @see #setDiscountOverlaps - */ - public boolean getDiscountOverlaps() { - return discountOverlaps; + /** Primary constructor. */ + public TFIDFSimilarity(boolean discountOverlaps) { + super(discountOverlaps); } /** @@ -438,7 +417,7 @@ public Explanation idfExplain(CollectionStatistics collectionStats, TermStatisti /** * Compute an index-time normalization value for this field instance. * - * @param length the number of terms in the field, optionally {@link #setDiscountOverlaps(boolean) + * @param length the number of terms in the field, optionally {@link #getDiscountOverlaps() * discounting overlaps} * @return a length normalization value */ @@ -453,19 +432,6 @@ public Explanation idfExplain(CollectionStatistics collectionStats, TermStatisti } } - @Override - public final long computeNorm(FieldInvertState state) { - final int numTerms; - if (state.getIndexOptions() == IndexOptions.DOCS && state.getIndexCreatedVersionMajor() >= 8) { - numTerms = state.getUniqueTermCount(); - } else if (discountOverlaps) { - numTerms = state.getLength() - state.getNumOverlap(); - } else { - numTerms = state.getLength(); - } - return SmallFloat.intToByte4(numTerms); - } - @Override public final SimScorer scorer( float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { diff --git a/lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java b/lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java index 13151692bc06..1738259fa2fb 100644 --- a/lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java +++ b/lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java @@ -151,7 +151,7 @@ public final int readInt() throws IOException { } @Override - protected void readGroupVInt(long[] dst, int offset) throws IOException { + public void readGroupVInt(int[] dst, int offset) throws IOException { final int len = GroupVIntUtil.readGroupVInt( this, buffer.remaining(), p -> buffer.getInt((int) p), buffer.position(), dst, offset); diff --git a/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataInput.java b/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataInput.java index 4b722b61689c..39e920616209 100644 --- a/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataInput.java +++ b/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataInput.java @@ -204,7 +204,7 @@ public long readLong() throws IOException { } @Override - protected void readGroupVInt(long[] dst, int offset) throws IOException { + public void readGroupVInt(int[] dst, int offset) throws IOException { final ByteBuffer block = blocks[blockIndex(pos)]; final int blockOffset = blockOffset(pos); // We MUST save the return value to local variable, could not use pos += readGroupVInt(...). diff --git a/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataOutput.java b/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataOutput.java index eaa0929848db..1c6bcd636299 100644 --- a/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataOutput.java +++ b/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataOutput.java @@ -31,7 +31,6 @@ import java.util.function.IntFunction; import org.apache.lucene.util.Accountable; import org.apache.lucene.util.BitUtil; -import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.UnicodeUtil; @@ -415,12 +414,17 @@ public void writeLong(long v) { @Override public void writeString(String v) { try { - if (v.length() <= MAX_CHARS_PER_WINDOW) { - final BytesRef utf8 = new BytesRef(v); - writeVInt(utf8.length); - writeBytes(utf8.bytes, utf8.offset, utf8.length); + final int charCount = v.length(); + final int byteLen = UnicodeUtil.calcUTF16toUTF8Length(v, 0, charCount); + writeVInt(byteLen); + ByteBuffer currentBlock = this.currentBlock; + if (currentBlock.hasArray() && currentBlock.remaining() >= byteLen) { + int startingPos = currentBlock.position(); + UnicodeUtil.UTF16toUTF8( + v, 0, charCount, currentBlock.array(), currentBlock.arrayOffset() + startingPos); + currentBlock.position(startingPos + byteLen); } else { - writeLongString(v); + writeLongString(byteLen, v); } } catch (IOException e) { throw new UncheckedIOException(e); @@ -541,9 +545,7 @@ private static int computeBlockSizeBitsFor(long bytes) { } /** Writes a long string in chunks */ - private void writeLongString(final String s) throws IOException { - final int byteLen = UnicodeUtil.calcUTF16toUTF8Length(s, 0, s.length()); - writeVInt(byteLen); + private void writeLongString(int byteLen, final String s) throws IOException { final byte[] buf = new byte[Math.min(byteLen, UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR * MAX_CHARS_PER_WINDOW)]; for (int i = 0, end = s.length(); i < end; ) { diff --git a/lucene/core/src/java/org/apache/lucene/store/ByteBuffersIndexInput.java b/lucene/core/src/java/org/apache/lucene/store/ByteBuffersIndexInput.java index c66d864d570f..959f429ecfce 100644 --- a/lucene/core/src/java/org/apache/lucene/store/ByteBuffersIndexInput.java +++ b/lucene/core/src/java/org/apache/lucene/store/ByteBuffersIndexInput.java @@ -206,7 +206,7 @@ public void readLongs(long[] dst, int offset, int length) throws IOException { } @Override - protected void readGroupVInt(long[] dst, int offset) throws IOException { + public void readGroupVInt(int[] dst, int offset) throws IOException { ensureOpen(); in.readGroupVInt(dst, offset); } diff --git a/lucene/core/src/java/org/apache/lucene/store/DataInput.java b/lucene/core/src/java/org/apache/lucene/store/DataInput.java index 427e81f2df24..369b631426bd 100644 --- a/lucene/core/src/java/org/apache/lucene/store/DataInput.java +++ b/lucene/core/src/java/org/apache/lucene/store/DataInput.java @@ -100,28 +100,12 @@ public int readInt() throws IOException { } /** - * Read all the group varints, including the tail vints. we need a long[] because this is what - * postings are using, all longs are actually required to be integers. + * Override if you have an efficient implementation. In general this is when the input supports + * random access. * - * @param dst the array to read ints into. - * @param limit the number of int values to read. * @lucene.experimental */ - public final void readGroupVInts(long[] dst, int limit) throws IOException { - int i; - for (i = 0; i <= limit - 4; i += 4) { - readGroupVInt(dst, i); - } - for (; i < limit; ++i) { - dst[i] = readVInt() & 0xFFFFFFFFL; - } - } - - /** - * Override if you have a efficient implementation. In general this is when the input supports - * random access. - */ - protected void readGroupVInt(long[] dst, int offset) throws IOException { + public void readGroupVInt(int[] dst, int offset) throws IOException { GroupVIntUtil.readGroupVInt(this, dst, offset); } diff --git a/lucene/core/src/java/org/apache/lucene/store/DataOutput.java b/lucene/core/src/java/org/apache/lucene/store/DataOutput.java index 7b97dcd8a7b0..b312a693eba5 100644 --- a/lucene/core/src/java/org/apache/lucene/store/DataOutput.java +++ b/lucene/core/src/java/org/apache/lucene/store/DataOutput.java @@ -340,4 +340,19 @@ public void writeGroupVInts(long[] values, int limit) throws IOException { } GroupVIntUtil.writeGroupVInts(this, groupVIntBytes, values, limit); } + + /** + * Encode integers using group-varint. It uses {@link DataOutput#writeVInt VInt} to encode tail + * values that are not enough for a group. + * + * @param values the values to write + * @param limit the number of values to write. + * @lucene.experimental + */ + public void writeGroupVInts(int[] values, int limit) throws IOException { + if (groupVIntBytes == null) { + groupVIntBytes = new byte[GroupVIntUtil.MAX_LENGTH_PER_GROUP]; + } + GroupVIntUtil.writeGroupVInts(this, groupVIntBytes, values, limit); + } } diff --git a/lucene/core/src/java/org/apache/lucene/store/IOContext.java b/lucene/core/src/java/org/apache/lucene/store/IOContext.java index f318b3a90157..91f3822dbc13 100644 --- a/lucene/core/src/java/org/apache/lucene/store/IOContext.java +++ b/lucene/core/src/java/org/apache/lucene/store/IOContext.java @@ -68,10 +68,10 @@ public enum Context { Objects.requireNonNull(context, "context must not be null"); Objects.requireNonNull(readAdvice, "readAdvice must not be null"); switch (context) { - case MERGE -> Objects.requireNonNull( - mergeInfo, "mergeInfo must not be null if context is MERGE"); - case FLUSH -> Objects.requireNonNull( - flushInfo, "flushInfo must not be null if context is FLUSH"); + case MERGE -> + Objects.requireNonNull(mergeInfo, "mergeInfo must not be null if context is MERGE"); + case FLUSH -> + Objects.requireNonNull(flushInfo, "flushInfo must not be null if context is FLUSH"); } if ((context == Context.FLUSH || context == Context.MERGE) && readAdvice != ReadAdvice.SEQUENTIAL) { diff --git a/lucene/core/src/java/org/apache/lucene/store/IndexInput.java b/lucene/core/src/java/org/apache/lucene/store/IndexInput.java index ee84d9088382..38eb1dcbceeb 100644 --- a/lucene/core/src/java/org/apache/lucene/store/IndexInput.java +++ b/lucene/core/src/java/org/apache/lucene/store/IndexInput.java @@ -127,6 +127,10 @@ public abstract IndexInput slice(String sliceDescription, long offset, long leng * CompoundFormat} implementations to honor the {@link ReadAdvice} of each file within the * compound file. * + *

    NOTE: it is only legal to call this method if this {@link IndexInput} has been open + * with {@link ReadAdvice#NORMAL}. However, this method accepts any {@link ReadAdvice} value but + * {@code null} as a read advice for the slice. + * *

    The default implementation delegates to {@link #slice(String, long, long)} and ignores the * {@link ReadAdvice}. */ diff --git a/lucene/core/src/java/org/apache/lucene/util/Accountables.java b/lucene/core/src/java/org/apache/lucene/util/Accountables.java index d822114452d2..b5c0f92c6810 100644 --- a/lucene/core/src/java/org/apache/lucene/util/Accountables.java +++ b/lucene/core/src/java/org/apache/lucene/util/Accountables.java @@ -19,7 +19,6 @@ import java.util.ArrayList; import java.util.Collection; import java.util.Collections; -import java.util.Comparator; import java.util.List; import java.util.Map; @@ -98,14 +97,7 @@ public static Collection namedAccountables( for (Map.Entry kv : in.entrySet()) { resources.add(namedAccountable(prefix + " '" + kv.getKey() + "'", kv.getValue())); } - Collections.sort( - resources, - new Comparator() { - @Override - public int compare(Accountable o1, Accountable o2) { - return o1.toString().compareTo(o2.toString()); - } - }); + resources.sort((o1, o2) -> o1.toString().compareTo(o2.toString())); return Collections.unmodifiableList(resources); } diff --git a/lucene/core/src/java/org/apache/lucene/util/ArrayUtil.java b/lucene/core/src/java/org/apache/lucene/util/ArrayUtil.java index 722df2ff6879..2cd3cb63cfe5 100644 --- a/lucene/core/src/java/org/apache/lucene/util/ArrayUtil.java +++ b/lucene/core/src/java/org/apache/lucene/util/ArrayUtil.java @@ -174,42 +174,34 @@ public static int oversize(int minTargetSize, int bytesPerElement) { if (Constants.JRE_IS_64BIT) { // round up to 8 byte alignment in 64bit env - switch (bytesPerElement) { - case 4: - // round up to multiple of 2 - return (newSize + 1) & 0x7ffffffe; - case 2: - // round up to multiple of 4 - return (newSize + 3) & 0x7ffffffc; - case 1: - // round up to multiple of 8 - return (newSize + 7) & 0x7ffffff8; - case 8: - // no rounding - default: - // odd (invalid?) size - return newSize; - } + return switch (bytesPerElement) { + // round up to multiple of 2 + case 4 -> (newSize + 1) & 0x7ffffffe; + // round up to multiple of 4 + case 2 -> (newSize + 3) & 0x7ffffffc; + // round up to multiple of 8 + case 1 -> (newSize + 7) & 0x7ffffff8; + // no rounding + case 8 -> newSize; + // odd (invalid?) size + default -> newSize; + }; } else { // In 32bit jvm, it's still 8-byte aligned, // but the array header is 12 bytes, not a multiple of 8. // So saving 4,12,20,28... bytes of data is the most cost-effective. - switch (bytesPerElement) { - case 1: - // align with size of 4,12,20,28... - return ((newSize + 3) & 0x7ffffff8) + 4; - case 2: - // align with size of 6,10,14,18... - return ((newSize + 1) & 0x7ffffffc) + 2; - case 4: - // align with size of 5,7,9,11... - return (newSize & 0x7ffffffe) + 1; - case 8: - // no processing required - default: - // odd (invalid?) size - return newSize; - } + return switch (bytesPerElement) { + // align with size of 4,12,20,28... + case 1 -> ((newSize + 3) & 0x7ffffff8) + 4; + // align with size of 6,10,14,18... + case 2 -> ((newSize + 1) & 0x7ffffffc) + 2; + // align with size of 5,7,9,11... + case 4 -> (newSize & 0x7ffffffe) + 1; + // no processing required + case 8 -> newSize; + // odd (invalid?) size + default -> newSize; + }; } } diff --git a/lucene/core/src/java/org/apache/lucene/util/BytesRefBuilder.java b/lucene/core/src/java/org/apache/lucene/util/BytesRefBuilder.java index 21adf0c60da7..27fc4337d256 100644 --- a/lucene/core/src/java/org/apache/lucene/util/BytesRefBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/BytesRefBuilder.java @@ -100,8 +100,10 @@ public void clear() { * #clear()} and then {@link #append(byte[], int, int)}. */ public void copyBytes(byte[] b, int off, int len) { - clear(); - append(b, off, len); + assert ref.offset == 0; + ref.length = len; + growNoCopy(len); + System.arraycopy(b, off, ref.bytes, 0, len); } /** @@ -109,8 +111,7 @@ public void copyBytes(byte[] b, int off, int len) { * #clear()} and then {@link #append(BytesRef)}. */ public void copyBytes(BytesRef ref) { - clear(); - append(ref); + copyBytes(ref.bytes, ref.offset, ref.length); } /** @@ -118,8 +119,7 @@ public void copyBytes(BytesRef ref) { * #clear()} and then {@link #append(BytesRefBuilder)}. */ public void copyBytes(BytesRefBuilder builder) { - clear(); - append(builder); + copyBytes(builder.get()); } /** @@ -135,7 +135,7 @@ public void copyChars(CharSequence text) { * text. */ public void copyChars(CharSequence text, int off, int len) { - grow(UnicodeUtil.maxUTF8Length(len)); + growNoCopy(UnicodeUtil.maxUTF8Length(len)); ref.length = UnicodeUtil.UTF16toUTF8(text, off, len, ref.bytes); } @@ -144,7 +144,7 @@ public void copyChars(CharSequence text, int off, int len) { * text. */ public void copyChars(char[] text, int off, int len) { - grow(UnicodeUtil.maxUTF8Length(len)); + growNoCopy(UnicodeUtil.maxUTF8Length(len)); ref.length = UnicodeUtil.UTF16toUTF8(text, off, len, ref.bytes); } diff --git a/lucene/core/src/java/org/apache/lucene/util/GroupVIntUtil.java b/lucene/core/src/java/org/apache/lucene/util/GroupVIntUtil.java index e1b5466342a0..e95e2eee4db0 100644 --- a/lucene/core/src/java/org/apache/lucene/util/GroupVIntUtil.java +++ b/lucene/core/src/java/org/apache/lucene/util/GroupVIntUtil.java @@ -30,12 +30,49 @@ public final class GroupVIntUtil { public static final int MAX_LENGTH_PER_GROUP = 17; // we use long array instead of int array to make negative integer to be read as positive long. - private static final long[] MASKS = new long[] {0xFFL, 0xFFFFL, 0xFFFFFFL, 0xFFFFFFFFL}; + private static final long[] LONG_MASKS = new long[] {0xFFL, 0xFFFFL, 0xFFFFFFL, 0xFFFFFFFFL}; + private static final int[] INT_MASKS = new int[] {0xFF, 0xFFFF, 0xFFFFFF, ~0}; + + /** + * Read all the group varints, including the tail vints. we need a long[] because this is what + * postings are using, all longs are actually required to be integers. + * + * @param dst the array to read ints into. + * @param limit the number of int values to read. + * @lucene.experimental + */ + public static void readGroupVInts(DataInput in, long[] dst, int limit) throws IOException { + int i; + for (i = 0; i <= limit - 4; i += 4) { + readGroupVInt(in, dst, i); + } + for (; i < limit; ++i) { + dst[i] = in.readVInt() & 0xFFFFFFFFL; + } + } + + /** + * Read all the group varints, including the tail vints. + * + * @param dst the array to read ints into. + * @param limit the number of int values to read. + * @lucene.experimental + */ + public static void readGroupVInts(DataInput in, int[] dst, int limit) throws IOException { + int i; + for (i = 0; i <= limit - 4; i += 4) { + in.readGroupVInt(dst, i); + } + for (; i < limit; ++i) { + dst[i] = in.readVInt(); + } + } /** * Default implementation of read single group, for optimal performance, you should use {@link - * DataInput#readGroupVInts(long[], int)} instead. + * GroupVIntUtil#readGroupVInts(DataInput, long[], int)} instead. * + * @param in the input to use to read data. * @param dst the array to read ints into. * @param offset the offset in the array to start storing ints. */ @@ -47,22 +84,44 @@ public static void readGroupVInt(DataInput in, long[] dst, int offset) throws IO final int n3Minus1 = (flag >> 2) & 0x03; final int n4Minus1 = flag & 0x03; - dst[offset] = readLongInGroup(in, n1Minus1); - dst[offset + 1] = readLongInGroup(in, n2Minus1); - dst[offset + 2] = readLongInGroup(in, n3Minus1); - dst[offset + 3] = readLongInGroup(in, n4Minus1); + dst[offset] = readIntInGroup(in, n1Minus1) & 0xFFFFFFFFL; + dst[offset + 1] = readIntInGroup(in, n2Minus1) & 0xFFFFFFFFL; + dst[offset + 2] = readIntInGroup(in, n3Minus1) & 0xFFFFFFFFL; + dst[offset + 3] = readIntInGroup(in, n4Minus1) & 0xFFFFFFFFL; } - private static long readLongInGroup(DataInput in, int numBytesMinus1) throws IOException { + /** + * Default implementation of read single group, for optimal performance, you should use {@link + * GroupVIntUtil#readGroupVInts(DataInput, int[], int)} instead. + * + * @param in the input to use to read data. + * @param dst the array to read ints into. + * @param offset the offset in the array to start storing ints. + */ + public static void readGroupVInt(DataInput in, int[] dst, int offset) throws IOException { + final int flag = in.readByte() & 0xFF; + + final int n1Minus1 = flag >> 6; + final int n2Minus1 = (flag >> 4) & 0x03; + final int n3Minus1 = (flag >> 2) & 0x03; + final int n4Minus1 = flag & 0x03; + + dst[offset] = readIntInGroup(in, n1Minus1); + dst[offset + 1] = readIntInGroup(in, n2Minus1); + dst[offset + 2] = readIntInGroup(in, n3Minus1); + dst[offset + 3] = readIntInGroup(in, n4Minus1); + } + + private static int readIntInGroup(DataInput in, int numBytesMinus1) throws IOException { switch (numBytesMinus1) { case 0: - return in.readByte() & 0xFFL; + return in.readByte() & 0xFF; case 1: - return in.readShort() & 0xFFFFL; + return in.readShort() & 0xFFFF; case 2: - return (in.readShort() & 0xFFFFL) | ((in.readByte() & 0xFFL) << 16); + return (in.readShort() & 0xFFFF) | ((in.readByte() & 0xFF) << 16); default: - return in.readInt() & 0xFFFFFFFFL; + return in.readInt(); } } @@ -104,13 +163,53 @@ public static int readGroupVInt( final int n4Minus1 = flag & 0x03; // This code path has fewer conditionals and tends to be significantly faster in benchmarks - dst[offset] = reader.read(pos) & MASKS[n1Minus1]; + dst[offset] = reader.read(pos) & LONG_MASKS[n1Minus1]; pos += 1 + n1Minus1; - dst[offset + 1] = reader.read(pos) & MASKS[n2Minus1]; + dst[offset + 1] = reader.read(pos) & LONG_MASKS[n2Minus1]; pos += 1 + n2Minus1; - dst[offset + 2] = reader.read(pos) & MASKS[n3Minus1]; + dst[offset + 2] = reader.read(pos) & LONG_MASKS[n3Minus1]; pos += 1 + n3Minus1; - dst[offset + 3] = reader.read(pos) & MASKS[n4Minus1]; + dst[offset + 3] = reader.read(pos) & LONG_MASKS[n4Minus1]; + pos += 1 + n4Minus1; + return (int) (pos - posStart); + } + + /** + * Faster implementation of read single group, It read values from the buffer that would not cross + * boundaries. + * + * @param in the input to use to read data. + * @param remaining the number of remaining bytes allowed to read for current block/segment. + * @param reader the supplier of read int. + * @param pos the start pos to read from the reader. + * @param dst the array to read ints into. + * @param offset the offset in the array to start storing ints. + * @return the number of bytes read excluding the flag. this indicates the number of positions + * should to be increased for caller, it is 0 or positive number and less than {@link + * #MAX_LENGTH_PER_GROUP} + */ + public static int readGroupVInt( + DataInput in, long remaining, IntReader reader, long pos, int[] dst, int offset) + throws IOException { + if (remaining < MAX_LENGTH_PER_GROUP) { + readGroupVInt(in, dst, offset); + return 0; + } + final int flag = in.readByte() & 0xFF; + final long posStart = ++pos; // exclude the flag bytes, the position has updated via readByte(). + final int n1Minus1 = flag >> 6; + final int n2Minus1 = (flag >> 4) & 0x03; + final int n3Minus1 = (flag >> 2) & 0x03; + final int n4Minus1 = flag & 0x03; + + // This code path has fewer conditionals and tends to be significantly faster in benchmarks + dst[offset] = reader.read(pos) & INT_MASKS[n1Minus1]; + pos += 1 + n1Minus1; + dst[offset + 1] = reader.read(pos) & INT_MASKS[n2Minus1]; + pos += 1 + n2Minus1; + dst[offset + 2] = reader.read(pos) & INT_MASKS[n3Minus1]; + pos += 1 + n3Minus1; + dst[offset + 3] = reader.read(pos) & INT_MASKS[n4Minus1]; pos += 1 + n4Minus1; return (int) (pos - posStart); } @@ -161,4 +260,39 @@ public static void writeGroupVInts(DataOutput out, byte[] scratch, long[] values out.writeVInt(toInt(values[readPos])); } } + + /** + * The implementation for group-varint encoding, It uses a maximum of {@link + * #MAX_LENGTH_PER_GROUP} bytes scratch buffer. + */ + public static void writeGroupVInts(DataOutput out, byte[] scratch, int[] values, int limit) + throws IOException { + int readPos = 0; + + // encode each group + while ((limit - readPos) >= 4) { + int writePos = 0; + final int n1Minus1 = numBytes(values[readPos]) - 1; + final int n2Minus1 = numBytes(values[readPos + 1]) - 1; + final int n3Minus1 = numBytes(values[readPos + 2]) - 1; + final int n4Minus1 = numBytes(values[readPos + 3]) - 1; + int flag = (n1Minus1 << 6) | (n2Minus1 << 4) | (n3Minus1 << 2) | (n4Minus1); + scratch[writePos++] = (byte) flag; + BitUtil.VH_LE_INT.set(scratch, writePos, values[readPos++]); + writePos += n1Minus1 + 1; + BitUtil.VH_LE_INT.set(scratch, writePos, values[readPos++]); + writePos += n2Minus1 + 1; + BitUtil.VH_LE_INT.set(scratch, writePos, values[readPos++]); + writePos += n3Minus1 + 1; + BitUtil.VH_LE_INT.set(scratch, writePos, values[readPos++]); + writePos += n4Minus1 + 1; + + out.writeBytes(scratch, writePos); + } + + // tail vints + for (; readPos < limit; readPos++) { + out.writeVInt(values[readPos]); + } + } } diff --git a/lucene/core/src/java/org/apache/lucene/util/IntArrayDocIdSet.java b/lucene/core/src/java/org/apache/lucene/util/IntArrayDocIdSet.java index eb4b93f499ef..d44cc7839233 100644 --- a/lucene/core/src/java/org/apache/lucene/util/IntArrayDocIdSet.java +++ b/lucene/core/src/java/org/apache/lucene/util/IntArrayDocIdSet.java @@ -21,12 +21,7 @@ import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSetIterator; -/** - * A doc id set based on sorted int array. - * - * @lucene.internal - */ -public final class IntArrayDocIdSet extends DocIdSet { +final class IntArrayDocIdSet extends DocIdSet { private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(IntArrayDocIdSet.class); @@ -34,23 +29,15 @@ public final class IntArrayDocIdSet extends DocIdSet { private final int[] docs; private final int length; - /** - * Build an IntArrayDocIdSet by an int array and len. - * - * @param docs A docs array whose length need to be greater than the param len. It needs to be - * sorted from 0(inclusive) to the len(exclusive), and the len-th doc in docs need to be - * {@link DocIdSetIterator#NO_MORE_DOCS}. - * @param len The valid docs length in array. - */ - public IntArrayDocIdSet(int[] docs, int len) { - if (docs[len] != DocIdSetIterator.NO_MORE_DOCS) { + IntArrayDocIdSet(int[] docs, int length) { + if (docs[length] != DocIdSetIterator.NO_MORE_DOCS) { throw new IllegalArgumentException(); } - assert assertArraySorted(docs, len) - : "IntArrayDocIdSet need docs to be sorted" - + Arrays.toString(ArrayUtil.copyOfSubArray(docs, 0, len)); this.docs = docs; - this.length = len; + assert assertArraySorted(docs, length) + : "IntArrayDocIdSet need docs to be sorted" + + Arrays.toString(ArrayUtil.copyOfSubArray(docs, 0, length)); + this.length = length; } private static boolean assertArraySorted(int[] docs, int length) { diff --git a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java index 5dcc3a6ee53d..b95ba73f449c 100644 --- a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java @@ -62,18 +62,16 @@ public class QueryBuilder { protected boolean enableGraphQueries = true; protected boolean autoGenerateMultiTermSynonymsPhraseQuery = false; - /** Wraps a term and boost */ - public static class TermAndBoost { - /** the term */ - public final BytesRef term; - - /** the boost */ - public final float boost; - + /** + * Wraps a term and boost + * + * @param term the term + * @param boost the boost + */ + public record TermAndBoost(BytesRef term, float boost) { /** Creates a new TermAndBoost */ - public TermAndBoost(BytesRef term, float boost) { - this.term = BytesRef.deepCopyOf(term); - this.boost = boost; + public TermAndBoost { + term = BytesRef.deepCopyOf(term); } } diff --git a/lucene/core/src/java/org/apache/lucene/util/StringHelper.java b/lucene/core/src/java/org/apache/lucene/util/StringHelper.java index 20a8cce420b4..5580f5fbe8e1 100644 --- a/lucene/core/src/java/org/apache/lucene/util/StringHelper.java +++ b/lucene/core/src/java/org/apache/lucene/util/StringHelper.java @@ -180,10 +180,10 @@ public static int murmurhash3_x86_32(byte[] data, int offset, int len, int seed) switch (len & 0x03) { case 3: k1 = (data[roundedEnd + 2] & 0xff) << 16; - // fallthrough + // fallthrough case 2: k1 |= (data[roundedEnd + 1] & 0xff) << 8; - // fallthrough + // fallthrough case 1: k1 |= (data[roundedEnd] & 0xff); k1 *= c1; @@ -209,6 +209,156 @@ public static int murmurhash3_x86_32(BytesRef bytes, int seed) { return murmurhash3_x86_32(bytes.bytes, bytes.offset, bytes.length, seed); } + /** + * Generates 128-bit hash from the byte array with the given offset, length and seed. + * + *

    The code is adopted from Apache Commons (link) + * + * @param data The input byte array + * @param offset The first element of array + * @param length The length of array + * @param seed The initial seed value + * @return The 128-bit hash (2 longs) + */ + public static long[] murmurhash3_x64_128( + final byte[] data, final int offset, final int length, final int seed) { + // Use an unsigned 32-bit integer as the seed + return murmurhash3_x64_128(data, offset, length, seed & 0xFFFFFFFFL); + } + + @SuppressWarnings("fallthrough") + private static long[] murmurhash3_x64_128( + final byte[] data, final int offset, final int length, final long seed) { + long h1 = seed; + long h2 = seed; + final int nblocks = length >> 4; + + // Constants for 128-bit variant + final long C1 = 0x87c37b91114253d5L; + final long C2 = 0x4cf5ad432745937fL; + final int R1 = 31; + final int R2 = 27; + final int R3 = 33; + final int M = 5; + final int N1 = 0x52dce729; + final int N2 = 0x38495ab5; + + // body + for (int i = 0; i < nblocks; i++) { + final int index = offset + (i << 4); + long k1 = (long) BitUtil.VH_LE_LONG.get(data, index); + long k2 = (long) BitUtil.VH_LE_LONG.get(data, index + 8); + + // mix functions for k1 + k1 *= C1; + k1 = Long.rotateLeft(k1, R1); + k1 *= C2; + h1 ^= k1; + h1 = Long.rotateLeft(h1, R2); + h1 += h2; + h1 = h1 * M + N1; + + // mix functions for k2 + k2 *= C2; + k2 = Long.rotateLeft(k2, R3); + k2 *= C1; + h2 ^= k2; + h2 = Long.rotateLeft(h2, R1); + h2 += h1; + h2 = h2 * M + N2; + } + + // tail + long k1 = 0; + long k2 = 0; + final int index = offset + (nblocks << 4); + switch (length & 0x0F) { + case 15: + k2 ^= ((long) data[index + 14] & 0xff) << 48; + case 14: + k2 ^= ((long) data[index + 13] & 0xff) << 40; + case 13: + k2 ^= ((long) data[index + 12] & 0xff) << 32; + case 12: + k2 ^= ((long) data[index + 11] & 0xff) << 24; + case 11: + k2 ^= ((long) data[index + 10] & 0xff) << 16; + case 10: + k2 ^= ((long) data[index + 9] & 0xff) << 8; + case 9: + k2 ^= data[index + 8] & 0xff; + k2 *= C2; + k2 = Long.rotateLeft(k2, R3); + k2 *= C1; + h2 ^= k2; + + case 8: + k1 ^= ((long) data[index + 7] & 0xff) << 56; + case 7: + k1 ^= ((long) data[index + 6] & 0xff) << 48; + case 6: + k1 ^= ((long) data[index + 5] & 0xff) << 40; + case 5: + k1 ^= ((long) data[index + 4] & 0xff) << 32; + case 4: + k1 ^= ((long) data[index + 3] & 0xff) << 24; + case 3: + k1 ^= ((long) data[index + 2] & 0xff) << 16; + case 2: + k1 ^= ((long) data[index + 1] & 0xff) << 8; + case 1: + k1 ^= data[index] & 0xff; + k1 *= C1; + k1 = Long.rotateLeft(k1, R1); + k1 *= C2; + h1 ^= k1; + } + + // finalization + h1 ^= length; + h2 ^= length; + + h1 += h2; + h2 += h1; + + h1 = fmix64(h1); + h2 = fmix64(h2); + + h1 += h2; + h2 += h1; + + return new long[] {h1, h2}; + } + + /** + * Performs the final avalanche mix step of the 64-bit hash function. + * + * @param hash The current hash + * @return The final hash + */ + private static long fmix64(long hash) { + hash ^= (hash >>> 33); + hash *= 0xff51afd7ed558ccdL; + hash ^= (hash >>> 33); + hash *= 0xc4ceb9fe1a85ec53L; + hash ^= (hash >>> 33); + return hash; + } + + /** + * Generates 128-bit hash from the byte array with the given offset, length and seed. + * + *

    The code is adopted from Apache Commons (link) + * + * @param data The input data + * @return The 128-bit hash (2 longs) + */ + public static long[] murmurhash3_x64_128(BytesRef data) { + return murmurhash3_x64_128(data.bytes, data.offset, data.length, 104729); + } + // Holds 128 bit unsigned value: private static BigInteger nextId; private static final BigInteger mask128; diff --git a/lucene/core/src/java/org/apache/lucene/util/TermAndVector.java b/lucene/core/src/java/org/apache/lucene/util/TermAndVector.java index 1ade19a19803..af544c591a3c 100644 --- a/lucene/core/src/java/org/apache/lucene/util/TermAndVector.java +++ b/lucene/core/src/java/org/apache/lucene/util/TermAndVector.java @@ -24,37 +24,17 @@ * * @lucene.experimental */ -public class TermAndVector { - - private final BytesRef term; - private final float[] vector; - - public TermAndVector(BytesRef term, float[] vector) { - this.term = term; - this.vector = vector; - } - - public BytesRef getTerm() { - return this.term; - } - - public float[] getVector() { - return this.vector; - } +public record TermAndVector(BytesRef term, float[] vector) { public int size() { return vector.length; } - public void normalizeVector() { - float vectorLength = 0; - for (int i = 0; i < vector.length; i++) { - vectorLength += vector[i] * vector[i]; - } - vectorLength = (float) Math.sqrt(vectorLength); - for (int i = 0; i < vector.length; i++) { - vector[i] /= vectorLength; - } + /** Return a {@link TermAndVector} whose vector is normalized according to the L2 norm. */ + public TermAndVector normalizeVector() { + float[] vector = this.vector.clone(); + VectorUtil.l2normalize(vector); + return new TermAndVector(term, vector); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java b/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java index c3182fe1ed79..dcf6b0c26cbb 100644 --- a/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java +++ b/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java @@ -511,8 +511,9 @@ public static UTF8CodePoint codePointAt(byte[] utf8, int pos, UTF8CodePoint reus case 2 -> v = leadByte & 31; // 5 useful bits case 3 -> v = leadByte & 15; // 4 useful bits case 4 -> v = leadByte & 7; // 3 useful bits - default -> throw new IllegalArgumentException( - "Invalid UTF8 header byte: 0x" + Integer.toHexString(leadByte)); + default -> + throw new IllegalArgumentException( + "Invalid UTF8 header byte: 0x" + Integer.toHexString(leadByte)); } // TODO: this may read past utf8's limit. diff --git a/lucene/core/src/java/org/apache/lucene/util/VectorUtil.java b/lucene/core/src/java/org/apache/lucene/util/VectorUtil.java index e1c3978cff30..250c65448703 100644 --- a/lucene/core/src/java/org/apache/lucene/util/VectorUtil.java +++ b/lucene/core/src/java/org/apache/lucene/util/VectorUtil.java @@ -17,6 +17,7 @@ package org.apache.lucene.util; +import java.util.stream.IntStream; import org.apache.lucene.internal.vectorization.VectorUtilSupport; import org.apache.lucene.internal.vectorization.VectorizationProvider; @@ -307,4 +308,15 @@ public static float[] checkFinite(float[] v) { } return v; } + + /** + * Given an array {@code buffer} that is sorted between indexes {@code 0} inclusive and {@code to} + * exclusive, find the first array index whose value is greater than or equal to {@code target}. + * This index is guaranteed to be at least {@code from}. If there is no such array index, {@code + * to} is returned. + */ + public static int findNextGEQ(int[] buffer, int target, int from, int to) { + assert IntStream.range(0, to - 1).noneMatch(i -> buffer[i] > buffer[i + 1]); + return IMPL.findNextGEQ(buffer, target, from, to); + } } diff --git a/lucene/core/src/java/org/apache/lucene/util/Version.java b/lucene/core/src/java/org/apache/lucene/util/Version.java index 91eb4649efcc..e232f1ab6d27 100644 --- a/lucene/core/src/java/org/apache/lucene/util/Version.java +++ b/lucene/core/src/java/org/apache/lucene/util/Version.java @@ -32,140 +32,23 @@ public final class Version { /** - * Match settings and bugs in Lucene's 9.0.0 release. - * - * @deprecated (9.1.0) Use latest - */ - @Deprecated public static final Version LUCENE_9_0_0 = new Version(9, 0, 0); - - /** - * Match settings and bugs in Lucene's 9.1.0 release. - * - * @deprecated (9.2.0) Use latest - */ - @Deprecated public static final Version LUCENE_9_1_0 = new Version(9, 1, 0); - - /** - * Match settings and bugs in Lucene's 9.2.0 release. - * - * @deprecated (9.3.0) Use latest - */ - @Deprecated public static final Version LUCENE_9_2_0 = new Version(9, 2, 0); - - /** - * Match settings and bugs in Lucene's 9.3.0 release. - * - * @deprecated (9.4.0) Use latest - */ - @Deprecated public static final Version LUCENE_9_3_0 = new Version(9, 3, 0); - - /** - * Match settings and bugs in Lucene's 9.4.0 release. - * - * @deprecated Use latest - */ - @Deprecated public static final Version LUCENE_9_4_0 = new Version(9, 4, 0); - - /** - * Match settings and bugs in Lucene's 9.4.1 release. - * - * @deprecated Use latest - * @deprecated (9.4.2) Use latest - */ - @Deprecated public static final Version LUCENE_9_4_1 = new Version(9, 4, 1); - - /** - * Match settings and bugs in Lucene's 9.4.2 release. - * - * @deprecated Use latest - */ - @Deprecated public static final Version LUCENE_9_4_2 = new Version(9, 4, 2); - - /** - * Match settings and bugs in Lucene's 9.5.0 release. - * - * @deprecated (9.6.0) Use latest - */ - @Deprecated public static final Version LUCENE_9_5_0 = new Version(9, 5, 0); - - /** - * Match settings and bugs in Lucene's 9.6.0 release. - * - * @deprecated (9.7.0) Use latest - */ - @Deprecated public static final Version LUCENE_9_6_0 = new Version(9, 6, 0); - - /** - * Match settings and bugs in Lucene's 9.7.0 release. - * - * @deprecated (9.8.0) Use latest - */ - @Deprecated public static final Version LUCENE_9_7_0 = new Version(9, 7, 0); - - /** - * Match settings and bugs in Lucene's 9.8.0 release. - * - * @deprecated (9.9.0) Use latest - */ - @Deprecated public static final Version LUCENE_9_8_0 = new Version(9, 8, 0); - - /** - * Match settings and bugs in Lucene's 9.9.0 release. - * - * @deprecated (9.9.1) Use latest - */ - @Deprecated public static final Version LUCENE_9_9_0 = new Version(9, 9, 0); - - /** - * Match settings and bugs in Lucene's 9.9.1 release. - * - * @deprecated (9.9.2) Use latest - */ - @Deprecated public static final Version LUCENE_9_9_1 = new Version(9, 9, 1); - - /** - * Match settings and bugs in Lucene's 9.9.2 release. - * - * @deprecated (9.10.0) Use latest - */ - @Deprecated public static final Version LUCENE_9_9_2 = new Version(9, 9, 2); - - /** - * Match settings and bugs in Lucene's 9.10.0 release. - * - * @deprecated (9.11.0) Use latest - */ - @Deprecated public static final Version LUCENE_9_10_0 = new Version(9, 10, 0); - - /** - * Match settings and bugs in Lucene's 9.11.0 release. - * - * @deprecated Use latest - * @deprecated (9.12.0) Use latest - * @deprecated (9.11.1) Use latest - */ - @Deprecated public static final Version LUCENE_9_11_0 = new Version(9, 11, 0); - - /** - * Match settings and bugs in Lucene's 9.11.1 release. - * * @deprecated Use latest */ - @Deprecated public static final Version LUCENE_9_11_1 = new Version(9, 11, 1); + @Deprecated public static final Version LUCENE_10_0_0 = new Version(10, 0, 0); /** - * Match settings and bugs in Lucene's 9.12.0 release. + * Match settings and bugs in Lucene's 10.1.0 release. * * @deprecated Use latest */ - @Deprecated public static final Version LUCENE_9_12_0 = new Version(9, 12, 0); + @Deprecated public static final Version LUCENE_10_1_0 = new Version(10, 1, 0); /** - * Match settings and bugs in Lucene's 10.0.0 release. + * Match settings and bugs in Lucene's 11.0.0 release. * *

    Use this to get the latest & greatest settings, bug fixes, etc, for Lucene. */ - public static final Version LUCENE_10_0_0 = new Version(10, 0, 0); + public static final Version LUCENE_11_0_0 = new Version(11, 0, 0); // To add a new version: // * Only add above this comment @@ -181,7 +64,7 @@ public final class Version { * re-test your entire application to ensure it behaves as expected, as some defaults may * have changed and may break functionality in your application. */ - public static final Version LATEST = LUCENE_10_0_0; + public static final Version LATEST = LUCENE_11_0_0; /** * Constant for backwards compatibility. diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java index 7ad9eedda6f3..34a184461911 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java @@ -339,6 +339,7 @@ public int getNumTransitions() { @Override public int getNumTransitions(int state) { assert state >= 0; + assert state < getNumStates(); int count = states[2 * state + 1]; if (count == -1) { return 0; diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java index c10ff4f28de8..22ea583ee08c 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java @@ -510,6 +510,7 @@ public int hashCode() { final int prime = 31; int result = 1; result = prime * result + ((runAutomaton == null) ? 0 : runAutomaton.hashCode()); + result = prime * result + ((nfaRunAutomaton == null) ? 0 : nfaRunAutomaton.hashCode()); result = prime * result + ((term == null) ? 0 : term.hashCode()); result = prime * result + ((type == null) ? 0 : type.hashCode()); return result; @@ -538,6 +539,7 @@ public long ramBytesUsed() { + RamUsageEstimator.sizeOfObject(automaton) + RamUsageEstimator.sizeOfObject(commonSuffixRef) + RamUsageEstimator.sizeOfObject(runAutomaton) + + RamUsageEstimator.sizeOfObject(nfaRunAutomaton) + RamUsageEstimator.sizeOfObject(term) + RamUsageEstimator.sizeOfObject(transition); } diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/FiniteStringsIterator.java b/lucene/core/src/java/org/apache/lucene/util/automaton/FiniteStringsIterator.java index b9030c19cf03..e141b4b95cff 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/FiniteStringsIterator.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/FiniteStringsIterator.java @@ -86,7 +86,7 @@ public FiniteStringsIterator(Automaton a, int startState, int endState) { this.emitEmptyString = a.isAccept(0); // Start iteration with node startState. - if (a.getNumTransitions(startState) > 0) { + if (a.getNumStates() > startState && a.getNumTransitions(startState) > 0) { pathStates.set(startState); nodes[0].resetState(a, startState); string.append(startState); diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/NFARunAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/NFARunAutomaton.java index 9737098b37c2..7180e37d7051 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/NFARunAutomaton.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/NFARunAutomaton.java @@ -21,7 +21,9 @@ import java.util.HashMap; import java.util.Map; import org.apache.lucene.internal.hppc.BitMixer; +import org.apache.lucene.util.Accountable; import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.RamUsageEstimator; /** * A RunAutomaton that does not require DFA. It will lazily determinize on-demand, memorizing the @@ -31,13 +33,16 @@ * * @lucene.internal */ -public class NFARunAutomaton implements ByteRunnable, TransitionAccessor { +public class NFARunAutomaton implements ByteRunnable, TransitionAccessor, Accountable { /** state ordinal of "no such state" */ - public static final int MISSING = -1; + private static final int MISSING = -1; private static final int NOT_COMPUTED = -2; + private static final long BASE_RAM_BYTES = + RamUsageEstimator.shallowSizeOfInstance(NFARunAutomaton.class); + private final Automaton automaton; private final int[] points; private final Map dStateToOrd = new HashMap<>(); // could init lazily? @@ -229,7 +234,17 @@ public void getTransition(int state, int index, Transition t) { setTransitionAccordingly(t); } - private class DState { + @Override + public long ramBytesUsed() { + return BASE_RAM_BYTES + + RamUsageEstimator.sizeOfObject(automaton) + + RamUsageEstimator.sizeOfObject(points) + + RamUsageEstimator.sizeOfMap(dStateToOrd) + + RamUsageEstimator.sizeOfObject(dStates) + + RamUsageEstimator.sizeOfObject(classmap); + } + + private class DState implements Accountable { private final int[] nfaStates; // this field is lazily init'd when first time caller wants to add a new transition private int[] transitions; @@ -426,5 +441,17 @@ public boolean equals(Object o) { DState dState = (DState) o; return hashCode == dState.hashCode && Arrays.equals(nfaStates, dState.nfaStates); } + + @Override + public long ramBytesUsed() { + return RamUsageEstimator.alignObjectSize( + Integer.BYTES * 3 + + 1 + + Transition.BYTES_USED * 2 + + RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + + RamUsageEstimator.NUM_BYTES_OBJECT_REF * 4L) + + RamUsageEstimator.sizeOfObject(nfaStates) + + RamUsageEstimator.sizeOfObject(transitions); + } } } diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java index 2052b1c50bf5..7c2b164aa107 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java @@ -35,7 +35,6 @@ import java.util.BitSet; import java.util.Collection; import java.util.HashMap; -import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; @@ -160,6 +159,37 @@ public static Automaton concatenate(List l) { *

    Complexity: linear in number of states. */ public static Automaton optional(Automaton a) { + if (a.isAccept(0)) { + // If the initial state is accepted, then the empty string is already accepted. + return a; + } + + boolean hasTransitionsToInitialState = false; + Transition t = new Transition(); + outer: + for (int state = 0; state < a.getNumStates(); ++state) { + int count = a.initTransition(state, t); + for (int i = 0; i < count; ++i) { + a.getNextTransition(t); + if (t.dest == 0) { + hasTransitionsToInitialState = true; + break outer; + } + } + } + + if (hasTransitionsToInitialState == false) { + // If the automaton has no transition to the initial state, we can simply mark the initial + // state as accepted. + Automaton result = new Automaton(); + result.copy(a); + if (result.getNumStates() == 0) { + result.createState(); + } + result.setAccept(0, true); + return result; + } + Automaton result = new Automaton(); result.createState(); result.setAccept(0, true); @@ -182,30 +212,65 @@ public static Automaton repeat(Automaton a) { // Repeating the empty automata will still only accept the empty automata. return a; } + + if (a.isAccept(0) && a.getAcceptStates().cardinality() == 1) { + // If state 0 is the only accept state, then this automaton already repeats itself. + return a; + } + Automaton.Builder builder = new Automaton.Builder(); + // Create the initial state, which is accepted builder.createState(); builder.setAccept(0, true); - builder.copy(a); - Transition t = new Transition(); + + int[] stateMap = new int[a.getNumStates()]; + for (int state = 0; state < a.getNumStates(); ++state) { + if (a.isAccept(state) == false) { + stateMap[state] = builder.createState(); + } else if (a.getNumTransitions(state) == 0) { + // Accept states that have no transitions get merged into state 0. + stateMap[state] = 0; + } else { + int newState = builder.createState(); + stateMap[state] = newState; + builder.setAccept(newState, true); + } + } + + // Now copy the automaton while renumbering states. + for (int state = 0; state < a.getNumStates(); ++state) { + int src = stateMap[state]; + int count = a.initTransition(state, t); + for (int i = 0; i < count; i++) { + a.getNextTransition(t); + int dest = stateMap[t.dest]; + builder.addTransition(src, dest, t.min, t.max); + } + } + + // Now copy transitions of the initial state to our new initial state. int count = a.initTransition(0, t); for (int i = 0; i < count; i++) { a.getNextTransition(t); - builder.addTransition(0, t.dest + 1, t.min, t.max); + builder.addTransition(0, stateMap[t.dest], t.min, t.max); } - int numStates = a.getNumStates(); - for (int s = 0; s < numStates; s++) { - if (a.isAccept(s)) { + // Now copy transitions of the initial state to final states to make the automaton repeat + // itself. + for (int s = a.getAcceptStates().nextSetBit(0); + s != -1; + s = a.getAcceptStates().nextSetBit(s + 1)) { + if (stateMap[s] != 0) { count = a.initTransition(0, t); for (int i = 0; i < count; i++) { a.getNextTransition(t); - builder.addTransition(s + 1, t.dest + 1, t.min, t.max); + builder.addTransition(stateMap[s], stateMap[t.dest], t.min, t.max); } } } - return builder.finish(); + return removeDeadStates(builder.finish()); } /** @@ -374,17 +439,6 @@ public static Automaton intersection(Automaton a1, Automaton a2) { return removeDeadStates(c); } - /** - * Returns true if these two automata accept exactly the same language. This is a costly - * computation! Both automata must be determinized and have no dead states! - */ - public static boolean sameLanguage(Automaton a1, Automaton a2) { - if (a1 == a2) { - return true; - } - return subsetOf(a2, a1) && subsetOf(a1, a2); - } - // TODO: move to test-framework? /** * Returns true if this automaton has any states that cannot be reached from the initial state or @@ -417,73 +471,6 @@ public static boolean hasDeadStatesToAccept(Automaton a) { return reachableFromAccept.isEmpty() == false; } - /** - * Returns true if the language of a1 is a subset of the language of a2. - * Both automata must be determinized and must have no dead states. - * - *

    Complexity: quadratic in number of states. - */ - public static boolean subsetOf(Automaton a1, Automaton a2) { - if (a1.isDeterministic() == false) { - throw new IllegalArgumentException("a1 must be deterministic"); - } - if (a2.isDeterministic() == false) { - throw new IllegalArgumentException("a2 must be deterministic"); - } - assert hasDeadStatesFromInitial(a1) == false; - assert hasDeadStatesFromInitial(a2) == false; - if (a1.getNumStates() == 0) { - // Empty language is alwyas a subset of any other language - return true; - } else if (a2.getNumStates() == 0) { - return isEmpty(a1); - } - - // TODO: cutover to iterators instead - Transition[][] transitions1 = a1.getSortedTransitions(); - Transition[][] transitions2 = a2.getSortedTransitions(); - ArrayDeque worklist = new ArrayDeque<>(); - HashSet visited = new HashSet<>(); - StatePair p = new StatePair(0, 0); - worklist.add(p); - visited.add(p); - while (worklist.size() > 0) { - p = worklist.removeFirst(); - if (a1.isAccept(p.s1) && a2.isAccept(p.s2) == false) { - return false; - } - Transition[] t1 = transitions1[p.s1]; - Transition[] t2 = transitions2[p.s2]; - for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) { - while (b2 < t2.length && t2[b2].max < t1[n1].min) { - b2++; - } - int min1 = t1[n1].min, max1 = t1[n1].max; - - for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++) { - if (t2[n2].min > min1) { - return false; - } - if (t2[n2].max < Character.MAX_CODE_POINT) { - min1 = t2[n2].max + 1; - } else { - min1 = Character.MAX_CODE_POINT; - max1 = Character.MIN_CODE_POINT; - } - StatePair q = new StatePair(t1[n1].dest, t2[n2].dest); - if (!visited.contains(q)) { - worklist.add(q); - visited.add(q); - } - } - if (min1 <= max1) { - return false; - } - } - } - return true; - } - /** * Returns an automaton that accepts the union of the languages of the given automata. * @@ -857,22 +844,48 @@ public static boolean isEmpty(Automaton a) { return true; } - /** Returns true if the given automaton accepts all strings. The automaton must be minimized. */ + /** + * Returns true if the given automaton accepts all strings. + * + *

    The automaton must be deterministic, or this method may return false. + * + *

    Complexity: linear in number of states and transitions. + */ public static boolean isTotal(Automaton a) { return isTotal(a, Character.MIN_CODE_POINT, Character.MAX_CODE_POINT); } /** * Returns true if the given automaton accepts all strings for the specified min/max range of the - * alphabet. The automaton must be minimized. + * alphabet. + * + *

    The automaton must be deterministic, or this method may return false. + * + *

    Complexity: linear in number of states and transitions. */ public static boolean isTotal(Automaton a, int minAlphabet, int maxAlphabet) { - if (a.isAccept(0) && a.getNumTransitions(0) == 1) { - Transition t = new Transition(); - a.getTransition(0, 0, t); - return t.dest == 0 && t.min == minAlphabet && t.max == maxAlphabet; + BitSet states = getLiveStates(a); + Transition spare = new Transition(); + int seenStates = 0; + for (int state = states.nextSetBit(0); state >= 0; state = states.nextSetBit(state + 1)) { + // all reachable states must be accept states + if (a.isAccept(state) == false) return false; + // all reachable states must contain transitions covering minAlphabet-maxAlphabet + int previousLabel = minAlphabet - 1; + for (int transition = 0; transition < a.getNumTransitions(state); transition++) { + a.getTransition(state, transition, spare); + // no gaps are allowed + if (spare.min > previousLabel + 1) return false; + previousLabel = spare.max; + } + if (previousLabel < maxAlphabet) return false; + if (state == Integer.MAX_VALUE) { + break; // or (state+1) would overflow + } + seenStates++; } - return false; + // we've checked all the states, automaton is either total or empty + return seenStates > 0; } /** @@ -1004,6 +1017,9 @@ private static BitSet getLiveStatesToAccept(Automaton a) { public static Automaton removeDeadStates(Automaton a) { int numStates = a.getNumStates(); BitSet liveSet = getLiveStates(a); + if (liveSet.cardinality() == numStates) { + return a; + } int[] map = new int[numStates]; diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java b/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java index 0d17a6fcab47..92bfe41b462b 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java @@ -368,7 +368,7 @@ public enum Kind { REGEXP_REPEAT_MIN, /** An expression that repeats a minimum and maximum number of times */ REGEXP_REPEAT_MINMAX, - /** The complement of an expression */ + /** The complement of a character class */ REGEXP_COMPLEMENT, /** A Character */ REGEXP_CHAR, @@ -387,7 +387,14 @@ public enum Kind { /** An Interval expression */ REGEXP_INTERVAL, /** An expression for a pre-defined class e.g. \w */ - REGEXP_PRE_CLASS + REGEXP_PRE_CLASS, + /** + * The complement of an expression. + * + * @deprecated Will be removed in Lucene 11 + */ + @Deprecated + REGEXP_DEPRECATED_COMPLEMENT } // ----- Syntax flags ( <= 0xff ) ------ @@ -412,11 +419,23 @@ public enum Kind { /** Syntax flag, enables no optional regexp syntax. */ public static final int NONE = 0x0000; - // ----- Matching flags ( > 0xff ) ------ + // ----- Matching flags ( > 0xff <= 0xffff ) ------ /** Allows case insensitive matching of ASCII characters. */ public static final int ASCII_CASE_INSENSITIVE = 0x0100; + // ----- Deprecated flags ( > 0xffff ) ------ + + /** + * Allows regexp parsing of the complement (~). + * + *

    Note that processing the complement can require exponential time, but will be bounded by an + * internal limit. Regexes exceeding the limit will fail with TooComplexToDeterminizeException. + * + * @deprecated This method will be removed in Lucene 11 + */ + @Deprecated public static final int DEPRECATED_COMPLEMENT = 0x10000; + // Immutable parsed state /** The type of expression */ public final Kind kind; @@ -471,7 +490,7 @@ public RegExp(String s, int syntax_flags) throws IllegalArgumentException { * @exception IllegalArgumentException if an error occurred while parsing the regular expression */ public RegExp(String s, int syntax_flags, int match_flags) throws IllegalArgumentException { - if (syntax_flags > ALL) { + if ((syntax_flags & ~DEPRECATED_COMPLEMENT) > ALL) { throw new IllegalArgumentException("Illegal syntax flag"); } @@ -621,6 +640,12 @@ private Automaton toAutomaton( a = exp1.toAutomaton(automata, automaton_provider); a = Operations.complement(a, Integer.MAX_VALUE); break; + case REGEXP_DEPRECATED_COMPLEMENT: + // to ease transitions for users only, support arbitrary complement + // but bounded by DEFAULT_DETERMINIZE_WORK_LIMIT: must not be configurable. + a = exp1.toAutomaton(automata, automaton_provider); + a = Operations.complement(a, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT); + break; case REGEXP_CHAR: if (check(ASCII_CASE_INSENSITIVE)) { a = toCaseInsensitiveChar(c); @@ -768,6 +793,7 @@ void toStringBuilder(StringBuilder b) { b.append("){").append(min).append(",").append(max).append("}"); break; case REGEXP_COMPLEMENT: + case REGEXP_DEPRECATED_COMPLEMENT: b.append("~("); exp1.toStringBuilder(b); b.append(")"); @@ -817,7 +843,7 @@ public String toStringTree() { void toStringTree(StringBuilder b, String indent) { switch (kind) { - // binary + // binary case REGEXP_UNION: case REGEXP_CONCATENATION: case REGEXP_INTERSECTION: @@ -827,10 +853,11 @@ void toStringTree(StringBuilder b, String indent) { exp1.toStringTree(b, indent + " "); exp2.toStringTree(b, indent + " "); break; - // unary + // unary case REGEXP_OPTIONAL: case REGEXP_REPEAT: case REGEXP_COMPLEMENT: + case REGEXP_DEPRECATED_COMPLEMENT: b.append(indent); b.append(kind); b.append('\n'); @@ -935,6 +962,7 @@ void getIdentifiers(Set set) { case REGEXP_REPEAT_MIN: case REGEXP_REPEAT_MINMAX: case REGEXP_COMPLEMENT: + case REGEXP_DEPRECATED_COMPLEMENT: exp1.getIdentifiers(set); break; case REGEXP_AUTOMATON: @@ -1011,6 +1039,16 @@ static RegExp makeComplement(int flags, RegExp exp) { return newContainerNode(flags, Kind.REGEXP_COMPLEMENT, exp, null); } + /** + * Creates node that will compute complement of arbitrary expression. + * + * @deprecated Will be removed in Lucene 11 + */ + @Deprecated + static RegExp makeDeprecatedComplement(int flags, RegExp exp) { + return newContainerNode(flags, Kind.REGEXP_DEPRECATED_COMPLEMENT, exp, null); + } + static RegExp makeChar(int flags, int c) { return newLeafNode(flags, Kind.REGEXP_CHAR, null, c, 0, 0, 0, 0, 0); } @@ -1140,7 +1178,9 @@ else if (match('{')) { } final RegExp parseComplExp() throws IllegalArgumentException { - return parseCharClassExp(); + if (check(DEPRECATED_COMPLEMENT) && match('~')) + return makeDeprecatedComplement(flags, parseComplExp()); + else return parseCharClassExp(); } final RegExp parseCharClassExp() throws IllegalArgumentException { diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/StatePair.java b/lucene/core/src/java/org/apache/lucene/util/automaton/StatePair.java index ad1be724f9cd..bd003507f2de 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/StatePair.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/StatePair.java @@ -35,9 +35,14 @@ * @lucene.experimental */ public class StatePair { + // only mike knows what it does (do not expose) int s; - int s1; - int s2; + + /** first state */ + public final int s1; + + /** second state */ + public final int s2; StatePair(int s, int s1, int s2) { this.s = s; @@ -81,7 +86,7 @@ public boolean equals(Object obj) { @Override public int hashCode() { // Don't use s1 ^ s2 since it's vulnerable to the case where s1 == s2 always --> hashCode = 0, - // e.g. if you call Operations.sameLanguage, + // e.g. if you call AutomatonTestUtil.sameLanguage, // passing the same automaton against itself: return s1 * 31 + s2; } diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Transition.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Transition.java index a9e60b317607..c822a5028ead 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Transition.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Transition.java @@ -16,12 +16,18 @@ */ package org.apache.lucene.util.automaton; +import org.apache.lucene.util.Accountable; +import org.apache.lucene.util.RamUsageEstimator; + /** * Holds one transition from an {@link Automaton}. This is typically used temporarily when iterating * through transitions by invoking {@link Automaton#initTransition} and {@link * Automaton#getNextTransition}. */ -public class Transition { +public class Transition implements Accountable { + + /** static estimation of bytes used */ + public static final long BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(Transition.class); /** Sole constructor. */ public Transition() {} @@ -48,4 +54,9 @@ public Transition() {} public String toString() { return source + " --> " + dest + " " + (char) min + "-" + (char) max; } + + @Override + public long ramBytesUsed() { + return BYTES_USED; + } } diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDConfig.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDConfig.java index 10fb96592ffa..c2f4a1dfc47c 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDConfig.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDConfig.java @@ -19,8 +19,15 @@ import org.apache.lucene.util.ArrayUtil; -/** Basic parameters for indexing points on the BKD tree. */ -public final class BKDConfig { +/** + * Basic parameters for indexing points on the BKD tree. + * + * @param numDims How many dimensions we are storing at the leaf (data) node + * @param numIndexDims How many dimensions we are indexing in the internal nodes + * @param bytesPerDim How many bytes each value in each dimension takes. + * @param maxPointsInLeafNode max points allowed on a Leaf block + */ +public record BKDConfig(int numDims, int numIndexDims, int bytesPerDim, int maxPointsInLeafNode) { /** Default maximum number of point in each leaf block */ public static final int DEFAULT_MAX_POINTS_IN_LEAF_NODE = 512; @@ -31,48 +38,7 @@ public final class BKDConfig { /** Maximum number of index dimensions */ public static final int MAX_INDEX_DIMS = 8; - /** How many dimensions we are storing at the leaf (data) nodes */ - public final int numDims; - - /** How many dimensions we are indexing in the internal nodes */ - public final int numIndexDims; - - /** How many bytes each value in each dimension takes. */ - public final int bytesPerDim; - - /** max points allowed on a Leaf block */ - public final int maxPointsInLeafNode; - - /** numDataDims * bytesPerDim */ - public final int packedBytesLength; - - /** numIndexDims * bytesPerDim */ - public final int packedIndexBytesLength; - - /** packedBytesLength plus docID size */ - public final int bytesPerDoc; - - public BKDConfig( - final int numDims, - final int numIndexDims, - final int bytesPerDim, - final int maxPointsInLeafNode) { - verifyParams(numDims, numIndexDims, bytesPerDim, maxPointsInLeafNode); - this.numDims = numDims; - this.numIndexDims = numIndexDims; - this.bytesPerDim = bytesPerDim; - this.maxPointsInLeafNode = maxPointsInLeafNode; - this.packedIndexBytesLength = numIndexDims * bytesPerDim; - this.packedBytesLength = numDims * bytesPerDim; - // dimensional values (numDims * bytesPerDim) + docID (int) - this.bytesPerDoc = this.packedBytesLength + Integer.BYTES; - } - - private static void verifyParams( - final int numDims, - final int numIndexDims, - final int bytesPerDim, - final int maxPointsInLeafNode) { + public BKDConfig { // Check inputs are on bounds if (numDims < 1 || numDims > MAX_DIMS) { throw new IllegalArgumentException( @@ -101,4 +67,19 @@ private static void verifyParams( + maxPointsInLeafNode); } } + + /** numDims * bytesPerDim */ + public int packedBytesLength() { + return numDims * bytesPerDim; + } + + /** numIndexDims * bytesPerDim */ + public int packedIndexBytesLength() { + return numIndexDims * bytesPerDim; + } + + /** (numDims * bytesPerDim) + Integer.BYTES (packedBytesLength plus docID size) */ + public int bytesPerDoc() { + return packedBytesLength() + Integer.BYTES; + } } diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDRadixSelector.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDRadixSelector.java index 62586641e2cf..7d0bb3c6fd51 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDRadixSelector.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDRadixSelector.java @@ -39,7 +39,7 @@ public final class BKDRadixSelector { private static final int MAX_SIZE_OFFLINE_BUFFER = 1024 * 8; // histogram array private final long[] histogram; - // number of bytes to be sorted: config.bytesPerDim + Integer.BYTES + // number of bytes to be sorted: config.bytesPerDim() + Integer.BYTES private final int bytesSorted; // flag to when we are moving to sort on heap private final int maxPointsSortInHeap; @@ -69,11 +69,11 @@ public BKDRadixSelector( // equal // we tie-break on the docID. Here we account for all bytes used in the process. this.bytesSorted = - config.bytesPerDim - + (config.numDims - config.numIndexDims) * config.bytesPerDim + config.bytesPerDim() + + (config.numDims() - config.numIndexDims()) * config.bytesPerDim() + Integer.BYTES; - final int numberOfPointsOffline = MAX_SIZE_OFFLINE_BUFFER / config.bytesPerDoc; - this.offlineBuffer = new byte[numberOfPointsOffline * config.bytesPerDoc]; + final int numberOfPointsOffline = MAX_SIZE_OFFLINE_BUFFER / config.bytesPerDoc(); + this.offlineBuffer = new byte[numberOfPointsOffline * config.bytesPerDoc()]; this.partitionBucket = new int[bytesSorted]; this.histogram = new long[HISTOGRAM_SIZE]; this.scratch = new byte[bytesSorted]; @@ -147,7 +147,7 @@ private int findCommonPrefixAndHistogram( throws IOException { // find common prefix int commonPrefixPosition = bytesSorted; - final int offset = dim * config.bytesPerDim; + final int offset = dim * config.bytesPerDim(); try (OfflinePointReader reader = points.getReader(from, to - from, offlineBuffer)) { assert commonPrefixPosition > dimCommonPrefix; reader.next(); @@ -155,14 +155,18 @@ private int findCommonPrefixAndHistogram( BytesRef packedValueDocID = pointValue.packedValueDocIDBytes(); // copy dimension System.arraycopy( - packedValueDocID.bytes, packedValueDocID.offset + offset, scratch, 0, config.bytesPerDim); + packedValueDocID.bytes, + packedValueDocID.offset + offset, + scratch, + 0, + config.bytesPerDim()); // copy data dimensions and docID System.arraycopy( packedValueDocID.bytes, - packedValueDocID.offset + config.packedIndexBytesLength, + packedValueDocID.offset + config.packedIndexBytesLength(), scratch, - config.bytesPerDim, - (config.numDims - config.numIndexDims) * config.bytesPerDim + Integer.BYTES); + config.bytesPerDim(), + (config.numDims() - config.numIndexDims()) * config.bytesPerDim() + Integer.BYTES); for (long i = from + 1; i < to; i++) { reader.next(); @@ -179,8 +183,8 @@ private int findCommonPrefixAndHistogram( break; } else { // Check common prefix and adjust histogram - final int startIndex = Math.min(dimCommonPrefix, config.bytesPerDim); - final int endIndex = Math.min(commonPrefixPosition, config.bytesPerDim); + final int startIndex = Math.min(dimCommonPrefix, config.bytesPerDim()); + final int endIndex = Math.min(commonPrefixPosition, config.bytesPerDim()); packedValueDocID = pointValue.packedValueDocIDBytes(); int j = Arrays.mismatch( @@ -191,20 +195,20 @@ private int findCommonPrefixAndHistogram( packedValueDocID.offset + offset + startIndex, packedValueDocID.offset + offset + endIndex); if (j == -1) { - if (commonPrefixPosition > config.bytesPerDim) { + if (commonPrefixPosition > config.bytesPerDim()) { // Tie-break on data dimensions + docID - final int startTieBreak = config.packedIndexBytesLength; - final int endTieBreak = startTieBreak + commonPrefixPosition - config.bytesPerDim; + final int startTieBreak = config.packedIndexBytesLength(); + final int endTieBreak = startTieBreak + commonPrefixPosition - config.bytesPerDim(); int k = Arrays.mismatch( scratch, - config.bytesPerDim, + config.bytesPerDim(), commonPrefixPosition, packedValueDocID.bytes, packedValueDocID.offset + startTieBreak, packedValueDocID.offset + endTieBreak); if (k != -1) { - commonPrefixPosition = config.bytesPerDim + k; + commonPrefixPosition = config.bytesPerDim() + k; Arrays.fill(histogram, 0); histogram[scratch[commonPrefixPosition] & 0xff] = i - from; } @@ -230,7 +234,7 @@ private int findCommonPrefixAndHistogram( private int getBucket(int offset, int commonPrefixPosition, PointValue pointValue) { int bucket; - if (commonPrefixPosition < config.bytesPerDim) { + if (commonPrefixPosition < config.bytesPerDim()) { BytesRef packedValue = pointValue.packedValue(); bucket = packedValue.bytes[packedValue.offset + offset + commonPrefixPosition] & 0xff; } else { @@ -239,9 +243,9 @@ private int getBucket(int offset, int commonPrefixPosition, PointValue pointValu packedValueDocID .bytes[ packedValueDocID.offset - + config.packedIndexBytesLength + + config.packedIndexBytesLength() + commonPrefixPosition - - config.bytesPerDim] + - config.bytesPerDim()] & 0xff; } return bucket; @@ -341,7 +345,7 @@ private void offlinePartition( long numDocsTiebreak) throws IOException { assert bytePosition == bytesSorted - 1 || deltaPoints != null; - int offset = dim * config.bytesPerDim; + int offset = dim * config.bytesPerDim(); long tiebreakCounter = 0; try (OfflinePointReader reader = points.getReader(from, to - from, offlineBuffer)) { while (reader.next()) { @@ -372,8 +376,8 @@ private void offlinePartition( } private byte[] partitionPointFromCommonPrefix() { - byte[] partition = new byte[config.bytesPerDim]; - for (int i = 0; i < config.bytesPerDim; i++) { + byte[] partition = new byte[config.bytesPerDim()]; + for (int i = 0; i < config.bytesPerDim(); i++) { partition[i] = (byte) partitionBucket[i]; } return partition; @@ -408,9 +412,9 @@ private byte[] heapRadixSelect( int to, int partitionPoint, int commonPrefixLength) { - final int dimOffset = dim * config.bytesPerDim + commonPrefixLength; - final int dimCmpBytes = config.bytesPerDim - commonPrefixLength; - final int dataOffset = config.packedIndexBytesLength - dimCmpBytes; + final int dimOffset = dim * config.bytesPerDim() + commonPrefixLength; + final int dimCmpBytes = config.bytesPerDim() - commonPrefixLength; + final int dataOffset = config.packedIndexBytesLength() - dimCmpBytes; new RadixSelector(bytesSorted - commonPrefixLength) { @Override @@ -427,7 +431,7 @@ protected int byteAt(int i, int k) { @Override protected Selector getFallbackSelector(int d) { final int skypedBytes = d + commonPrefixLength; - final int dimStart = dim * config.bytesPerDim; + final int dimStart = dim * config.bytesPerDim(); return new IntroSelector() { @Override @@ -437,15 +441,15 @@ protected void swap(int i, int j) { @Override protected void setPivot(int i) { - if (skypedBytes < config.bytesPerDim) { + if (skypedBytes < config.bytesPerDim()) { points.copyDim(i, dimStart, scratch, 0); } - points.copyDataDimsAndDoc(i, scratch, config.bytesPerDim); + points.copyDataDimsAndDoc(i, scratch, config.bytesPerDim()); } @Override protected int compare(int i, int j) { - if (skypedBytes < config.bytesPerDim) { + if (skypedBytes < config.bytesPerDim()) { int cmp = points.compareDim(i, j, dimStart); if (cmp != 0) { return cmp; @@ -456,36 +460,36 @@ protected int compare(int i, int j) { @Override protected int comparePivot(int j) { - if (skypedBytes < config.bytesPerDim) { + if (skypedBytes < config.bytesPerDim()) { int cmp = points.compareDim(j, scratch, 0, dimStart); if (cmp != 0) { return cmp; } } - return points.compareDataDimsAndDoc(j, scratch, config.bytesPerDim); + return points.compareDataDimsAndDoc(j, scratch, config.bytesPerDim()); } }; } }.select(from, to, partitionPoint); - byte[] partition = new byte[config.bytesPerDim]; + byte[] partition = new byte[config.bytesPerDim()]; PointValue pointValue = points.getPackedValueSlice(partitionPoint); BytesRef packedValue = pointValue.packedValue(); System.arraycopy( packedValue.bytes, - packedValue.offset + dim * config.bytesPerDim, + packedValue.offset + dim * config.bytesPerDim(), partition, 0, - config.bytesPerDim); + config.bytesPerDim()); return partition; } /** Sort the heap writer by the specified dim. It is used to sort the leaves of the tree */ public void heapRadixSort( final HeapPointWriter points, int from, int to, int dim, int commonPrefixLength) { - final int dimOffset = dim * config.bytesPerDim + commonPrefixLength; - final int dimCmpBytes = config.bytesPerDim - commonPrefixLength; - final int dataOffset = config.packedIndexBytesLength - dimCmpBytes; + final int dimOffset = dim * config.bytesPerDim() + commonPrefixLength; + final int dimCmpBytes = config.bytesPerDim() - commonPrefixLength; + final int dataOffset = config.packedIndexBytesLength() - dimCmpBytes; new MSBRadixSorter(bytesSorted - commonPrefixLength) { @Override @@ -502,7 +506,7 @@ protected void swap(int i, int j) { @Override protected Sorter getFallbackSorter(int k) { final int skypedBytes = k + commonPrefixLength; - final int dimStart = dim * config.bytesPerDim; + final int dimStart = dim * config.bytesPerDim(); return new IntroSorter() { @Override @@ -512,15 +516,15 @@ protected void swap(int i, int j) { @Override protected void setPivot(int i) { - if (skypedBytes < config.bytesPerDim) { + if (skypedBytes < config.bytesPerDim()) { points.copyDim(i, dimStart, scratch, 0); } - points.copyDataDimsAndDoc(i, scratch, config.bytesPerDim); + points.copyDataDimsAndDoc(i, scratch, config.bytesPerDim()); } @Override protected int compare(int i, int j) { - if (skypedBytes < config.bytesPerDim) { + if (skypedBytes < config.bytesPerDim()) { final int cmp = points.compareDim(i, j, dimStart); if (cmp != 0) { return cmp; @@ -531,13 +535,13 @@ protected int compare(int i, int j) { @Override protected int comparePivot(int j) { - if (skypedBytes < config.bytesPerDim) { + if (skypedBytes < config.bytesPerDim()) { int cmp = points.compareDim(j, scratch, 0, dimStart); if (cmp != 0) { return cmp; } } - return points.compareDataDimsAndDoc(j, scratch, config.bytesPerDim); + return points.compareDataDimsAndDoc(j, scratch, config.bytesPerDim()); } }; } @@ -578,20 +582,5 @@ PointWriter getPointWriter(long count, String desc) throws IOException { } /** Sliced reference to points in an PointWriter. */ - public static final class PathSlice { - public final PointWriter writer; - public final long start; - public final long count; - - public PathSlice(PointWriter writer, long start, long count) { - this.writer = writer; - this.start = start; - this.count = count; - } - - @Override - public String toString() { - return "PathSlice(start=" + start + " count=" + count + " writer=" + writer + ")"; - } - } + public record PathSlice(PointWriter writer, long start, long count) {} } diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java index b2b109769f53..0efcc2ef4650 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java @@ -72,16 +72,19 @@ public BKDReader(IndexInput metaIn, IndexInput indexIn, IndexInput dataIn) throw numLeaves = metaIn.readVInt(); assert numLeaves > 0; - minPackedValue = new byte[config.packedIndexBytesLength]; - maxPackedValue = new byte[config.packedIndexBytesLength]; + minPackedValue = new byte[config.packedIndexBytesLength()]; + maxPackedValue = new byte[config.packedIndexBytesLength()]; - metaIn.readBytes(minPackedValue, 0, config.packedIndexBytesLength); - metaIn.readBytes(maxPackedValue, 0, config.packedIndexBytesLength); + metaIn.readBytes(minPackedValue, 0, config.packedIndexBytesLength()); + metaIn.readBytes(maxPackedValue, 0, config.packedIndexBytesLength()); final ArrayUtil.ByteArrayComparator comparator = - ArrayUtil.getUnsignedComparator(config.bytesPerDim); - for (int dim = 0; dim < config.numIndexDims; dim++) { + ArrayUtil.getUnsignedComparator(config.bytesPerDim()); + for (int dim = 0; dim < config.numIndexDims(); dim++) { if (comparator.compare( - minPackedValue, dim * config.bytesPerDim, maxPackedValue, dim * config.bytesPerDim) + minPackedValue, + dim * config.bytesPerDim(), + maxPackedValue, + dim * config.bytesPerDim()) > 0) { throw new CorruptIndexException( "minPackedValue " @@ -118,7 +121,7 @@ private boolean isTreeBalanced() throws IOException { // since lucene 8.6 all trees are unbalanced. return false; } - if (config.numDims > 1) { + if (config.numDims() > 1) { // high dimensional tree in pre-8.6 indices are balanced. assert 1 << MathUtil.log(numLeaves, 2) == numLeaves; return true; @@ -128,7 +131,7 @@ private boolean isTreeBalanced() throws IOException { return false; } // count of the last node for unbalanced trees - final int lastLeafNodePointCount = Math.toIntExact(pointCount % config.maxPointsInLeafNode); + final int lastLeafNodePointCount = Math.toIntExact(pointCount % config.maxPointsInLeafNode()); // navigate to last node PointTree pointTree = getPointTree(); do { @@ -244,11 +247,11 @@ private BKDPointTree( 1, minPackedValue, maxPackedValue, - new BKDReaderDocIDSetIterator(config.maxPointsInLeafNode), - new byte[config.packedBytesLength], - new byte[config.packedIndexBytesLength], - new byte[config.packedIndexBytesLength], - new int[config.numDims], + new BKDReaderDocIDSetIterator(config.maxPointsInLeafNode()), + new byte[config.packedBytesLength()], + new byte[config.packedIndexBytesLength()], + new byte[config.packedIndexBytesLength()], + new int[config.numDims()], isTreeBalanced); // read root node readNodeData(false); @@ -286,18 +289,18 @@ private BKDPointTree( int treeDepth = getTreeDepth(numLeaves); splitDimValueStack = new byte[treeDepth][]; splitValuesStack = new byte[treeDepth][]; - splitValuesStack[0] = new byte[config.packedIndexBytesLength]; + splitValuesStack[0] = new byte[config.packedIndexBytesLength()]; leafBlockFPStack = new long[treeDepth + 1]; readNodeDataPositions = new int[treeDepth + 1]; rightNodePositions = new int[treeDepth]; splitDimsPos = new int[treeDepth]; - negativeDeltas = new boolean[config.numIndexDims * treeDepth]; + negativeDeltas = new boolean[config.numIndexDims() * treeDepth]; // information about the unbalance of the tree so we can report the exact size below a node this.pointCount = pointCount; rightMostLeafNode = (1 << treeDepth - 1) - 1; - int lastLeafNodePointCount = Math.toIntExact(pointCount % config.maxPointsInLeafNode); + int lastLeafNodePointCount = Math.toIntExact(pointCount % config.maxPointsInLeafNode()); this.lastLeafNodePointCount = - lastLeafNodePointCount == 0 ? config.maxPointsInLeafNode : lastLeafNodePointCount; + lastLeafNodePointCount == 0 ? config.maxPointsInLeafNode() : lastLeafNodePointCount; // scratch objects, reused between clones so NN search are not creating those objects // in every clone. this.scratchIterator = scratchIterator; @@ -336,10 +339,10 @@ public PointTree clone() { index.splitValuesStack[index.level] = splitValuesStack[level].clone(); System.arraycopy( negativeDeltas, - level * config.numIndexDims, + level * config.numIndexDims(), index.negativeDeltas, - level * config.numIndexDims, - config.numIndexDims); + level * config.numIndexDims(), + config.numIndexDims()); index.splitDimsPos[level] = splitDimsPos[level]; } return index; @@ -375,25 +378,25 @@ private void resetNodeDataPosition() throws IOException { private void pushBoundsLeft() { final int splitDimPos = splitDimsPos[level]; if (splitDimValueStack[level] == null) { - splitDimValueStack[level] = new byte[config.bytesPerDim]; + splitDimValueStack[level] = new byte[config.bytesPerDim()]; } // save the dimension we are going to change System.arraycopy( - maxPackedValue, splitDimPos, splitDimValueStack[level], 0, config.bytesPerDim); - assert ArrayUtil.getUnsignedComparator(config.bytesPerDim) + maxPackedValue, splitDimPos, splitDimValueStack[level], 0, config.bytesPerDim()); + assert ArrayUtil.getUnsignedComparator(config.bytesPerDim()) .compare(maxPackedValue, splitDimPos, splitValuesStack[level], splitDimPos) >= 0 - : "config.bytesPerDim=" - + config.bytesPerDim + : "config.bytesPerDim()=" + + config.bytesPerDim() + " splitDimPos=" + splitDimsPos[level] - + " config.numIndexDims=" - + config.numIndexDims - + " config.numDims=" - + config.numDims; + + " config.numIndexDims()=" + + config.numIndexDims() + + " config.numDims()=" + + config.numDims(); // add the split dim value: System.arraycopy( - splitValuesStack[level], splitDimPos, maxPackedValue, splitDimPos, config.bytesPerDim); + splitValuesStack[level], splitDimPos, maxPackedValue, splitDimPos, config.bytesPerDim()); } private void pushLeft() throws IOException { @@ -408,21 +411,21 @@ private void pushBoundsRight() { assert splitDimValueStack[level] != null; // save the dimension we are going to change System.arraycopy( - minPackedValue, splitDimPos, splitDimValueStack[level], 0, config.bytesPerDim); - assert ArrayUtil.getUnsignedComparator(config.bytesPerDim) + minPackedValue, splitDimPos, splitDimValueStack[level], 0, config.bytesPerDim()); + assert ArrayUtil.getUnsignedComparator(config.bytesPerDim()) .compare(minPackedValue, splitDimPos, splitValuesStack[level], splitDimPos) <= 0 - : "config.bytesPerDim=" - + config.bytesPerDim + : "config.bytesPerDim()=" + + config.bytesPerDim() + " splitDimPos=" + splitDimsPos[level] - + " config.numIndexDims=" - + config.numIndexDims - + " config.numDims=" - + config.numDims; + + " config.numIndexDims()=" + + config.numIndexDims() + + " config.numDims()=" + + config.numDims(); // add the split dim value: System.arraycopy( - splitValuesStack[level], splitDimPos, minPackedValue, splitDimPos, config.bytesPerDim); + splitValuesStack[level], splitDimPos, minPackedValue, splitDimPos, config.bytesPerDim()); } private void pushRight() throws IOException { @@ -456,7 +459,7 @@ private void pop() { private void popBounds(byte[] packedValue) { // restore the split dimension System.arraycopy( - splitDimValueStack[level], 0, packedValue, splitDimsPos[level], config.bytesPerDim); + splitDimValueStack[level], 0, packedValue, splitDimsPos[level], config.bytesPerDim()); } @Override @@ -517,14 +520,14 @@ public long size() { } // size for an unbalanced tree. return rightMostLeafNode == this.rightMostLeafNode - ? (long) (numLeaves - 1) * config.maxPointsInLeafNode + lastLeafNodePointCount - : (long) numLeaves * config.maxPointsInLeafNode; + ? (long) (numLeaves - 1) * config.maxPointsInLeafNode() + lastLeafNodePointCount + : (long) numLeaves * config.maxPointsInLeafNode(); } private long sizeFromBalancedTree(int leftMostLeafNode, int rightMostLeafNode) { // number of points that need to be distributed between leaves, one per leaf final int extraPoints = - Math.toIntExact(((long) config.maxPointsInLeafNode * this.leafNodeOffset) - pointCount); + Math.toIntExact(((long) config.maxPointsInLeafNode() * this.leafNodeOffset) - pointCount); assert extraPoints < leafNodeOffset : "point excess should be lower than leafNodeOffset"; // offset where we stop adding one point to the leaves final int nodeOffset = leafNodeOffset - extraPoints; @@ -532,9 +535,9 @@ private long sizeFromBalancedTree(int leftMostLeafNode, int rightMostLeafNode) { for (int node = leftMostLeafNode; node <= rightMostLeafNode; node++) { // offsetPosition provides which extra point will be added to this node if (balanceTreeNodePosition(0, leafNodeOffset, node - leafNodeOffset, 0, 0) < nodeOffset) { - count += config.maxPointsInLeafNode; + count += config.maxPointsInLeafNode(); } else { - count += config.maxPointsInLeafNode - 1; + count += config.maxPointsInLeafNode() - 1; } } return count; @@ -664,12 +667,12 @@ private void readNodeData(boolean isLeft) throws IOException { if (isLeafNode() == false) { System.arraycopy( negativeDeltas, - (level - 1) * config.numIndexDims, + (level - 1) * config.numIndexDims(), negativeDeltas, - level * config.numIndexDims, - config.numIndexDims); + level * config.numIndexDims(), + config.numIndexDims()); negativeDeltas[ - level * config.numIndexDims + (splitDimsPos[level - 1] / config.bytesPerDim)] = + level * config.numIndexDims() + (splitDimsPos[level - 1] / config.bytesPerDim())] = isLeft; if (splitValuesStack[level] == null) { @@ -680,20 +683,20 @@ private void readNodeData(boolean isLeft) throws IOException { 0, splitValuesStack[level], 0, - config.packedIndexBytesLength); + config.packedIndexBytesLength()); } // read split dim, prefix, firstDiffByteDelta encoded as int: int code = innerNodes.readVInt(); - final int splitDim = code % config.numIndexDims; - splitDimsPos[level] = splitDim * config.bytesPerDim; - code /= config.numIndexDims; - final int prefix = code % (1 + config.bytesPerDim); - final int suffix = config.bytesPerDim - prefix; + final int splitDim = code % config.numIndexDims(); + splitDimsPos[level] = splitDim * config.bytesPerDim(); + code /= config.numIndexDims(); + final int prefix = code % (1 + config.bytesPerDim()); + final int suffix = config.bytesPerDim() - prefix; if (suffix > 0) { - int firstDiffByteDelta = code / (1 + config.bytesPerDim); - if (negativeDeltas[level * config.numIndexDims + splitDim]) { + int firstDiffByteDelta = code / (1 + config.bytesPerDim()); + if (negativeDeltas[level * config.numIndexDims() + splitDim]) { firstDiffByteDelta = -firstDiffByteDelta; } final int startPos = splitDimsPos[level] + prefix; @@ -737,13 +740,13 @@ private void visitDocValuesNoCardinality( PointValues.IntersectVisitor visitor) throws IOException { readCommonPrefixes(commonPrefixLengths, scratchDataPackedValue, in); - if (config.numIndexDims != 1 && version >= BKDWriter.VERSION_LEAF_STORES_BOUNDS) { + if (config.numIndexDims() != 1 && version >= BKDWriter.VERSION_LEAF_STORES_BOUNDS) { byte[] minPackedValue = scratchMinIndexPackedValue; System.arraycopy( - scratchDataPackedValue, 0, minPackedValue, 0, config.packedIndexBytesLength); + scratchDataPackedValue, 0, minPackedValue, 0, config.packedIndexBytesLength()); byte[] maxPackedValue = scratchMaxIndexPackedValue; // Copy common prefixes before reading adjusted box - System.arraycopy(minPackedValue, 0, maxPackedValue, 0, config.packedIndexBytesLength); + System.arraycopy(minPackedValue, 0, maxPackedValue, 0, config.packedIndexBytesLength()); readMinMax(commonPrefixLengths, minPackedValue, maxPackedValue, in); // The index gives us range of values for each dimension, but the actual range of values @@ -801,13 +804,13 @@ private void visitDocValuesWithCardinality( visitor.grow(count); visitUniqueRawDocValues(scratchDataPackedValue, scratchIterator, count, visitor); } else { - if (config.numIndexDims != 1) { + if (config.numIndexDims() != 1) { byte[] minPackedValue = scratchMinIndexPackedValue; System.arraycopy( - scratchDataPackedValue, 0, minPackedValue, 0, config.packedIndexBytesLength); + scratchDataPackedValue, 0, minPackedValue, 0, config.packedIndexBytesLength()); byte[] maxPackedValue = scratchMaxIndexPackedValue; // Copy common prefixes before reading adjusted box - System.arraycopy(minPackedValue, 0, maxPackedValue, 0, config.packedIndexBytesLength); + System.arraycopy(minPackedValue, 0, maxPackedValue, 0, config.packedIndexBytesLength()); readMinMax(commonPrefixLengths, minPackedValue, maxPackedValue, in); // The index gives us range of values for each dimension, but the actual range of values @@ -853,12 +856,12 @@ private void visitDocValuesWithCardinality( private void readMinMax( int[] commonPrefixLengths, byte[] minPackedValue, byte[] maxPackedValue, IndexInput in) throws IOException { - for (int dim = 0; dim < config.numIndexDims; dim++) { + for (int dim = 0; dim < config.numIndexDims(); dim++) { int prefix = commonPrefixLengths[dim]; in.readBytes( - minPackedValue, dim * config.bytesPerDim + prefix, config.bytesPerDim - prefix); + minPackedValue, dim * config.bytesPerDim() + prefix, config.bytesPerDim() - prefix); in.readBytes( - maxPackedValue, dim * config.bytesPerDim + prefix, config.bytesPerDim - prefix); + maxPackedValue, dim * config.bytesPerDim() + prefix, config.bytesPerDim() - prefix); } } @@ -874,10 +877,12 @@ private void visitSparseRawDocValues( int i; for (i = 0; i < count; ) { int length = in.readVInt(); - for (int dim = 0; dim < config.numDims; dim++) { + for (int dim = 0; dim < config.numDims(); dim++) { int prefix = commonPrefixLengths[dim]; in.readBytes( - scratchPackedValue, dim * config.bytesPerDim + prefix, config.bytesPerDim - prefix); + scratchPackedValue, + dim * config.bytesPerDim() + prefix, + config.bytesPerDim() - prefix); } scratchIterator.reset(i, length); visitor.visit(scratchIterator, scratchPackedValue); @@ -912,17 +917,19 @@ private void visitCompressedDocValues( // the byte at `compressedByteOffset` is compressed using run-length compression, // other suffix bytes are stored verbatim final int compressedByteOffset = - compressedDim * config.bytesPerDim + commonPrefixLengths[compressedDim]; + compressedDim * config.bytesPerDim() + commonPrefixLengths[compressedDim]; commonPrefixLengths[compressedDim]++; int i; for (i = 0; i < count; ) { scratchPackedValue[compressedByteOffset] = in.readByte(); final int runLen = Byte.toUnsignedInt(in.readByte()); for (int j = 0; j < runLen; ++j) { - for (int dim = 0; dim < config.numDims; dim++) { + for (int dim = 0; dim < config.numDims(); dim++) { int prefix = commonPrefixLengths[dim]; in.readBytes( - scratchPackedValue, dim * config.bytesPerDim + prefix, config.bytesPerDim - prefix); + scratchPackedValue, + dim * config.bytesPerDim() + prefix, + config.bytesPerDim() - prefix); } visitor.visit(scratchIterator.docIDs[i + j], scratchPackedValue); } @@ -937,7 +944,7 @@ private void visitCompressedDocValues( private int readCompressedDim(IndexInput in) throws IOException { int compressedDim = in.readByte(); if (compressedDim < -2 - || compressedDim >= config.numDims + || compressedDim >= config.numDims() || (version < BKDWriter.VERSION_LOW_CARDINALITY_LEAVES && compressedDim == -2)) { throw new CorruptIndexException("Got compressedDim=" + compressedDim, in); } @@ -946,11 +953,11 @@ private int readCompressedDim(IndexInput in) throws IOException { private void readCommonPrefixes( int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in) throws IOException { - for (int dim = 0; dim < config.numDims; dim++) { + for (int dim = 0; dim < config.numDims(); dim++) { int prefix = in.readVInt(); commonPrefixLengths[dim] = prefix; if (prefix > 0) { - in.readBytes(scratchPackedValue, dim * config.bytesPerDim, prefix); + in.readBytes(scratchPackedValue, dim * config.bytesPerDim(), prefix); } // System.out.println("R: " + dim + " of " + numDims + " prefix=" + prefix); } @@ -974,17 +981,17 @@ public byte[] getMaxPackedValue() { @Override public int getNumDimensions() throws IOException { - return config.numDims; + return config.numDims(); } @Override public int getNumIndexDimensions() throws IOException { - return config.numIndexDims; + return config.numIndexDims(); } @Override public int getBytesPerDimension() throws IOException { - return config.bytesPerDim; + return config.bytesPerDim(); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java index 1f4a7b294ac9..2af93ef5ca04 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java @@ -60,19 +60,19 @@ /** * Recursively builds a block KD-tree to assign all incoming points in N-dim space to smaller and * smaller N-dim rectangles (cells) until the number of points in a given rectangle is <= - * config.maxPointsInLeafNode. The tree is partially balanced, which means the leaf nodes - * will have the requested config.maxPointsInLeafNode except one that might have less. - * Leaf nodes may straddle the two bottom levels of the binary tree. Values that fall exactly on a - * cell boundary may be in either cell. + * config.maxPointsInLeafNode(). The tree is partially balanced, which means the leaf nodes + * will have the requested config.maxPointsInLeafNode() except one that might have + * less. Leaf nodes may straddle the two bottom levels of the binary tree. Values that fall exactly + * on a cell boundary may be in either cell. * *

    The number of dimensions can be 1 to 8, but every byte[] value is fixed length. * *

    This consumes heap during writing: it allocates a Long[numLeaves], a - * byte[numLeaves*(1+config.bytesPerDim)] and then uses up to the specified {@code + * byte[numLeaves*(1+config.bytesPerDim())] and then uses up to the specified {@code * maxMBSortInHeap} heap space for writing. * - *

    NOTE: This can write at most Integer.MAX_VALUE * config.maxPointsInLeafNode - * / config.bytesPerDim total points. + *

    NOTE: This can write at most Integer.MAX_VALUE * config.maxPointsInLeafNode() + * / config.bytesPerDim() total points. * * @lucene.experimental */ @@ -150,31 +150,31 @@ public BKDWriter( this.maxDoc = maxDoc; this.config = config; - this.comparator = ArrayUtil.getUnsignedComparator(config.bytesPerDim); - this.equalsPredicate = BKDUtil.getEqualsPredicate(config.bytesPerDim); - this.commonPrefixComparator = BKDUtil.getPrefixLengthComparator(config.bytesPerDim); + this.comparator = ArrayUtil.getUnsignedComparator(config.bytesPerDim()); + this.equalsPredicate = BKDUtil.getEqualsPredicate(config.bytesPerDim()); + this.commonPrefixComparator = BKDUtil.getPrefixLengthComparator(config.bytesPerDim()); docsSeen = new FixedBitSet(maxDoc); - scratchDiff = new byte[config.bytesPerDim]; - scratch = new byte[config.packedBytesLength]; - commonPrefixLengths = new int[config.numDims]; + scratchDiff = new byte[config.bytesPerDim()]; + scratch = new byte[config.packedBytesLength()]; + commonPrefixLengths = new int[config.numDims()]; - minPackedValue = new byte[config.packedIndexBytesLength]; - maxPackedValue = new byte[config.packedIndexBytesLength]; + minPackedValue = new byte[config.packedIndexBytesLength()]; + maxPackedValue = new byte[config.packedIndexBytesLength()]; // Maximum number of points we hold in memory at any time - maxPointsSortInHeap = (int) ((maxMBSortInHeap * 1024 * 1024) / (config.bytesPerDoc)); - docIdsWriter = new DocIdsWriter(config.maxPointsInLeafNode); + maxPointsSortInHeap = (int) ((maxMBSortInHeap * 1024 * 1024) / (config.bytesPerDoc())); + docIdsWriter = new DocIdsWriter(config.maxPointsInLeafNode()); // Finally, we must be able to hold at least the leaf node in heap during build: - if (maxPointsSortInHeap < config.maxPointsInLeafNode) { + if (maxPointsSortInHeap < config.maxPointsInLeafNode()) { throw new IllegalArgumentException( "maxMBSortInHeap=" + maxMBSortInHeap + " only allows for maxPointsSortInHeap=" + maxPointsSortInHeap + ", but this is less than maxPointsInLeafNode=" - + config.maxPointsInLeafNode + + config.maxPointsInLeafNode() + "; " + "either increase maxMBSortInHeap or decrease maxPointsInLeafNode"); } @@ -204,10 +204,10 @@ private void initPointWriter() throws IOException { } public void add(byte[] packedValue, int docID) throws IOException { - if (packedValue.length != config.packedBytesLength) { + if (packedValue.length != config.packedBytesLength()) { throw new IllegalArgumentException( "packedValue should be length=" - + config.packedBytesLength + + config.packedBytesLength() + " (got: " + packedValue.length + ")"); @@ -222,15 +222,15 @@ public void add(byte[] packedValue, int docID) throws IOException { } if (pointCount == 0) { initPointWriter(); - System.arraycopy(packedValue, 0, minPackedValue, 0, config.packedIndexBytesLength); - System.arraycopy(packedValue, 0, maxPackedValue, 0, config.packedIndexBytesLength); + System.arraycopy(packedValue, 0, minPackedValue, 0, config.packedIndexBytesLength()); + System.arraycopy(packedValue, 0, maxPackedValue, 0, config.packedIndexBytesLength()); } else { - for (int dim = 0; dim < config.numIndexDims; dim++) { - int offset = dim * config.bytesPerDim; + for (int dim = 0; dim < config.numIndexDims(); dim++) { + int offset = dim * config.bytesPerDim(); if (comparator.compare(packedValue, offset, minPackedValue, offset) < 0) { - System.arraycopy(packedValue, offset, minPackedValue, offset, config.bytesPerDim); + System.arraycopy(packedValue, offset, minPackedValue, offset, config.bytesPerDim()); } else if (comparator.compare(packedValue, offset, maxPackedValue, offset) > 0) { - System.arraycopy(packedValue, offset, maxPackedValue, offset, config.bytesPerDim); + System.arraycopy(packedValue, offset, maxPackedValue, offset, config.bytesPerDim()); } } } @@ -431,7 +431,7 @@ public IORunnable writeField( String fieldName, MutablePointTree reader) throws IOException { - if (config.numDims == 1) { + if (config.numDims() == 1) { return writeField1Dim(metaOut, indexOut, dataOut, fieldName, reader); } else { return writeFieldNDims(metaOut, indexOut, dataOut, fieldName, reader); @@ -450,14 +450,14 @@ private void computePackedValueBounds( } values.getValue(from, scratch); System.arraycopy( - scratch.bytes, scratch.offset, minPackedValue, 0, config.packedIndexBytesLength); + scratch.bytes, scratch.offset, minPackedValue, 0, config.packedIndexBytesLength()); System.arraycopy( - scratch.bytes, scratch.offset, maxPackedValue, 0, config.packedIndexBytesLength); + scratch.bytes, scratch.offset, maxPackedValue, 0, config.packedIndexBytesLength()); for (int i = from + 1; i < to; ++i) { values.getValue(i, scratch); - for (int dim = 0; dim < config.numIndexDims; dim++) { - final int startOffset = dim * config.bytesPerDim; - final int endOffset = startOffset + config.bytesPerDim; + for (int dim = 0; dim < config.numIndexDims(); dim++) { + final int startOffset = dim * config.bytesPerDim(); + final int endOffset = startOffset + config.bytesPerDim(); if (Arrays.compareUnsigned( scratch.bytes, scratch.offset + startOffset, @@ -471,7 +471,7 @@ private void computePackedValueBounds( scratch.offset + startOffset, minPackedValue, startOffset, - config.bytesPerDim); + config.bytesPerDim()); } else if (Arrays.compareUnsigned( scratch.bytes, scratch.offset + startOffset, @@ -485,7 +485,7 @@ private void computePackedValueBounds( scratch.offset + startOffset, maxPackedValue, startOffset, - config.bytesPerDim); + config.bytesPerDim()); } } } @@ -519,12 +519,13 @@ private IORunnable writeFieldNDims( } final int numLeaves = - Math.toIntExact((pointCount + config.maxPointsInLeafNode - 1) / config.maxPointsInLeafNode); + Math.toIntExact( + (pointCount + config.maxPointsInLeafNode() - 1) / config.maxPointsInLeafNode()); final int numSplits = numLeaves - 1; checkMaxLeafNodeCount(numLeaves); - final byte[] splitPackedValues = new byte[Math.multiplyExact(numSplits, config.bytesPerDim)]; + final byte[] splitPackedValues = new byte[Math.multiplyExact(numSplits, config.bytesPerDim())]; final byte[] splitDimensionValues = new byte[numSplits]; final long[] leafBlockFPs = new long[numLeaves]; @@ -536,7 +537,7 @@ private IORunnable writeFieldNDims( } final long dataStartFP = dataOut.getFilePointer(); - final int[] parentSplits = new int[config.numIndexDims]; + final int[] parentSplits = new int[config.numIndexDims()]; build( 0, numLeaves, @@ -550,10 +551,10 @@ private IORunnable writeFieldNDims( splitPackedValues, splitDimensionValues, leafBlockFPs, - new int[config.maxPointsInLeafNode]); - assert Arrays.equals(parentSplits, new int[config.numIndexDims]); + new int[config.maxPointsInLeafNode()]); + assert Arrays.equals(parentSplits, new int[config.numIndexDims()]); - scratchBytesRef1.length = config.bytesPerDim; + scratchBytesRef1.length = config.bytesPerDim(); scratchBytesRef1.bytes = splitPackedValues; return makeWriter(metaOut, indexOut, splitDimensionValues, leafBlockFPs, dataStartFP); @@ -609,13 +610,13 @@ public IORunnable merge( throws IOException { assert docMaps == null || readers.size() == docMaps.size(); - BKDMergeQueue queue = new BKDMergeQueue(config.bytesPerDim, readers.size()); + BKDMergeQueue queue = new BKDMergeQueue(config.bytesPerDim(), readers.size()); for (int i = 0; i < readers.size(); i++) { PointValues pointValues = readers.get(i); - assert pointValues.getNumDimensions() == config.numDims - && pointValues.getBytesPerDimension() == config.bytesPerDim - && pointValues.getNumIndexDimensions() == config.numIndexDims; + assert pointValues.getNumDimensions() == config.numDims() + && pointValues.getBytesPerDimension() == config.bytesPerDim() + && pointValues.getNumIndexDimensions() == config.numIndexDims(); MergeState.DocMap docMap; if (docMaps == null) { docMap = null; @@ -653,16 +654,16 @@ private class OneDimensionBKDWriter { final long dataStartFP; final LongArrayList leafBlockFPs = new LongArrayList(); final List leafBlockStartValues = new ArrayList<>(); - final byte[] leafValues = new byte[config.maxPointsInLeafNode * config.packedBytesLength]; - final int[] leafDocs = new int[config.maxPointsInLeafNode]; + final byte[] leafValues = new byte[config.maxPointsInLeafNode() * config.packedBytesLength()]; + final int[] leafDocs = new int[config.maxPointsInLeafNode()]; private long valueCount; private int leafCount; private int leafCardinality; OneDimensionBKDWriter(IndexOutput metaOut, IndexOutput indexOut, IndexOutput dataOut) { - if (config.numIndexDims != 1) { + if (config.numIndexDims() != 1) { throw new UnsupportedOperationException( - "config.numIndexDims must be 1 but got " + config.numIndexDims); + "config.numIndexDims() must be 1 but got " + config.numIndexDims()); } if (pointCount != 0) { throw new IllegalStateException("cannot mix add and merge"); @@ -681,7 +682,7 @@ private class OneDimensionBKDWriter { this.dataOut = dataOut; this.dataStartFP = dataOut.getFilePointer(); - lastPackedValue = new byte[config.packedBytesLength]; + lastPackedValue = new byte[config.packedBytesLength()]; } // for asserts @@ -693,7 +694,8 @@ assert valueInOrder( config, valueCount + leafCount, 0, lastPackedValue, packedValue, 0, docID, lastDocID); if (leafCount == 0 - || equalsPredicate.test(leafValues, (leafCount - 1) * config.bytesPerDim, packedValue, 0) + || equalsPredicate.test( + leafValues, (leafCount - 1) * config.bytesPerDim(), packedValue, 0) == false) { leafCardinality++; } @@ -701,8 +703,8 @@ assert valueInOrder( packedValue, 0, leafValues, - leafCount * config.packedBytesLength, - config.packedBytesLength); + leafCount * config.packedBytesLength(), + config.packedBytesLength()); leafDocs[leafCount] = docID; docsSeen.set(docID); leafCount++; @@ -716,7 +718,7 @@ assert valueInOrder( + " values"); } - if (leafCount == config.maxPointsInLeafNode) { + if (leafCount == config.maxPointsInLeafNode()) { // We write a block once we hit exactly the max count ... this is different from // when we write N > 1 dimensional points where we write between max/2 and max per leaf // block @@ -741,7 +743,7 @@ public IORunnable finish() throws IOException { pointCount = valueCount; - scratchBytesRef1.length = config.bytesPerDim; + scratchBytesRef1.length = config.bytesPerDim(); scratchBytesRef1.offset = 0; assert leafBlockStartValues.size() + 1 == leafBlockFPs.size(); BKDTreeLeafNodes leafNodes = @@ -768,28 +770,29 @@ public int numLeaves() { } }; return () -> { - writeIndex(metaOut, indexOut, config.maxPointsInLeafNode, leafNodes, dataStartFP); + writeIndex(metaOut, indexOut, config.maxPointsInLeafNode(), leafNodes, dataStartFP); }; } private void writeLeafBlock(int leafCardinality) throws IOException { assert leafCount != 0; if (valueCount == 0) { - System.arraycopy(leafValues, 0, minPackedValue, 0, config.packedIndexBytesLength); + System.arraycopy(leafValues, 0, minPackedValue, 0, config.packedIndexBytesLength()); } System.arraycopy( leafValues, - (leafCount - 1) * config.packedBytesLength, + (leafCount - 1) * config.packedBytesLength(), maxPackedValue, 0, - config.packedIndexBytesLength); + config.packedIndexBytesLength()); valueCount += leafCount; if (leafBlockFPs.size() > 0) { // Save the first (minimum) value in each leaf block except the first, to build the split // value index in the end: - leafBlockStartValues.add(ArrayUtil.copyOfSubArray(leafValues, 0, config.packedBytesLength)); + leafBlockStartValues.add( + ArrayUtil.copyOfSubArray(leafValues, 0, config.packedBytesLength())); } leafBlockFPs.add(dataOut.getFilePointer()); checkMaxLeafNodeCount(leafBlockFPs.size()); @@ -797,28 +800,28 @@ private void writeLeafBlock(int leafCardinality) throws IOException { // Find per-dim common prefix: commonPrefixLengths[0] = commonPrefixComparator.compare( - leafValues, 0, leafValues, (leafCount - 1) * config.packedBytesLength); + leafValues, 0, leafValues, (leafCount - 1) * config.packedBytesLength()); writeLeafBlockDocs(dataOut, leafDocs, 0, leafCount); writeCommonPrefixes(dataOut, commonPrefixLengths, leafValues); - scratchBytesRef1.length = config.packedBytesLength; + scratchBytesRef1.length = config.packedBytesLength(); scratchBytesRef1.bytes = leafValues; final IntFunction packedValues = i -> { - scratchBytesRef1.offset = config.packedBytesLength * i; + scratchBytesRef1.offset = config.packedBytesLength() * i; return scratchBytesRef1; }; assert valuesInOrderAndBounds( config, leafCount, 0, - ArrayUtil.copyOfSubArray(leafValues, 0, config.packedBytesLength), + ArrayUtil.copyOfSubArray(leafValues, 0, config.packedBytesLength()), ArrayUtil.copyOfSubArray( leafValues, - (leafCount - 1) * config.packedBytesLength, - leafCount * config.packedBytesLength), + (leafCount - 1) * config.packedBytesLength(), + leafCount * config.packedBytesLength()), packedValues, leafDocs, 0); @@ -857,7 +860,7 @@ private void printPathSlice(String desc, PathSlice slice, int dim) throws IOExce int count = 0; while (r.next()) { byte[] v = r.packedValue(); - System.out.println(" " + count + ": " + new BytesRef(v, dim*config.bytesPerDim, config.bytesPerDim)); + System.out.println(" " + count + ": " + new BytesRef(v, dim*config.bytesPerDim(), config.bytesPerDim())); count++; if (count == slice.count) { break; @@ -868,10 +871,10 @@ private void printPathSlice(String desc, PathSlice slice, int dim) throws IOExce */ private void checkMaxLeafNodeCount(int numLeaves) { - if (config.bytesPerDim * (long) numLeaves > ArrayUtil.MAX_ARRAY_LENGTH) { + if (config.bytesPerDim() * (long) numLeaves > ArrayUtil.MAX_ARRAY_LENGTH) { throw new IllegalStateException( - "too many nodes; increase config.maxPointsInLeafNode (currently " - + config.maxPointsInLeafNode + "too many nodes; increase config.maxPointsInLeafNode() (currently " + + config.maxPointsInLeafNode() + ") and reindex"); } } @@ -907,7 +910,8 @@ public IORunnable finish(IndexOutput metaOut, IndexOutput indexOut, IndexOutput pointWriter = null; final int numLeaves = - Math.toIntExact((pointCount + config.maxPointsInLeafNode - 1) / config.maxPointsInLeafNode); + Math.toIntExact( + (pointCount + config.maxPointsInLeafNode() - 1) / config.maxPointsInLeafNode()); final int numSplits = numLeaves - 1; checkMaxLeafNodeCount(numLeaves); @@ -918,7 +922,7 @@ public IORunnable finish(IndexOutput metaOut, IndexOutput indexOut, IndexOutput // Indexed by nodeID, but first (root) nodeID is 1. We do 1+ because the lead byte at each // recursion says which dim we split on. - byte[] splitPackedValues = new byte[Math.multiplyExact(numSplits, config.bytesPerDim)]; + byte[] splitPackedValues = new byte[Math.multiplyExact(numSplits, config.bytesPerDim())]; byte[] splitDimensionValues = new byte[numSplits]; // +1 because leaf count is power of 2 (e.g. 8), and innerNodeCount is power of 2 minus 1 (e.g. @@ -926,13 +930,13 @@ public IORunnable finish(IndexOutput metaOut, IndexOutput indexOut, IndexOutput long[] leafBlockFPs = new long[numLeaves]; // Make sure the math above "worked": - assert pointCount / numLeaves <= config.maxPointsInLeafNode + assert pointCount / numLeaves <= config.maxPointsInLeafNode() : "pointCount=" + pointCount + " numLeaves=" + numLeaves - + " config.maxPointsInLeafNode=" - + config.maxPointsInLeafNode; + + " config.maxPointsInLeafNode()=" + + config.maxPointsInLeafNode(); // We re-use the selector so we do not need to create an object every time. BKDRadixSelector radixSelector = @@ -942,7 +946,7 @@ public IORunnable finish(IndexOutput metaOut, IndexOutput indexOut, IndexOutput boolean success = false; try { - final int[] parentSplits = new int[config.numIndexDims]; + final int[] parentSplits = new int[config.numIndexDims()]; build( 0, numLeaves, @@ -955,8 +959,8 @@ public IORunnable finish(IndexOutput metaOut, IndexOutput indexOut, IndexOutput splitPackedValues, splitDimensionValues, leafBlockFPs, - new int[config.maxPointsInLeafNode]); - assert Arrays.equals(parentSplits, new int[config.numIndexDims]); + new int[config.maxPointsInLeafNode()]); + assert Arrays.equals(parentSplits, new int[config.numIndexDims()]); // If no exception, we should have cleaned everything up: assert tempDir.getCreatedFiles().isEmpty(); @@ -971,7 +975,7 @@ public IORunnable finish(IndexOutput metaOut, IndexOutput indexOut, IndexOutput } scratchBytesRef1.bytes = splitPackedValues; - scratchBytesRef1.length = config.bytesPerDim; + scratchBytesRef1.length = config.bytesPerDim(); return makeWriter(metaOut, indexOut, splitDimensionValues, leafBlockFPs, dataStartFP); } @@ -990,7 +994,7 @@ public long getLeafLP(int index) { @Override public BytesRef getSplitValue(int index) { - scratchBytesRef1.offset = index * config.bytesPerDim; + scratchBytesRef1.offset = index * config.bytesPerDim(); return scratchBytesRef1; } @@ -1007,7 +1011,7 @@ public int numLeaves() { return () -> { // Write index: - writeIndex(metaOut, indexOut, config.maxPointsInLeafNode, leafNodes, dataStartFP); + writeIndex(metaOut, indexOut, config.maxPointsInLeafNode(), leafNodes, dataStartFP); }; } @@ -1021,7 +1025,7 @@ private byte[] packIndex(BKDTreeLeafNodes leafNodes) throws IOException { // This is the "file" we append the byte[] to: List blocks = new ArrayList<>(); - byte[] lastSplitValues = new byte[config.bytesPerDim * config.numIndexDims]; + byte[] lastSplitValues = new byte[config.bytesPerDim() * config.numIndexDims()]; // System.out.println("\npack index"); int totalSize = recursePackIndex( @@ -1030,7 +1034,7 @@ private byte[] packIndex(BKDTreeLeafNodes leafNodes) throws IOException { 0l, blocks, lastSplitValues, - new boolean[config.numIndexDims], + new boolean[config.numIndexDims()], false, 0, leafNodes.numLeaves()); @@ -1104,25 +1108,25 @@ private int recursePackIndex( int address = splitValue.offset; // System.out.println("recursePack inner nodeID=" + nodeID + " splitDim=" + splitDim + " - // splitValue=" + new BytesRef(splitPackedValues, address, config.bytesPerDim)); + // splitValue=" + new BytesRef(splitPackedValues, address, config.bytesPerDim())); // find common prefix with last split value in this dim: int prefix = commonPrefixComparator.compare( - splitValue.bytes, address, lastSplitValues, splitDim * config.bytesPerDim); + splitValue.bytes, address, lastSplitValues, splitDim * config.bytesPerDim()); // System.out.println("writeNodeData nodeID=" + nodeID + " splitDim=" + splitDim + " numDims=" - // + numDims + " config.bytesPerDim=" + config.bytesPerDim + " prefix=" + prefix); + // + numDims + " config.bytesPerDim()=" + config.bytesPerDim() + " prefix=" + prefix); int firstDiffByteDelta; - if (prefix < config.bytesPerDim) { + if (prefix < config.bytesPerDim()) { // System.out.println(" delta byte cur=" + // Integer.toHexString(splitPackedValues[address+prefix]&0xFF) + " prev=" + - // Integer.toHexString(lastSplitValues[splitDim * config.bytesPerDim + prefix]&0xFF) + " + // Integer.toHexString(lastSplitValues[splitDim * config.bytesPerDim() + prefix]&0xFF) + " // negated?=" + negativeDeltas[splitDim]); firstDiffByteDelta = (splitValue.bytes[address + prefix] & 0xFF) - - (lastSplitValues[splitDim * config.bytesPerDim + prefix] & 0xFF); + - (lastSplitValues[splitDim * config.bytesPerDim() + prefix] & 0xFF); if (negativeDeltas[splitDim]) { firstDiffByteDelta = -firstDiffByteDelta; } @@ -1134,16 +1138,17 @@ private int recursePackIndex( // pack the prefix, splitDim and delta first diff byte into a single vInt: int code = - (firstDiffByteDelta * (1 + config.bytesPerDim) + prefix) * config.numIndexDims + splitDim; + (firstDiffByteDelta * (1 + config.bytesPerDim()) + prefix) * config.numIndexDims() + + splitDim; // System.out.println(" code=" + code); // System.out.println(" splitValue=" + new BytesRef(splitPackedValues, address, - // config.bytesPerDim)); + // config.bytesPerDim())); writeBuffer.writeVInt(code); // write the split value, prefix coded vs. our parent's split value: - int suffix = config.bytesPerDim - prefix; + int suffix = config.bytesPerDim() - prefix; byte[] savSplitValue = new byte[suffix]; if (suffix > 1) { writeBuffer.writeBytes(splitValue.bytes, address + prefix + 1, suffix - 1); @@ -1152,14 +1157,14 @@ private int recursePackIndex( byte[] cmp = lastSplitValues.clone(); System.arraycopy( - lastSplitValues, splitDim * config.bytesPerDim + prefix, savSplitValue, 0, suffix); + lastSplitValues, splitDim * config.bytesPerDim() + prefix, savSplitValue, 0, suffix); // copy our split value into lastSplitValues for our children to prefix-code against System.arraycopy( splitValue.bytes, address + prefix, lastSplitValues, - splitDim * config.bytesPerDim + prefix, + splitDim * config.bytesPerDim() + prefix, suffix); int numBytes = appendBlock(writeBuffer, blocks); @@ -1213,7 +1218,7 @@ private int recursePackIndex( // restore lastSplitValues to what caller originally passed us: System.arraycopy( - savSplitValue, 0, lastSplitValues, splitDim * config.bytesPerDim + prefix, suffix); + savSplitValue, 0, lastSplitValues, splitDim * config.bytesPerDim() + prefix, suffix); assert Arrays.equals(lastSplitValues, cmp); @@ -1241,15 +1246,15 @@ private void writeIndex( long dataStartFP) throws IOException { CodecUtil.writeHeader(metaOut, CODEC_NAME, VERSION_CURRENT); - metaOut.writeVInt(config.numDims); - metaOut.writeVInt(config.numIndexDims); + metaOut.writeVInt(config.numDims()); + metaOut.writeVInt(config.numIndexDims()); metaOut.writeVInt(countPerLeaf); - metaOut.writeVInt(config.bytesPerDim); + metaOut.writeVInt(config.bytesPerDim()); assert numLeaves > 0; metaOut.writeVInt(numLeaves); - metaOut.writeBytes(minPackedValue, 0, config.packedIndexBytesLength); - metaOut.writeBytes(maxPackedValue, 0, config.packedIndexBytesLength); + metaOut.writeBytes(minPackedValue, 0, config.packedIndexBytesLength()); + metaOut.writeBytes(maxPackedValue, 0, config.packedIndexBytesLength()); metaOut.writeVLong(pointCount); metaOut.writeVInt(docsSeen.cardinality()); @@ -1264,7 +1269,7 @@ private void writeIndex( private void writeLeafBlockDocs(DataOutput out, int[] docIDs, int start, int count) throws IOException { - assert count > 0 : "config.maxPointsInLeafNode=" + config.maxPointsInLeafNode; + assert count > 0 : "config.maxPointsInLeafNode()=" + config.maxPointsInLeafNode(); out.writeVInt(count); docIdsWriter.writeDocIds(docIDs, start, count, out); } @@ -1278,13 +1283,13 @@ private void writeLeafBlockPackedValues( int leafCardinality) throws IOException { int prefixLenSum = Arrays.stream(commonPrefixLengths).sum(); - if (prefixLenSum == config.packedBytesLength) { + if (prefixLenSum == config.packedBytesLength()) { // all values in this block are equal out.writeByte((byte) -1); } else { - assert commonPrefixLengths[sortedDim] < config.bytesPerDim; + assert commonPrefixLengths[sortedDim] < config.bytesPerDim(); // estimate if storing the values with cardinality is cheaper than storing all values. - int compressedByteOffset = sortedDim * config.bytesPerDim + commonPrefixLengths[sortedDim]; + int compressedByteOffset = sortedDim * config.bytesPerDim() + commonPrefixLengths[sortedDim]; int highCardinalityCost; int lowCardinalityCost; if (count == leafCardinality) { @@ -1303,9 +1308,9 @@ private void writeLeafBlockPackedValues( } // Add cost of runLen compression highCardinalityCost = - count * (config.packedBytesLength - prefixLenSum - 1) + 2 * numRunLens; + count * (config.packedBytesLength() - prefixLenSum - 1) + 2 * numRunLens; // +1 is the byte needed for storing the cardinality - lowCardinalityCost = leafCardinality * (config.packedBytesLength - prefixLenSum + 1); + lowCardinalityCost = leafCardinality * (config.packedBytesLength() - prefixLenSum + 1); } if (lowCardinalityCost <= highCardinalityCost) { out.writeByte((byte) -2); @@ -1321,38 +1326,38 @@ private void writeLeafBlockPackedValues( private void writeLowCardinalityLeafBlockPackedValues( DataOutput out, int[] commonPrefixLengths, int count, IntFunction packedValues) throws IOException { - if (config.numIndexDims != 1) { + if (config.numIndexDims() != 1) { writeActualBounds(out, commonPrefixLengths, count, packedValues); } BytesRef value = packedValues.apply(0); - System.arraycopy(value.bytes, value.offset, scratch, 0, config.packedBytesLength); + System.arraycopy(value.bytes, value.offset, scratch, 0, config.packedBytesLength()); int cardinality = 1; for (int i = 1; i < count; i++) { value = packedValues.apply(i); - for (int dim = 0; dim < config.numDims; dim++) { - final int start = dim * config.bytesPerDim; + for (int dim = 0; dim < config.numDims(); dim++) { + final int start = dim * config.bytesPerDim(); if (equalsPredicate.test(value.bytes, value.offset + start, scratch, start) == false) { out.writeVInt(cardinality); - for (int j = 0; j < config.numDims; j++) { + for (int j = 0; j < config.numDims(); j++) { out.writeBytes( scratch, - j * config.bytesPerDim + commonPrefixLengths[j], - config.bytesPerDim - commonPrefixLengths[j]); + j * config.bytesPerDim() + commonPrefixLengths[j], + config.bytesPerDim() - commonPrefixLengths[j]); } - System.arraycopy(value.bytes, value.offset, scratch, 0, config.packedBytesLength); + System.arraycopy(value.bytes, value.offset, scratch, 0, config.packedBytesLength()); cardinality = 1; break; - } else if (dim == config.numDims - 1) { + } else if (dim == config.numDims() - 1) { cardinality++; } } } out.writeVInt(cardinality); - for (int i = 0; i < config.numDims; i++) { + for (int i = 0; i < config.numDims(); i++) { out.writeBytes( scratch, - i * config.bytesPerDim + commonPrefixLengths[i], - config.bytesPerDim - commonPrefixLengths[i]); + i * config.bytesPerDim() + commonPrefixLengths[i], + config.bytesPerDim() - commonPrefixLengths[i]); } } @@ -1364,7 +1369,7 @@ private void writeHighCardinalityLeafBlockPackedValues( IntFunction packedValues, int compressedByteOffset) throws IOException { - if (config.numIndexDims != 1) { + if (config.numIndexDims() != 1) { writeActualBounds(out, commonPrefixLengths, count, packedValues); } commonPrefixLengths[sortedDim]++; @@ -1385,13 +1390,13 @@ private void writeHighCardinalityLeafBlockPackedValues( private void writeActualBounds( DataOutput out, int[] commonPrefixLengths, int count, IntFunction packedValues) throws IOException { - for (int dim = 0; dim < config.numIndexDims; ++dim) { + for (int dim = 0; dim < config.numIndexDims(); ++dim) { int commonPrefixLength = commonPrefixLengths[dim]; - int suffixLength = config.bytesPerDim - commonPrefixLength; + int suffixLength = config.bytesPerDim() - commonPrefixLength; if (suffixLength > 0) { BytesRef[] minMax = computeMinMax( - count, packedValues, dim * config.bytesPerDim + commonPrefixLength, suffixLength); + count, packedValues, dim * config.bytesPerDim() + commonPrefixLength, suffixLength); BytesRef min = minMax[0]; BytesRef max = minMax[1]; out.writeBytes(min.bytes, min.offset, min.length); @@ -1446,12 +1451,14 @@ private void writeLeafBlockPackedValuesRange( throws IOException { for (int i = start; i < end; ++i) { BytesRef ref = packedValues.apply(i); - assert ref.length == config.packedBytesLength; + assert ref.length == config.packedBytesLength(); - for (int dim = 0; dim < config.numDims; dim++) { + for (int dim = 0; dim < config.numDims(); dim++) { int prefix = commonPrefixLengths[dim]; out.writeBytes( - ref.bytes, ref.offset + dim * config.bytesPerDim + prefix, config.bytesPerDim - prefix); + ref.bytes, + ref.offset + dim * config.bytesPerDim() + prefix, + config.bytesPerDim() - prefix); } } } @@ -1473,10 +1480,10 @@ private static int runLen( private void writeCommonPrefixes(DataOutput out, int[] commonPrefixes, byte[] packedValue) throws IOException { - for (int dim = 0; dim < config.numDims; dim++) { + for (int dim = 0; dim < config.numDims(); dim++) { out.writeVInt(commonPrefixes[dim]); - // System.out.println(commonPrefixes[dim] + " of " + config.bytesPerDim); - out.writeBytes(packedValue, dim * config.bytesPerDim, commonPrefixes[dim]); + // System.out.println(commonPrefixes[dim] + " of " + config.bytesPerDim()); + out.writeBytes(packedValue, dim * config.bytesPerDim(), commonPrefixes[dim]); } } @@ -1535,8 +1542,8 @@ protected int split(byte[] minPackedValue, byte[] maxPackedValue, int[] parentSp for (int numSplits : parentSplits) { maxNumSplits = Math.max(maxNumSplits, numSplits); } - for (int dim = 0; dim < config.numIndexDims; ++dim) { - final int offset = dim * config.bytesPerDim; + for (int dim = 0; dim < config.numIndexDims(); ++dim) { + final int offset = dim * config.bytesPerDim(); if (parentSplits[dim] < maxNumSplits / 2 && comparator.compare(minPackedValue, offset, maxPackedValue, offset) != 0) { return dim; @@ -1545,10 +1552,10 @@ protected int split(byte[] minPackedValue, byte[] maxPackedValue, int[] parentSp // Find which dim has the largest span so we can split on it: int splitDim = -1; - for (int dim = 0; dim < config.numIndexDims; dim++) { - NumericUtils.subtract(config.bytesPerDim, dim, maxPackedValue, minPackedValue, scratchDiff); + for (int dim = 0; dim < config.numIndexDims(); dim++) { + NumericUtils.subtract(config.bytesPerDim(), dim, maxPackedValue, minPackedValue, scratchDiff); if (splitDim == -1 || comparator.compare(scratchDiff, 0, scratch, 0) > 0) { - System.arraycopy(scratchDiff, 0, scratch, 0, config.bytesPerDim); + System.arraycopy(scratchDiff, 0, scratch, 0, config.bytesPerDim()); splitDim = dim; } } @@ -1595,15 +1602,15 @@ private void build( if (numLeaves == 1) { // leaf node final int count = to - from; - assert count <= config.maxPointsInLeafNode; + assert count <= config.maxPointsInLeafNode(); // Compute common prefixes - Arrays.fill(commonPrefixLengths, config.bytesPerDim); + Arrays.fill(commonPrefixLengths, config.bytesPerDim()); reader.getValue(from, scratchBytesRef1); for (int i = from + 1; i < to; ++i) { reader.getValue(i, scratchBytesRef2); - for (int dim = 0; dim < config.numDims; dim++) { - final int offset = dim * config.bytesPerDim; + for (int dim = 0; dim < config.numDims(); dim++) { + final int offset = dim * config.bytesPerDim(); int dimensionPrefixLength = commonPrefixLengths[dim]; commonPrefixLengths[dim] = Math.min( @@ -1617,23 +1624,23 @@ private void build( } // Find the dimension that has the least number of unique bytes at commonPrefixLengths[dim] - FixedBitSet[] usedBytes = new FixedBitSet[config.numDims]; - for (int dim = 0; dim < config.numDims; ++dim) { - if (commonPrefixLengths[dim] < config.bytesPerDim) { + FixedBitSet[] usedBytes = new FixedBitSet[config.numDims()]; + for (int dim = 0; dim < config.numDims(); ++dim) { + if (commonPrefixLengths[dim] < config.bytesPerDim()) { usedBytes[dim] = new FixedBitSet(256); } } for (int i = from + 1; i < to; ++i) { - for (int dim = 0; dim < config.numDims; dim++) { + for (int dim = 0; dim < config.numDims(); dim++) { if (usedBytes[dim] != null) { - byte b = reader.getByteAt(i, dim * config.bytesPerDim + commonPrefixLengths[dim]); + byte b = reader.getByteAt(i, dim * config.bytesPerDim() + commonPrefixLengths[dim]); usedBytes[dim].set(Byte.toUnsignedInt(b)); } } } int sortedDim = 0; int sortedDimCardinality = Integer.MAX_VALUE; - for (int dim = 0; dim < config.numDims; ++dim) { + for (int dim = 0; dim < config.numDims(); ++dim) { if (usedBytes[dim] != null) { final int cardinality = usedBytes[dim].cardinality(); if (cardinality < sortedDimCardinality) { @@ -1660,8 +1667,8 @@ private void build( int leafCardinality = 1; for (int i = from + 1; i < to; ++i) { reader.getValue(i, collector); - for (int dim = 0; dim < config.numDims; dim++) { - final int start = dim * config.bytesPerDim; + for (int dim = 0; dim < config.numDims(); dim++) { + final int start = dim * config.bytesPerDim(); if (equalsPredicate.test( collector.bytes, collector.offset + start, @@ -1690,7 +1697,7 @@ private void build( // Write the common prefixes: reader.getValue(from, scratchBytesRef1); System.arraycopy( - scratchBytesRef1.bytes, scratchBytesRef1.offset, scratch, 0, config.packedBytesLength); + scratchBytesRef1.bytes, scratchBytesRef1.offset, scratch, 0, config.packedBytesLength()); writeCommonPrefixes(out, commonPrefixLengths, scratch); // Write the full values: @@ -1708,7 +1715,7 @@ assert valuesInOrderAndBounds( final int splitDim; // compute the split dimension and partition around it - if (config.numIndexDims == 1) { + if (config.numIndexDims() == 1) { splitDim = 0; } else { // for dimensions > 2 we recompute the bounds for the current inner node to help the @@ -1717,7 +1724,7 @@ assert valuesInOrderAndBounds( // bounds is given // by SPLITS_BEFORE_EXACT_BOUNDS. if (numLeaves != leafBlockFPs.length - && config.numIndexDims > 2 + && config.numIndexDims() > 2 && Arrays.stream(parentSplits).sum() % SPLITS_BEFORE_EXACT_BOUNDS == 0) { computePackedValueBounds( reader, from, to, minPackedValue, maxPackedValue, scratchBytesRef1); @@ -1728,14 +1735,14 @@ assert valuesInOrderAndBounds( // How many leaves will be in the left tree: int numLeftLeafNodes = getNumLeftLeafNodes(numLeaves); // How many points will be in the left tree: - final int mid = from + numLeftLeafNodes * config.maxPointsInLeafNode; + final int mid = from + numLeftLeafNodes * config.maxPointsInLeafNode(); final int commonPrefixLen = commonPrefixComparator.compare( minPackedValue, - splitDim * config.bytesPerDim, + splitDim * config.bytesPerDim(), maxPackedValue, - splitDim * config.bytesPerDim); + splitDim * config.bytesPerDim()); MutablePointTreeReaderUtils.partition( config, @@ -1752,32 +1759,32 @@ assert valuesInOrderAndBounds( final int rightOffset = leavesOffset + numLeftLeafNodes; final int splitOffset = rightOffset - 1; // set the split value - final int address = splitOffset * config.bytesPerDim; + final int address = splitOffset * config.bytesPerDim(); splitDimensionValues[splitOffset] = (byte) splitDim; reader.getValue(mid, scratchBytesRef1); System.arraycopy( scratchBytesRef1.bytes, - scratchBytesRef1.offset + splitDim * config.bytesPerDim, + scratchBytesRef1.offset + splitDim * config.bytesPerDim(), splitPackedValues, address, - config.bytesPerDim); + config.bytesPerDim()); byte[] minSplitPackedValue = - ArrayUtil.copyOfSubArray(minPackedValue, 0, config.packedIndexBytesLength); + ArrayUtil.copyOfSubArray(minPackedValue, 0, config.packedIndexBytesLength()); byte[] maxSplitPackedValue = - ArrayUtil.copyOfSubArray(maxPackedValue, 0, config.packedIndexBytesLength); + ArrayUtil.copyOfSubArray(maxPackedValue, 0, config.packedIndexBytesLength()); System.arraycopy( scratchBytesRef1.bytes, - scratchBytesRef1.offset + splitDim * config.bytesPerDim, + scratchBytesRef1.offset + splitDim * config.bytesPerDim(), minSplitPackedValue, - splitDim * config.bytesPerDim, - config.bytesPerDim); + splitDim * config.bytesPerDim(), + config.bytesPerDim()); System.arraycopy( scratchBytesRef1.bytes, - scratchBytesRef1.offset + splitDim * config.bytesPerDim, + scratchBytesRef1.offset + splitDim * config.bytesPerDim(), maxSplitPackedValue, - splitDim * config.bytesPerDim, - config.bytesPerDim); + splitDim * config.bytesPerDim(), + config.bytesPerDim()); // recurse parentSplits[splitDim]++; @@ -1816,17 +1823,19 @@ assert valuesInOrderAndBounds( private void computePackedValueBounds( BKDRadixSelector.PathSlice slice, byte[] minPackedValue, byte[] maxPackedValue) throws IOException { - try (PointReader reader = slice.writer.getReader(slice.start, slice.count)) { + try (PointReader reader = slice.writer().getReader(slice.start(), slice.count())) { if (reader.next() == false) { return; } BytesRef value = reader.pointValue().packedValue(); - System.arraycopy(value.bytes, value.offset, minPackedValue, 0, config.packedIndexBytesLength); - System.arraycopy(value.bytes, value.offset, maxPackedValue, 0, config.packedIndexBytesLength); + System.arraycopy( + value.bytes, value.offset, minPackedValue, 0, config.packedIndexBytesLength()); + System.arraycopy( + value.bytes, value.offset, maxPackedValue, 0, config.packedIndexBytesLength()); while (reader.next()) { value = reader.pointValue().packedValue(); - for (int dim = 0; dim < config.numIndexDims; dim++) { - final int startOffset = dim * config.bytesPerDim; + for (int dim = 0; dim < config.numIndexDims(); dim++) { + final int startOffset = dim * config.bytesPerDim(); if (comparator.compare( value.bytes, value.offset + startOffset, minPackedValue, startOffset) < 0) { @@ -1835,7 +1844,7 @@ private void computePackedValueBounds( value.offset + startOffset, minPackedValue, startOffset, - config.bytesPerDim); + config.bytesPerDim()); } else if (comparator.compare( value.bytes, value.offset + startOffset, maxPackedValue, startOffset) > 0) { @@ -1844,7 +1853,7 @@ private void computePackedValueBounds( value.offset + startOffset, maxPackedValue, startOffset, - config.bytesPerDim); + config.bytesPerDim()); } } } @@ -1878,32 +1887,32 @@ private void build( // least number of unique bytes at commonPrefixLengths[dim], which makes compression more // efficient HeapPointWriter heapSource; - if (points.writer instanceof HeapPointWriter == false) { + if (points.writer() instanceof HeapPointWriter == false) { // Adversarial cases can cause this, e.g. merging big segments with most of the points // deleted - heapSource = switchToHeap(points.writer); + heapSource = switchToHeap(points.writer()); } else { - heapSource = (HeapPointWriter) points.writer; + heapSource = (HeapPointWriter) points.writer(); } - int from = Math.toIntExact(points.start); - int to = Math.toIntExact(points.start + points.count); + int from = Math.toIntExact(points.start()); + int to = Math.toIntExact(points.start() + points.count()); // we store common prefix on scratch computeCommonPrefixLength(heapSource, scratch, from, to); int sortedDim = 0; int sortedDimCardinality = Integer.MAX_VALUE; - FixedBitSet[] usedBytes = new FixedBitSet[config.numDims]; - for (int dim = 0; dim < config.numDims; ++dim) { - if (commonPrefixLengths[dim] < config.bytesPerDim) { + FixedBitSet[] usedBytes = new FixedBitSet[config.numDims()]; + for (int dim = 0; dim < config.numDims(); ++dim) { + if (commonPrefixLengths[dim] < config.bytesPerDim()) { usedBytes[dim] = new FixedBitSet(256); } } // Find the dimension to compress - for (int dim = 0; dim < config.numDims; dim++) { + for (int dim = 0; dim < config.numDims(); dim++) { int prefix = commonPrefixLengths[dim]; - if (prefix < config.bytesPerDim) { - int offset = dim * config.bytesPerDim; + if (prefix < config.bytesPerDim()) { + int offset = dim * config.bytesPerDim(); for (int i = from; i < to; ++i) { PointValue value = heapSource.getPackedValueSlice(i); BytesRef packedValue = value.packedValue(); @@ -1958,7 +1967,7 @@ assert valuesInOrderAndBounds( // Inner node: partition/recurse final int splitDim; - if (config.numIndexDims == 1) { + if (config.numIndexDims() == 1) { splitDim = 0; } else { // for dimensions > 2 we recompute the bounds for the current inner node to help the @@ -1967,7 +1976,7 @@ assert valuesInOrderAndBounds( // bounds is given // by SPLITS_BEFORE_EXACT_BOUNDS. if (numLeaves != leafBlockFPs.length - && config.numIndexDims > 2 + && config.numIndexDims() > 2 && Arrays.stream(parentSplits).sum() % SPLITS_BEFORE_EXACT_BOUNDS == 0) { computePackedValueBounds(points, minPackedValue, maxPackedValue); } @@ -1980,24 +1989,24 @@ assert valuesInOrderAndBounds( // How many leaves will be in the left tree: final int numLeftLeafNodes = getNumLeftLeafNodes(numLeaves); // How many points will be in the left tree: - final long leftCount = numLeftLeafNodes * (long) config.maxPointsInLeafNode; + final long leftCount = numLeftLeafNodes * (long) config.maxPointsInLeafNode(); BKDRadixSelector.PathSlice[] slices = new BKDRadixSelector.PathSlice[2]; final int commonPrefixLen = commonPrefixComparator.compare( minPackedValue, - splitDim * config.bytesPerDim, + splitDim * config.bytesPerDim(), maxPackedValue, - splitDim * config.bytesPerDim); + splitDim * config.bytesPerDim()); byte[] splitValue = radixSelector.select( points, slices, - points.start, - points.start + points.count, - points.start + leftCount, + points.start(), + points.start() + points.count(), + points.start() + leftCount, splitDim, commonPrefixLen); @@ -2005,19 +2014,27 @@ assert valuesInOrderAndBounds( final int splitValueOffset = rightOffset - 1; splitDimensionValues[splitValueOffset] = (byte) splitDim; - int address = splitValueOffset * config.bytesPerDim; - System.arraycopy(splitValue, 0, splitPackedValues, address, config.bytesPerDim); + int address = splitValueOffset * config.bytesPerDim(); + System.arraycopy(splitValue, 0, splitPackedValues, address, config.bytesPerDim()); - byte[] minSplitPackedValue = new byte[config.packedIndexBytesLength]; - System.arraycopy(minPackedValue, 0, minSplitPackedValue, 0, config.packedIndexBytesLength); + byte[] minSplitPackedValue = new byte[config.packedIndexBytesLength()]; + System.arraycopy(minPackedValue, 0, minSplitPackedValue, 0, config.packedIndexBytesLength()); - byte[] maxSplitPackedValue = new byte[config.packedIndexBytesLength]; - System.arraycopy(maxPackedValue, 0, maxSplitPackedValue, 0, config.packedIndexBytesLength); + byte[] maxSplitPackedValue = new byte[config.packedIndexBytesLength()]; + System.arraycopy(maxPackedValue, 0, maxSplitPackedValue, 0, config.packedIndexBytesLength()); System.arraycopy( - splitValue, 0, minSplitPackedValue, splitDim * config.bytesPerDim, config.bytesPerDim); + splitValue, + 0, + minSplitPackedValue, + splitDim * config.bytesPerDim(), + config.bytesPerDim()); System.arraycopy( - splitValue, 0, maxSplitPackedValue, splitDim * config.bytesPerDim, config.bytesPerDim); + splitValue, + 0, + maxSplitPackedValue, + splitDim * config.bytesPerDim(), + config.bytesPerDim()); parentSplits[splitDim]++; // Recurse on left tree: @@ -2056,30 +2073,30 @@ assert valuesInOrderAndBounds( private void computeCommonPrefixLength( HeapPointWriter heapPointWriter, byte[] commonPrefix, int from, int to) { - Arrays.fill(commonPrefixLengths, config.bytesPerDim); + Arrays.fill(commonPrefixLengths, config.bytesPerDim()); PointValue value = heapPointWriter.getPackedValueSlice(from); BytesRef packedValue = value.packedValue(); - for (int dim = 0; dim < config.numDims; dim++) { + for (int dim = 0; dim < config.numDims(); dim++) { System.arraycopy( packedValue.bytes, - packedValue.offset + dim * config.bytesPerDim, + packedValue.offset + dim * config.bytesPerDim(), commonPrefix, - dim * config.bytesPerDim, - config.bytesPerDim); + dim * config.bytesPerDim(), + config.bytesPerDim()); } for (int i = from + 1; i < to; i++) { value = heapPointWriter.getPackedValueSlice(i); packedValue = value.packedValue(); - for (int dim = 0; dim < config.numDims; dim++) { + for (int dim = 0; dim < config.numDims(); dim++) { if (commonPrefixLengths[dim] != 0) { commonPrefixLengths[dim] = Math.min( commonPrefixLengths[dim], commonPrefixComparator.compare( commonPrefix, - dim * config.bytesPerDim, + dim * config.bytesPerDim(), packedValue.bytes, - packedValue.offset + dim * config.bytesPerDim)); + packedValue.offset + dim * config.bytesPerDim())); } } } @@ -2095,11 +2112,11 @@ private static boolean valuesInOrderAndBounds( IntFunction values, int[] docs, int docsOffset) { - byte[] lastPackedValue = new byte[config.packedBytesLength]; + byte[] lastPackedValue = new byte[config.packedBytesLength()]; int lastDoc = -1; for (int i = 0; i < count; i++) { BytesRef packedValue = values.apply(i); - assert packedValue.length == config.packedBytesLength; + assert packedValue.length == config.packedBytesLength(); assert valueInOrder( config, i, @@ -2127,40 +2144,40 @@ private static boolean valueInOrder( int packedValueOffset, int doc, int lastDoc) { - int dimOffset = sortedDim * config.bytesPerDim; + int dimOffset = sortedDim * config.bytesPerDim(); if (ord > 0) { int cmp = Arrays.compareUnsigned( lastPackedValue, dimOffset, - dimOffset + config.bytesPerDim, + dimOffset + config.bytesPerDim(), packedValue, packedValueOffset + dimOffset, - packedValueOffset + dimOffset + config.bytesPerDim); + packedValueOffset + dimOffset + config.bytesPerDim()); if (cmp > 0) { throw new AssertionError( "values out of order: last value=" + new BytesRef(lastPackedValue) + " current value=" - + new BytesRef(packedValue, packedValueOffset, config.packedBytesLength) + + new BytesRef(packedValue, packedValueOffset, config.packedBytesLength()) + " ord=" + ord); } - if (cmp == 0 && config.numDims > config.numIndexDims) { + if (cmp == 0 && config.numDims() > config.numIndexDims()) { cmp = Arrays.compareUnsigned( lastPackedValue, - config.packedIndexBytesLength, - config.packedBytesLength, + config.packedIndexBytesLength(), + config.packedBytesLength(), packedValue, - packedValueOffset + config.packedIndexBytesLength, - packedValueOffset + config.packedBytesLength); + packedValueOffset + config.packedIndexBytesLength(), + packedValueOffset + config.packedBytesLength()); if (cmp > 0) { throw new AssertionError( "data values out of order: last value=" + new BytesRef(lastPackedValue) + " current value=" - + new BytesRef(packedValue, packedValueOffset, config.packedBytesLength) + + new BytesRef(packedValue, packedValueOffset, config.packedBytesLength()) + " ord=" + ord); } @@ -2170,32 +2187,33 @@ private static boolean valueInOrder( "docs out of order: last doc=" + lastDoc + " current doc=" + doc + " ord=" + ord); } } - System.arraycopy(packedValue, packedValueOffset, lastPackedValue, 0, config.packedBytesLength); + System.arraycopy( + packedValue, packedValueOffset, lastPackedValue, 0, config.packedBytesLength()); return true; } // only called from assert private static boolean valueInBounds( BKDConfig config, BytesRef packedValue, byte[] minPackedValue, byte[] maxPackedValue) { - for (int dim = 0; dim < config.numIndexDims; dim++) { - int offset = config.bytesPerDim * dim; + for (int dim = 0; dim < config.numIndexDims(); dim++) { + int offset = config.bytesPerDim() * dim; if (Arrays.compareUnsigned( packedValue.bytes, packedValue.offset + offset, - packedValue.offset + offset + config.bytesPerDim, + packedValue.offset + offset + config.bytesPerDim(), minPackedValue, offset, - offset + config.bytesPerDim) + offset + config.bytesPerDim()) < 0) { return false; } if (Arrays.compareUnsigned( packedValue.bytes, packedValue.offset + offset, - packedValue.offset + offset + config.bytesPerDim, + packedValue.offset + offset + config.bytesPerDim(), maxPackedValue, offset, - offset + config.bytesPerDim) + offset + config.bytesPerDim()) > 0) { return false; } diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/DocIdsWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/DocIdsWriter.java index 9f6a10b9ddcf..b9ea0d9aa08e 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/DocIdsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/DocIdsWriter.java @@ -17,13 +17,16 @@ package org.apache.lucene.util.bkd; import java.io.IOException; +import java.util.Arrays; import org.apache.lucene.index.PointValues.IntersectVisitor; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.DocBaseBitSetIterator; import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.LongsRef; final class DocIdsWriter { @@ -36,6 +39,7 @@ final class DocIdsWriter { private static final byte LEGACY_DELTA_VINT = (byte) 0; private final int[] scratch; + private final LongsRef scratchLongs = new LongsRef(); /** * IntsRef to be used to iterate over the scratch buffer. A single instance is reused to avoid @@ -205,12 +209,17 @@ void readInts(IndexInput in, int count, int[] docIDs) throws IOException { } } - private static DocIdSetIterator readBitSetIterator(IndexInput in, int count) throws IOException { + private DocIdSetIterator readBitSetIterator(IndexInput in, int count) throws IOException { int offsetWords = in.readVInt(); int longLen = in.readVInt(); - long[] bits = new long[longLen]; - in.readLongs(bits, 0, longLen); - FixedBitSet bitSet = new FixedBitSet(bits, longLen << 6); + scratchLongs.longs = ArrayUtil.growNoCopy(scratchLongs.longs, longLen); + in.readLongs(scratchLongs.longs, 0, longLen); + // make ghost bits clear for FixedBitSet. + if (longLen < scratchLongs.length) { + Arrays.fill(scratchLongs.longs, longLen, scratchLongs.longs.length, 0); + } + scratchLongs.length = longLen; + FixedBitSet bitSet = new FixedBitSet(scratchLongs.longs, longLen << 6); return new DocBaseBitSetIterator(bitSet, count, offsetWords << 6); } @@ -230,7 +239,7 @@ private static void readLegacyDeltaVInts(IndexInput in, int count, int[] docIDs) } } - private static void readBitSet(IndexInput in, int count, int[] docIDs) throws IOException { + private void readBitSet(IndexInput in, int count, int[] docIDs) throws IOException { DocIdSetIterator iterator = readBitSetIterator(in, count); int docId, pos = 0; while ((docId = iterator.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { @@ -307,8 +316,7 @@ void readInts(IndexInput in, int count, IntersectVisitor visitor) throws IOExcep } } - private static void readBitSet(IndexInput in, int count, IntersectVisitor visitor) - throws IOException { + private void readBitSet(IndexInput in, int count, IntersectVisitor visitor) throws IOException { DocIdSetIterator bitSetIterator = readBitSetIterator(in, count); visitor.visit(bitSetIterator); } diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointWriter.java index 5a918d166d8e..687bd1649fb2 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointWriter.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointWriter.java @@ -40,11 +40,11 @@ public final class HeapPointWriter implements PointWriter { public HeapPointWriter(BKDConfig config, int size) { this.config = config; - this.block = new byte[config.bytesPerDoc * size]; + this.block = new byte[config.bytesPerDoc() * size]; this.size = size; - this.dimComparator = ArrayUtil.getUnsignedComparator(config.bytesPerDim); - this.dataDimsAndDocLength = config.bytesPerDoc - config.packedIndexBytesLength; - this.scratch = new byte[config.bytesPerDoc]; + this.dimComparator = ArrayUtil.getUnsignedComparator(config.bytesPerDim()); + this.dataDimsAndDocLength = config.bytesPerDoc() - config.packedIndexBytesLength(); + this.scratch = new byte[config.bytesPerDoc()]; if (size > 0) { pointValue = new HeapPointValue(config, block); } else { @@ -56,23 +56,23 @@ public HeapPointWriter(BKDConfig config, int size) { /** Returns a reference, in result, to the byte[] slice holding this value */ public PointValue getPackedValueSlice(int index) { assert index < nextWrite : "nextWrite=" + (nextWrite) + " vs index=" + index; - pointValue.setOffset(index * config.bytesPerDoc); + pointValue.setOffset(index * config.bytesPerDoc()); return pointValue; } @Override public void append(byte[] packedValue, int docID) { assert closed == false : "point writer is already closed"; - assert packedValue.length == config.packedBytesLength + assert packedValue.length == config.packedBytesLength() : "[packedValue] must have length [" - + config.packedBytesLength + + config.packedBytesLength() + "] but was [" + packedValue.length + "]"; assert nextWrite < size : "nextWrite=" + (nextWrite + 1) + " vs size=" + size; - final int position = nextWrite * config.bytesPerDoc; - System.arraycopy(packedValue, 0, block, position, config.packedBytesLength); - BitUtil.VH_BE_INT.set(block, position + config.packedBytesLength, docID); + final int position = nextWrite * config.bytesPerDoc(); + System.arraycopy(packedValue, 0, block, position, config.packedBytesLength()); + BitUtil.VH_BE_INT.set(block, position + config.packedBytesLength(), docID); nextWrite++; } @@ -81,33 +81,33 @@ public void append(PointValue pointValue) { assert closed == false : "point writer is already closed"; assert nextWrite < size : "nextWrite=" + (nextWrite + 1) + " vs size=" + size; final BytesRef packedValueDocID = pointValue.packedValueDocIDBytes(); - assert packedValueDocID.length == config.bytesPerDoc + assert packedValueDocID.length == config.bytesPerDoc() : "[packedValue] must have length [" - + (config.bytesPerDoc) + + (config.bytesPerDoc()) + "] but was [" + packedValueDocID.length + "]"; - final int position = nextWrite * config.bytesPerDoc; + final int position = nextWrite * config.bytesPerDoc(); System.arraycopy( - packedValueDocID.bytes, packedValueDocID.offset, block, position, config.bytesPerDoc); + packedValueDocID.bytes, packedValueDocID.offset, block, position, config.bytesPerDoc()); nextWrite++; } /** Swaps the point at point {@code i} with the point at position {@code j} */ void swap(int i, int j) { - final int indexI = i * config.bytesPerDoc; - final int indexJ = j * config.bytesPerDoc; + final int indexI = i * config.bytesPerDoc(); + final int indexJ = j * config.bytesPerDoc(); // scratch1 = values[i] - System.arraycopy(block, indexI, scratch, 0, config.bytesPerDoc); + System.arraycopy(block, indexI, scratch, 0, config.bytesPerDoc()); // values[i] = values[j] - System.arraycopy(block, indexJ, block, indexI, config.bytesPerDoc); + System.arraycopy(block, indexJ, block, indexI, config.bytesPerDoc()); // values[j] = scratch1 - System.arraycopy(scratch, 0, block, indexJ, config.bytesPerDoc); + System.arraycopy(scratch, 0, block, indexJ, config.bytesPerDoc()); } /** Return the byte at position {@code k} of the point at position {@code i} */ int byteAt(int i, int k) { - return block[i * config.bytesPerDoc + k] & 0xff; + return block[i * config.bytesPerDoc() + k] & 0xff; } /** @@ -115,7 +115,7 @@ int byteAt(int i, int k) { * at the given offset */ void copyDim(int i, int dim, byte[] bytes, int offset) { - System.arraycopy(block, i * config.bytesPerDoc + dim, bytes, offset, config.bytesPerDim); + System.arraycopy(block, i * config.bytesPerDoc() + dim, bytes, offset, config.bytesPerDim()); } /** @@ -125,7 +125,7 @@ void copyDim(int i, int dim, byte[] bytes, int offset) { void copyDataDimsAndDoc(int i, byte[] bytes, int offset) { System.arraycopy( block, - i * config.bytesPerDoc + config.packedIndexBytesLength, + i * config.bytesPerDoc() + config.packedIndexBytesLength(), bytes, offset, dataDimsAndDocLength); @@ -136,8 +136,8 @@ void copyDataDimsAndDoc(int i, byte[] bytes, int offset) { * position {@code j} */ int compareDim(int i, int j, int dim) { - final int iOffset = i * config.bytesPerDoc + dim; - final int jOffset = j * config.bytesPerDoc + dim; + final int iOffset = i * config.bytesPerDoc() + dim; + final int jOffset = j * config.bytesPerDoc() + dim; return compareDim(block, iOffset, block, jOffset); } @@ -146,7 +146,7 @@ int compareDim(int i, int j, int dim) { * value */ int compareDim(int j, byte[] dimValue, int offset, int dim) { - final int jOffset = j * config.bytesPerDoc + dim; + final int jOffset = j * config.bytesPerDoc() + dim; return compareDim(dimValue, offset, block, jOffset); } @@ -159,8 +159,8 @@ private int compareDim(byte[] blockI, int offsetI, byte[] blockJ, int offsetJ) { * at position {@code j} */ int compareDataDimsAndDoc(int i, int j) { - final int iOffset = i * config.bytesPerDoc + config.packedIndexBytesLength; - final int jOffset = j * config.bytesPerDoc + config.packedIndexBytesLength; + final int iOffset = i * config.bytesPerDoc() + config.packedIndexBytesLength(); + final int jOffset = j * config.bytesPerDoc() + config.packedIndexBytesLength(); return compareDataDimsAndDoc(block, iOffset, block, jOffset); } @@ -169,7 +169,7 @@ int compareDataDimsAndDoc(int i, int j) { * provided value */ int compareDataDimsAndDoc(int j, byte[] dataDimsAndDocs, int offset) { - final int jOffset = j * config.bytesPerDoc + config.packedIndexBytesLength; + final int jOffset = j * config.bytesPerDoc() + config.packedIndexBytesLength(); return compareDataDimsAndDoc(dataDimsAndDocs, offset, block, jOffset); } @@ -187,11 +187,11 @@ private int compareDataDimsAndDoc(byte[] blockI, int offsetI, byte[] blockJ, int public int computeCardinality(int from, int to, int[] commonPrefixLengths) { int leafCardinality = 1; for (int i = from + 1; i < to; i++) { - final int pointOffset = (i - 1) * config.bytesPerDoc; - final int nextPointOffset = pointOffset + config.bytesPerDoc; - for (int dim = 0; dim < config.numDims; dim++) { - final int start = dim * config.bytesPerDim + commonPrefixLengths[dim]; - final int end = dim * config.bytesPerDim + config.bytesPerDim; + final int pointOffset = (i - 1) * config.bytesPerDoc(); + final int nextPointOffset = pointOffset + config.bytesPerDoc(); + for (int dim = 0; dim < config.numDims(); dim++) { + final int start = dim * config.bytesPerDim() + commonPrefixLengths[dim]; + final int end = dim * config.bytesPerDim() + config.bytesPerDim(); if (Arrays.mismatch( block, nextPointOffset + start, @@ -245,9 +245,9 @@ private static class HeapPointValue implements PointValue { private final int packedValueLength; HeapPointValue(BKDConfig config, byte[] value) { - this.packedValueLength = config.packedBytesLength; + this.packedValueLength = config.packedBytesLength(); this.packedValue = new BytesRef(value, 0, packedValueLength); - this.packedValueDocID = new BytesRef(value, 0, config.bytesPerDoc); + this.packedValueDocID = new BytesRef(value, 0, config.bytesPerDoc()); } /** Sets a new value by changing the offset. */ diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/MutablePointTreeReaderUtils.java b/lucene/core/src/java/org/apache/lucene/util/bkd/MutablePointTreeReaderUtils.java index 4a9a290eaea5..44b3e8c72672 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/MutablePointTreeReaderUtils.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/MutablePointTreeReaderUtils.java @@ -55,7 +55,7 @@ public static void sort(BKDConfig config, int maxDoc, MutablePointTree reader, i // This should be a common situation as IndexWriter accumulates data in doc ID order when // index sorting is not enabled. final int bitsPerDocId = sortedByDocID ? 0 : PackedInts.bitsRequired(maxDoc - 1); - new StableMSBRadixSorter(config.packedBytesLength + (bitsPerDocId + 7) / 8) { + new StableMSBRadixSorter(config.packedBytesLength() + (bitsPerDocId + 7) / 8) { @Override protected void swap(int i, int j) { @@ -74,10 +74,10 @@ protected void restore(int i, int j) { @Override protected int byteAt(int i, int k) { - if (k < config.packedBytesLength) { + if (k < config.packedBytesLength()) { return Byte.toUnsignedInt(reader.getByteAt(i, k)); } else { - final int shift = bitsPerDocId - ((k - config.packedBytesLength + 1) << 3); + final int shift = bitsPerDocId - ((k - config.packedBytesLength() + 1) << 3); return (reader.getDocID(i) >>> Math.max(0, shift)) & 0xff; } } @@ -95,8 +95,8 @@ public static void sortByDim( BytesRef scratch1, BytesRef scratch2) { - final ByteArrayComparator comparator = ArrayUtil.getUnsignedComparator(config.bytesPerDim); - final int start = sortedDim * config.bytesPerDim; + final ByteArrayComparator comparator = ArrayUtil.getUnsignedComparator(config.bytesPerDim()); + final int start = sortedDim * config.bytesPerDim(); // No need for a fancy radix sort here, this is called on the leaves only so // there are not many values to sort new IntroSorter() { @@ -125,11 +125,11 @@ protected int comparePivot(int j) { cmp = Arrays.compareUnsigned( pivot.bytes, - pivot.offset + config.packedIndexBytesLength, - pivot.offset + config.packedBytesLength, + pivot.offset + config.packedIndexBytesLength(), + pivot.offset + config.packedBytesLength(), scratch2.bytes, - scratch2.offset + config.packedIndexBytesLength, - scratch2.offset + config.packedBytesLength); + scratch2.offset + config.packedIndexBytesLength(), + scratch2.offset + config.packedBytesLength()); if (cmp == 0) { cmp = pivotDoc - reader.getDocID(j); } @@ -154,23 +154,23 @@ public static void partition( int mid, BytesRef scratch1, BytesRef scratch2) { - final int dimOffset = splitDim * config.bytesPerDim + commonPrefixLen; - final int dimCmpBytes = config.bytesPerDim - commonPrefixLen; + final int dimOffset = splitDim * config.bytesPerDim() + commonPrefixLen; + final int dimCmpBytes = config.bytesPerDim() - commonPrefixLen; final int dataCmpBytes = - (config.numDims - config.numIndexDims) * config.bytesPerDim + dimCmpBytes; + (config.numDims() - config.numIndexDims()) * config.bytesPerDim() + dimCmpBytes; final int bitsPerDocId = PackedInts.bitsRequired(maxDoc - 1); new RadixSelector(dataCmpBytes + (bitsPerDocId + 7) / 8) { @Override protected Selector getFallbackSelector(int k) { - final int dimStart = splitDim * config.bytesPerDim; + final int dimStart = splitDim * config.bytesPerDim(); final int dataStart = (k < dimCmpBytes) - ? config.packedIndexBytesLength - : config.packedIndexBytesLength + k - dimCmpBytes; - final int dataEnd = config.numDims * config.bytesPerDim; + ? config.packedIndexBytesLength() + : config.packedIndexBytesLength() + k - dimCmpBytes; + final int dataEnd = config.numDims() * config.bytesPerDim(); final ByteArrayComparator dimComparator = - ArrayUtil.getUnsignedComparator(config.bytesPerDim); + ArrayUtil.getUnsignedComparator(config.bytesPerDim()); return new IntroSelector() { final BytesRef pivot = scratch1; @@ -230,7 +230,7 @@ protected int byteAt(int i, int k) { return Byte.toUnsignedInt(reader.getByteAt(i, dimOffset + k)); } else if (k < dataCmpBytes) { return Byte.toUnsignedInt( - reader.getByteAt(i, config.packedIndexBytesLength + k - dimCmpBytes)); + reader.getByteAt(i, config.packedIndexBytesLength() + k - dimCmpBytes)); } else { final int shift = bitsPerDocId - ((k - dataCmpBytes + 1) << 3); return (reader.getDocID(i) >>> Math.max(0, shift)) & 0xff; diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointReader.java b/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointReader.java index dcdc9354528b..0b2306cb9d33 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointReader.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointReader.java @@ -56,7 +56,7 @@ public OfflinePointReader( throws IOException { this.config = config; - if ((start + length) * config.bytesPerDoc + CodecUtil.footerLength() + if ((start + length) * config.bytesPerDoc() + CodecUtil.footerLength() > tempDir.fileLength(tempFileName)) { throw new IllegalArgumentException( "requested slice is beyond the length of this file: start=" @@ -64,7 +64,7 @@ public OfflinePointReader( + " length=" + length + " bytesPerDoc=" - + config.bytesPerDoc + + config.bytesPerDoc() + " fileLength=" + tempDir.fileLength(tempFileName) + " tempFileName=" @@ -73,15 +73,15 @@ public OfflinePointReader( if (reusableBuffer == null) { throw new IllegalArgumentException("[reusableBuffer] cannot be null"); } - if (reusableBuffer.length < config.bytesPerDoc) { + if (reusableBuffer.length < config.bytesPerDoc()) { throw new IllegalArgumentException( - "Length of [reusableBuffer] must be bigger than " + config.bytesPerDoc); + "Length of [reusableBuffer] must be bigger than " + config.bytesPerDoc()); } - this.maxPointOnHeap = reusableBuffer.length / config.bytesPerDoc; + this.maxPointOnHeap = reusableBuffer.length / config.bytesPerDoc(); // Best-effort checksumming: if (start == 0 - && length * config.bytesPerDoc + && length * config.bytesPerDoc() == tempDir.fileLength(tempFileName) - CodecUtil.footerLength()) { // If we are going to read the entire file, e.g. because BKDWriter is now // partitioning it, we open with checksums: @@ -96,7 +96,7 @@ public OfflinePointReader( name = tempFileName; - long seekFP = start * config.bytesPerDoc; + long seekFP = start * config.bytesPerDoc(); in.seek(seekFP); countLeft = length; this.onHeapBuffer = reusableBuffer; @@ -113,11 +113,11 @@ public boolean next() throws IOException { } try { if (countLeft > maxPointOnHeap) { - in.readBytes(onHeapBuffer, 0, maxPointOnHeap * config.bytesPerDoc); + in.readBytes(onHeapBuffer, 0, maxPointOnHeap * config.bytesPerDoc()); pointsInBuffer = maxPointOnHeap - 1; countLeft -= maxPointOnHeap; } else { - in.readBytes(onHeapBuffer, 0, (int) countLeft * config.bytesPerDoc); + in.readBytes(onHeapBuffer, 0, (int) countLeft * config.bytesPerDoc()); pointsInBuffer = Math.toIntExact(countLeft - 1); countLeft = 0; } @@ -130,7 +130,7 @@ public boolean next() throws IOException { } } else { this.pointsInBuffer--; - this.offset += config.bytesPerDoc; + this.offset += config.bytesPerDoc(); } return true; } @@ -162,9 +162,9 @@ static class OfflinePointValue implements PointValue { final int packedValueLength; OfflinePointValue(BKDConfig config, byte[] value) { - this.packedValueLength = config.packedBytesLength; + this.packedValueLength = config.packedBytesLength(); this.packedValue = new BytesRef(value, 0, packedValueLength); - this.packedValueDocID = new BytesRef(value, 0, config.bytesPerDoc); + this.packedValueDocID = new BytesRef(value, 0, config.bytesPerDoc()); } /** Sets a new value by changing the offset. */ diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointWriter.java index 84dc99c79a62..bc564f90b23c 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointWriter.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointWriter.java @@ -56,9 +56,9 @@ public OfflinePointWriter( @Override public void append(byte[] packedValue, int docID) throws IOException { assert closed == false : "Point writer is already closed"; - assert packedValue.length == config.packedBytesLength + assert packedValue.length == config.packedBytesLength() : "[packedValue] must have length [" - + config.packedBytesLength + + config.packedBytesLength() + "] but was [" + packedValue.length + "]"; @@ -75,9 +75,9 @@ public void append(byte[] packedValue, int docID) throws IOException { public void append(PointValue pointValue) throws IOException { assert closed == false : "Point writer is already closed"; BytesRef packedValueDocID = pointValue.packedValueDocIDBytes(); - assert packedValueDocID.length == config.bytesPerDoc + assert packedValueDocID.length == config.bytesPerDoc() : "[packedValue and docID] must have length [" - + (config.bytesPerDoc) + + (config.bytesPerDoc()) + "] but was [" + packedValueDocID.length + "]"; @@ -89,7 +89,7 @@ public void append(PointValue pointValue) throws IOException { @Override public PointReader getReader(long start, long length) throws IOException { - byte[] buffer = new byte[config.bytesPerDoc]; + byte[] buffer = new byte[config.bytesPerDoc()]; return getReader(start, length, buffer); } diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java index ead90dca24c6..eb780acc245d 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java @@ -481,19 +481,13 @@ public static FSTMetadata readMetadata(DataInput metaIn, Outputs outpu } INPUT_TYPE inputType; final byte t = metaIn.readByte(); - switch (t) { - case 0: - inputType = INPUT_TYPE.BYTE1; - break; - case 1: - inputType = INPUT_TYPE.BYTE2; - break; - case 2: - inputType = INPUT_TYPE.BYTE4; - break; - default: - throw new CorruptIndexException("invalid input type " + t, metaIn); - } + inputType = + switch (t) { + case 0 -> INPUT_TYPE.BYTE1; + case 1 -> INPUT_TYPE.BYTE2; + case 2 -> INPUT_TYPE.BYTE4; + default -> throw new CorruptIndexException("invalid input type " + t, metaIn); + }; long startNode = metaIn.readVLong(); long numBytes = metaIn.readVLong(); return new FSTMetadata<>(inputType, outputs, emptyOutput, startNode, version, numBytes); @@ -630,7 +624,6 @@ Arc readLastTargetArc(Arc follow, Arc arc, BytesReader in) throws IOExc arc.output = follow.nextFinalOutput(); arc.flags = BIT_LAST_ARC; arc.nodeFlags = arc.flags; - return arc; } else { in.setPosition(follow.target()); byte flags = arc.nodeFlags = in.readByte(); @@ -683,8 +676,8 @@ Arc readLastTargetArc(Arc follow, Arc arc, BytesReader in) throws IOExc readNextRealArc(arc, in); } assert arc.isLast(); - return arc; } + return arc; } private long readUnpackedNodeTarget(BytesReader in) throws IOException { diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java index e837b7775e0c..b83024930eea 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java @@ -98,8 +98,8 @@ public class FSTCompiler { // it will throw exceptions if attempt to call getReverseBytesReader() or writeTo(DataOutput) private static final FSTReader NULL_FST_READER = new NullFSTReader(); - private final NodeHash dedupHash; - // a temporary FST used during building for NodeHash cache + private final FSTSuffixNodeCache suffixDedupCache; + // a temporary FST used during building for FSTSuffixNodeCache cache final FST fst; private final T NO_OUTPUT; @@ -178,9 +178,9 @@ private FSTCompiler( if (suffixRAMLimitMB < 0) { throw new IllegalArgumentException("ramLimitMB must be >= 0; got: " + suffixRAMLimitMB); } else if (suffixRAMLimitMB > 0) { - dedupHash = new NodeHash<>(this, suffixRAMLimitMB); + suffixDedupCache = new FSTSuffixNodeCache<>(this, suffixRAMLimitMB); } else { - dedupHash = null; + suffixDedupCache = null; } NO_OUTPUT = outputs.getNoOutput(); @@ -379,12 +379,12 @@ public long getArcCount() { private CompiledNode compileNode(UnCompiledNode nodeIn) throws IOException { final long node; long bytesPosStart = numBytesWritten; - if (dedupHash != null) { + if (suffixDedupCache != null) { if (nodeIn.numArcs == 0) { node = addNode(nodeIn); lastFrozenNode = node; } else { - node = dedupHash.add(nodeIn); + node = suffixDedupCache.add(nodeIn); } } else { node = addNode(nodeIn); @@ -817,7 +817,8 @@ private void freezeTail(int prefixLenPlus1) throws IOException { for (int idx = lastInput.length(); idx >= downTo; idx--) { final UnCompiledNode node = frontier[idx]; - final UnCompiledNode parent = frontier[idx - 1]; + final int prevIdx = idx - 1; + final UnCompiledNode parent = frontier[prevIdx]; final T nextFinalOutput = node.output; @@ -833,7 +834,7 @@ private void freezeTail(int prefixLenPlus1) throws IOException { // this node makes it and we now compile it. first, // compile any targets that were previously // undecided: - parent.replaceLast(lastInput.intAt(idx - 1), compileNode(node), nextFinalOutput, isFinal); + parent.replaceLast(lastInput.intAt(prevIdx), compileNode(node), nextFinalOutput, isFinal); } } @@ -871,10 +872,7 @@ public void add(IntsRef input, T output) throws IOException { int pos1 = 0; int pos2 = input.offset; final int pos1Stop = Math.min(lastInput.length(), input.length); - while (true) { - if (pos1 >= pos1Stop || lastInput.intAt(pos1) != input.ints[pos2]) { - break; - } + while (pos1 < pos1Stop && lastInput.intAt(pos1) == input.ints[pos2]) { pos1++; pos2++; } diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTSuffixNodeCache.java similarity index 93% rename from lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java rename to lucene/core/src/java/org/apache/lucene/util/fst/FSTSuffixNodeCache.java index 7326fd77f73b..f33f09e90723 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTSuffixNodeCache.java @@ -31,8 +31,24 @@ // TODO: couldn't we prune naturally back until we see a transition with an output? it's highly // unlikely (mostly impossible) such suffixes can be shared? -// Used to dedup states (lookup already-frozen states) -final class NodeHash { +/** + * This is essentially a LRU cache to maintain and lookup node suffix. Un-compiled node can be added + * into the cache and if a similar node exists we will return its address in the FST. A node is + * defined as similar if it has the same label, arcs, outputs & other properties that identify a + * node. + * + *

    The total size of the cache is controlled through the constructor parameter ramLimitMB + * Implementation-wise, we maintain two lookup tables, a primary table where node can be + * looked up from, and a fallback lookup table in case the lookup in the primary table fails. Nodes + * from the fallback table can also be promoted to the primary table when that happens. When the + * primary table is full, we swap it with the fallback table and clear out the primary table. + * + *

    To lookup the node address, we build a special hash table which maps from the Node hash value + * to the Node address in the FST, called PagedGrowableHash. Internally it uses {@link + * PagedGrowableWriter} to store the mapping, which allows efficient packing the hash & address long + * values, and uses {@link ByteBlockPool} to store the actual node content (arcs & outputs). + */ +final class FSTSuffixNodeCache { // primary table -- we add nodes into this until it reaches the requested tableSizeLimit/2, then // we move it to fallback @@ -60,7 +76,7 @@ final class NodeHash { * recently used suffixes are discarded, and the FST is no longer minimalI. Still, larger * ramLimitMB will make the FST smaller (closer to minimal). */ - public NodeHash(FSTCompiler fstCompiler, double ramLimitMB) { + public FSTSuffixNodeCache(FSTCompiler fstCompiler, double ramLimitMB) { if (ramLimitMB <= 0) { throw new IllegalArgumentException("ramLimitMB must be > 0; got: " + ramLimitMB); } diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/Util.java b/lucene/core/src/java/org/apache/lucene/util/fst/Util.java index 15f763f62b29..c0f4bfeb5726 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/Util.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/Util.java @@ -149,12 +149,8 @@ public String toString() { } /** Compares first by the provided comparator, and then tie breaks by path.input. */ - private static class TieBreakByInputComparator implements Comparator> { - private final Comparator comparator; - - TieBreakByInputComparator(Comparator comparator) { - this.comparator = comparator; - } + private record TieBreakByInputComparator(Comparator comparator) + implements Comparator> { @Override public int compare(FSTPath a, FSTPath b) { @@ -430,15 +426,7 @@ protected boolean acceptResult(IntsRef input, T output) { /** * Holds a single input (IntsRef) + output, returned by {@link #shortestPaths shortestPaths()}. */ - public static final class Result { - public final IntsRef input; - public final T output; - - public Result(IntsRef input, T output) { - this.input = input; - this.output = output; - } - } + public record Result(IntsRef input, T output) {} /** Holds the results for a top N search using {@link TopNSearcher} */ public static final class TopResults implements Iterable> { @@ -787,10 +775,11 @@ public static IntsRef toUTF32(char[] s, int offset, int length, IntsRefBuilder s /** Just takes unsigned byte values from the BytesRef and converts into an IntsRef. */ public static IntsRef toIntsRef(BytesRef input, IntsRefBuilder scratch) { - scratch.clear(); + scratch.growNoCopy(input.length); for (int i = 0; i < input.length; i++) { - scratch.append(input.bytes[i + input.offset] & 0xFF); + scratch.setIntAt(i, input.bytes[i + input.offset] & 0xFF); } + scratch.setLength(input.length); return scratch.get(); } diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/ConcurrentHnswMerger.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/ConcurrentHnswMerger.java index 392d83fa262c..b4688d097302 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/ConcurrentHnswMerger.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/ConcurrentHnswMerger.java @@ -19,7 +19,7 @@ import java.io.IOException; import org.apache.lucene.codecs.hnsw.HnswGraphProvider; import org.apache.lucene.index.FieldInfo; -import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.search.TaskExecutor; import org.apache.lucene.util.BitSet; import org.apache.lucene.util.FixedBitSet; @@ -46,30 +46,25 @@ public ConcurrentHnswMerger( } @Override - protected HnswBuilder createBuilder(DocIdSetIterator mergedVectorIterator, int maxOrd) + protected HnswBuilder createBuilder(KnnVectorValues mergedVectorValues, int maxOrd) throws IOException { + OnHeapHnswGraph graph; + BitSet initializedNodes = null; + if (initReader == null) { - return new HnswConcurrentMergeBuilder( - taskExecutor, - numWorker, - scorerSupplier, - M, - beamWidth, - new OnHeapHnswGraph(M, maxOrd), - null); + graph = new OnHeapHnswGraph(M, maxOrd); + } else { + HnswGraph initializerGraph = ((HnswGraphProvider) initReader).getGraph(fieldInfo.name); + if (initializerGraph.size() == 0) { + graph = new OnHeapHnswGraph(M, maxOrd); + } else { + initializedNodes = new FixedBitSet(maxOrd); + int[] oldToNewOrdinalMap = getNewOrdMapping(mergedVectorValues, initializedNodes); + graph = + InitializedHnswGraphBuilder.initGraph(M, initializerGraph, oldToNewOrdinalMap, maxOrd); + } } - - HnswGraph initializerGraph = ((HnswGraphProvider) initReader).getGraph(fieldInfo.name); - BitSet initializedNodes = new FixedBitSet(maxOrd); - int[] oldToNewOrdinalMap = getNewOrdMapping(mergedVectorIterator, initializedNodes); - return new HnswConcurrentMergeBuilder( - taskExecutor, - numWorker, - scorerSupplier, - M, - beamWidth, - InitializedHnswGraphBuilder.initGraph(M, initializerGraph, oldToNewOrdinalMap, maxOrd), - initializedNodes); + taskExecutor, numWorker, scorerSupplier, M, beamWidth, graph, initializedNodes); } } diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswBuilder.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswBuilder.java index 672bf5c60105..aa27525b7f10 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswBuilder.java @@ -48,5 +48,5 @@ public interface HnswBuilder { * components, re-ordering node ids for better delta compression) may be triggered, so callers * should expect this call to take some time. */ - OnHeapHnswGraph getCompletedGraph(); + OnHeapHnswGraph getCompletedGraph() throws IOException; } diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswConcurrentMergeBuilder.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswConcurrentMergeBuilder.java index fd1a98d08c0e..aebed5642383 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswConcurrentMergeBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswConcurrentMergeBuilder.java @@ -91,6 +91,7 @@ public OnHeapHnswGraph build(int maxOrd) throws IOException { }); } taskExecutor.invokeAll(futures); + finish(); frozen = true; return workers[0].getCompletedGraph(); } @@ -109,11 +110,19 @@ public void setInfoStream(InfoStream infoStream) { } @Override - public OnHeapHnswGraph getCompletedGraph() { - frozen = true; + public OnHeapHnswGraph getCompletedGraph() throws IOException { + if (frozen == false) { + // should already have been called in build(), but just in case + finish(); + frozen = true; + } return getGraph(); } + private void finish() throws IOException { + workers[0].finish(); + } + @Override public OnHeapHnswGraph getGraph() { return workers[0].getGraph(); @@ -213,7 +222,7 @@ private MergeSearcher(NeighborQueue candidates, HnswLock hnswLock, BitSet visite @Override void graphSeek(HnswGraph graph, int level, int targetNode) { try (HnswLock.LockedRow rowLock = hnswLock.read(level, targetNode)) { - NeighborArray neighborArray = rowLock.row; + NeighborArray neighborArray = rowLock.row(); if (nodeBuffer == null || nodeBuffer.length < neighborArray.size()) { nodeBuffer = new int[neighborArray.size()]; } diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraph.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraph.java index ba0b714fdd7c..0c38c4e2ff78 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraph.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraph.java @@ -68,7 +68,7 @@ protected HnswGraph() {} /** Returns the number of nodes in the graph */ public abstract int size(); - /** Returns max node id, inclusive, normally this value will be size - 1 */ + /** Returns max node id, inclusive. Normally this value will be size - 1. */ public int maxNodeId() { return size() - 1; } @@ -130,7 +130,7 @@ public NodesIterator getNodesOnLevel(int level) { }; /** - * Iterator over the graph nodes on a certain level, Iterator also provides the size – the total + * Iterator over the graph nodes on a certain level. Iterator also provides the size – the total * number of nodes to be iterated over. The nodes are NOT guaranteed to be presented in any * particular order. */ diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphBuilder.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphBuilder.java index 6d06c1298a9c..bed1480e9262 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphBuilder.java @@ -18,8 +18,11 @@ package org.apache.lucene.util.hnsw; import static java.lang.Math.log; +import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; import java.io.IOException; +import java.util.Comparator; +import java.util.List; import java.util.Locale; import java.util.Objects; import java.util.SplittableRandom; @@ -28,6 +31,7 @@ import org.apache.lucene.search.TopDocs; import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.InfoStream; +import org.apache.lucene.util.hnsw.HnswUtil.Component; /** * Builder for HNSW graph. See {@link HnswGraph} for a gloss on the algorithm and the meaning of the @@ -137,7 +141,7 @@ protected HnswGraphBuilder( HnswGraphSearcher graphSearcher) throws IOException { if (M <= 0) { - throw new IllegalArgumentException("maxConn must be positive"); + throw new IllegalArgumentException("M (max connections) must be positive"); } if (beamWidth <= 0) { throw new IllegalArgumentException("beamWidth must be positive"); @@ -173,8 +177,10 @@ public void setInfoStream(InfoStream infoStream) { } @Override - public OnHeapHnswGraph getCompletedGraph() { - frozen = true; + public OnHeapHnswGraph getCompletedGraph() throws IOException { + if (!frozen) { + finish(); + } return getGraph(); } @@ -333,7 +339,7 @@ private void addDiverseNeighbors(int level, int node, NeighborArray candidates) int nbr = candidates.nodes()[i]; if (hnswLock != null) { try (HnswLock.LockedRow rowLock = hnswLock.write(level, nbr)) { - NeighborArray nbrsOfNbr = rowLock.row; + NeighborArray nbrsOfNbr = rowLock.row(); nbrsOfNbr.addAndEnsureDiversity(node, candidates.scores()[i], nbr, scorerSupplier); } } else { @@ -405,6 +411,117 @@ private static int getRandomGraphLevel(double ml, SplittableRandom random) { return ((int) (-log(randDouble) * ml)); } + void finish() throws IOException { + // System.out.println("finish " + frozen); + connectComponents(); + frozen = true; + } + + private void connectComponents() throws IOException { + long start = System.nanoTime(); + for (int level = 0; level < hnsw.numLevels(); level++) { + if (connectComponents(level) == false) { + if (infoStream.isEnabled(HNSW_COMPONENT)) { + infoStream.message(HNSW_COMPONENT, "connectComponents failed on level " + level); + } + } + } + if (infoStream.isEnabled(HNSW_COMPONENT)) { + infoStream.message( + HNSW_COMPONENT, "connectComponents " + (System.nanoTime() - start) / 1_000_000 + " ms"); + } + } + + private boolean connectComponents(int level) throws IOException { + FixedBitSet notFullyConnected = new FixedBitSet(hnsw.size()); + int maxConn = M; + if (level == 0) { + maxConn *= 2; + } + List components = HnswUtil.components(hnsw, level, notFullyConnected, maxConn); + if (infoStream.isEnabled(HNSW_COMPONENT)) { + infoStream.message( + HNSW_COMPONENT, "connect " + components.size() + " components on level=" + level); + } + // System.out.println("HnswGraphBuilder. level=" + level + ": " + components); + boolean result = true; + if (components.size() > 1) { + // connect other components to the largest one + Component c0 = components.stream().max(Comparator.comparingInt(Component::size)).get(); + if (c0.start() == NO_MORE_DOCS) { + // the component is already fully connected - no room for new connections + return false; + } + // try for more connections? We only do one since otherwise they may become full + // while linking + GraphBuilderKnnCollector beam = new GraphBuilderKnnCollector(2); + int[] eps = new int[1]; + for (Component c : components) { + if (c != c0) { + if (c.start() == NO_MORE_DOCS) { + continue; + } + if (infoStream.isEnabled(HNSW_COMPONENT)) { + infoStream.message(HNSW_COMPONENT, "connect component " + c + " to " + c0); + } + + beam.clear(); + eps[0] = c0.start(); + RandomVectorScorer scorer = scorerSupplier.scorer(c.start()); + // find the closest node in the largest component to the lowest-numbered node in this + // component that has room to make a connection + graphSearcher.searchLevel(beam, scorer, level, eps, hnsw, notFullyConnected); + boolean linked = false; + while (beam.size() > 0) { + int c0node = beam.popNode(); + if (c0node == c.start() || notFullyConnected.get(c0node) == false) { + continue; + } + float score = beam.minimumScore(); + assert notFullyConnected.get(c0node); + // link the nodes + // System.out.println("link " + c0 + "." + c0node + " to " + c + "." + c.start()); + link(level, c0node, c.start(), score, notFullyConnected); + linked = true; + if (infoStream.isEnabled(HNSW_COMPONENT)) { + infoStream.message(HNSW_COMPONENT, "connected ok " + c0node + " -> " + c.start()); + } + } + if (!linked) { + if (infoStream.isEnabled(HNSW_COMPONENT)) { + infoStream.message(HNSW_COMPONENT, "not connected; no free nodes found"); + } + result = false; + } + } + } + } + return result; + } + + // Try to link two nodes bidirectionally; the forward connection will always be made. + // Update notFullyConnected. + private void link(int level, int n0, int n1, float score, FixedBitSet notFullyConnected) { + NeighborArray nbr0 = hnsw.getNeighbors(level, n0); + NeighborArray nbr1 = hnsw.getNeighbors(level, n1); + // must subtract 1 here since the nodes array is one larger than the configured + // max neighbors (M / 2M). + // We should have taken care of this check by searching for not-full nodes + int maxConn = nbr0.nodes().length - 1; + assert notFullyConnected.get(n0); + assert nbr0.size() < maxConn : "node " + n0 + " is full, has " + nbr0.size() + " friends"; + nbr0.addOutOfOrder(n1, score); + if (nbr0.size() == maxConn) { + notFullyConnected.clear(n0); + } + if (nbr1.size() < maxConn) { + nbr1.addOutOfOrder(n0, score); + if (nbr1.size() == maxConn) { + notFullyConnected.clear(n1); + } + } + } + /** * A restricted, specialized knnCollector that can be used when building a graph. * @@ -438,7 +555,7 @@ public int[] popUntilNearestKNodes() { return queue.nodes(); } - float minimumScore() { + public float minimumScore() { return queue.topScore(); } diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphMerger.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphMerger.java index 7ed5dd142de5..31e9c768dc03 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphMerger.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphMerger.java @@ -18,8 +18,8 @@ import java.io.IOException; import org.apache.lucene.codecs.KnnVectorsReader; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.MergeState; -import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.Bits; import org.apache.lucene.util.InfoStream; @@ -45,12 +45,12 @@ HnswGraphMerger addReader(KnnVectorsReader reader, MergeState.DocMap docMap, Bit /** * Merge and produce the on heap graph * - * @param mergedVectorIterator iterator over the vectors in the merged segment + * @param mergedVectorValues view of the vectors in the merged segment * @param infoStream optional info stream to set to builder * @param maxOrd max number of vectors that will be added to the graph * @return merged graph * @throws IOException during merge */ - OnHeapHnswGraph merge(DocIdSetIterator mergedVectorIterator, InfoStream infoStream, int maxOrd) + OnHeapHnswGraph merge(KnnVectorValues mergedVectorValues, InfoStream infoStream, int maxOrd) throws IOException; } diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswLock.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswLock.java index d7947c7fab1a..d8b12f67f4a6 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswLock.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswLock.java @@ -18,7 +18,6 @@ package org.apache.lucene.util.hnsw; import java.io.Closeable; -import java.util.Objects; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantReadWriteLock; @@ -26,7 +25,7 @@ * Provide (read-and-write) locked access to rows of an OnHeapHnswGraph. For use by * HnswConcurrentMerger and its HnswGraphBuilders. */ -class HnswLock { +final class HnswLock { private static final int NUM_LOCKS = 512; private final ReentrantReadWriteLock[] locks; private final OnHeapHnswGraph graph; @@ -40,31 +39,27 @@ class HnswLock { } LockedRow read(int level, int node) { - int lockid = Objects.hash(level, node) % NUM_LOCKS; + int lockid = hash(level, node) % NUM_LOCKS; Lock lock = locks[lockid].readLock(); lock.lock(); return new LockedRow(graph.getNeighbors(level, node), lock); } LockedRow write(int level, int node) { - int lockid = Objects.hash(level, node) % NUM_LOCKS; + int lockid = hash(level, node) % NUM_LOCKS; Lock lock = locks[lockid].writeLock(); lock.lock(); return new LockedRow(graph.getNeighbors(level, node), lock); } - static class LockedRow implements Closeable { - final Lock lock; - final NeighborArray row; - - LockedRow(NeighborArray row, Lock lock) { - this.lock = lock; - this.row = row; - } - + record LockedRow(NeighborArray row, Lock lock) implements Closeable { @Override public void close() { lock.unlock(); } } + + static int hash(int v1, int v2) { + return v1 * 31 + v2; + } } diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswUtil.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswUtil.java new file mode 100644 index 000000000000..a4b1d0c7c536 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswUtil.java @@ -0,0 +1,248 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.util.hnsw; + +import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; + +import java.io.IOException; +import java.util.ArrayDeque; +import java.util.ArrayList; +import java.util.Deque; +import java.util.List; +import org.apache.lucene.codecs.hnsw.HnswGraphProvider; +import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; +import org.apache.lucene.index.CodecReader; +import org.apache.lucene.index.FilterLeafReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.util.FixedBitSet; + +/** Utilities for use in tests involving HNSW graphs */ +public class HnswUtil { + + // utility class; only has static methods + private HnswUtil() {} + + /* + For each level, check rooted components from previous level nodes, which are entry + points with the goal that each node should be reachable from *some* entry point. For each entry + point, compute a spanning tree, recording the nodes in a single shared bitset. + + Also record a bitset marking nodes that are not full to be used when reconnecting in order to + limit the search to include non-full nodes only. + */ + + /** Returns true if every node on every level is reachable from node 0. */ + static boolean isRooted(HnswGraph knnValues) throws IOException { + for (int level = 0; level < knnValues.numLevels(); level++) { + if (components(knnValues, level, null, 0).size() > 1) { + return false; + } + } + return true; + } + + /** + * Returns the sizes of the distinct graph components on level 0. If the graph is fully-rooted the + * list will have one entry. If it is empty, the returned list will be empty. + */ + static List componentSizes(HnswGraph hnsw) throws IOException { + return componentSizes(hnsw, 0); + } + + /** + * Returns the sizes of the distinct graph components on the given level. The forest starting at + * the entry points (nodes in the next highest level) is considered as a single component. If the + * entire graph is rooted in the entry points--that is, every node is reachable from at least one + * entry point--the returned list will have a single entry. If the graph is empty, the returned + * list will be empty. + */ + static List componentSizes(HnswGraph hnsw, int level) throws IOException { + return components(hnsw, level, null, 0).stream().map(Component::size).toList(); + } + + // Finds orphaned components on the graph level. + static List components( + HnswGraph hnsw, int level, FixedBitSet notFullyConnected, int maxConn) throws IOException { + List components = new ArrayList<>(); + FixedBitSet connectedNodes = new FixedBitSet(hnsw.size()); + assert hnsw.size() == hnsw.getNodesOnLevel(0).size(); + int total = 0; + if (level >= hnsw.numLevels()) { + throw new IllegalArgumentException( + "Level " + level + " too large for graph with " + hnsw.numLevels() + " levels"); + } + HnswGraph.NodesIterator entryPoints; + // System.out.println("components level=" + level); + if (level == hnsw.numLevels() - 1) { + entryPoints = new HnswGraph.ArrayNodesIterator(new int[] {hnsw.entryNode()}, 1); + } else { + entryPoints = hnsw.getNodesOnLevel(level + 1); + } + while (entryPoints.hasNext()) { + int entryPoint = entryPoints.nextInt(); + Component component = + markRooted(hnsw, level, connectedNodes, notFullyConnected, maxConn, entryPoint); + total += component.size(); + } + int entryPoint; + if (notFullyConnected != null) { + entryPoint = notFullyConnected.nextSetBit(0); + } else { + entryPoint = connectedNodes.nextSetBit(0); + } + components.add(new Component(entryPoint, total)); + if (level == 0) { + int nextClear = nextClearBit(connectedNodes, 0); + while (nextClear != NO_MORE_DOCS) { + Component component = + markRooted(hnsw, level, connectedNodes, notFullyConnected, maxConn, nextClear); + assert component.size() > 0; + components.add(component); + total += component.size(); + nextClear = nextClearBit(connectedNodes, component.start()); + } + } else { + HnswGraph.NodesIterator nodes = hnsw.getNodesOnLevel(level); + while (nodes.hasNext()) { + int nextClear = nodes.nextInt(); + if (connectedNodes.get(nextClear)) { + continue; + } + Component component = + markRooted(hnsw, level, connectedNodes, notFullyConnected, maxConn, nextClear); + assert component.size() > 0; + components.add(component); + total += component.size(); + } + } + assert total == hnsw.getNodesOnLevel(level).size() + : "total=" + + total + + " level nodes on level " + + level + + " = " + + hnsw.getNodesOnLevel(level).size(); + return components; + } + + /** + * Count the nodes in a rooted component of the graph and set the bits of its nodes in + * connectedNodes bitset. Rooted means nodes that can be reached from a root node. + * + * @param hnswGraph the graph to check + * @param level the level of the graph to check + * @param connectedNodes a bitset the size of the entire graph with 1's indicating nodes that have + * been marked as connected. This method updates the bitset. + * @param notFullyConnected a bitset the size of the entire graph. On output, we mark nodes + * visited having fewer than maxConn connections. May be null. + * @param maxConn the maximum number of connections for any node (aka M). + * @param entryPoint a node id to start at + */ + private static Component markRooted( + HnswGraph hnswGraph, + int level, + FixedBitSet connectedNodes, + FixedBitSet notFullyConnected, + int maxConn, + int entryPoint) + throws IOException { + // Start at entry point and search all nodes on this level + // System.out.println("markRooted level=" + level + " entryPoint=" + entryPoint); + Deque stack = new ArrayDeque<>(); + stack.push(entryPoint); + int count = 0; + while (!stack.isEmpty()) { + int node = stack.pop(); + if (connectedNodes.get(node)) { + continue; + } + count++; + connectedNodes.set(node); + hnswGraph.seek(level, node); + int friendOrd; + int friendCount = 0; + while ((friendOrd = hnswGraph.nextNeighbor()) != NO_MORE_DOCS) { + ++friendCount; + stack.push(friendOrd); + } + if (friendCount < maxConn && notFullyConnected != null) { + notFullyConnected.set(node); + } + } + return new Component(entryPoint, count); + } + + private static int nextClearBit(FixedBitSet bits, int index) { + // Does not depend on the ghost bits being clear! + long[] barray = bits.getBits(); + assert index >= 0 && index < bits.length() : "index=" + index + ", numBits=" + bits.length(); + int i = index >> 6; + long word = ~(barray[i] >> index); // skip all the bits to the right of index + + int next = NO_MORE_DOCS; + if (word != 0) { + next = index + Long.numberOfTrailingZeros(word); + } else { + while (++i < barray.length) { + word = ~barray[i]; + if (word != 0) { + next = (i << 6) + Long.numberOfTrailingZeros(word); + break; + } + } + } + if (next >= bits.length()) { + return NO_MORE_DOCS; + } else { + return next; + } + } + + /** + * In graph theory, "connected components" are really defined only for undirected (ie + * bidirectional) graphs. Our graphs are directed, because of pruning, but they are *mostly* + * undirected. In this case we compute components starting from a single node so what we are + * really measuring is whether the graph is a "rooted graph". TODO: measure whether the graph is + * "strongly connected" ie there is a path from every node to every other node. + */ + public static boolean graphIsRooted(IndexReader reader, String vectorField) throws IOException { + for (LeafReaderContext ctx : reader.leaves()) { + CodecReader codecReader = (CodecReader) FilterLeafReader.unwrap(ctx.reader()); + HnswGraph graph = + ((HnswGraphProvider) + ((PerFieldKnnVectorsFormat.FieldsReader) codecReader.getVectorReader()) + .getFieldReader(vectorField)) + .getGraph(vectorField); + if (isRooted(graph) == false) { + return false; + } + } + return true; + } + + /** + * A component (also "connected component") of an undirected graph is a collection of nodes that + * are connected by neighbor links: every node in a connected component is reachable from every + * other node in the component. See https://en.wikipedia.org/wiki/Component_(graph_theory). Such a + * graph is said to be "fully connected" iff it has a single component, or it is empty. + * + * @param start the lowest-numbered node in the component + * @param size the number of nodes in the component + */ + record Component(int start, int size) {} +} diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/IncrementalHnswGraphMerger.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/IncrementalHnswGraphMerger.java index 7331111d45a9..c480d53360cb 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/IncrementalHnswGraphMerger.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/IncrementalHnswGraphMerger.java @@ -25,9 +25,9 @@ import org.apache.lucene.index.ByteVectorValues; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FloatVectorValues; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.MergeState; import org.apache.lucene.internal.hppc.IntIntHashMap; -import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.BitSet; import org.apache.lucene.util.Bits; import org.apache.lucene.util.FixedBitSet; @@ -108,12 +108,12 @@ public IncrementalHnswGraphMerger addReader( * Builds a new HnswGraphBuilder using the biggest graph from the merge state as a starting point. * If no valid readers were added to the merge state, a new graph is created. * - * @param mergedVectorIterator iterator over the vectors in the merged segment + * @param mergedVectorValues vector values in the merged segment * @param maxOrd max num of vectors that will be merged into the graph * @return HnswGraphBuilder * @throws IOException If an error occurs while reading from the merge state */ - protected HnswBuilder createBuilder(DocIdSetIterator mergedVectorIterator, int maxOrd) + protected HnswBuilder createBuilder(KnnVectorValues mergedVectorValues, int maxOrd) throws IOException { if (initReader == null) { return HnswGraphBuilder.create( @@ -121,9 +121,13 @@ protected HnswBuilder createBuilder(DocIdSetIterator mergedVectorIterator, int m } HnswGraph initializerGraph = ((HnswGraphProvider) initReader).getGraph(fieldInfo.name); + if (initializerGraph.size() == 0) { + return HnswGraphBuilder.create( + scorerSupplier, M, beamWidth, HnswGraphBuilder.randSeed, maxOrd); + } BitSet initializedNodes = new FixedBitSet(maxOrd); - int[] oldToNewOrdinalMap = getNewOrdMapping(mergedVectorIterator, initializedNodes); + int[] oldToNewOrdinalMap = getNewOrdMapping(mergedVectorValues, initializedNodes); return InitializedHnswGraphBuilder.fromGraph( scorerSupplier, M, @@ -137,8 +141,8 @@ protected HnswBuilder createBuilder(DocIdSetIterator mergedVectorIterator, int m @Override public OnHeapHnswGraph merge( - DocIdSetIterator mergedVectorIterator, InfoStream infoStream, int maxOrd) throws IOException { - HnswBuilder builder = createBuilder(mergedVectorIterator, maxOrd); + KnnVectorValues mergedVectorValues, InfoStream infoStream, int maxOrd) throws IOException { + HnswBuilder builder = createBuilder(mergedVectorValues, maxOrd); builder.setInfoStream(infoStream); return builder.build(maxOrd); } @@ -147,46 +151,45 @@ public OnHeapHnswGraph merge( * Creates a new mapping from old ordinals to new ordinals and returns the total number of vectors * in the newly merged segment. * - * @param mergedVectorIterator iterator over the vectors in the merged segment + * @param mergedVectorValues vector values in the merged segment * @param initializedNodes track what nodes have been initialized * @return the mapping from old ordinals to new ordinals * @throws IOException If an error occurs while reading from the merge state */ protected final int[] getNewOrdMapping( - DocIdSetIterator mergedVectorIterator, BitSet initializedNodes) throws IOException { - DocIdSetIterator initializerIterator = null; + KnnVectorValues mergedVectorValues, BitSet initializedNodes) throws IOException { + KnnVectorValues.DocIndexIterator initializerIterator = null; switch (fieldInfo.getVectorEncoding()) { - case BYTE -> initializerIterator = initReader.getByteVectorValues(fieldInfo.name); - case FLOAT32 -> initializerIterator = initReader.getFloatVectorValues(fieldInfo.name); + case BYTE -> initializerIterator = initReader.getByteVectorValues(fieldInfo.name).iterator(); + case FLOAT32 -> + initializerIterator = initReader.getFloatVectorValues(fieldInfo.name).iterator(); } IntIntHashMap newIdToOldOrdinal = new IntIntHashMap(initGraphSize); - int oldOrd = 0; int maxNewDocID = -1; - for (int oldId = initializerIterator.nextDoc(); - oldId != NO_MORE_DOCS; - oldId = initializerIterator.nextDoc()) { - int newId = initDocMap.get(oldId); + for (int docId = initializerIterator.nextDoc(); + docId != NO_MORE_DOCS; + docId = initializerIterator.nextDoc()) { + int newId = initDocMap.get(docId); maxNewDocID = Math.max(newId, maxNewDocID); - newIdToOldOrdinal.put(newId, oldOrd); - oldOrd++; + newIdToOldOrdinal.put(newId, initializerIterator.index()); } if (maxNewDocID == -1) { return new int[0]; } final int[] oldToNewOrdinalMap = new int[initGraphSize]; - int newOrd = 0; + KnnVectorValues.DocIndexIterator mergedVectorIterator = mergedVectorValues.iterator(); for (int newDocId = mergedVectorIterator.nextDoc(); newDocId <= maxNewDocID; newDocId = mergedVectorIterator.nextDoc()) { int hashDocIndex = newIdToOldOrdinal.indexOf(newDocId); if (newIdToOldOrdinal.indexExists(hashDocIndex)) { + int newOrd = mergedVectorIterator.index(); initializedNodes.set(newOrd); oldToNewOrdinalMap[newIdToOldOrdinal.indexGet(hashDocIndex)] = newOrd; } - newOrd++; } return oldToNewOrdinalMap; } diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/OnHeapHnswGraph.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/OnHeapHnswGraph.java index d50c96c48720..a79bcd17d91f 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/OnHeapHnswGraph.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/OnHeapHnswGraph.java @@ -90,7 +90,10 @@ public final class OnHeapHnswGraph extends HnswGraph implements Accountable { * @param node the node whose neighbors are returned, represented as an ordinal on the level 0. */ public NeighborArray getNeighbors(int level, int node) { - assert graph[node][level] != null; + assert node < graph.length; + assert level < graph[node].length + : "level=" + level + ", node has only " + graph[node].length + " levels"; + assert graph[node][level] != null : "node=" + node + ", level=" + level; return graph[node][level]; } diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/RandomAccessVectorValues.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/RandomAccessVectorValues.java deleted file mode 100644 index e2c7372b667a..000000000000 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/RandomAccessVectorValues.java +++ /dev/null @@ -1,175 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.lucene.util.hnsw; - -import java.io.IOException; -import java.util.List; -import org.apache.lucene.store.IndexInput; -import org.apache.lucene.util.Bits; - -/** - * Provides random access to vectors by dense ordinal. This interface is used by HNSW-based - * implementations of KNN search. - * - * @lucene.experimental - */ -public interface RandomAccessVectorValues { - - /** Return the number of vector values */ - int size(); - - /** Return the dimension of the returned vector values */ - int dimension(); - - /** - * Creates a new copy of this {@link RandomAccessVectorValues}. This is helpful when you need to - * access different values at once, to avoid overwriting the underlying vector returned. - */ - RandomAccessVectorValues copy() throws IOException; - - /** - * Returns a slice of the underlying {@link IndexInput} that contains the vector values if - * available - */ - default IndexInput getSlice() { - return null; - } - - /** Returns the byte length of the vector values. */ - int getVectorByteLength(); - - /** - * Translates vector ordinal to the correct document ID. By default, this is an identity function. - * - * @param ord the vector ordinal - * @return the document Id for that vector ordinal - */ - default int ordToDoc(int ord) { - return ord; - } - - /** - * Returns the {@link Bits} representing live documents. By default, this is an identity function. - * - * @param acceptDocs the accept docs - * @return the accept docs - */ - default Bits getAcceptOrds(Bits acceptDocs) { - return acceptDocs; - } - - /** Float vector values. */ - interface Floats extends RandomAccessVectorValues { - @Override - RandomAccessVectorValues.Floats copy() throws IOException; - - /** - * Return the vector value indexed at the given ordinal. - * - * @param targetOrd a valid ordinal, ≥ 0 and < {@link #size()}. - */ - float[] vectorValue(int targetOrd) throws IOException; - - /** Returns the vector byte length, defaults to dimension multiplied by float byte size */ - @Override - default int getVectorByteLength() { - return dimension() * Float.BYTES; - } - } - - /** Byte vector values. */ - interface Bytes extends RandomAccessVectorValues { - @Override - RandomAccessVectorValues.Bytes copy() throws IOException; - - /** - * Return the vector value indexed at the given ordinal. - * - * @param targetOrd a valid ordinal, ≥ 0 and < {@link #size()}. - */ - byte[] vectorValue(int targetOrd) throws IOException; - - /** Returns the vector byte length, defaults to dimension multiplied by byte size */ - @Override - default int getVectorByteLength() { - return dimension() * Byte.BYTES; - } - } - - /** - * Creates a {@link RandomAccessVectorValues.Floats} from a list of float arrays. - * - * @param vectors the list of float arrays - * @param dim the dimension of the vectors - * @return a {@link RandomAccessVectorValues.Floats} instance - */ - static RandomAccessVectorValues.Floats fromFloats(List vectors, int dim) { - return new RandomAccessVectorValues.Floats() { - @Override - public int size() { - return vectors.size(); - } - - @Override - public int dimension() { - return dim; - } - - @Override - public float[] vectorValue(int targetOrd) { - return vectors.get(targetOrd); - } - - @Override - public RandomAccessVectorValues.Floats copy() { - return this; - } - }; - } - - /** - * Creates a {@link RandomAccessVectorValues.Bytes} from a list of byte arrays. - * - * @param vectors the list of byte arrays - * @param dim the dimension of the vectors - * @return a {@link RandomAccessVectorValues.Bytes} instance - */ - static RandomAccessVectorValues.Bytes fromBytes(List vectors, int dim) { - return new RandomAccessVectorValues.Bytes() { - @Override - public int size() { - return vectors.size(); - } - - @Override - public int dimension() { - return dim; - } - - @Override - public byte[] vectorValue(int targetOrd) { - return vectors.get(targetOrd); - } - - @Override - public RandomAccessVectorValues.Bytes copy() { - return this; - } - }; - } -} diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/RandomVectorScorer.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/RandomVectorScorer.java index fc8ed3d004a1..a135df436991 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/RandomVectorScorer.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/RandomVectorScorer.java @@ -18,6 +18,7 @@ package org.apache.lucene.util.hnsw; import java.io.IOException; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.util.Bits; /** A {@link RandomVectorScorer} for scoring random nodes in batches against an abstract query. */ @@ -57,14 +58,14 @@ default Bits getAcceptOrds(Bits acceptDocs) { /** Creates a default scorer for random access vectors. */ abstract class AbstractRandomVectorScorer implements RandomVectorScorer { - private final RandomAccessVectorValues values; + private final KnnVectorValues values; /** * Creates a new scorer for the given vector values. * * @param values the vector values */ - public AbstractRandomVectorScorer(RandomAccessVectorValues values) { + public AbstractRandomVectorScorer(KnnVectorValues values) { this.values = values; } diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/PackedInts.java b/lucene/core/src/java/org/apache/lucene/util/packed/PackedInts.java index dc9f22531dc0..d5e112e04700 100644 --- a/lucene/core/src/java/org/apache/lucene/util/packed/PackedInts.java +++ b/lucene/core/src/java/org/apache/lucene/util/packed/PackedInts.java @@ -171,20 +171,7 @@ public final float overheadRatio(int bitsPerValue) { } /** Simple class that holds a format and a number of bits per value. */ - public static class FormatAndBits { - public final Format format; - public final int bitsPerValue; - - public FormatAndBits(Format format, int bitsPerValue) { - this.format = format; - this.bitsPerValue = bitsPerValue; - } - - @Override - public String toString() { - return "FormatAndBits(format=" + format + " bitsPerValue=" + bitsPerValue + ")"; - } - } + public record FormatAndBits(Format format, int bitsPerValue) {} /** * Try to find the {@link Format} and number of bits per value that would restore from disk the diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/PagedMutable.java b/lucene/core/src/java/org/apache/lucene/util/packed/PagedMutable.java index 16a91de4e325..0653b025fe84 100644 --- a/lucene/core/src/java/org/apache/lucene/util/packed/PagedMutable.java +++ b/lucene/core/src/java/org/apache/lucene/util/packed/PagedMutable.java @@ -47,7 +47,7 @@ public PagedMutable(long size, int pageSize, int bitsPerValue, float acceptableO } PagedMutable(long size, int pageSize, PackedInts.FormatAndBits formatAndBits) { - this(size, pageSize, formatAndBits.bitsPerValue, formatAndBits.format); + this(size, pageSize, formatAndBits.bitsPerValue(), formatAndBits.format()); } PagedMutable(long size, int pageSize, int bitsPerValue, PackedInts.Format format) { diff --git a/lucene/core/src/java/org/apache/lucene/util/quantization/QuantizedByteVectorValues.java b/lucene/core/src/java/org/apache/lucene/util/quantization/QuantizedByteVectorValues.java index a0fe957fecb4..b90ab8276dd1 100644 --- a/lucene/core/src/java/org/apache/lucene/util/quantization/QuantizedByteVectorValues.java +++ b/lucene/core/src/java/org/apache/lucene/util/quantization/QuantizedByteVectorValues.java @@ -17,9 +17,10 @@ package org.apache.lucene.util.quantization; import java.io.IOException; +import org.apache.lucene.codecs.lucene95.HasIndexSlice; import org.apache.lucene.index.ByteVectorValues; -import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.VectorScorer; +import org.apache.lucene.store.IndexInput; /** * A version of {@link ByteVectorValues}, but additionally retrieving score correction offset for @@ -27,31 +28,31 @@ * * @lucene.experimental */ -public abstract class QuantizedByteVectorValues extends DocIdSetIterator { - public abstract float getScoreCorrectionConstant() throws IOException; +public abstract class QuantizedByteVectorValues extends ByteVectorValues implements HasIndexSlice { - public abstract byte[] vectorValue() throws IOException; + public ScalarQuantizer getScalarQuantizer() { + throw new UnsupportedOperationException(); + } - /** Return the dimension of the vectors */ - public abstract int dimension(); + public abstract float getScoreCorrectionConstant(int ord) throws IOException; /** - * Return the number of vectors for this field. + * Return a {@link VectorScorer} for the given query vector. * - * @return the number of vectors returned by this iterator + * @param query the query vector + * @return a {@link VectorScorer} instance or null */ - public abstract int size(); + public VectorScorer scorer(float[] query) throws IOException { + throw new UnsupportedOperationException(); + } @Override - public final long cost() { - return size(); + public QuantizedByteVectorValues copy() throws IOException { + return this; } - /** - * Return a {@link VectorScorer} for the given query vector. - * - * @param query the query vector - * @return a {@link VectorScorer} instance or null - */ - public abstract VectorScorer scorer(float[] query) throws IOException; + @Override + public IndexInput getSlice() { + return null; + } } diff --git a/lucene/core/src/java/org/apache/lucene/util/quantization/ScalarQuantizedVectorSimilarity.java b/lucene/core/src/java/org/apache/lucene/util/quantization/ScalarQuantizedVectorSimilarity.java index 6c11ef7a2130..1c22d191f51b 100644 --- a/lucene/core/src/java/org/apache/lucene/util/quantization/ScalarQuantizedVectorSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/util/quantization/ScalarQuantizedVectorSimilarity.java @@ -40,10 +40,12 @@ static ScalarQuantizedVectorSimilarity fromVectorSimilarity( VectorSimilarityFunction sim, float constMultiplier, byte bits) { return switch (sim) { case EUCLIDEAN -> new Euclidean(constMultiplier); - case COSINE, DOT_PRODUCT -> new DotProduct( - constMultiplier, bits <= 4 ? VectorUtil::int4DotProduct : VectorUtil::dotProduct); - case MAXIMUM_INNER_PRODUCT -> new MaximumInnerProduct( - constMultiplier, bits <= 4 ? VectorUtil::int4DotProduct : VectorUtil::dotProduct); + case COSINE, DOT_PRODUCT -> + new DotProduct( + constMultiplier, bits <= 4 ? VectorUtil::int4DotProduct : VectorUtil::dotProduct); + case MAXIMUM_INNER_PRODUCT -> + new MaximumInnerProduct( + constMultiplier, bits <= 4 ? VectorUtil::int4DotProduct : VectorUtil::dotProduct); }; } diff --git a/lucene/core/src/java/org/apache/lucene/util/quantization/ScalarQuantizer.java b/lucene/core/src/java/org/apache/lucene/util/quantization/ScalarQuantizer.java index 44c0ac5aca42..3f7bcf6c5c45 100644 --- a/lucene/core/src/java/org/apache/lucene/util/quantization/ScalarQuantizer.java +++ b/lucene/core/src/java/org/apache/lucene/util/quantization/ScalarQuantizer.java @@ -25,6 +25,7 @@ import java.util.Random; import java.util.stream.IntStream; import org.apache.lucene.index.FloatVectorValues; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.search.HitQueue; import org.apache.lucene.search.ScoreDoc; @@ -269,11 +270,12 @@ static ScalarQuantizer fromVectors( if (totalVectorCount == 0) { return new ScalarQuantizer(0f, 0f, bits); } + KnnVectorValues.DocIndexIterator iterator = floatVectorValues.iterator(); if (confidenceInterval == 1f) { float min = Float.POSITIVE_INFINITY; float max = Float.NEGATIVE_INFINITY; - while (floatVectorValues.nextDoc() != NO_MORE_DOCS) { - for (float v : floatVectorValues.vectorValue()) { + while (iterator.nextDoc() != NO_MORE_DOCS) { + for (float v : floatVectorValues.vectorValue(iterator.index())) { min = Math.min(min, v); max = Math.max(max, v); } @@ -289,8 +291,8 @@ static ScalarQuantizer fromVectors( if (totalVectorCount <= quantizationSampleSize) { int scratchSize = Math.min(SCRATCH_SIZE, totalVectorCount); int i = 0; - while (floatVectorValues.nextDoc() != NO_MORE_DOCS) { - float[] vectorValue = floatVectorValues.vectorValue(); + while (iterator.nextDoc() != NO_MORE_DOCS) { + float[] vectorValue = floatVectorValues.vectorValue(iterator.index()); System.arraycopy( vectorValue, 0, quantileGatheringScratch, i * vectorValue.length, vectorValue.length); i++; @@ -311,11 +313,11 @@ static ScalarQuantizer fromVectors( for (int i : vectorsToTake) { while (index <= i) { // We cannot use `advance(docId)` as MergedVectorValues does not support it - floatVectorValues.nextDoc(); + iterator.nextDoc(); index++; } - assert floatVectorValues.docID() != NO_MORE_DOCS; - float[] vectorValue = floatVectorValues.vectorValue(); + assert iterator.docID() != NO_MORE_DOCS; + float[] vectorValue = floatVectorValues.vectorValue(iterator.index()); System.arraycopy( vectorValue, 0, quantileGatheringScratch, idx * vectorValue.length, vectorValue.length); idx++; @@ -353,11 +355,16 @@ public static ScalarQuantizer fromVectorsAutoInterval( / (floatVectorValues.dimension() + 1), 1 - 1f / (floatVectorValues.dimension() + 1) }; + KnnVectorValues.DocIndexIterator iterator = floatVectorValues.iterator(); if (totalVectorCount <= sampleSize) { int scratchSize = Math.min(SCRATCH_SIZE, totalVectorCount); int i = 0; - while (floatVectorValues.nextDoc() != NO_MORE_DOCS) { - gatherSample(floatVectorValues, quantileGatheringScratch, sampledDocs, i); + while (iterator.nextDoc() != NO_MORE_DOCS) { + gatherSample( + floatVectorValues.vectorValue(iterator.index()), + quantileGatheringScratch, + sampledDocs, + i); i++; if (i == scratchSize) { extractQuantiles(confidenceIntervals, quantileGatheringScratch, upperSum, lowerSum); @@ -374,11 +381,15 @@ public static ScalarQuantizer fromVectorsAutoInterval( for (int i : vectorsToTake) { while (index <= i) { // We cannot use `advance(docId)` as MergedVectorValues does not support it - floatVectorValues.nextDoc(); + iterator.nextDoc(); index++; } - assert floatVectorValues.docID() != NO_MORE_DOCS; - gatherSample(floatVectorValues, quantileGatheringScratch, sampledDocs, idx); + assert iterator.docID() != NO_MORE_DOCS; + gatherSample( + floatVectorValues.vectorValue(iterator.index()), + quantileGatheringScratch, + sampledDocs, + idx); idx++; if (idx == SCRATCH_SIZE) { extractQuantiles(confidenceIntervals, quantileGatheringScratch, upperSum, lowerSum); @@ -437,12 +448,7 @@ private static void extractQuantiles( } private static void gatherSample( - FloatVectorValues floatVectorValues, - float[] quantileGatheringScratch, - List sampledDocs, - int i) - throws IOException { - float[] vectorValue = floatVectorValues.vectorValue(); + float[] vectorValue, float[] quantileGatheringScratch, List sampledDocs, int i) { float[] copy = new float[vectorValue.length]; System.arraycopy(vectorValue, 0, copy, 0, vectorValue.length); sampledDocs.add(copy); @@ -614,19 +620,7 @@ protected void swap(int i, int j) { } } - private static class ScoreDocsAndScoreVariance { - private final ScoreDoc[] scoreDocs; - private final float scoreVariance; - - public ScoreDocsAndScoreVariance(ScoreDoc[] scoreDocs, float scoreVariance) { - this.scoreDocs = scoreDocs; - this.scoreVariance = scoreVariance; - } - - public ScoreDoc[] getScoreDocs() { - return scoreDocs; - } - } + private record ScoreDocsAndScoreVariance(ScoreDoc[] scoreDocs, float scoreVariance) {} private static class OnlineMeanAndVar { private double mean = 0.0; @@ -687,7 +681,7 @@ public ScoreErrorCorrelator( for (int i = 0; i < nearestNeighbors.size(); i++) { float queryCorrection = quantizer.quantize(vectors.get(i), query, function); ScoreDocsAndScoreVariance scoreDocsAndScoreVariance = nearestNeighbors.get(i); - ScoreDoc[] scoreDocs = scoreDocsAndScoreVariance.getScoreDocs(); + ScoreDoc[] scoreDocs = scoreDocsAndScoreVariance.scoreDocs(); float scoreVariance = scoreDocsAndScoreVariance.scoreVariance; // calculate the score for the vector against its nearest neighbors but with quantized // scores now diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorer.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorer.java index aae362042404..b65f1e570921 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorer.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorer.java @@ -19,11 +19,12 @@ import java.io.IOException; import java.lang.foreign.MemorySegment; import java.util.Optional; +import org.apache.lucene.index.ByteVectorValues; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.store.FilterIndexInput; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.MemorySegmentAccessInput; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.hnsw.RandomVectorScorer; abstract sealed class Lucene99MemorySegmentByteVectorScorer @@ -39,10 +40,8 @@ abstract sealed class Lucene99MemorySegmentByteVectorScorer * returned. */ public static Optional create( - VectorSimilarityFunction type, - IndexInput input, - RandomAccessVectorValues values, - byte[] queryVector) { + VectorSimilarityFunction type, IndexInput input, KnnVectorValues values, byte[] queryVector) { + assert values instanceof ByteVectorValues; input = FilterIndexInput.unwrapOnlyTest(input); if (!(input instanceof MemorySegmentAccessInput msInput)) { return Optional.empty(); @@ -52,13 +51,13 @@ public static Optional create( case COSINE -> Optional.of(new CosineScorer(msInput, values, queryVector)); case DOT_PRODUCT -> Optional.of(new DotProductScorer(msInput, values, queryVector)); case EUCLIDEAN -> Optional.of(new EuclideanScorer(msInput, values, queryVector)); - case MAXIMUM_INNER_PRODUCT -> Optional.of( - new MaxInnerProductScorer(msInput, values, queryVector)); + case MAXIMUM_INNER_PRODUCT -> + Optional.of(new MaxInnerProductScorer(msInput, values, queryVector)); }; } Lucene99MemorySegmentByteVectorScorer( - MemorySegmentAccessInput input, RandomAccessVectorValues values, byte[] queryVector) { + MemorySegmentAccessInput input, KnnVectorValues values, byte[] queryVector) { super(values); this.input = input; this.vectorByteSize = values.getVectorByteLength(); @@ -92,7 +91,7 @@ final void checkOrdinal(int ord) { } static final class CosineScorer extends Lucene99MemorySegmentByteVectorScorer { - CosineScorer(MemorySegmentAccessInput input, RandomAccessVectorValues values, byte[] query) { + CosineScorer(MemorySegmentAccessInput input, KnnVectorValues values, byte[] query) { super(input, values, query); } @@ -105,8 +104,7 @@ public float score(int node) throws IOException { } static final class DotProductScorer extends Lucene99MemorySegmentByteVectorScorer { - DotProductScorer( - MemorySegmentAccessInput input, RandomAccessVectorValues values, byte[] query) { + DotProductScorer(MemorySegmentAccessInput input, KnnVectorValues values, byte[] query) { super(input, values, query); } @@ -120,7 +118,7 @@ public float score(int node) throws IOException { } static final class EuclideanScorer extends Lucene99MemorySegmentByteVectorScorer { - EuclideanScorer(MemorySegmentAccessInput input, RandomAccessVectorValues values, byte[] query) { + EuclideanScorer(MemorySegmentAccessInput input, KnnVectorValues values, byte[] query) { super(input, values, query); } @@ -133,8 +131,7 @@ public float score(int node) throws IOException { } static final class MaxInnerProductScorer extends Lucene99MemorySegmentByteVectorScorer { - MaxInnerProductScorer( - MemorySegmentAccessInput input, RandomAccessVectorValues values, byte[] query) { + MaxInnerProductScorer(MemorySegmentAccessInput input, KnnVectorValues values, byte[] query) { super(input, values, query); } diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorerSupplier.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorerSupplier.java index 90b3bfb014c3..02c71561122d 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorerSupplier.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorerSupplier.java @@ -19,11 +19,12 @@ import java.io.IOException; import java.lang.foreign.MemorySegment; import java.util.Optional; +import org.apache.lucene.index.ByteVectorValues; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.store.FilterIndexInput; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.MemorySegmentAccessInput; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.hnsw.RandomVectorScorer; import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier; @@ -33,7 +34,7 @@ public abstract sealed class Lucene99MemorySegmentByteVectorScorerSupplier final int vectorByteSize; final int maxOrd; final MemorySegmentAccessInput input; - final RandomAccessVectorValues values; // to support ordToDoc/getAcceptOrds + final KnnVectorValues values; // to support ordToDoc/getAcceptOrds byte[] scratch1, scratch2; /** @@ -41,7 +42,8 @@ public abstract sealed class Lucene99MemorySegmentByteVectorScorerSupplier * optional is returned. */ static Optional create( - VectorSimilarityFunction type, IndexInput input, RandomAccessVectorValues values) { + VectorSimilarityFunction type, IndexInput input, KnnVectorValues values) { + assert values instanceof ByteVectorValues; input = FilterIndexInput.unwrapOnlyTest(input); if (!(input instanceof MemorySegmentAccessInput msInput)) { return Optional.empty(); @@ -56,7 +58,7 @@ static Optional create( } Lucene99MemorySegmentByteVectorScorerSupplier( - MemorySegmentAccessInput input, RandomAccessVectorValues values) { + MemorySegmentAccessInput input, KnnVectorValues values) { this.input = input; this.values = values; this.vectorByteSize = values.getVectorByteLength(); @@ -103,7 +105,7 @@ final MemorySegment getSecondSegment(int ord) throws IOException { static final class CosineSupplier extends Lucene99MemorySegmentByteVectorScorerSupplier { - CosineSupplier(MemorySegmentAccessInput input, RandomAccessVectorValues values) { + CosineSupplier(MemorySegmentAccessInput input, KnnVectorValues values) { super(input, values); } @@ -128,7 +130,7 @@ public CosineSupplier copy() throws IOException { static final class DotProductSupplier extends Lucene99MemorySegmentByteVectorScorerSupplier { - DotProductSupplier(MemorySegmentAccessInput input, RandomAccessVectorValues values) { + DotProductSupplier(MemorySegmentAccessInput input, KnnVectorValues values) { super(input, values); } @@ -155,7 +157,7 @@ public DotProductSupplier copy() throws IOException { static final class EuclideanSupplier extends Lucene99MemorySegmentByteVectorScorerSupplier { - EuclideanSupplier(MemorySegmentAccessInput input, RandomAccessVectorValues values) { + EuclideanSupplier(MemorySegmentAccessInput input, KnnVectorValues values) { super(input, values); } @@ -181,7 +183,7 @@ public EuclideanSupplier copy() throws IOException { static final class MaxInnerProductSupplier extends Lucene99MemorySegmentByteVectorScorerSupplier { - MaxInnerProductSupplier(MemorySegmentAccessInput input, RandomAccessVectorValues values) { + MaxInnerProductSupplier(MemorySegmentAccessInput input, KnnVectorValues values) { super(input, values); } diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentFlatVectorsScorer.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentFlatVectorsScorer.java index b085185fb113..bd8cbb2c388a 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentFlatVectorsScorer.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentFlatVectorsScorer.java @@ -19,11 +19,13 @@ import java.io.IOException; import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer; import org.apache.lucene.codecs.hnsw.FlatVectorsScorer; +import org.apache.lucene.codecs.lucene95.HasIndexSlice; +import org.apache.lucene.index.ByteVectorValues; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.VectorSimilarityFunction; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.hnsw.RandomVectorScorer; import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier; -import org.apache.lucene.util.quantization.RandomAccessQuantizedByteVectorValues; +import org.apache.lucene.util.quantization.QuantizedByteVectorValues; public class Lucene99MemorySegmentFlatVectorsScorer implements FlatVectorsScorer { @@ -38,15 +40,16 @@ private Lucene99MemorySegmentFlatVectorsScorer(FlatVectorsScorer delegate) { @Override public RandomVectorScorerSupplier getRandomVectorScorerSupplier( - VectorSimilarityFunction similarityType, RandomAccessVectorValues vectorValues) - throws IOException { + VectorSimilarityFunction similarityType, KnnVectorValues vectorValues) throws IOException { // a quantized values here is a wrapping or delegation issue - assert !(vectorValues instanceof RandomAccessQuantizedByteVectorValues); + assert !(vectorValues instanceof QuantizedByteVectorValues); // currently only supports binary vectors - if (vectorValues instanceof RandomAccessVectorValues.Bytes && vectorValues.getSlice() != null) { + if (vectorValues instanceof ByteVectorValues bvv + && bvv instanceof HasIndexSlice byteVectorValues + && byteVectorValues.getSlice() != null) { var scorer = Lucene99MemorySegmentByteVectorScorerSupplier.create( - similarityType, vectorValues.getSlice(), vectorValues); + similarityType, byteVectorValues.getSlice(), vectorValues); if (scorer.isPresent()) { return scorer.get(); } @@ -56,9 +59,7 @@ public RandomVectorScorerSupplier getRandomVectorScorerSupplier( @Override public RandomVectorScorer getRandomVectorScorer( - VectorSimilarityFunction similarityType, - RandomAccessVectorValues vectorValues, - float[] target) + VectorSimilarityFunction similarityType, KnnVectorValues vectorValues, float[] target) throws IOException { // currently only supports binary vectors, so always delegate return delegate.getRandomVectorScorer(similarityType, vectorValues, target); @@ -66,17 +67,17 @@ public RandomVectorScorer getRandomVectorScorer( @Override public RandomVectorScorer getRandomVectorScorer( - VectorSimilarityFunction similarityType, - RandomAccessVectorValues vectorValues, - byte[] queryVector) + VectorSimilarityFunction similarityType, KnnVectorValues vectorValues, byte[] queryVector) throws IOException { checkDimensions(queryVector.length, vectorValues.dimension()); // a quantized values here is a wrapping or delegation issue - assert !(vectorValues instanceof RandomAccessQuantizedByteVectorValues); - if (vectorValues instanceof RandomAccessVectorValues.Bytes && vectorValues.getSlice() != null) { + assert !(vectorValues instanceof QuantizedByteVectorValues); + if (vectorValues instanceof ByteVectorValues bvv + && bvv instanceof HasIndexSlice byteVectorValues + && byteVectorValues.getSlice() != null) { var scorer = Lucene99MemorySegmentByteVectorScorer.create( - similarityType, vectorValues.getSlice(), vectorValues, queryVector); + similarityType, byteVectorValues.getSlice(), vectorValues, queryVector); if (scorer.isPresent()) { return scorer.get(); } diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentPostingDecodingUtil.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentPostingDecodingUtil.java new file mode 100644 index 000000000000..c4a3f8232704 --- /dev/null +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentPostingDecodingUtil.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.internal.vectorization; + +import java.io.IOException; +import java.lang.foreign.MemorySegment; +import java.nio.ByteOrder; +import jdk.incubator.vector.IntVector; +import jdk.incubator.vector.VectorOperators; +import jdk.incubator.vector.VectorSpecies; +import org.apache.lucene.store.IndexInput; + +final class MemorySegmentPostingDecodingUtil extends PostingDecodingUtil { + + private static final VectorSpecies INT_SPECIES = + PanamaVectorConstants.PRERERRED_INT_SPECIES; + + private final MemorySegment memorySegment; + + MemorySegmentPostingDecodingUtil(IndexInput in, MemorySegment memorySegment) { + super(in); + this.memorySegment = memorySegment; + } + + private static void shift( + IntVector vector, int bShift, int dec, int maxIter, int bMask, int[] b, int count, int i) { + for (int j = 0; j <= maxIter; ++j) { + vector + .lanewise(VectorOperators.LSHR, bShift - j * dec) + .lanewise(VectorOperators.AND, bMask) + .intoArray(b, count * j + i); + } + } + + @Override + public void splitInts( + int count, int[] b, int bShift, int dec, int bMask, int[] c, int cIndex, int cMask) + throws IOException { + if (count < INT_SPECIES.length()) { + // Not enough data to vectorize without going out-of-bounds. In practice, this branch is never + // used if the bit width is 256, and is used for 2 and 3 bits per value if the bit width is + // 512. + super.splitInts(count, b, bShift, dec, bMask, c, cIndex, cMask); + return; + } + + int maxIter = (bShift - 1) / dec; + long offset = in.getFilePointer(); + long endOffset = offset + count * Integer.BYTES; + int loopBound = INT_SPECIES.loopBound(count - 1); + for (int i = 0; + i < loopBound; + i += INT_SPECIES.length(), offset += INT_SPECIES.length() * Integer.BYTES) { + IntVector vector = + IntVector.fromMemorySegment(INT_SPECIES, memorySegment, offset, ByteOrder.LITTLE_ENDIAN); + shift(vector, bShift, dec, maxIter, bMask, b, count, i); + vector.lanewise(VectorOperators.AND, cMask).intoArray(c, cIndex + i); + } + + // Handle the tail by reading a vector that is aligned with `count` on the right side. + int i = count - INT_SPECIES.length(); + offset = endOffset - INT_SPECIES.length() * Integer.BYTES; + IntVector vector = + IntVector.fromMemorySegment(INT_SPECIES, memorySegment, offset, ByteOrder.LITTLE_ENDIAN); + shift(vector, bShift, dec, maxIter, bMask, b, count, i); + vector.lanewise(VectorOperators.AND, cMask).intoArray(c, cIndex + i); + + in.seek(endOffset); + } +} diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorConstants.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorConstants.java new file mode 100644 index 000000000000..e0c5bbca38ee --- /dev/null +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorConstants.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.internal.vectorization; + +import jdk.incubator.vector.VectorShape; +import jdk.incubator.vector.VectorSpecies; +import org.apache.lucene.util.Constants; + +/** Shared constants for implementations that take advantage of the Panama Vector API. */ +final class PanamaVectorConstants { + + /** Preferred width in bits for vectors. */ + static final int PREFERRED_VECTOR_BITSIZE; + + /** Whether integer vectors can be trusted to actually be fast. */ + static final boolean HAS_FAST_INTEGER_VECTORS; + + static final VectorSpecies PRERERRED_LONG_SPECIES; + static final VectorSpecies PRERERRED_INT_SPECIES; + + static { + // default to platform supported bitsize + int vectorBitSize = VectorShape.preferredShape().vectorBitSize(); + // but allow easy overriding for testing + PREFERRED_VECTOR_BITSIZE = VectorizationProvider.TESTS_VECTOR_SIZE.orElse(vectorBitSize); + + // hotspot misses some SSE intrinsics, workaround it + // to be fair, they do document this thing only works well with AVX2/AVX3 and Neon + boolean isAMD64withoutAVX2 = + Constants.OS_ARCH.equals("amd64") && PREFERRED_VECTOR_BITSIZE < 256; + HAS_FAST_INTEGER_VECTORS = + VectorizationProvider.TESTS_FORCE_INTEGER_VECTORS || (isAMD64withoutAVX2 == false); + + PRERERRED_LONG_SPECIES = + VectorSpecies.of(long.class, VectorShape.forBitSize(PREFERRED_VECTOR_BITSIZE)); + PRERERRED_INT_SPECIES = + VectorSpecies.of(int.class, VectorShape.forBitSize(PREFERRED_VECTOR_BITSIZE)); + } + + private PanamaVectorConstants() {} +} diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java index 867d0c684cbe..9273f7c5a813 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java @@ -31,6 +31,8 @@ import jdk.incubator.vector.IntVector; import jdk.incubator.vector.ShortVector; import jdk.incubator.vector.Vector; +import jdk.incubator.vector.VectorMask; +import jdk.incubator.vector.VectorOperators; import jdk.incubator.vector.VectorShape; import jdk.incubator.vector.VectorSpecies; import org.apache.lucene.util.Constants; @@ -52,20 +54,15 @@ final class PanamaVectorUtilSupport implements VectorUtilSupport { // preferred vector sizes, which can be altered for testing private static final VectorSpecies FLOAT_SPECIES; - private static final VectorSpecies INT_SPECIES; + private static final VectorSpecies INT_SPECIES = + PanamaVectorConstants.PRERERRED_INT_SPECIES; private static final VectorSpecies BYTE_SPECIES; private static final VectorSpecies SHORT_SPECIES; static final int VECTOR_BITSIZE; - static final boolean HAS_FAST_INTEGER_VECTORS; static { - // default to platform supported bitsize - int vectorBitSize = VectorShape.preferredShape().vectorBitSize(); - // but allow easy overriding for testing - vectorBitSize = VectorizationProvider.TESTS_VECTOR_SIZE.orElse(vectorBitSize); - INT_SPECIES = VectorSpecies.of(int.class, VectorShape.forBitSize(vectorBitSize)); - VECTOR_BITSIZE = INT_SPECIES.vectorBitSize(); + VECTOR_BITSIZE = PanamaVectorConstants.PREFERRED_VECTOR_BITSIZE; FLOAT_SPECIES = INT_SPECIES.withLanes(float.class); // compute BYTE/SHORT sizes relative to preferred integer vector size if (VECTOR_BITSIZE >= 256) { @@ -76,11 +73,6 @@ final class PanamaVectorUtilSupport implements VectorUtilSupport { BYTE_SPECIES = null; SHORT_SPECIES = null; } - // hotspot misses some SSE intrinsics, workaround it - // to be fair, they do document this thing only works well with AVX2/AVX3 and Neon - boolean isAMD64withoutAVX2 = Constants.OS_ARCH.equals("amd64") && VECTOR_BITSIZE < 256; - HAS_FAST_INTEGER_VECTORS = - VectorizationProvider.TESTS_FORCE_INTEGER_VECTORS || (isAMD64withoutAVX2 == false); } // the way FMA should work! if available use it, otherwise fall back to mul/add @@ -320,7 +312,7 @@ public static int dotProduct(MemorySegment a, MemorySegment b) { // only vectorize if we'll at least enter the loop a single time, and we have at least 128-bit // vectors (256-bit on intel to dodge performance landmines) - if (a.byteSize() >= 16 && HAS_FAST_INTEGER_VECTORS) { + if (a.byteSize() >= 16 && PanamaVectorConstants.HAS_FAST_INTEGER_VECTORS) { // compute vectorized dot product consistent with VPDPBUSD instruction if (VECTOR_BITSIZE >= 512) { i += BYTE_SPECIES.loopBound(a.byteSize()); @@ -414,7 +406,7 @@ public int int4DotProduct(byte[] a, boolean apacked, byte[] b, boolean bpacked) } else if (VECTOR_BITSIZE == 256) { i += ByteVector.SPECIES_128.loopBound(packed.length); res += dotProductBody256Int4Packed(unpacked, packed, i); - } else if (HAS_FAST_INTEGER_VECTORS) { + } else if (PanamaVectorConstants.HAS_FAST_INTEGER_VECTORS) { i += ByteVector.SPECIES_64.loopBound(packed.length); res += dotProductBody128Int4Packed(unpacked, packed, i); } @@ -430,7 +422,7 @@ public int int4DotProduct(byte[] a, boolean apacked, byte[] b, boolean bpacked) } else { if (VECTOR_BITSIZE >= 512 || VECTOR_BITSIZE == 256) { return dotProduct(a, b); - } else if (a.length >= 32 && HAS_FAST_INTEGER_VECTORS) { + } else if (a.length >= 32 && PanamaVectorConstants.HAS_FAST_INTEGER_VECTORS) { i += ByteVector.SPECIES_128.loopBound(a.length); res += int4DotProductBody128(a, b, i); } @@ -588,7 +580,7 @@ public static float cosine(MemorySegment a, MemorySegment b) { // only vectorize if we'll at least enter the loop a single time, and we have at least 128-bit // vectors (256-bit on intel to dodge performance landmines) - if (a.byteSize() >= 16 && HAS_FAST_INTEGER_VECTORS) { + if (a.byteSize() >= 16 && PanamaVectorConstants.HAS_FAST_INTEGER_VECTORS) { final float[] ret; if (VECTOR_BITSIZE >= 512) { i += BYTE_SPECIES.loopBound((int) a.byteSize()); @@ -711,7 +703,7 @@ public static int squareDistance(MemorySegment a, MemorySegment b) { // only vectorize if we'll at least enter the loop a single time, and we have at least 128-bit // vectors (256-bit on intel to dodge performance landmines) - if (a.byteSize() >= 16 && HAS_FAST_INTEGER_VECTORS) { + if (a.byteSize() >= 16 && PanamaVectorConstants.HAS_FAST_INTEGER_VECTORS) { if (VECTOR_BITSIZE >= 256) { i += BYTE_SPECIES.loopBound((int) a.byteSize()); res += squareDistanceBody256(a, b, i); @@ -771,4 +763,27 @@ private static int squareDistanceBody128(MemorySegment a, MemorySegment b, int l // reduce return acc1.add(acc2).reduceLanes(ADD); } + + // Experiments suggest that we need at least 8 lanes so that the overhead of going with the vector + // approach and counting trues on vector masks pays off. + private static final boolean ENABLE_FIND_NEXT_GEQ_VECTOR_OPTO = INT_SPECIES.length() >= 8; + + @Override + public int findNextGEQ(int[] buffer, int target, int from, int to) { + if (ENABLE_FIND_NEXT_GEQ_VECTOR_OPTO) { + for (; from + INT_SPECIES.length() < to; from += INT_SPECIES.length() + 1) { + if (buffer[from + INT_SPECIES.length()] >= target) { + IntVector vector = IntVector.fromArray(INT_SPECIES, buffer, from); + VectorMask mask = vector.compare(VectorOperators.LT, target); + return from + mask.trueCount(); + } + } + } + for (int i = from; i < to; ++i) { + if (buffer[i] >= target) { + return i; + } + } + return to; + } } diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorizationProvider.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorizationProvider.java index 87f7cf2baf76..df9d35bc439b 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorizationProvider.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorizationProvider.java @@ -16,19 +16,25 @@ */ package org.apache.lucene.internal.vectorization; +import java.io.IOException; +import java.lang.foreign.MemorySegment; import java.security.AccessController; import java.security.PrivilegedAction; import java.util.Locale; import java.util.logging.Logger; import jdk.incubator.vector.FloatVector; import org.apache.lucene.codecs.hnsw.FlatVectorsScorer; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.MemorySegmentAccessInput; import org.apache.lucene.util.Constants; import org.apache.lucene.util.SuppressForbidden; /** A vectorization provider that leverages the Panama Vector API. */ final class PanamaVectorizationProvider extends VectorizationProvider { - private final VectorUtilSupport vectorUtilSupport; + // NOTE: Avoid static fields or initializers which rely on the vector API, as these initializers + // would get called before we have a chance to perform sanity checks around the vector API in the + // constructor of this class. Put them in PanamaVectorConstants instead. // Extracted to a method to be able to apply the SuppressForbidden annotation @SuppressWarnings("removal") @@ -37,6 +43,8 @@ private static T doPrivileged(PrivilegedAction action) { return AccessController.doPrivileged(action); } + private final VectorUtilSupport vectorUtilSupport; + PanamaVectorizationProvider() { // hack to work around for JDK-8309727: try { @@ -51,9 +59,9 @@ private static T doPrivileged(PrivilegedAction action) { "We hit initialization failure described in JDK-8309727: " + se); } - if (PanamaVectorUtilSupport.VECTOR_BITSIZE < 128) { + if (PanamaVectorConstants.PREFERRED_VECTOR_BITSIZE < 128) { throw new UnsupportedOperationException( - "Vector bit size is less than 128: " + PanamaVectorUtilSupport.VECTOR_BITSIZE); + "Vector bit size is less than 128: " + PanamaVectorConstants.PREFERRED_VECTOR_BITSIZE); } this.vectorUtilSupport = new PanamaVectorUtilSupport(); @@ -63,11 +71,9 @@ private static T doPrivileged(PrivilegedAction action) { String.format( Locale.ENGLISH, "Java vector incubator API enabled; uses preferredBitSize=%d%s%s", - PanamaVectorUtilSupport.VECTOR_BITSIZE, + PanamaVectorConstants.PREFERRED_VECTOR_BITSIZE, Constants.HAS_FAST_VECTOR_FMA ? "; FMA enabled" : "", - PanamaVectorUtilSupport.HAS_FAST_INTEGER_VECTORS - ? "" - : "; floating-point vectors only")); + PanamaVectorConstants.HAS_FAST_INTEGER_VECTORS ? "" : "; floating-point vectors only")); } @Override @@ -79,4 +85,16 @@ public VectorUtilSupport getVectorUtilSupport() { public FlatVectorsScorer getLucene99FlatVectorsScorer() { return Lucene99MemorySegmentFlatVectorsScorer.INSTANCE; } + + @Override + public PostingDecodingUtil newPostingDecodingUtil(IndexInput input) throws IOException { + if (PanamaVectorConstants.HAS_FAST_INTEGER_VECTORS + && input instanceof MemorySegmentAccessInput msai) { + MemorySegment ms = msai.segmentSliceOrNull(0, input.length()); + if (ms != null) { + return new MemorySegmentPostingDecodingUtil(input, ms); + } + } + return new PostingDecodingUtil(input); + } } diff --git a/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentAccessInput.java b/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentAccessInput.java index 7c22eccdcf1e..8b6452a748ba 100644 --- a/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentAccessInput.java +++ b/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentAccessInput.java @@ -27,7 +27,7 @@ public interface MemorySegmentAccessInput extends RandomAccessInput, Cloneable { /** Returns the memory segment for a given position and length, or null. */ - MemorySegment segmentSliceOrNull(long pos, int len) throws IOException; + MemorySegment segmentSliceOrNull(long pos, long len) throws IOException; MemorySegmentAccessInput clone(); } diff --git a/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java b/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java index e9805f0f7a64..417511b6f5d5 100644 --- a/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java +++ b/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java @@ -419,7 +419,7 @@ public byte readByte(long pos) throws IOException { } @Override - protected void readGroupVInt(long[] dst, int offset) throws IOException { + public void readGroupVInt(int[] dst, int offset) throws IOException { try { final int len = GroupVIntUtil.readGroupVInt( @@ -530,7 +530,29 @@ public final long length() { @Override public final MemorySegmentIndexInput clone() { - final MemorySegmentIndexInput clone = buildSlice((String) null, 0L, this.length); + ensureOpen(); + ensureAccessible(); + final MemorySegmentIndexInput clone; + if (segments.length == 1) { + clone = + new SingleSegmentImpl( + toString(), + null, // clones don't have an Arena, as they can't close) + segments[0], + length, + chunkSizePower, + confined); + } else { + clone = + new MultiSegmentImpl( + toString(), + null, // clones don't have an Arena, as they can't close) + segments, + ((MultiSegmentImpl) this).offset, + length, + chunkSizePower, + confined); + } try { clone.seek(getFilePointer()); } catch (IOException ioe) { @@ -567,14 +589,23 @@ public final MemorySegmentIndexInput slice(String sliceDescription, long offset, public final MemorySegmentIndexInput slice( String sliceDescription, long offset, long length, ReadAdvice advice) throws IOException { MemorySegmentIndexInput slice = slice(sliceDescription, offset, length); - if (NATIVE_ACCESS.isPresent()) { + if (NATIVE_ACCESS.isPresent() && advice != ReadAdvice.NORMAL) { + // No need to madvise with a normal advice, since it's the OS' default. final NativeAccess nativeAccess = NATIVE_ACCESS.get(); - slice.advise( - 0, - slice.length, - segment -> { - nativeAccess.madvise(segment, advice); - }); + if (length >= nativeAccess.getPageSize()) { + // Only set the read advice if the inner file is large enough. Otherwise the cons are likely + // outweighing the pros as we're: + // - potentially overriding the advice of other files that share the same pages, + // - paying the cost of a madvise system call for little value. + // We could align inner files with the page size to avoid the first issue, but again the + // pros don't clearly overweigh the cons. + slice.advise( + 0, + slice.length, + segment -> { + nativeAccess.madvise(segment, advice); + }); + } } return slice; } @@ -583,26 +614,30 @@ public final MemorySegmentIndexInput slice( MemorySegmentIndexInput buildSlice(String sliceDescription, long offset, long length) { ensureOpen(); ensureAccessible(); + final MemorySegment[] slices; + final boolean isClone = offset == 0 && length == this.length; + if (isClone) { + slices = segments; + } else { + final long sliceEnd = offset + length; + final int startIndex = (int) (offset >>> chunkSizePower); + final int endIndex = (int) (sliceEnd >>> chunkSizePower); + // we always allocate one more slice, the last one may be a 0 byte one after truncating with + // asSlice(): + slices = ArrayUtil.copyOfSubArray(segments, startIndex, endIndex + 1); - final long sliceEnd = offset + length; - final int startIndex = (int) (offset >>> chunkSizePower); - final int endIndex = (int) (sliceEnd >>> chunkSizePower); - - // we always allocate one more slice, the last one may be a 0 byte one after truncating with - // asSlice(): - final MemorySegment slices[] = ArrayUtil.copyOfSubArray(segments, startIndex, endIndex + 1); - - // set the last segment's limit for the sliced view. - slices[slices.length - 1] = slices[slices.length - 1].asSlice(0L, sliceEnd & chunkSizeMask); + // set the last segment's limit for the sliced view. + slices[slices.length - 1] = slices[slices.length - 1].asSlice(0L, sliceEnd & chunkSizeMask); - offset = offset & chunkSizeMask; + offset = offset & chunkSizeMask; + } final String newResourceDescription = getFullSliceDescription(sliceDescription); if (slices.length == 1) { return new SingleSegmentImpl( newResourceDescription, null, // clones don't have an Arena, as they can't close) - slices[0].asSlice(offset, length), + isClone ? slices[0] : slices[0].asSlice(offset, length), length, chunkSizePower, confined); @@ -742,7 +777,7 @@ public long readLong(long pos) throws IOException { } @Override - public MemorySegment segmentSliceOrNull(long pos, int len) throws IOException { + public MemorySegment segmentSliceOrNull(long pos, long len) throws IOException { try { Objects.checkIndex(pos + len, this.length + 1); return curSegment.asSlice(pos, len); @@ -816,7 +851,8 @@ public long readLong(long pos) throws IOException { return super.readLong(pos + offset); } - public MemorySegment segmentSliceOrNull(long pos, int len) throws IOException { + @Override + public MemorySegment segmentSliceOrNull(long pos, long len) throws IOException { if (pos + len > length) { throw handlePositionalIOOBE(null, "segmentSliceOrNull", pos); } diff --git a/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInputProvider.java b/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInputProvider.java index 1e5a305219b7..7cbe376678bb 100644 --- a/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInputProvider.java +++ b/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInputProvider.java @@ -129,7 +129,9 @@ private final MemorySegment[] map( // internal FileChannel logic) if (preload) { segment.load(); - } else if (nativeAccess.filter(na -> segment.address() % na.getPageSize() == 0).isPresent()) { + } else if (readAdvice != ReadAdvice.NORMAL + && nativeAccess.filter(na -> segment.address() % na.getPageSize() == 0).isPresent()) { + // No need to madvise with ReadAdvice.NORMAL since it is the OS' default read advice. nativeAccess.get().madvise(segment, readAdvice); } segments[segNr] = segment; diff --git a/lucene/core/src/java21/org/apache/lucene/store/PosixNativeAccess.java b/lucene/core/src/java21/org/apache/lucene/store/PosixNativeAccess.java index 80c1665cdd19..05eb61571188 100644 --- a/lucene/core/src/java21/org/apache/lucene/store/PosixNativeAccess.java +++ b/lucene/core/src/java21/org/apache/lucene/store/PosixNativeAccess.java @@ -122,10 +122,7 @@ private static MethodHandle findFunction( @Override public void madvise(MemorySegment segment, ReadAdvice readAdvice) throws IOException { - final Integer advice = mapReadAdvice(readAdvice); - if (advice == null) { - return; // do nothing - } + final int advice = mapReadAdvice(readAdvice); madvise(segment, advice); } @@ -156,7 +153,7 @@ private void madvise(MemorySegment segment, int advice) throws IOException { } } - private Integer mapReadAdvice(ReadAdvice readAdvice) { + private int mapReadAdvice(ReadAdvice readAdvice) { return switch (readAdvice) { case NORMAL -> POSIX_MADV_NORMAL; case RANDOM -> POSIX_MADV_RANDOM; diff --git a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec index 8b6724966018..f7a246c76849 100644 --- a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec +++ b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec @@ -13,4 +13,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -org.apache.lucene.codecs.lucene912.Lucene912Codec +org.apache.lucene.codecs.lucene101.Lucene101Codec diff --git a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat index cd63926a287d..874ebafd971b 100644 --- a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat +++ b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat @@ -13,4 +13,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat +org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat diff --git a/lucene/core/src/test/org/apache/lucene/TestDemo.java b/lucene/core/src/test/org/apache/lucene/TestDemo.java index 6c608e1d0b12..9f6f880e1dcf 100644 --- a/lucene/core/src/test/org/apache/lucene/TestDemo.java +++ b/lucene/core/src/test/org/apache/lucene/TestDemo.java @@ -70,7 +70,7 @@ public void testDemo() throws IOException { Query query = new TermQuery(new Term("fieldname", "text")); TopDocs hits = searcher.search(query, 1); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); // Iterate through the results. StoredFields storedFields = searcher.storedFields(); diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java b/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java index 6e51a6bbb0c3..4737694702f1 100644 --- a/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java +++ b/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java @@ -625,7 +625,7 @@ private void assertSameLanguage(Automaton expected, Automaton actual) { Operations.removeDeadStates(expected), DEFAULT_DETERMINIZE_WORK_LIMIT); Automaton actualDet = Operations.determinize(Operations.removeDeadStates(actual), DEFAULT_DETERMINIZE_WORK_LIMIT); - if (Operations.sameLanguage(expectedDet, actualDet) == false) { + if (AutomatonTestUtil.sameLanguage(expectedDet, actualDet) == false) { Set expectedPaths = toPathStrings(expectedDet); Set actualPaths = toPathStrings(actualDet); StringBuilder b = new StringBuilder(); diff --git a/lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorer.java b/lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorer.java index 9bce1f10a432..6fe9a685e1b4 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorer.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorer.java @@ -35,6 +35,8 @@ import org.apache.lucene.codecs.lucene95.OffHeapByteVectorValues; import org.apache.lucene.codecs.lucene95.OffHeapFloatVectorValues; import org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorScorer; +import org.apache.lucene.index.ByteVectorValues; +import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; @@ -42,7 +44,6 @@ import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.MMapDirectory; import org.apache.lucene.tests.util.LuceneTestCase; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.hamcrest.Matcher; import org.hamcrest.MatcherAssert; @@ -174,13 +175,13 @@ public void testCheckFloatDimensions() throws IOException { } } - RandomAccessVectorValues byteVectorValues( - int dims, int size, IndexInput in, VectorSimilarityFunction sim) throws IOException { + ByteVectorValues byteVectorValues(int dims, int size, IndexInput in, VectorSimilarityFunction sim) + throws IOException { return new OffHeapByteVectorValues.DenseOffHeapVectorValues( dims, size, in.slice("byteValues", 0, in.length()), dims, flatVectorsScorer, sim); } - RandomAccessVectorValues floatVectorValues( + FloatVectorValues floatVectorValues( int dims, int size, IndexInput in, VectorSimilarityFunction sim) throws IOException { return new OffHeapFloatVectorValues.DenseOffHeapVectorValues( dims, diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestForDeltaUtil.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestForDeltaUtil.java new file mode 100644 index 000000000000..3e346f3eb206 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestForDeltaUtil.java @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene101; + +import com.carrotsearch.randomizedtesting.generators.RandomNumbers; +import java.io.IOException; +import java.util.Arrays; +import org.apache.lucene.internal.vectorization.PostingDecodingUtil; +import org.apache.lucene.store.ByteBuffersDirectory; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.tests.util.TestUtil; +import org.apache.lucene.util.packed.PackedInts; + +public class TestForDeltaUtil extends LuceneTestCase { + + public void testEncodeDecode() throws IOException { + final int iterations = RandomNumbers.randomIntBetween(random(), 50, 1000); + final int[] values = new int[iterations * ForUtil.BLOCK_SIZE]; + + for (int i = 0; i < iterations; ++i) { + final int bpv = TestUtil.nextInt(random(), 1, 31 - 7); + for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) { + values[i * ForUtil.BLOCK_SIZE + j] = + RandomNumbers.randomIntBetween(random(), 1, (int) PackedInts.maxValue(bpv)); + } + } + + final Directory d = new ByteBuffersDirectory(); + final long endPointer; + + { + // encode + IndexOutput out = d.createOutput("test.bin", IOContext.DEFAULT); + final ForDeltaUtil forDeltaUtil = new ForDeltaUtil(); + + for (int i = 0; i < iterations; ++i) { + int[] source = new int[ForUtil.BLOCK_SIZE]; + for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) { + source[j] = values[i * ForUtil.BLOCK_SIZE + j]; + } + forDeltaUtil.encodeDeltas(source, out); + } + endPointer = out.getFilePointer(); + out.close(); + } + + { + // decode + IndexInput in = d.openInput("test.bin", IOContext.READONCE); + PostingDecodingUtil pdu = + Lucene101PostingsReader.VECTORIZATION_PROVIDER.newPostingDecodingUtil(in); + ForDeltaUtil forDeltaUtil = new ForDeltaUtil(); + for (int i = 0; i < iterations; ++i) { + int base = 0; + final int[] restored = new int[ForUtil.BLOCK_SIZE]; + forDeltaUtil.decodeAndPrefixSum(pdu, base, restored); + final int[] expected = new int[ForUtil.BLOCK_SIZE]; + for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) { + expected[j] = values[i * ForUtil.BLOCK_SIZE + j]; + if (j > 0) { + expected[j] += expected[j - 1]; + } else { + expected[j] += base; + } + } + assertArrayEquals(Arrays.toString(restored), expected, restored); + } + assertEquals(endPointer, in.getFilePointer()); + in.close(); + } + + d.close(); + } +} diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestForUtil.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestForUtil.java new file mode 100644 index 000000000000..d93cb0ab3ecd --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestForUtil.java @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene101; + +import com.carrotsearch.randomizedtesting.generators.RandomNumbers; +import java.io.IOException; +import java.util.Arrays; +import org.apache.lucene.internal.vectorization.PostingDecodingUtil; +import org.apache.lucene.store.ByteBuffersDirectory; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.tests.util.TestUtil; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.packed.PackedInts; + +public class TestForUtil extends LuceneTestCase { + + public void testEncodeDecode() throws IOException { + final int iterations = RandomNumbers.randomIntBetween(random(), 50, 1000); + final int[] values = new int[iterations * ForUtil.BLOCK_SIZE]; + + for (int i = 0; i < iterations; ++i) { + final int bpv = TestUtil.nextInt(random(), 1, 31); + for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) { + values[i * ForUtil.BLOCK_SIZE + j] = + RandomNumbers.randomIntBetween(random(), 0, (int) PackedInts.maxValue(bpv)); + } + } + + final Directory d = new ByteBuffersDirectory(); + final long endPointer; + + { + // encode + IndexOutput out = d.createOutput("test.bin", IOContext.DEFAULT); + final ForUtil forUtil = new ForUtil(); + + for (int i = 0; i < iterations; ++i) { + int[] source = new int[ForUtil.BLOCK_SIZE]; + long or = 0; + for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) { + source[j] = values[i * ForUtil.BLOCK_SIZE + j]; + or |= source[j]; + } + final int bpv = PackedInts.bitsRequired(or); + out.writeByte((byte) bpv); + forUtil.encode(source, bpv, out); + } + endPointer = out.getFilePointer(); + out.close(); + } + + { + // decode + IndexInput in = d.openInput("test.bin", IOContext.READONCE); + PostingDecodingUtil pdu = + Lucene101PostingsReader.VECTORIZATION_PROVIDER.newPostingDecodingUtil(in); + ForUtil forUtil = new ForUtil(); + for (int i = 0; i < iterations; ++i) { + final int bitsPerValue = in.readByte(); + final long currentFilePointer = in.getFilePointer(); + final int[] restored = new int[ForUtil.BLOCK_SIZE]; + forUtil.decode(bitsPerValue, pdu, restored); + int[] ints = new int[ForUtil.BLOCK_SIZE]; + for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) { + ints[j] = Math.toIntExact(restored[j]); + } + assertArrayEquals( + Arrays.toString(ints), + ArrayUtil.copyOfSubArray(values, i * ForUtil.BLOCK_SIZE, (i + 1) * ForUtil.BLOCK_SIZE), + ints); + assertEquals(ForUtil.numBytes(bitsPerValue), in.getFilePointer() - currentFilePointer); + } + assertEquals(endPointer, in.getFilePointer()); + in.close(); + } + + d.close(); + } +} diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestLucene101PostingsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestLucene101PostingsFormat.java new file mode 100644 index 000000000000..6b0ff1fe5e3b --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestLucene101PostingsFormat.java @@ -0,0 +1,157 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene101; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.CompetitiveImpactAccumulator; +import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader.MutableImpactList; +import org.apache.lucene.codecs.lucene90.blocktree.FieldReader; +import org.apache.lucene.codecs.lucene90.blocktree.Stats; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.Impact; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.tests.analysis.MockAnalyzer; +import org.apache.lucene.tests.index.BasePostingsFormatTestCase; +import org.apache.lucene.tests.util.TestUtil; + +public class TestLucene101PostingsFormat extends BasePostingsFormatTestCase { + + @Override + protected Codec getCodec() { + return TestUtil.alwaysPostingsFormat(new Lucene101PostingsFormat()); + } + + public void testVInt15() throws IOException { + byte[] bytes = new byte[5]; + ByteArrayDataOutput out = new ByteArrayDataOutput(bytes); + ByteArrayDataInput in = new ByteArrayDataInput(); + for (int i : new int[] {0, 1, 127, 128, 32767, 32768, Integer.MAX_VALUE}) { + out.reset(bytes); + Lucene101PostingsWriter.writeVInt15(out, i); + in.reset(bytes, 0, out.getPosition()); + assertEquals(i, Lucene101PostingsReader.readVInt15(in)); + assertEquals(out.getPosition(), in.getPosition()); + } + } + + public void testVLong15() throws IOException { + byte[] bytes = new byte[9]; + ByteArrayDataOutput out = new ByteArrayDataOutput(bytes); + ByteArrayDataInput in = new ByteArrayDataInput(); + for (long i : new long[] {0, 1, 127, 128, 32767, 32768, Integer.MAX_VALUE, Long.MAX_VALUE}) { + out.reset(bytes); + Lucene101PostingsWriter.writeVLong15(out, i); + in.reset(bytes, 0, out.getPosition()); + assertEquals(i, Lucene101PostingsReader.readVLong15(in)); + assertEquals(out.getPosition(), in.getPosition()); + } + } + + /** Make sure the final sub-block(s) are not skipped. */ + public void testFinalBlock() throws Exception { + Directory d = newDirectory(); + IndexWriter w = new IndexWriter(d, new IndexWriterConfig(new MockAnalyzer(random()))); + for (int i = 0; i < 25; i++) { + Document doc = new Document(); + doc.add(newStringField("field", Character.toString((char) (97 + i)), Field.Store.NO)); + doc.add(newStringField("field", "z" + Character.toString((char) (97 + i)), Field.Store.NO)); + w.addDocument(doc); + } + w.forceMerge(1); + + DirectoryReader r = DirectoryReader.open(w); + assertEquals(1, r.leaves().size()); + FieldReader field = (FieldReader) r.leaves().get(0).reader().terms("field"); + // We should see exactly two blocks: one root block (prefix empty string) and one block for z* + // terms (prefix z): + Stats stats = field.getStats(); + assertEquals(0, stats.floorBlockCount); + assertEquals(2, stats.nonFloorBlockCount); + r.close(); + w.close(); + d.close(); + } + + public void testImpactSerialization() throws IOException { + // omit norms and omit freqs + doTestImpactSerialization(Collections.singletonList(new Impact(1, 1L))); + + // omit freqs + doTestImpactSerialization(Collections.singletonList(new Impact(1, 42L))); + // omit freqs with very large norms + doTestImpactSerialization(Collections.singletonList(new Impact(1, -100L))); + + // omit norms + doTestImpactSerialization(Collections.singletonList(new Impact(30, 1L))); + // omit norms with large freq + doTestImpactSerialization(Collections.singletonList(new Impact(500, 1L))); + + // freqs and norms, basic + doTestImpactSerialization( + Arrays.asList( + new Impact(1, 7L), + new Impact(3, 9L), + new Impact(7, 10L), + new Impact(15, 11L), + new Impact(20, 13L), + new Impact(28, 14L))); + + // freqs and norms, high values + doTestImpactSerialization( + Arrays.asList( + new Impact(2, 2L), + new Impact(10, 10L), + new Impact(12, 50L), + new Impact(50, -100L), + new Impact(1000, -80L), + new Impact(1005, -3L))); + } + + private void doTestImpactSerialization(List impacts) throws IOException { + CompetitiveImpactAccumulator acc = new CompetitiveImpactAccumulator(); + for (Impact impact : impacts) { + acc.add(impact.freq, impact.norm); + } + try (Directory dir = newDirectory()) { + try (IndexOutput out = dir.createOutput("foo", IOContext.DEFAULT)) { + Lucene101PostingsWriter.writeImpacts(acc.getCompetitiveFreqNormPairs(), out); + } + try (IndexInput in = dir.openInput("foo", IOContext.DEFAULT)) { + byte[] b = new byte[Math.toIntExact(in.length())]; + in.readBytes(b, 0, b.length); + List impacts2 = + Lucene101PostingsReader.readImpacts( + new ByteArrayDataInput(b), + new MutableImpactList(impacts.size() + random().nextInt(3))); + assertEquals(impacts, impacts2); + } + } + } +} diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestPForUtil.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestPForUtil.java new file mode 100644 index 000000000000..4e9ab4b55ee2 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestPForUtil.java @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene101; + +import com.carrotsearch.randomizedtesting.generators.RandomNumbers; +import java.io.IOException; +import java.util.Arrays; +import org.apache.lucene.internal.vectorization.PostingDecodingUtil; +import org.apache.lucene.store.ByteBuffersDirectory; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.tests.util.TestUtil; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.packed.PackedInts; + +public class TestPForUtil extends LuceneTestCase { + + public void testEncodeDecode() throws IOException { + final int iterations = RandomNumbers.randomIntBetween(random(), 50, 1000); + final int[] values = createTestData(iterations, 31); + + final Directory d = new ByteBuffersDirectory(); + final long endPointer = encodeTestData(iterations, values, d); + + IndexInput in = d.openInput("test.bin", IOContext.READONCE); + PostingDecodingUtil pdu = + Lucene101PostingsReader.VECTORIZATION_PROVIDER.newPostingDecodingUtil(in); + final PForUtil pforUtil = new PForUtil(); + for (int i = 0; i < iterations; ++i) { + if (random().nextInt(5) == 0) { + PForUtil.skip(in); + continue; + } + final int[] restored = new int[ForUtil.BLOCK_SIZE]; + pforUtil.decode(pdu, restored); + int[] ints = new int[ForUtil.BLOCK_SIZE]; + for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) { + ints[j] = Math.toIntExact(restored[j]); + } + assertArrayEquals( + Arrays.toString(ints), + ArrayUtil.copyOfSubArray(values, i * ForUtil.BLOCK_SIZE, (i + 1) * ForUtil.BLOCK_SIZE), + ints); + } + assertEquals(endPointer, in.getFilePointer()); + in.close(); + + d.close(); + } + + private int[] createTestData(int iterations, int maxBpv) { + final int[] values = new int[iterations * ForUtil.BLOCK_SIZE]; + + for (int i = 0; i < iterations; ++i) { + final int bpv = TestUtil.nextInt(random(), 0, maxBpv); + for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) { + values[i * ForUtil.BLOCK_SIZE + j] = + RandomNumbers.randomIntBetween(random(), 0, (int) PackedInts.maxValue(bpv)); + if (random().nextInt(100) == 0) { + final int exceptionBpv; + if (random().nextInt(10) == 0) { + exceptionBpv = Math.min(bpv + TestUtil.nextInt(random(), 9, 16), maxBpv); + } else { + exceptionBpv = Math.min(bpv + TestUtil.nextInt(random(), 1, 8), maxBpv); + } + values[i * ForUtil.BLOCK_SIZE + j] |= random().nextInt(1 << (exceptionBpv - bpv)) << bpv; + } + } + } + + return values; + } + + private long encodeTestData(int iterations, int[] values, Directory d) throws IOException { + IndexOutput out = d.createOutput("test.bin", IOContext.DEFAULT); + final PForUtil pforUtil = new PForUtil(); + + for (int i = 0; i < iterations; ++i) { + int[] source = new int[ForUtil.BLOCK_SIZE]; + for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) { + source[j] = values[i * ForUtil.BLOCK_SIZE + j]; + } + pforUtil.encode(source, out); + } + final long endPointer = out.getFilePointer(); + out.close(); + + return endPointer; + } +} diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestPostingsUtil.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestPostingsUtil.java new file mode 100644 index 000000000000..5d02d0561e33 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestPostingsUtil.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene101; + +import java.io.IOException; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.tests.util.LuceneTestCase; + +public class TestPostingsUtil extends LuceneTestCase { + + // checks for bug described in https://github.com/apache/lucene/issues/13373 + public void testIntegerOverflow() throws IOException { + // Size that writes the first value as a regular vint + int randomSize1 = random().nextInt(1, 3); + // Size that writes the first value as a group vint + int randomSize2 = random().nextInt(4, ForUtil.BLOCK_SIZE); + doTestIntegerOverflow(randomSize1); + doTestIntegerOverflow(randomSize2); + } + + private void doTestIntegerOverflow(int size) throws IOException { + final int[] docDeltaBuffer = new int[size]; + final int[] freqBuffer = new int[size]; + + final int delta = 1 << 30; + docDeltaBuffer[0] = delta; + try (Directory dir = newDirectory()) { + try (IndexOutput out = dir.createOutput("test", IOContext.DEFAULT)) { + // In old implementation, this would cause integer overflow exception. + PostingsUtil.writeVIntBlock(out, docDeltaBuffer, freqBuffer, size, true); + } + int[] restoredDocs = new int[size]; + int[] restoredFreqs = new int[size]; + try (IndexInput in = dir.openInput("test", IOContext.DEFAULT)) { + PostingsUtil.readVIntBlock(in, restoredDocs, restoredFreqs, size, true, true); + } + assertEquals(delta, restoredDocs[0]); + } + } +} diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90StoredFieldsFormatHighCompression.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90StoredFieldsFormatHighCompression.java index c72bcfeea468..69fbf96f6da8 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90StoredFieldsFormatHighCompression.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90StoredFieldsFormatHighCompression.java @@ -18,7 +18,7 @@ import com.carrotsearch.randomizedtesting.generators.RandomPicks; import org.apache.lucene.codecs.Codec; -import org.apache.lucene.codecs.lucene912.Lucene912Codec; +import org.apache.lucene.codecs.lucene101.Lucene101Codec; import org.apache.lucene.document.Document; import org.apache.lucene.document.StoredField; import org.apache.lucene.index.DirectoryReader; @@ -31,7 +31,7 @@ public class TestLucene90StoredFieldsFormatHighCompression extends BaseStoredFieldsFormatTestCase { @Override protected Codec getCodec() { - return new Lucene912Codec(Lucene912Codec.Mode.BEST_COMPRESSION); + return new Lucene101Codec(Lucene101Codec.Mode.BEST_COMPRESSION); } /** @@ -42,7 +42,7 @@ public void testMixedCompressions() throws Exception { for (int i = 0; i < 10; i++) { IndexWriterConfig iwc = newIndexWriterConfig(); iwc.setCodec( - new Lucene912Codec(RandomPicks.randomFrom(random(), Lucene912Codec.Mode.values()))); + new Lucene101Codec(RandomPicks.randomFrom(random(), Lucene101Codec.Mode.values()))); IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig()); Document doc = new Document(); doc.add(new StoredField("field1", "value1")); @@ -72,7 +72,7 @@ public void testInvalidOptions() { expectThrows( NullPointerException.class, () -> { - new Lucene912Codec(null); + new Lucene101Codec(null); }); expectThrows( diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90TermVectorsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90TermVectorsFormat.java index 6660b9b64294..8cc6dda1c1d7 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90TermVectorsFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90TermVectorsFormat.java @@ -16,7 +16,24 @@ */ package org.apache.lucene.codecs.lucene90; +import java.io.IOException; +import java.util.concurrent.atomic.AtomicInteger; import org.apache.lucene.codecs.Codec; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.TermVectors; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FilterDirectory; +import org.apache.lucene.store.FilterIndexInput; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.tests.codecs.compressing.dummy.DummyCompressingCodec; import org.apache.lucene.tests.index.BaseTermVectorsFormatTestCase; import org.apache.lucene.tests.util.TestUtil; @@ -25,4 +42,84 @@ public class TestLucene90TermVectorsFormat extends BaseTermVectorsFormatTestCase protected Codec getCodec() { return TestUtil.getDefaultCodec(); } + + private static class CountingPrefetchDirectory extends FilterDirectory { + + private final AtomicInteger counter; + + CountingPrefetchDirectory(Directory in, AtomicInteger counter) { + super(in); + this.counter = counter; + } + + @Override + public IndexInput openInput(String name, IOContext context) throws IOException { + return new CountingPrefetchIndexInput(super.openInput(name, context), counter); + } + } + + private static class CountingPrefetchIndexInput extends FilterIndexInput { + + private final AtomicInteger counter; + + public CountingPrefetchIndexInput(IndexInput input, AtomicInteger counter) { + super(input.toString(), input); + this.counter = counter; + } + + @Override + public void prefetch(long offset, long length) throws IOException { + in.prefetch(offset, length); + counter.incrementAndGet(); + } + + @Override + public IndexInput clone() { + return new CountingPrefetchIndexInput(in.clone(), counter); + } + + @Override + public IndexInput slice(String sliceDescription, long offset, long length) throws IOException { + return new CountingPrefetchIndexInput(in.slice(sliceDescription, offset, length), counter); + } + } + + public void testSkipRedundantPrefetches() throws IOException { + // Use the "dummy" codec, which has the same base class as Lucene90StoredFieldsFormat but allows + // configuring the number of docs per chunk. + Codec codec = new DummyCompressingCodec(1 << 10, 2, false, 16); + try (Directory origDir = newDirectory()) { + AtomicInteger counter = new AtomicInteger(); + Directory dir = new CountingPrefetchDirectory(origDir, counter); + try (IndexWriter w = new IndexWriter(dir, new IndexWriterConfig().setCodec(codec))) { + FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); + ft.setStoreTermVectors(true); + for (int i = 0; i < 100; ++i) { + Document doc = new Document(); + doc.add(new Field("content", Integer.toString(i), ft)); + w.addDocument(doc); + } + w.forceMerge(1); + } + + try (IndexReader reader = DirectoryReader.open(dir)) { + TermVectors termVectors = reader.termVectors(); + counter.set(0); + assertEquals(0, counter.get()); + termVectors.prefetch(0); + assertEquals(1, counter.get()); + termVectors.prefetch(1); + // This format has 2 docs per block, so the second prefetch is skipped + assertEquals(1, counter.get()); + termVectors.prefetch(15); + assertEquals(2, counter.get()); + termVectors.prefetch(14); + // 14 is in the same block as 15, so the prefetch was skipped + assertEquals(2, counter.get()); + // Already prefetched in the past, so skipped again + termVectors.prefetch(1); + assertEquals(2, counter.get()); + } + } + } } diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java index 3098ca7fbf33..d2aa9b8d0542 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java @@ -17,7 +17,6 @@ package org.apache.lucene.codecs.lucene99; import static java.lang.String.format; -import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; import static org.hamcrest.Matchers.is; import static org.hamcrest.Matchers.oneOf; @@ -29,7 +28,6 @@ import org.apache.lucene.codecs.FilterCodec; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.KnnVectorsReader; -import org.apache.lucene.codecs.lucene912.Lucene912Codec; import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.KnnFloatVectorField; @@ -42,11 +40,14 @@ import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.NoMergePolicy; import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.KnnFloatVectorQuery; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopKnnCollector; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase; +import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.SameThreadExecutorService; import org.apache.lucene.util.VectorUtil; import org.apache.lucene.util.quantization.QuantizedByteVectorValues; @@ -67,26 +68,112 @@ public void setUp() throws Exception { if (random().nextBoolean()) { confidenceInterval = 0f; } - format = - new Lucene99HnswScalarQuantizedVectorsFormat( - Lucene99HnswVectorsFormat.DEFAULT_MAX_CONN, - Lucene99HnswVectorsFormat.DEFAULT_BEAM_WIDTH, - 1, - bits, - random().nextBoolean(), - confidenceInterval, - null); + format = getKnnFormat(bits); super.setUp(); } @Override protected Codec getCodec() { - return new Lucene912Codec() { - @Override - public KnnVectorsFormat getKnnVectorsFormatForField(String field) { - return format; + return TestUtil.alwaysKnnVectorsFormat(format); + } + + private final KnnVectorsFormat getKnnFormat(int bits) { + return new Lucene99HnswScalarQuantizedVectorsFormat( + Lucene99HnswVectorsFormat.DEFAULT_MAX_CONN, + Lucene99HnswVectorsFormat.DEFAULT_BEAM_WIDTH, + 1, + bits, + bits == 4 ? random().nextBoolean() : false, + confidenceInterval, + null); + } + + // verifies it's fine to change your mind on the number of bits quantization you want for the same + // field in the same index by changing up the Codec. This is allowed because at merge time we + // requantize the vectors. + public void testMixedQuantizedBits() throws Exception { + + try (Directory dir = newDirectory()) { + + // add first vector using 4 bit quantization, then close index: + try (IndexWriter w = + new IndexWriter( + dir, + newIndexWriterConfig().setCodec(TestUtil.alwaysKnnVectorsFormat(getKnnFormat(4))))) { + + Document doc = new Document(); + doc.add( + new KnnFloatVectorField( + "f", new float[] {0.6f, 0.8f}, VectorSimilarityFunction.DOT_PRODUCT)); + w.addDocument(doc); } - }; + + // create another writer using 7 bit quantization and add 2nd vector + try (IndexWriter w = + new IndexWriter( + dir, + newIndexWriterConfig().setCodec(TestUtil.alwaysKnnVectorsFormat(getKnnFormat(7))))) { + + Document doc = new Document(); + doc.add( + new KnnFloatVectorField( + "f", new float[] {0.8f, 0.6f}, VectorSimilarityFunction.DOT_PRODUCT)); + w.addDocument(doc); + w.forceMerge(1); + } + + // confirm searching works: we find both vectors + try (IndexReader reader = DirectoryReader.open(dir)) { + IndexSearcher searcher = newSearcher(reader); + KnnFloatVectorQuery q = new KnnFloatVectorQuery("f", new float[] {0.7f, 0.7f}, 10); + TopDocs topDocs = searcher.search(q, 100); + assertEquals(2, topDocs.totalHits.value()); + } + } + } + + // verifies you can change your mind and enable quantization on a previously indexed vector field + // without quantization + public void testMixedQuantizedUnQuantized() throws Exception { + + try (Directory dir = newDirectory()) { + + // add first vector using no quantization + try (IndexWriter w = + new IndexWriter( + dir, + newIndexWriterConfig() + .setCodec(TestUtil.alwaysKnnVectorsFormat(new Lucene99HnswVectorsFormat())))) { + + Document doc = new Document(); + doc.add( + new KnnFloatVectorField( + "f", new float[] {0.6f, 0.8f}, VectorSimilarityFunction.DOT_PRODUCT)); + w.addDocument(doc); + } + + // create another writer using (7 bit) quantization and add 2nd vector + try (IndexWriter w = + new IndexWriter( + dir, + newIndexWriterConfig().setCodec(TestUtil.alwaysKnnVectorsFormat(getKnnFormat(7))))) { + + Document doc = new Document(); + doc.add( + new KnnFloatVectorField( + "f", new float[] {0.8f, 0.6f}, VectorSimilarityFunction.DOT_PRODUCT)); + w.addDocument(doc); + w.forceMerge(1); + } + + // confirm searching works: we find both vectors + try (IndexReader reader = DirectoryReader.open(dir)) { + IndexSearcher searcher = newSearcher(reader); + KnnFloatVectorQuery q = new KnnFloatVectorQuery("f", new float[] {0.7f, 0.7f}, 10); + TopDocs topDocs = searcher.search(q, 100); + assertEquals(2, topDocs.totalHits.value()); + } + } } public void testQuantizationScoringEdgeCase() throws Exception { @@ -97,13 +184,9 @@ public void testQuantizationScoringEdgeCase() throws Exception { dir, newIndexWriterConfig() .setCodec( - new Lucene912Codec() { - @Override - public KnnVectorsFormat getKnnVectorsFormatForField(String field) { - return new Lucene99HnswScalarQuantizedVectorsFormat( - 16, 100, 1, (byte) 7, false, 0.9f, null); - } - }))) { + TestUtil.alwaysKnnVectorsFormat( + new Lucene99HnswScalarQuantizedVectorsFormat( + 16, 100, 1, (byte) 7, false, 0.9f, null))))) { for (float[] vector : vectors) { Document doc = new Document(); doc.add(new KnnFloatVectorField("f", vector, VectorSimilarityFunction.DOT_PRODUCT)); @@ -116,7 +199,7 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { TopKnnCollector topKnnCollector = new TopKnnCollector(5, Integer.MAX_VALUE); r.searchNearestVectors("f", new float[] {0.6f, 0.8f}, topKnnCollector, null); TopDocs topDocs = topKnnCollector.topDocs(); - assertEquals(3, topDocs.totalHits.value); + assertEquals(3, topDocs.totalHits.value()); for (ScoreDoc scoreDoc : topDocs.scoreDocs) { assertTrue(scoreDoc.score >= 0f); } @@ -125,7 +208,6 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { } public void testQuantizedVectorsWriteAndRead() throws Exception { - // create lucene directory with codec int numVectors = 1 + random().nextInt(50); VectorSimilarityFunction similarityFunction = randomSimilarity(); int dim = random().nextInt(64) + 1; @@ -158,6 +240,7 @@ public void testQuantizedVectorsWriteAndRead() throws Exception { } float[] randomlyReusedVector = new float[dim]; + // create lucene directory with codec try (Directory dir = newDirectory(); IndexWriter w = new IndexWriter( @@ -192,14 +275,13 @@ public void testQuantizedVectorsWriteAndRead() throws Exception { assertNotNull(hnswReader.getQuantizationState("f")); QuantizedByteVectorValues quantizedByteVectorValues = hnswReader.getQuantizedVectorValues("f"); - int docId = -1; - while ((docId = quantizedByteVectorValues.nextDoc()) != NO_MORE_DOCS) { - byte[] vector = quantizedByteVectorValues.vectorValue(); - float offset = quantizedByteVectorValues.getScoreCorrectionConstant(); + for (int ord = 0; ord < quantizedByteVectorValues.size(); ord++) { + byte[] vector = quantizedByteVectorValues.vectorValue(ord); + float offset = quantizedByteVectorValues.getScoreCorrectionConstant(ord); for (int i = 0; i < dim; i++) { - assertEquals(vector[i], expectedVectors[docId][i]); + assertEquals(vector[i], expectedVectors[ord][i]); } - assertEquals(offset, expectedCorrections[docId], 0.00001f); + assertEquals(offset, expectedCorrections[ord], 0.00001f); } } else { fail("reader is not Lucene99HnswVectorsReader"); diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorScorer.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorScorer.java index 58e6c27e326a..3ffeef501e87 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorScorer.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorScorer.java @@ -24,10 +24,8 @@ import java.nio.ByteBuffer; import java.nio.ByteOrder; import org.apache.lucene.codecs.Codec; -import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer; -import org.apache.lucene.codecs.lucene912.Lucene912Codec; import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -44,27 +42,24 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.VectorUtil; import org.apache.lucene.util.hnsw.RandomVectorScorer; -import org.apache.lucene.util.quantization.RandomAccessQuantizedByteVectorValues; +import org.apache.lucene.util.quantization.QuantizedByteVectorValues; import org.apache.lucene.util.quantization.ScalarQuantizer; public class TestLucene99ScalarQuantizedVectorScorer extends LuceneTestCase { private static Codec getCodec(int bits, boolean compress) { - return new Lucene912Codec() { - @Override - public KnnVectorsFormat getKnnVectorsFormatForField(String field) { - return new Lucene99HnswScalarQuantizedVectorsFormat( + return TestUtil.alwaysKnnVectorsFormat( + new Lucene99HnswScalarQuantizedVectorsFormat( Lucene99HnswVectorsFormat.DEFAULT_MAX_CONN, Lucene99HnswVectorsFormat.DEFAULT_BEAM_WIDTH, 1, bits, compress, 0f, - null); - } - }; + null)); } public void testNonZeroScores() throws IOException { @@ -100,8 +95,8 @@ private void vectorNonZeroScoringTest(int bits, boolean compress) throws IOExcep try (IndexInput in = dir.openInput(fileName, IOContext.DEFAULT)) { Lucene99ScalarQuantizedVectorScorer scorer = new Lucene99ScalarQuantizedVectorScorer(new DefaultFlatVectorScorer()); - RandomAccessQuantizedByteVectorValues values = - new RandomAccessQuantizedByteVectorValues() { + QuantizedByteVectorValues values = + new QuantizedByteVectorValues() { @Override public int dimension() { return 32; @@ -128,7 +123,7 @@ public float getScoreCorrectionConstant(int ord) { } @Override - public RandomAccessQuantizedByteVectorValues copy() throws IOException { + public QuantizedByteVectorValues copy() throws IOException { return this; } @@ -165,7 +160,7 @@ public void testScoringUncompressedInt4() throws Exception { } public void testScoringInt7() throws Exception { - vectorScoringTest(7, random().nextBoolean()); + vectorScoringTest(7, false); } private void vectorScoringTest(int bits, boolean compress) throws IOException { diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorsFormat.java index 094d90ba5a24..cf1436f21092 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorsFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorsFormat.java @@ -28,7 +28,6 @@ import org.apache.lucene.codecs.FilterCodec; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.KnnVectorsReader; -import org.apache.lucene.codecs.lucene912.Lucene912Codec; import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.KnnFloatVectorField; @@ -37,11 +36,13 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.NoMergePolicy; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase; +import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.VectorUtil; import org.apache.lucene.util.quantization.QuantizedByteVectorValues; import org.apache.lucene.util.quantization.ScalarQuantizer; @@ -62,18 +63,14 @@ public void setUp() throws Exception { confidenceInterval = 0f; } format = - new Lucene99ScalarQuantizedVectorsFormat(confidenceInterval, bits, random().nextBoolean()); + new Lucene99ScalarQuantizedVectorsFormat( + confidenceInterval, bits, bits == 4 ? random().nextBoolean() : false); super.setUp(); } @Override protected Codec getCodec() { - return new Lucene912Codec() { - @Override - public KnnVectorsFormat getKnnVectorsFormatForField(String field) { - return format; - } - }; + return TestUtil.alwaysKnnVectorsFormat(format); } public void testSearch() throws Exception { @@ -100,6 +97,11 @@ public void testSearch() throws Exception { } } + @Override + public void testRecall() { + // ignore this test since this class always returns no results from search + } + public void testQuantizedVectorsWriteAndRead() throws Exception { // create lucene directory with codec int numVectors = 1 + random().nextInt(50); @@ -172,9 +174,10 @@ public void testQuantizedVectorsWriteAndRead() throws Exception { QuantizedByteVectorValues quantizedByteVectorValues = quantizedReader.getQuantizedVectorValues("f"); int docId = -1; - while ((docId = quantizedByteVectorValues.nextDoc()) != NO_MORE_DOCS) { - byte[] vector = quantizedByteVectorValues.vectorValue(); - float offset = quantizedByteVectorValues.getScoreCorrectionConstant(); + KnnVectorValues.DocIndexIterator iter = quantizedByteVectorValues.iterator(); + for (docId = iter.nextDoc(); docId != NO_MORE_DOCS; docId = iter.nextDoc()) { + byte[] vector = quantizedByteVectorValues.vectorValue(iter.index()); + float offset = quantizedByteVectorValues.getScoreCorrectionConstant(iter.index()); for (int i = 0; i < dim; i++) { assertEquals(vector[i], expectedVectors[docId][i]); } diff --git a/lucene/core/src/test/org/apache/lucene/codecs/perfield/TestPerFieldDocValuesFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/perfield/TestPerFieldDocValuesFormat.java index 0e93934855d7..e617d5272940 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/perfield/TestPerFieldDocValuesFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/perfield/TestPerFieldDocValuesFormat.java @@ -119,7 +119,7 @@ public DocValuesFormat getDocValuesFormatForField(String field) { assertEquals(1, isearcher.count(new TermQuery(new Term("fieldname", longTerm)))); Query query = new TermQuery(new Term("fieldname", "text")); TopDocs hits = isearcher.search(query, 1); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); StoredFields storedFields = isearcher.storedFields(); // Iterate through the results: for (int i = 0; i < hits.scoreDocs.length; i++) { diff --git a/lucene/core/src/test/org/apache/lucene/codecs/perfield/TestPerFieldPostingsFormat2.java b/lucene/core/src/test/org/apache/lucene/codecs/perfield/TestPerFieldPostingsFormat2.java index b9ccab0935ad..ebae291f35e2 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/perfield/TestPerFieldPostingsFormat2.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/perfield/TestPerFieldPostingsFormat2.java @@ -210,7 +210,7 @@ public void assertQuery(Term t, Directory dir, int num) throws IOException { IndexReader reader = DirectoryReader.open(dir); IndexSearcher searcher = newSearcher(reader); TopDocs search = searcher.search(new TermQuery(t), num + 10); - assertEquals(num, search.totalHits.value); + assertEquals(num, search.totalHits.value()); reader.close(); } diff --git a/lucene/core/src/test/org/apache/lucene/document/TestDocValuesRangeIterator.java b/lucene/core/src/test/org/apache/lucene/document/TestDocValuesRangeIterator.java deleted file mode 100644 index 5b24608320a2..000000000000 --- a/lucene/core/src/test/org/apache/lucene/document/TestDocValuesRangeIterator.java +++ /dev/null @@ -1,273 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.document; - -import java.io.IOException; -import java.util.concurrent.atomic.AtomicBoolean; -import org.apache.lucene.index.DocValuesSkipper; -import org.apache.lucene.index.NumericDocValues; -import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.search.TwoPhaseIterator; -import org.apache.lucene.tests.util.LuceneTestCase; - -public class TestDocValuesRangeIterator extends LuceneTestCase { - - public void testSingleLevel() throws IOException { - doTestBasics(false); - } - - public void testMultipleLevels() throws IOException { - doTestBasics(true); - } - - private void doTestBasics(boolean doLevels) throws IOException { - long queryMin = 10; - long queryMax = 20; - - // Fake numeric doc values so that: - // docs 0-256 all match - // docs in 256-512 are all greater than queryMax - // docs in 512-768 are all less than queryMin - // docs in 768-1024 have some docs that match the range, others not - // docs in 1024-2048 follow a similar pattern as docs in 0-1024 except that not all docs have a - // value - NumericDocValues values = - new NumericDocValues() { - - int doc = -1; - - @Override - public boolean advanceExact(int target) throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public int docID() { - return doc; - } - - @Override - public int nextDoc() throws IOException { - return advance(doc + 1); - } - - @Override - public int advance(int target) throws IOException { - if (target < 1024) { - // dense up to 1024 - return doc = target; - } else if (doc < 2047) { - // 50% docs have a value up to 2048 - return doc = target + (target & 1); - } else { - return doc = DocIdSetIterator.NO_MORE_DOCS; - } - } - - @Override - public long longValue() throws IOException { - int d = doc % 1024; - if (d < 128) { - return (queryMin + queryMax) >> 1; - } else if (d < 256) { - return queryMax + 1; - } else if (d < 512) { - return queryMin - 1; - } else { - return switch ((d / 2) % 3) { - case 0 -> queryMin - 1; - case 1 -> queryMax + 1; - case 2 -> (queryMin + queryMax) >> 1; - default -> throw new AssertionError(); - }; - } - } - - @Override - public long cost() { - return 42; - } - }; - - AtomicBoolean twoPhaseCalled = new AtomicBoolean(); - TwoPhaseIterator twoPhase = - new TwoPhaseIterator(values) { - - @Override - public boolean matches() throws IOException { - twoPhaseCalled.set(true); - long v = values.longValue(); - return v >= queryMin && v <= queryMax; - } - - @Override - public float matchCost() { - return 2f; // 2 comparisons - } - }; - - DocValuesSkipper skipper = - new DocValuesSkipper() { - - int doc = -1; - - @Override - public void advance(int target) throws IOException { - doc = target; - } - - @Override - public int numLevels() { - return doLevels ? 3 : 1; - } - - @Override - public int minDocID(int level) { - int rangeLog = 9 - numLevels() + level; - - // the level is the log2 of the interval - if (doc < 0) { - return -1; - } else if (doc >= 2048) { - return DocIdSetIterator.NO_MORE_DOCS; - } else { - int mask = (1 << rangeLog) - 1; - // prior multiple of 2^level - return doc & ~mask; - } - } - - @Override - public int maxDocID(int level) { - int rangeLog = 9 - numLevels() + level; - - int minDocID = minDocID(level); - return switch (minDocID) { - case -1 -> -1; - case DocIdSetIterator.NO_MORE_DOCS -> DocIdSetIterator.NO_MORE_DOCS; - default -> minDocID + (1 << rangeLog) - 1; - }; - } - - @Override - public long minValue(int level) { - int d = doc % 1024; - if (d < 128) { - return queryMin; - } else if (d < 256) { - return queryMax + 1; - } else if (d < 768) { - return queryMin - 1; - } else { - return queryMin - 1; - } - } - - @Override - public long maxValue(int level) { - int d = doc % 1024; - if (d < 128) { - return queryMax; - } else if (d < 256) { - return queryMax + 1; - } else if (d < 768) { - return queryMin - 1; - } else { - return queryMax + 1; - } - } - - @Override - public int docCount(int level) { - int rangeLog = 9 - numLevels() + level; - - if (doc < 1024) { - return 1 << rangeLog; - } else { - // half docs have a value - return 1 << rangeLog >> 1; - } - } - - @Override - public long minValue() { - return Long.MIN_VALUE; - } - - @Override - public long maxValue() { - return Long.MAX_VALUE; - } - - @Override - public int docCount() { - return 1024 + 1024 / 2; - } - }; - - DocValuesRangeIterator rangeIterator = - new DocValuesRangeIterator(twoPhase, skipper, queryMin, queryMax); - DocValuesRangeIterator.Approximation rangeApproximation = - (DocValuesRangeIterator.Approximation) rangeIterator.approximation(); - - assertEquals(100, rangeApproximation.advance(100)); - assertEquals(DocValuesRangeIterator.Match.YES, rangeApproximation.match); - assertEquals(255, rangeApproximation.upTo); - assertTrue(rangeIterator.matches()); - assertTrue(values.docID() < rangeApproximation.docID()); // we did not advance doc values - assertFalse(twoPhaseCalled.get()); - - assertEquals(768, rangeApproximation.advance(300)); - assertEquals(DocValuesRangeIterator.Match.MAYBE, rangeApproximation.match); - if (doLevels) { - assertEquals(831, rangeApproximation.upTo); - } else { - assertEquals(1023, rangeApproximation.upTo); - } - for (int i = 0; i < 10; ++i) { - assertEquals(values.docID(), rangeApproximation.docID()); - assertEquals(twoPhase.matches(), rangeIterator.matches()); - assertTrue(twoPhaseCalled.get()); - twoPhaseCalled.set(false); - rangeApproximation.nextDoc(); - } - - assertEquals(1100, rangeApproximation.advance(1099)); - assertEquals(DocValuesRangeIterator.Match.IF_DOC_HAS_VALUE, rangeApproximation.match); - assertEquals(1024 + 256 - 1, rangeApproximation.upTo); - assertEquals(values.docID(), rangeApproximation.docID()); - assertTrue(rangeIterator.matches()); - assertFalse(twoPhaseCalled.get()); - - assertEquals(1024 + 768, rangeApproximation.advance(1024 + 300)); - assertEquals(DocValuesRangeIterator.Match.MAYBE, rangeApproximation.match); - if (doLevels) { - assertEquals(1024 + 831, rangeApproximation.upTo); - } else { - assertEquals(2047, rangeApproximation.upTo); - } - for (int i = 0; i < 10; ++i) { - assertEquals(values.docID(), rangeApproximation.docID()); - assertEquals(twoPhase.matches(), rangeIterator.matches()); - assertTrue(twoPhaseCalled.get()); - twoPhaseCalled.set(false); - rangeApproximation.nextDoc(); - } - - assertEquals(DocIdSetIterator.NO_MORE_DOCS, rangeApproximation.advance(2048)); - } -} diff --git a/lucene/core/src/test/org/apache/lucene/document/TestFeatureSort.java b/lucene/core/src/test/org/apache/lucene/document/TestFeatureSort.java index a5b5806f3586..cd5bde9ba3aa 100644 --- a/lucene/core/src/test/org/apache/lucene/document/TestFeatureSort.java +++ b/lucene/core/src/test/org/apache/lucene/document/TestFeatureSort.java @@ -73,7 +73,7 @@ public void testFeature() throws IOException { Sort sort = new Sort(FeatureField.newFeatureSort("field", "name")); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(3, td.totalHits.value); + assertEquals(3, td.totalHits.value()); // numeric order assertEquals("30.1", storedFields.document(td.scoreDocs[0].doc).get("value")); assertEquals("4.2", storedFields.document(td.scoreDocs[1].doc).get("value")); @@ -106,7 +106,7 @@ public void testFeatureMissing() throws IOException { StoredFields storedFields = searcher.storedFields(); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(3, td.totalHits.value); + assertEquals(3, td.totalHits.value()); // null is treated as 0 assertEquals("4.2", storedFields.document(td.scoreDocs[0].doc).get("value")); assertEquals("1.3", storedFields.document(td.scoreDocs[1].doc).get("value")); @@ -140,7 +140,7 @@ public void testFeatureMissingFieldInSegment() throws IOException { StoredFields storedFields = searcher.storedFields(); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(3, td.totalHits.value); + assertEquals(3, td.totalHits.value()); // null is treated as 0 assertEquals("4.2", storedFields.document(td.scoreDocs[0].doc).get("value")); assertEquals("1.3", storedFields.document(td.scoreDocs[1].doc).get("value")); @@ -175,7 +175,7 @@ public void testFeatureMissingFeatureNameInSegment() throws IOException { StoredFields storedFields = searcher.storedFields(); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(3, td.totalHits.value); + assertEquals(3, td.totalHits.value()); // null is treated as 0 assertEquals("4.2", storedFields.document(td.scoreDocs[0].doc).get("value")); assertEquals("1.3", storedFields.document(td.scoreDocs[1].doc).get("value")); @@ -216,7 +216,7 @@ public void testFeatureMultipleMissing() throws IOException { StoredFields storedFields = searcher.storedFields(); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(7, td.totalHits.value); + assertEquals(7, td.totalHits.value()); // null is treated as 0 assertEquals("4.2", storedFields.document(td.scoreDocs[0].doc).get("value")); assertEquals("1.3", storedFields.document(td.scoreDocs[1].doc).get("value")); diff --git a/lucene/core/src/test/org/apache/lucene/document/TestField.java b/lucene/core/src/test/org/apache/lucene/document/TestField.java index e813721b5958..5c1b8f17294f 100644 --- a/lucene/core/src/test/org/apache/lucene/document/TestField.java +++ b/lucene/core/src/test/org/apache/lucene/document/TestField.java @@ -18,6 +18,7 @@ import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; +import java.io.IOException; import java.io.StringReader; import java.nio.charset.StandardCharsets; import org.apache.lucene.codecs.Codec; @@ -27,6 +28,7 @@ import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.Term; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.search.IndexSearcher; @@ -680,7 +682,7 @@ public void testIndexedBinaryField() throws Exception { IndexSearcher s = newSearcher(r); TopDocs hits = s.search(new TermQuery(new Term("binary", br)), 1); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); Document storedDoc = s.storedFields().document(hits.scoreDocs[0].doc); assertEquals(br, storedDoc.getField("binary").binaryValue()); @@ -713,17 +715,21 @@ public void testKnnVectorField() throws Exception { try (IndexReader r = DirectoryReader.open(w)) { ByteVectorValues binary = r.leaves().get(0).reader().getByteVectorValues("binary"); assertEquals(1, binary.size()); - assertNotEquals(NO_MORE_DOCS, binary.nextDoc()); - assertNotNull(binary.vectorValue()); - assertArrayEquals(b, binary.vectorValue()); - assertEquals(NO_MORE_DOCS, binary.nextDoc()); + KnnVectorValues.DocIndexIterator iterator = binary.iterator(); + assertNotEquals(NO_MORE_DOCS, iterator.nextDoc()); + assertNotNull(binary.vectorValue(0)); + assertArrayEquals(b, binary.vectorValue(0)); + assertEquals(NO_MORE_DOCS, iterator.nextDoc()); + expectThrows(IOException.class, () -> binary.vectorValue(1)); FloatVectorValues floatValues = r.leaves().get(0).reader().getFloatVectorValues("float"); assertEquals(1, floatValues.size()); - assertNotEquals(NO_MORE_DOCS, floatValues.nextDoc()); - assertEquals(vector.length, floatValues.vectorValue().length); - assertEquals(vector[0], floatValues.vectorValue()[0], 0); - assertEquals(NO_MORE_DOCS, floatValues.nextDoc()); + KnnVectorValues.DocIndexIterator iterator1 = floatValues.iterator(); + assertNotEquals(NO_MORE_DOCS, iterator1.nextDoc()); + assertEquals(vector.length, floatValues.vectorValue(0).length); + assertEquals(vector[0], floatValues.vectorValue(0)[0], 0); + assertEquals(NO_MORE_DOCS, iterator1.nextDoc()); + expectThrows(IOException.class, () -> floatValues.vectorValue(1)); } } } diff --git a/lucene/core/src/test/org/apache/lucene/document/TestLatLonPointDistanceFeatureQuery.java b/lucene/core/src/test/org/apache/lucene/document/TestLatLonPointDistanceFeatureQuery.java index 76ed1b5bab34..210b92295329 100644 --- a/lucene/core/src/test/org/apache/lucene/document/TestLatLonPointDistanceFeatureQuery.java +++ b/lucene/core/src/test/org/apache/lucene/document/TestLatLonPointDistanceFeatureQuery.java @@ -232,7 +232,7 @@ public void testMissingField() throws IOException { Query q = LatLonPoint.newDistanceFeatureQuery("foo", 3, 10, 10, 5000); TopDocs topHits = searcher.search(q, 2); - assertEquals(0, topHits.totalHits.value); + assertEquals(0, topHits.totalHits.value()); } public void testMissingValue() throws IOException { diff --git a/lucene/core/src/test/org/apache/lucene/document/TestLongDistanceFeatureQuery.java b/lucene/core/src/test/org/apache/lucene/document/TestLongDistanceFeatureQuery.java index 2ea6e161435f..036e0a5450e3 100644 --- a/lucene/core/src/test/org/apache/lucene/document/TestLongDistanceFeatureQuery.java +++ b/lucene/core/src/test/org/apache/lucene/document/TestLongDistanceFeatureQuery.java @@ -197,7 +197,7 @@ public void testMissingField() throws IOException { Query q = LongField.newDistanceFeatureQuery("foo", 3, 10, 5); TopDocs topHits = searcher.search(q, 2); - assertEquals(0, topHits.totalHits.value); + assertEquals(0, topHits.totalHits.value()); } public void testMissingValue() throws IOException { diff --git a/lucene/core/src/test/org/apache/lucene/document/TestShapeDocValues.java b/lucene/core/src/test/org/apache/lucene/document/TestShapeDocValues.java index f89f614471ed..ca9eca679865 100644 --- a/lucene/core/src/test/org/apache/lucene/document/TestShapeDocValues.java +++ b/lucene/core/src/test/org/apache/lucene/document/TestShapeDocValues.java @@ -56,12 +56,14 @@ public void testSimpleDocValue() throws Exception { public void testLatLonPolygonBBox() { Polygon p = GeoTestUtil.nextPolygon(); - Rectangle expected = (Rectangle) computeBoundingBox(p); - LatLonShapeDocValuesField dv = LatLonShape.createDocValueField(FIELD_NAME, p); - assertEquals(expected.minLat, dv.getBoundingBox().minLat, TOLERANCE); - assertEquals(expected.maxLat, dv.getBoundingBox().maxLat, TOLERANCE); - assertEquals(expected.minLon, dv.getBoundingBox().minLon, TOLERANCE); - assertEquals(expected.maxLon, dv.getBoundingBox().maxLon, TOLERANCE); + if (area(p) != 0) { + Rectangle expected = (Rectangle) computeBoundingBox(p); + LatLonShapeDocValuesField dv = LatLonShape.createDocValueField(FIELD_NAME, p); + assertEquals(expected.minLat, dv.getBoundingBox().minLat, TOLERANCE); + assertEquals(expected.maxLat, dv.getBoundingBox().maxLat, TOLERANCE); + assertEquals(expected.minLon, dv.getBoundingBox().minLon, TOLERANCE); + assertEquals(expected.maxLon, dv.getBoundingBox().maxLon, TOLERANCE); + } } public void testXYPolygonBBox() { @@ -255,4 +257,9 @@ private List getTessellation(XYPolygon p) { } return tess; } + + /** Compute signed area of rectangle */ + private static double area(Polygon p) { + return (p.maxLon - p.minLon) * (p.maxLat - p.minLat); + } } diff --git a/lucene/core/src/test/org/apache/lucene/document/TestSortedSetDocValuesSetQuery.java b/lucene/core/src/test/org/apache/lucene/document/TestSortedSetDocValuesSetQuery.java index 59a7ffbb135d..b348ca41ddb1 100644 --- a/lucene/core/src/test/org/apache/lucene/document/TestSortedSetDocValuesSetQuery.java +++ b/lucene/core/src/test/org/apache/lucene/document/TestSortedSetDocValuesSetQuery.java @@ -251,7 +251,7 @@ private void assertSameMatches(IndexSearcher searcher, Query q1, Query q2, boole final int maxDoc = searcher.getIndexReader().maxDoc(); final TopDocs td1 = searcher.search(q1, maxDoc, scores ? Sort.RELEVANCE : Sort.INDEXORDER); final TopDocs td2 = searcher.search(q2, maxDoc, scores ? Sort.RELEVANCE : Sort.INDEXORDER); - assertEquals(td1.totalHits.value, td2.totalHits.value); + assertEquals(td1.totalHits.value(), td2.totalHits.value()); for (int i = 0; i < td1.scoreDocs.length; ++i) { assertEquals(td1.scoreDocs[i].doc, td2.scoreDocs[i].doc); if (scores) { diff --git a/lucene/core/src/test/org/apache/lucene/geo/TestTessellator.java b/lucene/core/src/test/org/apache/lucene/geo/TestTessellator.java index 8002717a9ced..e2e964526e88 100644 --- a/lucene/core/src/test/org/apache/lucene/geo/TestTessellator.java +++ b/lucene/core/src/test/org/apache/lucene/geo/TestTessellator.java @@ -430,11 +430,7 @@ public void testComplexPolygon26() throws Exception { + "(6.9735097 51.6245538,6.9736199 51.624605,6.9736853 51.6246203,6.9737516 51.6246231,6.9738024 51.6246107,6.9738324 51.6245878,6.9738425 51.6245509,6.9738332 51.6245122,6.9738039 51.6244869,6.9737616 51.6244687,6.9737061 51.6244625,6.9736445 51.6244749,6.9735736 51.6245046,6.9735097 51.6245538))," + "((6.9731576 51.6249947,6.9731361 51.6250664,6.9731161 51.6251037,6.9731022 51.6250803,6.9731277 51.62502,6.9731576 51.6249947)))"; Polygon[] polygons = (Polygon[]) SimpleWKTShapeParser.parse(wkt); - for (Polygon polygon : polygons) { - List tessellation = - Tessellator.tessellate(polygon, random().nextBoolean()); - assertTrue(tessellation.size() > 0); - } + checkMultiPolygon(polygons, 0.0); } public void testComplexPolygon27() throws Exception { @@ -684,13 +680,7 @@ public void testComplexPolygon39() throws Exception { public void testComplexPolygon40() throws Exception { String wkt = GeoTestUtil.readShape("lucene-9251.wkt.gz"); Polygon polygon = (Polygon) SimpleWKTShapeParser.parse(wkt); - List tessellation = - Tessellator.tessellate(polygon, random().nextBoolean()); - // calculate the area of big polygons have numerical error - assertEquals(area(polygon), area(tessellation), 1e-12); - for (Tessellator.Triangle t : tessellation) { - checkTriangleEdgesFromPolygon(polygon, t); - } + checkPolygon(polygon, 1e-12); } public void testComplexPolygon41() throws Exception { @@ -706,15 +696,7 @@ public void testComplexPolygon41() throws Exception { public void testComplexPolygon42() throws Exception { String geoJson = GeoTestUtil.readShape("lucene-9417.geojson.gz"); Polygon[] polygons = Polygon.fromGeoJSON(geoJson); - for (Polygon polygon : polygons) { - List tessellation = - Tessellator.tessellate(polygon, random().nextBoolean()); - // calculate the area of big polygons have numerical error - assertEquals(area(polygon), area(tessellation), 1e-11); - for (Tessellator.Triangle t : tessellation) { - checkTriangleEdgesFromPolygon(polygon, t); - } - } + checkMultiPolygon(polygons, 1e-11); } public void testComplexPolygon43() throws Exception { @@ -727,12 +709,7 @@ public void testComplexPolygon43() throws Exception { + "(-88.3245325358123 41.9306419084828,-88.3245478066552 41.9305086556331,-88.3245658060855 41.930351580587,-88.3242368660096 41.9303327977821,-88.3242200926128 41.9304905242189,-88.324206161464 41.9306215207536,-88.3245325358123 41.9306419084828)," + "(-88.3236767661893 41.9307089429871,-88.3237008716322 41.930748885445,-88.323876104365 41.9306891087739,-88.324063438129 41.9306252050871,-88.3239244290607 41.930399373909,-88.3237349076233 41.9304653056436,-88.3235653339759 41.9305242981369,-88.3236767661893 41.9307089429871))"; Polygon polygon = (Polygon) SimpleWKTShapeParser.parse(wkt); - List tessellation = - Tessellator.tessellate(polygon, random().nextBoolean()); - assertEquals(area(polygon), area(tessellation), 1e-11); - for (Tessellator.Triangle t : tessellation) { - checkTriangleEdgesFromPolygon(polygon, t); - } + checkPolygon(polygon, 1e-11); } public void testComplexPolygon44() throws Exception { @@ -748,12 +725,7 @@ public void testComplexPolygon44() throws Exception { "Polygon self-intersection at lat=34.21165542666664 lon=-83.88787058666672", ex.getMessage()); } else { - List tessellation = - Tessellator.tessellate(polygons[i], random().nextBoolean()); - assertEquals(area(polygons[i]), area(tessellation), 0.0); - for (Tessellator.Triangle t : tessellation) { - checkTriangleEdgesFromPolygon(polygons[i], t); - } + checkPolygon(polygons[i], 0.0); } } } @@ -761,55 +733,26 @@ public void testComplexPolygon44() throws Exception { public void testComplexPolygon45() throws Exception { String geoJson = GeoTestUtil.readShape("lucene-10470.geojson.gz"); Polygon[] polygons = Polygon.fromGeoJSON(geoJson); - for (Polygon polygon : polygons) { - List tessellation = - Tessellator.tessellate(polygon, random().nextBoolean()); - // calculate the area of big polygons have numerical error - assertEquals(area(polygon), area(tessellation), 1e-11); - for (Tessellator.Triangle t : tessellation) { - checkTriangleEdgesFromPolygon(polygon, t); - } - } + checkMultiPolygon(polygons, 1e-11); } public void testComplexPolygon46() throws Exception { String wkt = GeoTestUtil.readShape("lucene-10470.wkt.gz"); Polygon polygon = (Polygon) SimpleWKTShapeParser.parse(wkt); - List tessellation = - Tessellator.tessellate(polygon, random().nextBoolean()); - // calculate the area of big polygons have numerical error - assertEquals(area(polygon), area(tessellation), 1e-11); - for (Tessellator.Triangle t : tessellation) { - checkTriangleEdgesFromPolygon(polygon, t); - } + checkPolygon(polygon, 1e-11); } public void testComplexPolygon47() throws Exception { String geoJson = GeoTestUtil.readShape("lucene-10470-2.geojson.gz"); Polygon[] polygons = Polygon.fromGeoJSON(geoJson); - for (Polygon polygon : polygons) { - List tessellation = - Tessellator.tessellate(polygon, random().nextBoolean()); - // calculate the area of big polygons have numerical error - assertEquals(area(polygon), area(tessellation), 1e-11); - for (Tessellator.Triangle t : tessellation) { - checkTriangleEdgesFromPolygon(polygon, t); - } - } + checkMultiPolygon(polygons, 1e-11); } @Nightly public void testComplexPolygon48() throws Exception { String geoJson = GeoTestUtil.readShape("lucene-10470-3.geojson.gz"); Polygon[] polygons = Polygon.fromGeoJSON(geoJson); - for (Polygon polygon : polygons) { - List tessellation = Tessellator.tessellate(polygon, true); - // calculate the area of big polygons have numerical error - assertEquals(area(polygon), area(tessellation), 1e-11); - for (Tessellator.Triangle t : tessellation) { - checkTriangleEdgesFromPolygon(polygon, t); - } - } + checkMultiPolygon(polygons, 1e-11); } public void testComplexPolygon49() throws Exception { @@ -817,25 +760,14 @@ public void testComplexPolygon49() throws Exception { "POLYGON((77.500 13.500, 77.550 13.500, 77.530 13.470, 77.570 13.470," + "77.550 13.500, 77.600 13.500, 77.600 13.400, 77.500 13.400, 77.500 13.500))"; Polygon polygon = (Polygon) SimpleWKTShapeParser.parse(wkt); - List tessellation = - Tessellator.tessellate(polygon, random().nextBoolean()); - assertEquals(area(polygon), area(tessellation), 1e-11); - for (Tessellator.Triangle t : tessellation) { - checkTriangleEdgesFromPolygon(polygon, t); - } + checkPolygon(polygon, 1e-11); } public void testComplexPolygon50() throws Exception { String geoJson = GeoTestUtil.readShape("lucene-10563-1.geojson.gz"); Polygon[] polygons = Polygon.fromGeoJSON(geoJson); assertEquals("Only one polygon", 1, polygons.length); - Polygon polygon = polygons[0]; - List tessellation = Tessellator.tessellate(polygon, true); - // calculate the area of big polygons have numerical error - assertEquals(area(polygon), area(tessellation), 1e-11); - for (Tessellator.Triangle t : tessellation) { - checkTriangleEdgesFromPolygon(polygon, t); - } + checkPolygon(polygons[0], 1e-11); } public void testComplexPolygon50_WithMonitor() throws Exception { @@ -893,25 +825,13 @@ public void testComplexPolygon52() throws Exception { public void testComplexPolygon53() throws Exception { String geoJson = GeoTestUtil.readShape("github-11986-1.geojson.gz"); Polygon[] polygons = Polygon.fromGeoJSON(geoJson); - for (Polygon polygon : polygons) { - List tessellation = Tessellator.tessellate(polygon, true); - assertEquals(area(polygon), area(tessellation), 0.0); - for (Tessellator.Triangle t : tessellation) { - checkTriangleEdgesFromPolygon(polygon, t); - } - } + checkMultiPolygon(polygons, 0.0); } public void testComplexPolygon54() throws Exception { String geoJson = GeoTestUtil.readShape("github-11986-2.geojson.gz"); Polygon[] polygons = Polygon.fromGeoJSON(geoJson); - for (Polygon polygon : polygons) { - List tessellation = Tessellator.tessellate(polygon, true); - assertEquals(area(polygon), area(tessellation), 0.0); - for (Tessellator.Triangle t : tessellation) { - checkTriangleEdgesFromPolygon(polygon, t); - } - } + checkMultiPolygon(polygons, 0.0); } public void testComplexPolygon55() throws Exception { @@ -936,6 +856,41 @@ public void testComplexPolygon56() throws Exception { } } + public void testComplexPolygon57() throws Exception { + String geoJson = GeoTestUtil.readShape("github-13841-1.geojson.gz"); + Polygon[] polygons = Polygon.fromGeoJSON(geoJson); + checkMultiPolygon(polygons, 3e-11); + } + + @Nightly + public void testComplexPolygon58() throws Exception { + String wkt = GeoTestUtil.readShape("github-13841-2.wkt.gz"); + checkMultiPolygon(wkt); + } + + @Nightly + public void testComplexPolygon59() throws Exception { + String wkt = GeoTestUtil.readShape("github-13841-3.wkt.gz"); + Polygon[] polygons = (Polygon[]) SimpleWKTShapeParser.parse(wkt); + checkMultiPolygon(polygons, 1e-11); + } + + public void testComplexPolygon60() throws Exception { + String wkt = + "POLYGON((0 0, 5 1, 10 0, 11 5, 10 10,5 11, 0 10, 1 5, 0 0)," + + "(1 5, 1 7, 2 7, 1 5), (1 5, 4 8, 5 8, 1 5)," + + "(1 5, 3 6, 7 7, 1 5), (1 5, 2 3, 1 3, 1 5)," + + "(1 5, 3 4, 4 4, 1 5), (1 5, 5 6, 6 6, 1 5)," + + "(11 5, 10 3, 10 4, 11 5), (11 5,8 3, 8 4, 11 5)," + + "(11 5,5 4, 5 5, 11 5), (11 5, 4.5 3, 4 3, 11 5)," + + "(11 5, 8 6, 9 7, 11 5), (11 5, 10 8, 10 7, 11 5)," + + "(5 11, 2 10, 3 10, 5 11), (5 11, 3 9, 4 9, 5 11)," + + "(5 11, 5.5 8, 6 7, 5 11), (5 11, 8 8, 9 8, 5 11)," + + "(5 1, 2 0.5, 3 1, 5 1), (5 1, 8 0.5, 7 2, 5 1)," + + "(5 1, 3 2, 3 3, 5 1), (5 1, 5 2, 6 2, 5 1))"; + checkPolygon(wkt); + } + private static class TestCountingMonitor implements Tessellator.Monitor { private int count = 0; private int splitsStarted = 0; @@ -958,11 +913,26 @@ public void endSplit(String status) { } } + private void checkMultiPolygon(String wkt) throws Exception { + Polygon[] polygons = (Polygon[]) SimpleWKTShapeParser.parse(wkt); + checkMultiPolygon(polygons, 0.0); + } + + private void checkMultiPolygon(Polygon[] polygons, double delta) { + for (Polygon polygon : polygons) { + checkPolygon(polygon, delta); + } + } + private void checkPolygon(String wkt) throws Exception { Polygon polygon = (Polygon) SimpleWKTShapeParser.parse(wkt); + checkPolygon(polygon, 0.0); + } + + private void checkPolygon(Polygon polygon, double delta) { List tessellation = Tessellator.tessellate(polygon, random().nextBoolean()); - assertEquals(area(polygon), area(tessellation), 0.0); + assertEquals(area(polygon), area(tessellation), delta); for (Tessellator.Triangle t : tessellation) { checkTriangleEdgesFromPolygon(polygon, t); } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestAllFilesHaveChecksumFooter.java b/lucene/core/src/test/org/apache/lucene/index/TestAllFilesHaveChecksumFooter.java index 3826962779af..9db1d305a744 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestAllFilesHaveChecksumFooter.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestAllFilesHaveChecksumFooter.java @@ -65,10 +65,7 @@ private void checkFooters(Directory dir) throws IOException { } if (si.info.getUseCompoundFile()) { try (Directory cfsDir = - si.info - .getCodec() - .compoundFormat() - .getCompoundReader(dir, si.info, newIOContext(random()))) { + si.info.getCodec().compoundFormat().getCompoundReader(dir, si.info)) { for (String cfsFile : cfsDir.listAll()) { checkFooter(cfsDir, cfsFile); } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestAllFilesHaveCodecHeader.java b/lucene/core/src/test/org/apache/lucene/index/TestAllFilesHaveCodecHeader.java index 76c3ee75f250..e8857791c3a1 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestAllFilesHaveCodecHeader.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestAllFilesHaveCodecHeader.java @@ -70,10 +70,7 @@ private void checkHeaders(Directory dir, Map namesToExtensions) } if (si.info.getUseCompoundFile()) { try (Directory cfsDir = - si.info - .getCodec() - .compoundFormat() - .getCompoundReader(dir, si.info, newIOContext(random()))) { + si.info.getCodec().compoundFormat().getCompoundReader(dir, si.info)) { for (String cfsFile : cfsDir.listAll()) { checkHeader(cfsDir, cfsFile, namesToExtensions, si.info.getId()); } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestBinaryTerms.java b/lucene/core/src/test/org/apache/lucene/index/TestBinaryTerms.java index a0edb9df83a6..b76d0e10aec7 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestBinaryTerms.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestBinaryTerms.java @@ -57,7 +57,7 @@ public void testBinary() throws IOException { bytes.bytes[1] = (byte) (255 - i); bytes.length = 2; TopDocs docs = is.search(new TermQuery(new Term("bytes", bytes)), 5); - assertEquals(1, docs.totalHits.value); + assertEquals(1, docs.totalHits.value()); assertEquals("" + i, is.storedFields().document(docs.scoreDocs[0].doc).get("id")); } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java b/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java index 1759271012d1..bdbb2cf8f052 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java @@ -106,7 +106,7 @@ public FieldData( storePayloads, indexOptions, DocValuesType.NONE, - false, + DocValuesSkipIndexType.NONE, -1, new HashMap<>(), 0, diff --git a/lucene/core/src/test/org/apache/lucene/index/TestConsistentFieldNumbers.java b/lucene/core/src/test/org/apache/lucene/index/TestConsistentFieldNumbers.java index de1d4ca98a44..c21c6fbbd4dd 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestConsistentFieldNumbers.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestConsistentFieldNumbers.java @@ -290,7 +290,7 @@ public void testManyFields() throws Exception { for (FieldInfo fi : fis) { Field expected = getField(Integer.parseInt(fi.name)); assertEquals(expected.fieldType().indexOptions(), fi.getIndexOptions()); - assertEquals(expected.fieldType().storeTermVectors(), fi.hasVectors()); + assertEquals(expected.fieldType().storeTermVectors(), fi.hasTermVectors()); } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestCrashCausesCorruptIndex.java b/lucene/core/src/test/org/apache/lucene/index/TestCrashCausesCorruptIndex.java index 5ed2149963a8..cab8c004c1e7 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestCrashCausesCorruptIndex.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestCrashCausesCorruptIndex.java @@ -105,7 +105,7 @@ private void searchForFleas(final int expectedTotalHits) throws IOException { IndexSearcher indexSearcher = newSearcher(indexReader); TopDocs topDocs = indexSearcher.search(new TermQuery(new Term(TEXT_FIELD, "fleas")), 10); assertNotNull(topDocs); - assertEquals(expectedTotalHits, topDocs.totalHits.value); + assertEquals(expectedTotalHits, topDocs.totalHits.value()); indexReader.close(); realDirectory.close(); } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestDefaultCodecParallelizesIO.java b/lucene/core/src/test/org/apache/lucene/index/TestDefaultCodecParallelizesIO.java index 2c4351fa1705..58579ab93a46 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestDefaultCodecParallelizesIO.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestDefaultCodecParallelizesIO.java @@ -40,7 +40,14 @@ public static void beforeClass() throws Exception { Directory bbDir = new ByteBuffersDirectory(); try (LineFileDocs docs = new LineFileDocs(random()); IndexWriter w = - new IndexWriter(bbDir, new IndexWriterConfig().setCodec(TestUtil.getDefaultCodec()))) { + new IndexWriter( + bbDir, + new IndexWriterConfig() + // Disable CFS, this test needs to know about files that are open with the + // RANDOM_PRELOAD advice, which CFS doesn't allow us to detect. + .setUseCompoundFile(false) + .setMergePolicy(newLogMergePolicy(false)) + .setCodec(TestUtil.getDefaultCodec()))) { final int numDocs = atLeast(10_000); for (int d = 0; d < numDocs; ++d) { Document doc = docs.nextDoc(); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestDirectoryReader.java b/lucene/core/src/test/org/apache/lucene/index/TestDirectoryReader.java index 026a4b49c782..1ea5da23d4b8 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestDirectoryReader.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestDirectoryReader.java @@ -269,7 +269,7 @@ public void testGetFieldNames() throws Exception { } else { notIndexedFieldNames.add(name); } - if (fieldInfo.hasVectors()) { + if (fieldInfo.hasTermVectors()) { tvFieldNames.add(name); } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestDocumentWriter.java b/lucene/core/src/test/org/apache/lucene/index/TestDocumentWriter.java index 7d8cdf95d233..af5b2938d590 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestDocumentWriter.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestDocumentWriter.java @@ -388,28 +388,14 @@ public void testRAMUsageVector() throws IOException { field, new float[] {1, 2, 3, 4}, VectorSimilarityFunction.EUCLIDEAN)); } - private static class MockIndexableField implements IndexableField { - - private final String field; - private final BytesRef value; - private final IndexableFieldType fieldType; - - MockIndexableField(String field, BytesRef value, IndexableFieldType fieldType) { - this.field = field; - this.value = value; - this.fieldType = fieldType; - } + private record MockIndexableField(String field, BytesRef value, IndexableFieldType fieldType) + implements IndexableField { @Override public String name() { return field; } - @Override - public IndexableFieldType fieldType() { - return fieldType; - } - @Override public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) { return null; diff --git a/lucene/core/src/test/org/apache/lucene/index/TestExitableDirectoryReader.java b/lucene/core/src/test/org/apache/lucene/index/TestExitableDirectoryReader.java index 3c82cd6b33e4..d03c8cf42b59 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestExitableDirectoryReader.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestExitableDirectoryReader.java @@ -459,8 +459,8 @@ public void testFloatVectorValues() throws IOException { expectThrows( ExitingReaderException.class, () -> { - DocIdSetIterator iter = leaf.getFloatVectorValues("vector"); - scanAndRetrieve(leaf, iter); + KnnVectorValues values = leaf.getFloatVectorValues("vector"); + scanAndRetrieve(leaf, values); }); expectThrows( @@ -473,8 +473,8 @@ public void testFloatVectorValues() throws IOException { leaf.getLiveDocs(), Integer.MAX_VALUE)); } else { - DocIdSetIterator iter = leaf.getFloatVectorValues("vector"); - scanAndRetrieve(leaf, iter); + KnnVectorValues values = leaf.getFloatVectorValues("vector"); + scanAndRetrieve(leaf, values); leaf.searchNearestVectors( "vector", @@ -534,8 +534,8 @@ public void testByteVectorValues() throws IOException { expectThrows( ExitingReaderException.class, () -> { - DocIdSetIterator iter = leaf.getByteVectorValues("vector"); - scanAndRetrieve(leaf, iter); + KnnVectorValues values = leaf.getByteVectorValues("vector"); + scanAndRetrieve(leaf, values); }); expectThrows( @@ -549,8 +549,8 @@ public void testByteVectorValues() throws IOException { Integer.MAX_VALUE)); } else { - DocIdSetIterator iter = leaf.getByteVectorValues("vector"); - scanAndRetrieve(leaf, iter); + KnnVectorValues values = leaf.getByteVectorValues("vector"); + scanAndRetrieve(leaf, values); leaf.searchNearestVectors( "vector", @@ -564,20 +564,24 @@ public void testByteVectorValues() throws IOException { directory.close(); } - private static void scanAndRetrieve(LeafReader leaf, DocIdSetIterator iter) throws IOException { + private static void scanAndRetrieve(LeafReader leaf, KnnVectorValues values) throws IOException { + KnnVectorValues.DocIndexIterator iter = values.iterator(); for (iter.nextDoc(); iter.docID() != DocIdSetIterator.NO_MORE_DOCS && iter.docID() < leaf.maxDoc(); ) { - final int nextDocId = iter.docID() + 1; + int docId = iter.docID(); + if (docId >= leaf.maxDoc()) { + break; + } + final int nextDocId = docId + 1; if (random().nextBoolean() && nextDocId < leaf.maxDoc()) { iter.advance(nextDocId); } else { iter.nextDoc(); } - if (random().nextBoolean() && iter.docID() != DocIdSetIterator.NO_MORE_DOCS - && iter instanceof FloatVectorValues) { - ((FloatVectorValues) iter).vectorValue(); + && values instanceof FloatVectorValues) { + ((FloatVectorValues) values).vectorValue(iter.index()); } } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestFieldInfos.java b/lucene/core/src/test/org/apache/lucene/index/TestFieldInfos.java index e19855bbdda8..a658c8571ec9 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestFieldInfos.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestFieldInfos.java @@ -250,7 +250,7 @@ public void testFieldNumbersAutoIncrement() { false, IndexOptions.NONE, DocValuesType.NONE, - false, + DocValuesSkipIndexType.NONE, -1, new HashMap<>(), 0, @@ -272,7 +272,7 @@ public void testFieldNumbersAutoIncrement() { false, IndexOptions.NONE, DocValuesType.NONE, - false, + DocValuesSkipIndexType.NONE, -1, new HashMap<>(), 0, @@ -296,7 +296,7 @@ public void testFieldNumbersAutoIncrement() { false, IndexOptions.NONE, DocValuesType.NONE, - false, + DocValuesSkipIndexType.NONE, -1, new HashMap<>(), 0, diff --git a/lucene/core/src/test/org/apache/lucene/index/TestFieldsReader.java b/lucene/core/src/test/org/apache/lucene/index/TestFieldsReader.java index 15d6dddcb586..8580691028b5 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestFieldsReader.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestFieldsReader.java @@ -58,7 +58,7 @@ public static void beforeClass() throws Exception { false, ift.indexOptions(), ift.docValuesType(), - ift.hasDocValuesSkipIndex(), + ift.docValuesSkipIndexType(), -1, new HashMap<>(), 0, diff --git a/lucene/core/src/test/org/apache/lucene/index/TestForTooMuchCloning.java b/lucene/core/src/test/org/apache/lucene/index/TestForTooMuchCloning.java index 7c72b3d2e76a..d06330c29269 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestForTooMuchCloning.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestForTooMuchCloning.java @@ -75,12 +75,17 @@ public void test() throws Exception { final TopDocs hits = s.search( new TermRangeQuery("field", new BytesRef(), new BytesRef("\uFFFF"), true, true), 10); - assertTrue(hits.totalHits.value > 0); + assertTrue(hits.totalHits.value() > 0); final int queryCloneCount = dir.getInputCloneCount() - cloneCount; // System.out.println("query clone count=" + queryCloneCount); + // It is rather difficult to reliably predict how many query clone calls will be performed. One + // important factor is the number of segment partitions being searched, but it depends as well + // on the terms being indexed, and the distribution of the matches across the documents, which + // affects how the query gets rewritten and the subsequent number of clone calls it will + // perform. assertTrue( "too many calls to IndexInput.clone during TermRangeQuery: " + queryCloneCount, - queryCloneCount < 50); + queryCloneCount <= Math.max(s.getLeafContexts().size(), s.getSlices().length) * 5); r.close(); dir.close(); } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java index 0432f05d94f8..91a167926173 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java @@ -1780,9 +1780,9 @@ public void testRandom1() throws IOException { TermQuery termQuery = new TermQuery(new Term("id", Integer.toString(i))); final TopDocs topDocs = searcher.search(termQuery, 1); if (deleted.get(i)) { - assertEquals(0, topDocs.totalHits.value); + assertEquals(0, topDocs.totalHits.value()); } else { - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); NumericDocValues values = MultiDocValues.getNumericValues(reader, "id"); assertEquals(topDocs.scoreDocs[0].doc, values.advance(topDocs.scoreDocs[0].doc)); assertEquals(i, values.longValue()); @@ -1832,9 +1832,9 @@ public void testMultiValuedRandom1() throws IOException { TermQuery termQuery = new TermQuery(new Term("id", Integer.toString(i))); final TopDocs topDocs = searcher.search(termQuery, 1); if (deleted.get(i)) { - assertEquals(0, topDocs.totalHits.value); + assertEquals(0, topDocs.totalHits.value()); } else { - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); NumericDocValues values = MultiDocValues.getNumericValues(reader, "id"); assertEquals(topDocs.scoreDocs[0].doc, values.advance(topDocs.scoreDocs[0].doc)); assertEquals(i, values.longValue()); @@ -1937,9 +1937,9 @@ public void testConcurrentUpdates() throws Exception { final TopDocs topDocs = searcher.search(new TermQuery(new Term("id", Integer.toString(i))), 1); if (values.containsKey(i) == false) { - assertEquals(0, topDocs.totalHits.value); + assertEquals(0, topDocs.totalHits.value()); } else { - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); NumericDocValues dvs = MultiDocValues.getNumericValues(reader, "foo"); int docID = topDocs.scoreDocs[0].doc; assertEquals(docID, dvs.advance(docID)); @@ -2074,7 +2074,7 @@ public void testConcurrentDVUpdates() throws Exception { for (int i = 0; i < numDocs; ++i) { final TopDocs topDocs = searcher.search(new TermQuery(new Term("id", Integer.toString(i))), 1); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); NumericDocValues dvs = MultiDocValues.getNumericValues(reader, "bar"); int hitDoc = topDocs.scoreDocs[0].doc; assertEquals(hitDoc, dvs.advance(hitDoc)); @@ -2181,8 +2181,8 @@ public void testAddIndexes(boolean withDeletes, boolean useReaders) throws Excep Query query = new TermQuery(new Term("id", Integer.toString(i))); final TopDocs topDocs = searcher.search(query, 1); final TopDocs topDocs2 = searcher2.search(query, 1); - assertEquals(topDocs.totalHits.value, topDocs2.totalHits.value); - if (topDocs.totalHits.value == 1) { + assertEquals(topDocs.totalHits.value(), topDocs2.totalHits.value()); + if (topDocs.totalHits.value() == 1) { NumericDocValues dvs1 = MultiDocValues.getNumericValues(reader, "foo"); int hitDoc1 = topDocs.scoreDocs[0].doc; assertEquals(hitDoc1, dvs1.advance(hitDoc1)); @@ -2412,7 +2412,7 @@ protected TokenStreamComponents createComponents(String fieldName) { if (VERBOSE) { System.out.println("TEST: now compare r1=" + r1 + " r2=" + r2); } - assertEquals(sort, getOnlyLeafReader(r2).getMetaData().getSort()); + assertEquals(sort, getOnlyLeafReader(r2).getMetaData().sort()); assertReaderEquals("left: sorted by hand; right: sorted by Lucene", r1, r2); IOUtils.close(w1, w2, r1, r2, dir1, dir2); } @@ -2659,11 +2659,11 @@ public void testRandom3() throws Exception { s2.search(new MatchAllDocsQuery(), new TopFieldCollectorManager(sort, numHits, 1)); if (VERBOSE) { - System.out.println(" topDocs query-time sort: totalHits=" + hits1.totalHits.value); + System.out.println(" topDocs query-time sort: totalHits=" + hits1.totalHits.value()); for (ScoreDoc scoreDoc : hits1.scoreDocs) { System.out.println(" " + scoreDoc.doc); } - System.out.println(" topDocs index-time sort: totalHits=" + hits2.totalHits.value); + System.out.println(" topDocs index-time sort: totalHits=" + hits2.totalHits.value()); for (ScoreDoc scoreDoc : hits2.scoreDocs) { System.out.println(" " + scoreDoc.doc); } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java index 8ef60beefd49..04adc5035db4 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java @@ -1503,7 +1503,7 @@ public void testNoUnwantedTVFiles() throws Exception { DirectoryReader r0 = DirectoryReader.open(dir); for (LeafReaderContext ctx : r0.leaves()) { SegmentReader sr = (SegmentReader) ctx.reader(); - assertFalse(sr.getFieldInfos().hasVectors()); + assertFalse(sr.getFieldInfos().hasTermVectors()); } r0.close(); @@ -1930,7 +1930,7 @@ protected TokenStreamComponents createComponents(String fieldName) { builder.add(new Term("body", "test"), 2); PhraseQuery pq = builder.build(); // body:"just ? test" - assertEquals(1, is.search(pq, 5).totalHits.value); + assertEquals(1, is.search(pq, 5).totalHits.value()); ir.close(); dir.close(); } @@ -1963,7 +1963,7 @@ protected TokenStreamComponents createComponents(String fieldName) { builder.add(new Term("body", "test"), 3); PhraseQuery pq = builder.build(); // body:"just ? ? test" - assertEquals(1, is.search(pq, 5).totalHits.value); + assertEquals(1, is.search(pq, 5).totalHits.value()); ir.close(); dir.close(); } @@ -3484,7 +3484,7 @@ public void testSoftUpdateDocuments() throws IOException { assertEquals(2, reader.docFreq(new Term("id", "1"))); IndexSearcher searcher = new IndexSearcher(reader); TopDocs topDocs = searcher.search(new TermQuery(new Term("id", "1")), 10); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); Document document = reader.storedFields().document(topDocs.scoreDocs[0].doc); assertEquals("2", document.get("version")); @@ -3500,7 +3500,7 @@ public void testSoftUpdateDocuments() throws IOException { oldReader.close(); searcher = new IndexSearcher(reader); topDocs = searcher.search(new TermQuery(new Term("id", "1")), 10); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); document = reader.storedFields().document(topDocs.scoreDocs[0].doc); assertEquals("3", document.get("version")); @@ -3513,7 +3513,7 @@ public void testSoftUpdateDocuments() throws IOException { oldReader.close(); searcher = new IndexSearcher(reader); topDocs = searcher.search(new TermQuery(new Term("id", "1")), 10); - assertEquals(0, topDocs.totalHits.value); + assertEquals(0, topDocs.totalHits.value()); int numSoftDeleted = 0; for (SegmentCommitInfo info : writer.cloneSegmentInfos()) { numSoftDeleted += info.getSoftDelCount(); @@ -3650,10 +3650,10 @@ public int numDeletesToMerge( for (String id : ids) { TopDocs topDocs = searcher.search(new TermQuery(new Term("id", id)), 10); if (updateSeveralDocs) { - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); assertEquals(Math.abs(topDocs.scoreDocs[0].doc - topDocs.scoreDocs[1].doc), 1); } else { - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); } } if (mixDeletes == false) { @@ -4442,7 +4442,10 @@ public void testMaxCompletedSequenceNumber() throws IOException, InterruptedExce try { assertEquals( 1, - acquire.search(new TermQuery(new Term("id", id)), 10).totalHits.value); + acquire + .search(new TermQuery(new Term("id", id)), 10) + .totalHits + .value()); } finally { manager.release(acquire); } @@ -4989,8 +4992,9 @@ public void testDocValuesMixedSkippingIndex() throws Exception { doc2.add(new SortedNumericDocValuesField("test", random().nextLong())); IllegalArgumentException ex = expectThrows(IllegalArgumentException.class, () -> writer.addDocument(doc2)); + ex.printStackTrace(); assertEquals( - "Inconsistency of field data structures across documents for field [test] of doc [1]. doc values skip index: expected 'true', but it has 'false'.", + "Inconsistency of field data structures across documents for field [test] of doc [1]. doc values skip index type: expected 'RANGE', but it has 'NONE'.", ex.getMessage()); } } @@ -5006,7 +5010,7 @@ public void testDocValuesMixedSkippingIndex() throws Exception { IllegalArgumentException ex = expectThrows(IllegalArgumentException.class, () -> writer.addDocument(doc2)); assertEquals( - "Inconsistency of field data structures across documents for field [test] of doc [1]. doc values skip index: expected 'false', but it has 'true'.", + "Inconsistency of field data structures across documents for field [test] of doc [1]. doc values skip index type: expected 'NONE', but it has 'RANGE'.", ex.getMessage()); } } @@ -5018,7 +5022,7 @@ public void testDocValuesSkippingIndexWithoutDocValues() throws Exception { FieldType fieldType = new FieldType(); fieldType.setStored(true); fieldType.setDocValuesType(docValuesType); - fieldType.setDocValuesSkipIndex(true); + fieldType.setDocValuesSkipIndexType(DocValuesSkipIndexType.RANGE); fieldType.freeze(); try (Directory dir = newMockDirectory()) { try (IndexWriter writer = @@ -5028,8 +5032,7 @@ public void testDocValuesSkippingIndexWithoutDocValues() throws Exception { IllegalArgumentException ex = expectThrows(IllegalArgumentException.class, () -> writer.addDocument(doc1)); assertTrue( - ex.getMessage() - .startsWith("field 'test' cannot have docValuesSkipIndex set to true")); + ex.getMessage().startsWith("field 'test' cannot have docValuesSkipIndexType=RANGE")); } } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterDelete.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterDelete.java index f27298781cad..fe114e5cc516 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterDelete.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterDelete.java @@ -547,7 +547,7 @@ private void addDoc(IndexWriter modifier, int id, int value) throws IOException private long getHitCount(Directory dir, Term term) throws IOException { IndexReader reader = DirectoryReader.open(dir); IndexSearcher searcher = newSearcher(reader); - long hitCount = searcher.search(new TermQuery(term), 1000).totalHits.value; + long hitCount = searcher.search(new TermQuery(term), 1000).totalHits.value(); reader.close(); return hitCount; } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterExceptions.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterExceptions.java index 5757141b8f25..2f92606ba7d0 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterExceptions.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterExceptions.java @@ -73,10 +73,7 @@ @SuppressCodecs("SimpleText") // too slow here public class TestIndexWriterExceptions extends LuceneTestCase { - private static class DocCopyIterator implements Iterable { - private final Document doc; - private final int count; - + private record DocCopyIterator(Document doc, int count) implements Iterable { /* private field types */ /* private field types */ @@ -105,11 +102,6 @@ private static class DocCopyIterator implements Iterable { custom5.setStoreTermVectorOffsets(true); } - public DocCopyIterator(Document doc, int count) { - this.count = count; - this.doc = doc; - } - @Override public Iterator iterator() { return new Iterator() { @@ -1455,7 +1447,7 @@ public void testTermVectorExceptions() throws IOException { assertTrue(reader.numDocs() > 0); SegmentInfos.readLatestCommit(dir); for (LeafReaderContext context : reader.leaves()) { - assertFalse(context.reader().getFieldInfos().hasVectors()); + assertFalse(context.reader().getFieldInfos().hasTermVectors()); } reader.close(); dir.close(); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterForceMerge.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterForceMerge.java index ae52075a00dc..52cd21630bca 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterForceMerge.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterForceMerge.java @@ -40,7 +40,6 @@ import org.apache.lucene.document.StringField; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.store.Directory; -import org.apache.lucene.store.IOContext; import org.apache.lucene.tests.analysis.MockAnalyzer; import org.apache.lucene.tests.analysis.MockTokenizer; import org.apache.lucene.tests.store.MockDirectoryWrapper; @@ -244,10 +243,7 @@ private String listFiles(Directory dir) throws IOException { } if (info.info.getUseCompoundFile()) { try (Directory cfs = - info.info - .getCodec() - .compoundFormat() - .getCompoundReader(dir, info.info, IOContext.DEFAULT)) { + info.info.getCodec().compoundFormat().getCompoundReader(dir, info.info)) { for (String file : cfs.listAll()) { sb.append( String.format( @@ -308,6 +304,7 @@ public void testBackgroundForceMerge() throws IOException { dir.close(); } + @AwaitsFix(bugUrl = "https://github.com/apache/lucene/issues/13478") public void testMergePerField() throws IOException { IndexWriterConfig config = new IndexWriterConfig(); ConcurrentMergeScheduler mergeScheduler = diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterMaxDocs.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterMaxDocs.java index 0f348044b605..d9b8a055f6c0 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterMaxDocs.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterMaxDocs.java @@ -67,9 +67,9 @@ public void testExactlyAtTrueLimit() throws Exception { assertEquals(IndexWriter.MAX_DOCS, ir.numDocs()); IndexSearcher searcher = new IndexSearcher(ir); TopScoreDocCollectorManager collectorManager = - new TopScoreDocCollectorManager(10, null, Integer.MAX_VALUE, true); + new TopScoreDocCollectorManager(10, null, Integer.MAX_VALUE); TopDocs hits = searcher.search(new TermQuery(new Term("field", "text")), collectorManager); - assertEquals(IndexWriter.MAX_DOCS, hits.totalHits.value); + assertEquals(IndexWriter.MAX_DOCS, hits.totalHits.value()); // Sort by docID reversed: hits = @@ -77,7 +77,7 @@ public void testExactlyAtTrueLimit() throws Exception { new TermQuery(new Term("field", "text")), 10, new Sort(new SortField(null, SortField.Type.DOC, true))); - assertEquals(IndexWriter.MAX_DOCS, hits.totalHits.value); + assertEquals(IndexWriter.MAX_DOCS, hits.totalHits.value()); assertEquals(10, hits.scoreDocs.length); assertEquals(IndexWriter.MAX_DOCS - 1, hits.scoreDocs[0].doc); ir.close(); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterMergePolicy.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterMergePolicy.java index c56feb62d84a..824b24ed6aa0 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterMergePolicy.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterMergePolicy.java @@ -801,7 +801,7 @@ void stressUpdateSameDocumentWithMergeOnX(boolean useGetReader) new IndexSearcher(reader) .search(new TermQuery(new Term("id", "1")), 10) .totalHits - .value); + .value()); } } else { if (random().nextBoolean()) { @@ -815,7 +815,7 @@ void stressUpdateSameDocumentWithMergeOnX(boolean useGetReader) new IndexSearcher(open) .search(new TermQuery(new Term("id", "1")), 10) .totalHits - .value); + .value()); } } numFullFlushes.decrementAndGet(); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexableField.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexableField.java index 339368da1f81..ddd25dd7682c 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexableField.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexableField.java @@ -96,8 +96,8 @@ public DocValuesType docValuesType() { } @Override - public boolean hasDocValuesSkipIndex() { - return false; + public DocValuesSkipIndexType docValuesSkipIndexType() { + return DocValuesSkipIndexType.NONE; } @Override @@ -284,7 +284,7 @@ public void remove() { } final TopDocs hits = s.search(new TermQuery(new Term("id", "" + id)), 1); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); final int docID = hits.scoreDocs[0].doc; final Document doc = storedFields.document(docID); final int endCounter = counter + fieldsPerDoc[id]; @@ -354,14 +354,14 @@ public void remove() { bq.add(new TermQuery(new Term("id", "" + id)), BooleanClause.Occur.MUST); bq.add(new TermQuery(new Term(name, "text")), BooleanClause.Occur.MUST); final TopDocs hits2 = s.search(bq.build(), 1); - assertEquals(1, hits2.totalHits.value); + assertEquals(1, hits2.totalHits.value()); assertEquals(docID, hits2.scoreDocs[0].doc); bq = new BooleanQuery.Builder(); bq.add(new TermQuery(new Term("id", "" + id)), BooleanClause.Occur.MUST); bq.add(new TermQuery(new Term(name, "" + counter)), BooleanClause.Occur.MUST); final TopDocs hits3 = s.search(bq.build(), 1); - assertEquals(1, hits3.totalHits.value); + assertEquals(1, hits3.totalHits.value()); assertEquals(docID, hits3.scoreDocs[0].doc); } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexingSequenceNumbers.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexingSequenceNumbers.java index c920ecd9f24d..66d24d6b9425 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexingSequenceNumbers.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexingSequenceNumbers.java @@ -127,7 +127,7 @@ public void run() { DirectoryReader r = w.getReader(); IndexSearcher s = newSearcher(r); TopDocs hits = s.search(new TermQuery(id), 1); - assertEquals("maxDoc: " + r.maxDoc(), 1, hits.totalHits.value); + assertEquals("maxDoc: " + r.maxDoc(), 1, hits.totalHits.value()); Document doc = r.storedFields().document(hits.scoreDocs[0].doc); assertEquals(maxThread, doc.getField("thread").numericValue().intValue()); r.close(); @@ -276,7 +276,7 @@ public void run() { TopDocs hits = s.search(new TermQuery(new Term("id", "" + id)), 1); if (expectedThreadIDs[id] != -1) { - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); Document doc = r.storedFields().document(hits.scoreDocs[0].doc); int actualThreadID = doc.getField("thread").numericValue().intValue(); if (expectedThreadIDs[id] != actualThreadID) { @@ -306,14 +306,14 @@ public void run() { } assertEquals("id=" + id, expectedThreadIDs[id], actualThreadID); } - } else if (hits.totalHits.value != 0) { + } else if (hits.totalHits.value() != 0) { System.out.println( "FAIL: id=" + id + " expectedThreadID=" + expectedThreadIDs[id] + " vs totalHits=" - + hits.totalHits.value + + hits.totalHits.value() + " commitSeqNo=" + commitSeqNo + " numThreads=" @@ -331,7 +331,7 @@ public void run() { } } } - assertEquals(0, hits.totalHits.value); + assertEquals(0, hits.totalHits.value()); } } w.close(); @@ -482,7 +482,7 @@ public void run() { // We pre-add all ids up front: assert expectedThreadIDs[id] != -1; - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); int hitDoc = hits.scoreDocs[0].doc; assertEquals(hitDoc, docValues.advance(hitDoc)); int actualThreadID = (int) docValues.longValue(); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestKnnGraph.java b/lucene/core/src/test/org/apache/lucene/index/TestKnnGraph.java index 72be0bd929fa..a3d655ebe3b0 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestKnnGraph.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestKnnGraph.java @@ -30,8 +30,6 @@ import java.util.Set; import java.util.concurrent.CountDownLatch; import org.apache.lucene.codecs.Codec; -import org.apache.lucene.codecs.FilterCodec; -import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat; import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader; @@ -85,33 +83,15 @@ public void setup() { vectorEncoding = randomVectorEncoding(); boolean quantized = randomBoolean(); codec = - new FilterCodec(TestUtil.getDefaultCodec().getName(), TestUtil.getDefaultCodec()) { - @Override - public KnnVectorsFormat knnVectorsFormat() { - return new PerFieldKnnVectorsFormat() { - @Override - public KnnVectorsFormat getKnnVectorsFormatForField(String field) { - return quantized - ? new Lucene99HnswScalarQuantizedVectorsFormat( - M, HnswGraphBuilder.DEFAULT_BEAM_WIDTH) - : new Lucene99HnswVectorsFormat(M, HnswGraphBuilder.DEFAULT_BEAM_WIDTH); - } - }; - } - }; + TestUtil.alwaysKnnVectorsFormat( + quantized + ? new Lucene99HnswScalarQuantizedVectorsFormat( + M, HnswGraphBuilder.DEFAULT_BEAM_WIDTH) + : new Lucene99HnswVectorsFormat(M, HnswGraphBuilder.DEFAULT_BEAM_WIDTH)); float32Codec = - new FilterCodec(TestUtil.getDefaultCodec().getName(), TestUtil.getDefaultCodec()) { - @Override - public KnnVectorsFormat knnVectorsFormat() { - return new PerFieldKnnVectorsFormat() { - @Override - public KnnVectorsFormat getKnnVectorsFormatForField(String field) { - return new Lucene99HnswVectorsFormat(M, HnswGraphBuilder.DEFAULT_BEAM_WIDTH); - } - }; - } - }; + TestUtil.alwaysKnnVectorsFormat( + new Lucene99HnswVectorsFormat(M, HnswGraphBuilder.DEFAULT_BEAM_WIDTH)); } private VectorEncoding randomVectorEncoding() { @@ -413,11 +393,13 @@ private void assertConsistentGraph(IndexWriter iw, float[][] values, String vect // stored vector values are the same as original int nextDocWithVectors = 0; StoredFields storedFields = reader.storedFields(); + KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); for (int i = 0; i < reader.maxDoc(); i++) { - nextDocWithVectors = vectorValues.advance(i); + nextDocWithVectors = iterator.advance(i); while (i < nextDocWithVectors && i < reader.maxDoc()) { int id = Integer.parseInt(storedFields.document(i).get("id")); - assertNull("document " + id + " has no vector, but was expected to", values[id]); + assertNull( + "document " + id + ", expected to have no vector, does have one", values[id]); ++i; } if (nextDocWithVectors == NO_MORE_DOCS) { @@ -425,7 +407,7 @@ private void assertConsistentGraph(IndexWriter iw, float[][] values, String vect } int id = Integer.parseInt(storedFields.document(i).get("id")); // documents with KnnGraphValues have the expected vectors - float[] scratch = vectorValues.vectorValue(); + float[] scratch = vectorValues.vectorValue(iterator.index()); assertArrayEquals( "vector did not match for doc " + i + ", id=" + id + ": " + Arrays.toString(scratch), values[id], @@ -435,9 +417,9 @@ private void assertConsistentGraph(IndexWriter iw, float[][] values, String vect } // if IndexDisi.doc == NO_MORE_DOCS, we should not call IndexDisi.nextDoc() if (nextDocWithVectors != NO_MORE_DOCS) { - assertEquals(NO_MORE_DOCS, vectorValues.nextDoc()); + assertEquals(NO_MORE_DOCS, iterator.nextDoc()); } else { - assertEquals(NO_MORE_DOCS, vectorValues.docID()); + assertEquals(NO_MORE_DOCS, iterator.docID()); } // assert graph values: @@ -560,7 +542,6 @@ private void add( String idString = Integer.toString(id); doc.add(new StringField("id", idString, Field.Store.YES)); doc.add(new SortedDocValuesField("id", new BytesRef(idString))); - // XSSystem.out.println("add " + idString + " " + Arrays.toString(vector)); iw.updateDocument(new Term("id", idString), doc); } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestMixedDocValuesUpdates.java b/lucene/core/src/test/org/apache/lucene/index/TestMixedDocValuesUpdates.java index 086b45a5ec77..a5f8021bb58f 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestMixedDocValuesUpdates.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestMixedDocValuesUpdates.java @@ -476,7 +476,7 @@ public void testTryUpdateDocValues() throws IOException { for (LeafReaderContext c : reader.leaves()) { TopDocs topDocs = new IndexSearcher(c.reader()).search(new TermQuery(new Term("id", "" + doc)), 10); - if (topDocs.totalHits.value == 1) { + if (topDocs.totalHits.value() == 1) { assertNull(numericIdValues); assertNull(binaryIdValues); numericIdValues = c.reader().getNumericDocValues("numericId"); @@ -484,7 +484,7 @@ public void testTryUpdateDocValues() throws IOException { binaryIdValues = c.reader().getBinaryDocValues("binaryId"); assertEquals(topDocs.scoreDocs[0].doc, binaryIdValues.advance(topDocs.scoreDocs[0].doc)); } else { - assertEquals(0, topDocs.totalHits.value); + assertEquals(0, topDocs.totalHits.value()); } } @@ -564,7 +564,7 @@ public void testTryUpdateMultiThreaded() Long value = values[i]; TopDocs topDocs = new IndexSearcher(reader).search(new TermQuery(new Term("id", "" + i)), 10); - assertEquals(topDocs.totalHits.value, 1); + assertEquals(topDocs.totalHits.value(), 1); int docID = topDocs.scoreDocs[0].doc; List leaves = reader.leaves(); int subIndex = ReaderUtil.subIndex(docID, leaves); @@ -591,7 +591,7 @@ static void doUpdate(Term doc, IndexWriter writer, Field... fields) throws IOExc do { // retry if we just committing a merge try (DirectoryReader reader = DirectoryReader.open(writer)) { TopDocs topDocs = new IndexSearcher(reader).search(new TermQuery(doc), 10); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); int theDoc = topDocs.scoreDocs[0].doc; seqId = writer.tryUpdateDocValue(reader, theDoc, fields); } @@ -676,7 +676,7 @@ public void testResetValueMultipleDocs() throws Exception { IndexSearcher searcher = new IndexSearcher(reader); TopDocs is_live = searcher.search(new FieldExistsQuery("is_live"), 5); - assertEquals(numHits, is_live.totalHits.value); + assertEquals(numHits, is_live.totalHits.value()); StoredFields storedFields = reader.storedFields(); for (ScoreDoc doc : is_live.scoreDocs) { int id = Integer.parseInt(storedFields.document(doc.doc).get("id")); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestNumericDocValuesUpdates.java b/lucene/core/src/test/org/apache/lucene/index/TestNumericDocValuesUpdates.java index 4a54f6b1cf2d..d5f014b302d9 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestNumericDocValuesUpdates.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestNumericDocValuesUpdates.java @@ -170,7 +170,7 @@ public void testBiasedMixOfRandomUpdates() throws Exception { new TermQuery(new Term("id", id)), 1, new Sort(new SortField("val", SortField.Type.LONG))); - assertEquals(id + " missing?", 1, td.totalHits.value); + assertEquals(id + " missing?", 1, td.totalHits.value()); assertEquals(id + " value", expect.getValue(), ((FieldDoc) td.scoreDocs[0]).fields[0]); } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java b/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java index cb57e9836919..d3236845a4bc 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java @@ -191,7 +191,7 @@ public void testApplyUpdates() throws IOException { false, IndexOptions.NONE, DocValuesType.NUMERIC, - false, + DocValuesSkipIndexType.NONE, 0, Collections.emptyMap(), 0, @@ -231,7 +231,7 @@ public void testApplyUpdates() throws IOException { false, IndexOptions.NONE, DocValuesType.NUMERIC, - false, + DocValuesSkipIndexType.NONE, 1, Collections.emptyMap(), 0, @@ -297,7 +297,7 @@ public void testUpdateAppliedOnlyOnce() throws IOException { false, IndexOptions.NONE, DocValuesType.NUMERIC, - false, + DocValuesSkipIndexType.NONE, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, @@ -368,7 +368,7 @@ public void testResetOnUpdate() throws IOException { false, IndexOptions.NONE, DocValuesType.NUMERIC, - false, + DocValuesSkipIndexType.NONE, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, @@ -407,7 +407,7 @@ public void testResetOnUpdate() throws IOException { false, IndexOptions.NONE, DocValuesType.NUMERIC, - false, + DocValuesSkipIndexType.NONE, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, diff --git a/lucene/core/src/test/org/apache/lucene/index/TestReadOnlyIndex.java b/lucene/core/src/test/org/apache/lucene/index/TestReadOnlyIndex.java index 9ea726f8abff..e6246a1de2c6 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestReadOnlyIndex.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestReadOnlyIndex.java @@ -87,7 +87,7 @@ private Void doTestReadOnlyIndex() throws Exception { assertEquals(1, isearcher.count(new TermQuery(new Term("fieldname", longTerm)))); Query query = new TermQuery(new Term("fieldname", "text")); TopDocs hits = isearcher.search(query, 1); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); // Iterate through the results: StoredFields storedFields = isearcher.storedFields(); for (int i = 0; i < hits.scoreDocs.length; i++) { diff --git a/lucene/core/src/test/org/apache/lucene/index/TestRollingUpdates.java b/lucene/core/src/test/org/apache/lucene/index/TestRollingUpdates.java index a1b70deba667..14d09f8d9414 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestRollingUpdates.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestRollingUpdates.java @@ -83,7 +83,7 @@ public void testRollingUpdates() throws Exception { final boolean doUpdate; if (s != null && updateCount < SIZE) { TopDocs hits = s.search(new TermQuery(idTerm), 1); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); doUpdate = w.tryDeleteDocument(r, hits.scoreDocs[0].doc) == -1; if (VERBOSE) { if (doUpdate) { diff --git a/lucene/core/src/test/org/apache/lucene/index/TestSegmentInfos.java b/lucene/core/src/test/org/apache/lucene/index/TestSegmentInfos.java index 90b0a07aa343..e222c20d639d 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestSegmentInfos.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestSegmentInfos.java @@ -71,8 +71,8 @@ public void testVersionsOneSegment() throws IOException { SegmentInfo info = new SegmentInfo( dir, - Version.LUCENE_10_0_0, - Version.LUCENE_10_0_0, + Version.LUCENE_11_0_0, + Version.LUCENE_11_0_0, "_0", 1, false, @@ -90,7 +90,7 @@ public void testVersionsOneSegment() throws IOException { sis.add(commitInfo); sis.commit(dir); sis = SegmentInfos.readLatestCommit(dir); - assertEquals(Version.LUCENE_10_0_0, sis.getMinSegmentLuceneVersion()); + assertEquals(Version.LUCENE_11_0_0, sis.getMinSegmentLuceneVersion()); assertEquals(Version.LATEST, sis.getCommitLuceneVersion()); dir.close(); } @@ -106,8 +106,8 @@ public void testVersionsTwoSegments() throws IOException { SegmentInfo info = new SegmentInfo( dir, - Version.LUCENE_10_0_0, - Version.LUCENE_10_0_0, + Version.LUCENE_11_0_0, + Version.LUCENE_11_0_0, "_0", 1, false, @@ -126,8 +126,8 @@ public void testVersionsTwoSegments() throws IOException { info = new SegmentInfo( dir, - Version.LUCENE_10_0_0, - Version.LUCENE_10_0_0, + Version.LUCENE_11_0_0, + Version.LUCENE_11_0_0, "_1", 1, false, @@ -146,7 +146,7 @@ public void testVersionsTwoSegments() throws IOException { byte[] commitInfoId0 = sis.info(0).getId(); byte[] commitInfoId1 = sis.info(1).getId(); sis = SegmentInfos.readLatestCommit(dir); - assertEquals(Version.LUCENE_10_0_0, sis.getMinSegmentLuceneVersion()); + assertEquals(Version.LUCENE_11_0_0, sis.getMinSegmentLuceneVersion()); assertEquals(Version.LATEST, sis.getCommitLuceneVersion()); assertEquals( StringHelper.idToString(commitInfoId0), StringHelper.idToString(sis.info(0).getId())); @@ -277,8 +277,8 @@ public void testIDChangesOnAdvance() throws IOException { SegmentInfo info = new SegmentInfo( dir, - Version.LUCENE_9_0_0, - Version.LUCENE_9_0_0, + Version.LUCENE_10_0_0, + Version.LUCENE_10_0_0, "_0", 1, false, diff --git a/lucene/core/src/test/org/apache/lucene/index/TestSegmentMerger.java b/lucene/core/src/test/org/apache/lucene/index/TestSegmentMerger.java index 94a1849a35e2..d3f7ec8a803c 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestSegmentMerger.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestSegmentMerger.java @@ -138,7 +138,7 @@ public void testMerge() throws IOException { int tvCount = 0; for (FieldInfo fieldInfo : mergedReader.getFieldInfos()) { - if (fieldInfo.hasVectors()) { + if (fieldInfo.hasTermVectors()) { tvCount++; } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestSegmentReader.java b/lucene/core/src/test/org/apache/lucene/index/TestSegmentReader.java index 71bb8b167b1e..b2d9d37613d3 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestSegmentReader.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestSegmentReader.java @@ -90,7 +90,7 @@ public void testGetFieldNameVariations() { } else { notIndexedFieldNames.add(name); } - if (fieldInfo.hasVectors()) { + if (fieldInfo.hasTermVectors()) { tvFieldNames.add(name); } else if (fieldInfo.getIndexOptions() != IndexOptions.NONE) { noTVFieldNames.add(name); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestSegmentToThreadMapping.java b/lucene/core/src/test/org/apache/lucene/index/TestSegmentToThreadMapping.java index 609dd0359ab5..f3016d4b82f3 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestSegmentToThreadMapping.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestSegmentToThreadMapping.java @@ -17,29 +17,24 @@ package org.apache.lucene.index; +import com.carrotsearch.randomizedtesting.RandomizedTest; import java.io.IOException; import java.util.ArrayList; +import java.util.Collections; import java.util.List; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.LinkedBlockingQueue; -import java.util.concurrent.ThreadPoolExecutor; -import java.util.concurrent.TimeUnit; import org.apache.lucene.document.Document; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.KnnCollector; -import org.apache.lucene.search.MatchAllDocsQuery; -import org.apache.lucene.search.Query; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.index.RandomIndexWriter; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.util.Bits; import org.apache.lucene.util.IOUtils; -import org.apache.lucene.util.NamedThreadFactory; import org.apache.lucene.util.Version; public class TestSegmentToThreadMapping extends LuceneTestCase { - public LeafReader dummyIndexReader(final int maxDoc) { + private static LeafReader dummyIndexReader(final int maxDoc) { return new LeafReader() { @Override public int maxDoc() { @@ -160,83 +155,230 @@ public CacheHelper getReaderCacheHelper() { }; } - public void testSingleSlice() { - LeafReader largeSegmentReader = dummyIndexReader(50_000); - LeafReader firstMediumSegmentReader = dummyIndexReader(30_000); - LeafReader secondMediumSegmentReader = dummyIndexReader(30__000); - LeafReader thirdMediumSegmentReader = dummyIndexReader(30_000); + private static List createLeafReaderContexts(int... maxDocs) { List leafReaderContexts = new ArrayList<>(); + for (int maxDoc : maxDocs) { + leafReaderContexts.add(new LeafReaderContext(dummyIndexReader(maxDoc))); + } + Collections.shuffle(leafReaderContexts, random()); + return leafReaderContexts; + } - leafReaderContexts.add(new LeafReaderContext(largeSegmentReader)); - leafReaderContexts.add(new LeafReaderContext(firstMediumSegmentReader)); - leafReaderContexts.add(new LeafReaderContext(secondMediumSegmentReader)); - leafReaderContexts.add(new LeafReaderContext(thirdMediumSegmentReader)); - - IndexSearcher.LeafSlice[] resultSlices = IndexSearcher.slices(leafReaderContexts, 250_000, 5); + public void testSingleSlice() { + List leafReaderContexts = + createLeafReaderContexts(50_000, 30_000, 30_000, 30_000); + IndexSearcher.LeafSlice[] resultSlices = + IndexSearcher.slices( + leafReaderContexts, 250_000, RandomizedTest.randomIntBetween(4, 10), false); + assertEquals(1, resultSlices.length); + assertEquals(4, resultSlices[0].partitions.length); + } - assertTrue(resultSlices.length == 1); + public void testSingleSliceWithPartitions() { + List leafReaderContexts = + createLeafReaderContexts(50_000, 30_000, 30_000, 30_000); + IndexSearcher.LeafSlice[] resultSlices = + IndexSearcher.slices( + leafReaderContexts, 250_000, RandomizedTest.randomIntBetween(4, 10), true); + assertEquals(1, resultSlices.length); + assertEquals(4, resultSlices[0].partitions.length); + } - final LeafReaderContext[] leaves = resultSlices[0].leaves; + public void testMaxSegmentsPerSlice() { + List leafReaderContexts = + createLeafReaderContexts(50_000, 30_000, 30_000, 30_000); + { + IndexSearcher.LeafSlice[] resultSlices = + IndexSearcher.slices(leafReaderContexts, 250_000, 3, false); + assertEquals(2, resultSlices.length); + assertEquals(3, resultSlices[0].partitions.length); + assertEquals(110_000, resultSlices[0].getMaxDocs()); + assertEquals(1, resultSlices[1].partitions.length); + assertEquals(30_000, resultSlices[1].getMaxDocs()); + } + { + IndexSearcher.LeafSlice[] resultSlices = + IndexSearcher.slices(leafReaderContexts, 250_000, 2, false); + assertEquals(2, resultSlices.length); + assertEquals(2, resultSlices[0].partitions.length); + assertEquals(80_000, resultSlices[0].getMaxDocs()); + assertEquals(2, resultSlices[1].partitions.length); + assertEquals(60_000, resultSlices[1].getMaxDocs()); + } + { + IndexSearcher.LeafSlice[] resultSlices = + IndexSearcher.slices(leafReaderContexts, 250_000, 1, false); + assertEquals(4, resultSlices.length); + assertEquals(1, resultSlices[0].partitions.length); + assertEquals(50_000, resultSlices[0].getMaxDocs()); + assertEquals(1, resultSlices[1].partitions.length); + assertEquals(30_000, resultSlices[1].getMaxDocs()); + assertEquals(1, resultSlices[2].partitions.length); + assertEquals(30_000, resultSlices[2].getMaxDocs()); + assertEquals(1, resultSlices[3].partitions.length); + assertEquals(30_000, resultSlices[3].getMaxDocs()); + } + } - assertTrue(leaves.length == 4); + public void testMaxSegmentsPerSliceWithPartitions() { + List leafReaderContexts = + createLeafReaderContexts(50_000, 30_000, 30_000, 30_000); + { + IndexSearcher.LeafSlice[] resultSlices = + IndexSearcher.slices(leafReaderContexts, 250_000, 3, true); + assertEquals(2, resultSlices.length); + assertEquals(3, resultSlices[0].partitions.length); + assertEquals(110_000, resultSlices[0].getMaxDocs()); + assertEquals(1, resultSlices[1].partitions.length); + assertEquals(30_000, resultSlices[1].getMaxDocs()); + } + { + IndexSearcher.LeafSlice[] resultSlices = + IndexSearcher.slices(leafReaderContexts, 250_000, 2, true); + assertEquals(2, resultSlices.length); + assertEquals(2, resultSlices[0].partitions.length); + assertEquals(80_000, resultSlices[0].getMaxDocs()); + assertEquals(2, resultSlices[1].partitions.length); + assertEquals(60_000, resultSlices[1].getMaxDocs()); + } + { + IndexSearcher.LeafSlice[] resultSlices = + IndexSearcher.slices(leafReaderContexts, 250_000, 1, true); + assertEquals(4, resultSlices.length); + assertEquals(1, resultSlices[0].partitions.length); + assertEquals(50_000, resultSlices[0].getMaxDocs()); + assertEquals(1, resultSlices[1].partitions.length); + assertEquals(30_000, resultSlices[1].getMaxDocs()); + assertEquals(1, resultSlices[2].partitions.length); + assertEquals(30_000, resultSlices[2].getMaxDocs()); + assertEquals(1, resultSlices[3].partitions.length); + assertEquals(30_000, resultSlices[3].getMaxDocs()); + } } public void testSmallSegments() { - LeafReader firstMediumSegmentReader = dummyIndexReader(10_000); - LeafReader secondMediumSegmentReader = dummyIndexReader(10_000); - LeafReader thirdMediumSegmentReader = dummyIndexReader(10_000); - LeafReader fourthMediumSegmentReader = dummyIndexReader(10_000); - LeafReader fifthMediumSegmentReader = dummyIndexReader(10_000); - LeafReader sixthMediumSegmentReader = dummyIndexReader(10_000); - LeafReader seventhLargeSegmentReader = dummyIndexReader(130_000); - LeafReader eigthLargeSegmentReader = dummyIndexReader(130_000); - List leafReaderContexts = new ArrayList<>(); - - leafReaderContexts.add(new LeafReaderContext(firstMediumSegmentReader)); - leafReaderContexts.add(new LeafReaderContext(secondMediumSegmentReader)); - leafReaderContexts.add(new LeafReaderContext(thirdMediumSegmentReader)); - leafReaderContexts.add(new LeafReaderContext(fourthMediumSegmentReader)); - leafReaderContexts.add(new LeafReaderContext(fifthMediumSegmentReader)); - leafReaderContexts.add(new LeafReaderContext(sixthMediumSegmentReader)); - leafReaderContexts.add(new LeafReaderContext(seventhLargeSegmentReader)); - leafReaderContexts.add(new LeafReaderContext(eigthLargeSegmentReader)); - - IndexSearcher.LeafSlice[] resultSlices = IndexSearcher.slices(leafReaderContexts, 250_000, 5); - - assertTrue(resultSlices.length == 3); - - final LeafReaderContext[] firstSliceleaves = resultSlices[0].leaves; - final LeafReaderContext[] secondSliceleaves = resultSlices[1].leaves; - final LeafReaderContext[] thirdSliceleaves = resultSlices[2].leaves; + List leafReaderContexts = + createLeafReaderContexts(10_000, 10_000, 10_000, 10_000, 10_000, 10_000, 130_000, 130_000); + + { + IndexSearcher.LeafSlice[] resultSlices = + IndexSearcher.slices(leafReaderContexts, 250_000, 5, false); + assertEquals(3, resultSlices.length); + + assertEquals(2, resultSlices[0].partitions.length); + assertEquals(260_000, resultSlices[0].getMaxDocs()); + assertEquals(5, resultSlices[1].partitions.length); + assertEquals(50_000, resultSlices[1].getMaxDocs()); + assertEquals(1, resultSlices[2].partitions.length); + assertEquals(10_000, resultSlices[2].getMaxDocs()); + } + { + IndexSearcher.LeafSlice[] resultSlices = + IndexSearcher.slices(leafReaderContexts, 130_000, 5, false); + assertEquals(3, resultSlices.length); + // this is odd, because we allow two segments in the same slice with both size == + // maxDocsPerSlice + assertEquals(2, resultSlices[0].partitions.length); + assertEquals(260_000, resultSlices[0].getMaxDocs()); + assertEquals(5, resultSlices[1].partitions.length); + assertEquals(50_000, resultSlices[1].getMaxDocs()); + assertEquals(1, resultSlices[2].partitions.length); + assertEquals(10_000, resultSlices[2].getMaxDocs()); + } + } - assertTrue(firstSliceleaves.length == 2); - assertTrue(secondSliceleaves.length == 5); - assertTrue(thirdSliceleaves.length == 1); + public void testSmallSegmentsWithPartitions() { + List leafReaderContexts = + createLeafReaderContexts(10_000, 10_000, 10_000, 10_000, 10_000, 10_000, 130_000, 130_000); + + { + IndexSearcher.LeafSlice[] resultSlices = + IndexSearcher.slices(leafReaderContexts, 250_000, 5, true); + assertEquals(3, resultSlices.length); + + assertEquals(2, resultSlices[0].partitions.length); + assertEquals(260_000, resultSlices[0].getMaxDocs()); + assertEquals(5, resultSlices[1].partitions.length); + assertEquals(50_000, resultSlices[1].getMaxDocs()); + assertEquals(1, resultSlices[2].partitions.length); + assertEquals(10_000, resultSlices[2].getMaxDocs()); + } + { + IndexSearcher.LeafSlice[] resultSlices = + IndexSearcher.slices(leafReaderContexts, 130_000, 5, true); + assertEquals(3, resultSlices.length); + // this is odd, because we allow two segments in the same slice with both size == + // maxDocsPerSlice + assertEquals(2, resultSlices[0].partitions.length); + assertEquals(260_000, resultSlices[0].getMaxDocs()); + assertEquals(5, resultSlices[1].partitions.length); + assertEquals(50_000, resultSlices[1].getMaxDocs()); + assertEquals(1, resultSlices[2].partitions.length); + assertEquals(10_000, resultSlices[2].getMaxDocs()); + } } public void testLargeSlices() { - LeafReader largeSegmentReader = dummyIndexReader(290_900); - LeafReader firstMediumSegmentReader = dummyIndexReader(170_000); - LeafReader secondMediumSegmentReader = dummyIndexReader(170_000); - LeafReader thirdMediumSegmentReader = dummyIndexReader(170_000); - List leafReaderContexts = new ArrayList<>(); - - leafReaderContexts.add(new LeafReaderContext(largeSegmentReader)); - leafReaderContexts.add(new LeafReaderContext(firstMediumSegmentReader)); - leafReaderContexts.add(new LeafReaderContext(secondMediumSegmentReader)); - leafReaderContexts.add(new LeafReaderContext(thirdMediumSegmentReader)); - - IndexSearcher.LeafSlice[] resultSlices = IndexSearcher.slices(leafReaderContexts, 250_000, 5); + List leafReaderContexts = + createLeafReaderContexts(290_900, 170_000, 170_000, 170_000); + IndexSearcher.LeafSlice[] resultSlices = + IndexSearcher.slices(leafReaderContexts, 250_000, 5, false); + + assertEquals(3, resultSlices.length); + assertEquals(1, resultSlices[0].partitions.length); + assertEquals(2, resultSlices[1].partitions.length); + assertEquals(1, resultSlices[2].partitions.length); + } - assertTrue(resultSlices.length == 3); + public void testLargeSlicesWithPartitions() { + List leafReaderContexts = + createLeafReaderContexts(290_900, 170_000, 170_000, 170_000); + IndexSearcher.LeafSlice[] resultSlices = + IndexSearcher.slices( + leafReaderContexts, 250_000, RandomizedTest.randomIntBetween(5, 10), true); + + assertEquals(4, resultSlices.length); + assertEquals(1, resultSlices[0].partitions.length); + assertEquals(145_450, resultSlices[0].getMaxDocs()); + assertEquals(1, resultSlices[1].partitions.length); + assertEquals(145_450, resultSlices[1].getMaxDocs()); + assertEquals(2, resultSlices[2].partitions.length); + assertEquals(340_000, resultSlices[2].getMaxDocs()); + assertEquals(1, resultSlices[3].partitions.length); + assertEquals(170_000, resultSlices[3].getMaxDocs()); + } - final LeafReaderContext[] firstSliceleaves = resultSlices[0].leaves; - final LeafReaderContext[] secondSliceleaves = resultSlices[1].leaves; - final LeafReaderContext[] thirdSliceleaves = resultSlices[2].leaves; + public void testSingleSegmentPartitions() { + List leafReaderContexts = createLeafReaderContexts(750_001); + IndexSearcher.LeafSlice[] resultSlices = + IndexSearcher.slices( + leafReaderContexts, 250_000, RandomizedTest.randomIntBetween(1, 10), true); + + assertEquals(4, resultSlices.length); + assertEquals(1, resultSlices[0].partitions.length); + assertEquals(187_500, resultSlices[0].getMaxDocs()); + assertEquals(1, resultSlices[1].partitions.length); + assertEquals(187_500, resultSlices[1].getMaxDocs()); + assertEquals(1, resultSlices[2].partitions.length); + assertEquals(187_500, resultSlices[2].getMaxDocs()); + assertEquals(1, resultSlices[3].partitions.length); + assertEquals(187_501, resultSlices[3].getMaxDocs()); + } - assertTrue(firstSliceleaves.length == 1); - assertTrue(secondSliceleaves.length == 2); - assertTrue(thirdSliceleaves.length == 1); + public void testExtremeSegmentsPartitioning() { + List leafReaderContexts = createLeafReaderContexts(2, 5, 10); + IndexSearcher.LeafSlice[] resultSlices = IndexSearcher.slices(leafReaderContexts, 1, 1, true); + + assertEquals(12, resultSlices.length); + int i = 0; + for (IndexSearcher.LeafSlice leafSlice : resultSlices) { + if (i++ > 4) { + assertEquals(1, leafSlice.getMaxDocs()); + } else { + assertEquals(2, leafSlice.getMaxDocs()); + } + assertEquals(1, leafSlice.partitions.length); + } } public void testIntraSliceDocIDOrder() throws Exception { @@ -251,33 +393,54 @@ public void testIntraSliceDocIDOrder() throws Exception { IndexReader r = w.getReader(); w.close(); - ExecutorService service = - new ThreadPoolExecutor( - 4, - 4, - 0L, - TimeUnit.MILLISECONDS, - new LinkedBlockingQueue(), - new NamedThreadFactory("TestSegmentToThreadMapping")); - IndexSearcher s = new IndexSearcher(r, service); - Query query = new MatchAllDocsQuery(); + IndexSearcher s = new IndexSearcher(r, command -> {}); + IndexSearcher.LeafSlice[] slices = s.getSlices(); + assertNotNull(slices); + + for (IndexSearcher.LeafSlice leafSlice : slices) { + int previousDocBase = leafSlice.partitions[0].ctx.docBase; - s.search(query, Integer.MAX_VALUE); + for (IndexSearcher.LeafReaderContextPartition leafReaderContextPartition : + leafSlice.partitions) { + assertTrue(previousDocBase <= leafReaderContextPartition.ctx.docBase); + previousDocBase = leafReaderContextPartition.ctx.docBase; + } + } + IOUtils.close(r, dir); + } + + public void testIntraSliceDocIDOrderWithPartitions() throws Exception { + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), dir); + w.addDocument(new Document()); + w.addDocument(new Document()); + w.commit(); + w.addDocument(new Document()); + w.addDocument(new Document()); + w.commit(); + IndexReader r = w.getReader(); + w.close(); + IndexSearcher s = + new IndexSearcher(r, command -> {}) { + @Override + protected LeafSlice[] slices(List leaves) { + // force partitioning of segment with max docs per slice set to 1: 1 doc per partition. + return slices(leaves, 1, 1, true); + } + }; IndexSearcher.LeafSlice[] slices = s.getSlices(); assertNotNull(slices); for (IndexSearcher.LeafSlice leafSlice : slices) { - LeafReaderContext[] leafReaderContexts = leafSlice.leaves; - int previousDocBase = leafReaderContexts[0].docBase; + int previousDocBase = leafSlice.partitions[0].ctx.docBase; - for (LeafReaderContext leafReaderContext : leafReaderContexts) { - assertTrue(previousDocBase <= leafReaderContext.docBase); - previousDocBase = leafReaderContext.docBase; + for (IndexSearcher.LeafReaderContextPartition leafReaderContextPartition : + leafSlice.partitions) { + assertTrue(previousDocBase <= leafReaderContextPartition.ctx.docBase); + previousDocBase = leafReaderContextPartition.ctx.docBase; } } - - service.shutdown(); IOUtils.close(r, dir); } @@ -291,9 +454,8 @@ public void testRandom() { leafReaderContexts.add( new LeafReaderContext(dummyIndexReader(random().nextInt((max - min) + 1) + min))); } - - IndexSearcher.LeafSlice[] resultSlices = IndexSearcher.slices(leafReaderContexts, 250_000, 5); - + final IndexSearcher.LeafSlice[] resultSlices = + IndexSearcher.slices(leafReaderContexts, 250_000, 5, random().nextBoolean()); assertTrue(resultSlices.length > 0); } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestSoftDeletesDirectoryReaderWrapper.java b/lucene/core/src/test/org/apache/lucene/index/TestSoftDeletesDirectoryReaderWrapper.java index 2098f57910dc..5214b97fdc5f 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestSoftDeletesDirectoryReaderWrapper.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestSoftDeletesDirectoryReaderWrapper.java @@ -306,6 +306,12 @@ public void testAvoidWrappingReadersWithoutSoftDeletes() throws Exception { softDeletesField, MatchNoDocsQuery::new, mergePolicy)); writer.forceMerge(1); try (DirectoryReader reader = DirectoryReader.open(writer)) { + for (LeafReaderContext leafContext : reader.leaves()) { + assertThat(leafContext.reader(), instanceOf(SegmentReader.class)); + SegmentReader segmentReader = (SegmentReader) leafContext.reader(); + assertNull(segmentReader.getLiveDocs()); + assertNull(segmentReader.getHardLiveDocs()); + } SoftDeletesDirectoryReaderWrapper wrapped = new SoftDeletesDirectoryReaderWrapper(reader, softDeletesField); assertEquals(numDocs, wrapped.numDocs()); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestSoftDeletesRetentionMergePolicy.java b/lucene/core/src/test/org/apache/lucene/index/TestSoftDeletesRetentionMergePolicy.java index 1df06462d0e5..0e492573409b 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestSoftDeletesRetentionMergePolicy.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestSoftDeletesRetentionMergePolicy.java @@ -340,10 +340,10 @@ public void testSoftDeleteWithRetention() throws IOException, InterruptedExcepti for (String id : ids) { TopDocs topDocs = searcher.search(new TermQuery(new Term("id", id)), 10); if (updateSeveralDocs) { - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); assertEquals(Math.abs(topDocs.scoreDocs[0].doc - topDocs.scoreDocs[1].doc), 1); } else { - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); } } writer.addDocument(new Document()); // add a dummy doc to trigger a segment here @@ -386,13 +386,13 @@ public int numDocs() { TopDocs seq_id = searcher.search( IntPoint.newRangeQuery("seq_id", seqIds.intValue() - 50, Integer.MAX_VALUE), 10); - assertTrue(seq_id.totalHits.value + " hits", seq_id.totalHits.value >= 50); + assertTrue(seq_id.totalHits.value() + " hits", seq_id.totalHits.value() >= 50); searcher = new IndexSearcher(reader); for (String id : ids) { if (updateSeveralDocs) { - assertEquals(2, searcher.search(new TermQuery(new Term("id", id)), 10).totalHits.value); + assertEquals(2, searcher.search(new TermQuery(new Term("id", id)), 10).totalHits.value()); } else { - assertEquals(1, searcher.search(new TermQuery(new Term("id", id)), 10).totalHits.value); + assertEquals(1, searcher.search(new TermQuery(new Term("id", id)), 10).totalHits.value()); } } IOUtils.close(reader, writer, dir); @@ -651,7 +651,7 @@ public void testMergeSoftDeleteAndHardDelete() throws Exception { TopDocs topDocs = new IndexSearcher(new IncludeSoftDeletesWrapper(reader)) .search(new TermQuery(new Term("id", "1")), 1); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); if (writer.tryDeleteDocument(reader, topDocs.scoreDocs[0].doc) > 0) { break; } @@ -784,7 +784,7 @@ static void doUpdate(Term doc, IndexWriter writer, Field... fields) throws IOExc try (DirectoryReader reader = DirectoryReader.open(writer)) { TopDocs topDocs = new IndexSearcher(new IncludeSoftDeletesWrapper(reader)).search(new TermQuery(doc), 10); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); int theDoc = topDocs.scoreDocs[0].doc; seqId = writer.tryUpdateDocValue(reader, theDoc, fields); } @@ -797,7 +797,7 @@ static void doDelete(Term doc, IndexWriter writer) throws IOException { try (DirectoryReader reader = DirectoryReader.open(writer)) { TopDocs topDocs = new IndexSearcher(new IncludeSoftDeletesWrapper(reader)).search(new TermQuery(doc), 10); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); int theDoc = topDocs.scoreDocs[0].doc; seqId = writer.tryDeleteDocument(reader, theDoc); } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java b/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java index 10efbd881a22..9663d6762554 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java @@ -138,7 +138,7 @@ public void testSortOnAddIndicesInt() throws IOException { assertEquals(7, values.longValue()); assertEquals(2, values.nextDoc()); assertEquals(18, values.longValue()); - assertNotNull(leaf.getMetaData().getSort()); + assertNotNull(leaf.getMetaData().sort()); IOUtils.close(r, w, dir, tmpDir); } @@ -242,6 +242,7 @@ public void testSortOnAddIndicesRandom() throws IOException { NumericDocValues ids = leaf.getNumericDocValues("id"); long prevValue = -1; boolean usingAltIds = false; + KnnVectorValues.DocIndexIterator valuesIterator = vectorValues.iterator(); for (int i = 0; i < actualNumDocs; i++) { int idNext = ids.nextDoc(); if (idNext == DocIdSetIterator.NO_MORE_DOCS) { @@ -254,6 +255,7 @@ public void testSortOnAddIndicesRandom() throws IOException { sorted_set_dv = leaf.getSortedSetDocValues("sorted_set_dv"); binary_sorted_dv = leaf.getSortedDocValues("binary_sorted_dv"); vectorValues = leaf.getFloatVectorValues("vector"); + valuesIterator = vectorValues.iterator(); prevValue = -1; } assertTrue(prevValue + " < " + ids.longValue(), prevValue < ids.longValue()); @@ -262,7 +264,7 @@ public void testSortOnAddIndicesRandom() throws IOException { assertTrue(sorted_numeric_dv.advanceExact(idNext)); assertTrue(sorted_set_dv.advanceExact(idNext)); assertTrue(binary_sorted_dv.advanceExact(idNext)); - assertEquals(idNext, vectorValues.advance(idNext)); + assertEquals(idNext, valuesIterator.advance(idNext)); assertEquals(new BytesRef(ids.longValue() + ""), binary_dv.binaryValue()); assertEquals( new BytesRef(ids.longValue() + ""), @@ -274,7 +276,7 @@ public void testSortOnAddIndicesRandom() throws IOException { assertEquals(1, sorted_numeric_dv.docValueCount()); assertEquals(ids.longValue(), sorted_numeric_dv.nextValue()); - float[] vectorValue = vectorValues.vectorValue(); + float[] vectorValue = vectorValues.vectorValue(valuesIterator.index()); assertEquals(1, vectorValue.length); assertEquals((float) ids.longValue(), vectorValue[0], 0.001f); @@ -290,12 +292,12 @@ public void testSortOnAddIndicesRandom() throws IOException { IndexSearcher searcher = new IndexSearcher(r); TopDocs result = searcher.search(LongPoint.newExactQuery("point_id", ids.longValue()), 1); - assertEquals(1, result.totalHits.value); + assertEquals(1, result.totalHits.value()); assertEquals(idNext, result.scoreDocs[0].doc); result = searcher.search(new TermQuery(new Term("string_id", "" + ids.longValue())), 1); - assertEquals(1, result.totalHits.value); + assertEquals(1, result.totalHits.value()); assertEquals(idNext, result.scoreDocs[0].doc); } assertEquals(DocIdSetIterator.NO_MORE_DOCS, ids.nextDoc()); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestStoredFieldsConsumer.java b/lucene/core/src/test/org/apache/lucene/index/TestStoredFieldsConsumer.java new file mode 100644 index 000000000000..e0c46bc58a70 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/index/TestStoredFieldsConsumer.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.index; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.concurrent.atomic.AtomicInteger; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FlushInfo; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.util.StringHelper; +import org.apache.lucene.util.Version; + +public class TestStoredFieldsConsumer extends LuceneTestCase { + + public void testFinish() throws IOException { + Directory dir = newDirectory(); + IndexWriterConfig iwc = new IndexWriterConfig(); + SegmentInfo si = + new SegmentInfo( + dir, + Version.LATEST, + null, + "_0", + -1, + false, + false, + iwc.getCodec(), + Collections.emptyMap(), + StringHelper.randomId(), + new HashMap<>(), + null); + + AtomicInteger startDocCounter = new AtomicInteger(), finishDocCounter = new AtomicInteger(); + StoredFieldsConsumer consumer = + new StoredFieldsConsumer(iwc.getCodec(), dir, si) { + @Override + void startDocument(int docID) throws IOException { + super.startDocument(docID); + startDocCounter.incrementAndGet(); + } + + @Override + void finishDocument() throws IOException { + super.finishDocument(); + finishDocCounter.incrementAndGet(); + } + }; + + int numDocs = 3; + consumer.finish(numDocs); + + si.setMaxDoc(numDocs); + SegmentWriteState state = + new SegmentWriteState( + null, + dir, + si, + new FieldInfos(new FieldInfo[0]), + null, + new IOContext(new FlushInfo(numDocs, 10))); + consumer.flush(state, null); + dir.close(); + + assertEquals(numDocs, startDocCounter.get()); + assertEquals(numDocs, finishDocCounter.get()); + } +} diff --git a/lucene/core/src/test/org/apache/lucene/index/TestStressDeletes.java b/lucene/core/src/test/org/apache/lucene/index/TestStressDeletes.java index ceecf32928d9..984a2340d01d 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestStressDeletes.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestStressDeletes.java @@ -110,9 +110,9 @@ public void run() { int id = ent.getKey(); TopDocs hits = s.search(new TermQuery(new Term("id", "" + id)), 1); if (ent.getValue()) { - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); } else { - assertEquals(0, hits.totalHits.value); + assertEquals(0, hits.totalHits.value()); } } r.close(); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestStressNRT.java b/lucene/core/src/test/org/apache/lucene/index/TestStressNRT.java index 955c335a0851..ab8d2aff4c29 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestStressNRT.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestStressNRT.java @@ -418,11 +418,11 @@ public void run() { Query q = new TermQuery(new Term("id", Integer.toString(id))); TopDocs results = searcher.search(q, 10); - if (results.totalHits.value == 0 && tombstones) { + if (results.totalHits.value() == 0 && tombstones) { // if we couldn't find the doc, look for its tombstone q = new TermQuery(new Term("id", "-" + Integer.toString(id))); results = searcher.search(q, 1); - if (results.totalHits.value == 0) { + if (results.totalHits.value() == 0) { if (val == -1L) { // expected... no doc was added yet r.decRef(); @@ -438,11 +438,11 @@ public void run() { } } - if (results.totalHits.value == 0 && !tombstones) { + if (results.totalHits.value() == 0 && !tombstones) { // nothing to do - we can't tell anything from a deleted doc without tombstones } else { // we should have found the document, or its tombstone - if (results.totalHits.value != 1) { + if (results.totalHits.value() != 1) { System.out.println("FAIL: hits id:" + id + " val=" + val); for (ScoreDoc sd : results.scoreDocs) { final Document doc = r.storedFields().document(sd.doc); @@ -454,7 +454,7 @@ public void run() { + " foundVal=" + doc.get(field)); } - fail("id=" + id + " reader=" + r + " totalHits=" + results.totalHits.value); + fail("id=" + id + " reader=" + r + " totalHits=" + results.totalHits.value()); } Document doc = searcher.storedFields().document(results.scoreDocs[0].doc); long foundVal = Long.parseLong(doc.get(field)); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestTermVectorsReader.java b/lucene/core/src/test/org/apache/lucene/index/TestTermVectorsReader.java index 76693291518c..1f3c7382317f 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestTermVectorsReader.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestTermVectorsReader.java @@ -190,7 +190,7 @@ public void test() throws IOException { DirectoryReader reader = DirectoryReader.open(dir); for (LeafReaderContext ctx : reader.leaves()) { SegmentReader sr = (SegmentReader) ctx.reader(); - assertTrue(sr.getFieldInfos().hasVectors()); + assertTrue(sr.getFieldInfos().hasTermVectors()); } reader.close(); } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java b/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java index 17d98e249f36..43c16d8ec154 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java @@ -625,15 +625,7 @@ private BytesRef getNonExistTerm(BytesRef[] terms) { } } - private static class TermAndState { - public final BytesRef term; - public final TermState state; - - public TermAndState(BytesRef term, TermState state) { - this.term = term; - this.state = state; - } - } + private record TermAndState(BytesRef term, TermState state) {} private void testRandomSeeks(IndexReader r, String... validTermStrings) throws IOException { final BytesRef[] validTerms = new BytesRef[validTermStrings.length]; diff --git a/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum2.java b/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum2.java index ddca01a2387e..105579b25d60 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum2.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum2.java @@ -183,7 +183,7 @@ public void testIntersect() throws Exception { Automaton actual = Operations.determinize(Automata.makeStringUnion(found), DEFAULT_DETERMINIZE_WORK_LIMIT); - assertTrue(Operations.sameLanguage(expected, actual)); + assertTrue(AutomatonTestUtil.sameLanguage(expected, actual)); } } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestTieredMergePolicy.java b/lucene/core/src/test/org/apache/lucene/index/TestTieredMergePolicy.java index eb24d9647026..a2d678a3ec04 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestTieredMergePolicy.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestTieredMergePolicy.java @@ -19,6 +19,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Collections; +import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Locale; @@ -39,6 +40,8 @@ public class TestTieredMergePolicy extends BaseMergePolicyTestCase { + private record DocCountAndSizeInBytes(int docCount, long sizeInBytes) {} + @Override public TieredMergePolicy mergePolicy() { return newTieredMergePolicy(); @@ -54,7 +57,7 @@ protected void assertSegmentInfos(MergePolicy policy, SegmentInfos infos) throws int totalDelCount = 0; int totalMaxDoc = 0; long totalBytes = 0; - List segmentSizes = new ArrayList<>(); + List segmentSizes = new ArrayList<>(); for (SegmentCommitInfo sci : infos) { totalDelCount += sci.getDelCount(); totalMaxDoc += sci.info.maxDoc(); @@ -62,10 +65,11 @@ protected void assertSegmentInfos(MergePolicy policy, SegmentInfos infos) throws double liveRatio = 1 - (double) sci.getDelCount() / sci.info.maxDoc(); long weightedByteSize = (long) (liveRatio * byteSize); totalBytes += weightedByteSize; - segmentSizes.add(weightedByteSize); + segmentSizes.add( + new DocCountAndSizeInBytes(sci.info.maxDoc() - sci.getDelCount(), weightedByteSize)); minSegmentBytes = Math.min(minSegmentBytes, weightedByteSize); } - Collections.sort(segmentSizes); + Collections.sort(segmentSizes, Comparator.comparingLong(DocCountAndSizeInBytes::sizeInBytes)); final double delPercentage = 100.0 * totalDelCount / totalMaxDoc; assertTrue( @@ -78,7 +82,7 @@ protected void assertSegmentInfos(MergePolicy policy, SegmentInfos infos) throws long levelSizeBytes = Math.max(minSegmentBytes, (long) (tmp.getFloorSegmentMB() * 1024 * 1024)); long bytesLeft = totalBytes; double allowedSegCount = 0; - List biggestSegments = segmentSizes; + List biggestSegments = segmentSizes; if (biggestSegments.size() > tmp.getTargetSearchConcurrency() - 1) { biggestSegments = biggestSegments.subList( @@ -86,11 +90,18 @@ protected void assertSegmentInfos(MergePolicy policy, SegmentInfos infos) throws biggestSegments.size()); } // Allow whole segments for the targetSearchConcurrency-1 biggest segments - for (long size : biggestSegments) { - bytesLeft -= size; + for (DocCountAndSizeInBytes size : biggestSegments) { + bytesLeft -= size.sizeInBytes(); allowedSegCount++; } + int tooBigCount = 0; + for (DocCountAndSizeInBytes size : segmentSizes) { + if (size.sizeInBytes() >= maxMergedSegmentBytes / 2) { + tooBigCount++; + } + } + // below we make the assumption that segments that reached the max segment // size divided by 2 don't need merging anymore int mergeFactor = (int) Math.min(tmp.getSegmentsPerTier(), tmp.getMaxMergeAtOnce()); @@ -105,39 +116,31 @@ protected void assertSegmentInfos(MergePolicy policy, SegmentInfos infos) throws bytesLeft -= tmp.getSegmentsPerTier() * levelSizeBytes; levelSizeBytes = Math.min(levelSizeBytes * mergeFactor, maxMergedSegmentBytes / 2); } - allowedSegCount = Math.max(allowedSegCount, tmp.getSegmentsPerTier()); + // Allow at least a full tier in addition of the too big segments. + allowedSegCount = Math.max(allowedSegCount, tooBigCount + tmp.getSegmentsPerTier()); + // Allow at least `targetSearchConcurrency` segments. + allowedSegCount = Math.max(allowedSegCount, tmp.getTargetSearchConcurrency()); - // It's ok to be over the allowed segment count if none of the most balanced merges are balanced - // enough - boolean hasBalancedMerges = false; - for (int i = 0; i < segmentSizes.size() - mergeFactor; ++i) { - long maxMergeSegmentSize = segmentSizes.get(i + mergeFactor - 1); - if (maxMergeSegmentSize >= maxMergedSegmentBytes / 2) { - break; - } - long totalMergeSize = 0; - for (int j = 0; j < i + mergeFactor; ++j) { - totalMergeSize += segmentSizes.get(j); - } - if (maxMergedSegmentBytes * 1.5 <= totalMergeSize) { - hasBalancedMerges = true; + // It's ok to be over the allowed segment count if none of the merges are legal, because they + // are either not balanced or because they exceed the max merged segment doc count. + // We only check pairwise merges instead of every possible merge to keep things simple. If none + // of the pairwise merges are legal, chances are high that no merge is legal. + int maxDocsPerSegment = tmp.getMaxAllowedDocs(infos.totalMaxDoc(), totalDelCount); + boolean hasLegalMerges = false; + for (int i = 0; i < segmentSizes.size() - 1; ++i) { + DocCountAndSizeInBytes size1 = segmentSizes.get(i); + DocCountAndSizeInBytes size2 = segmentSizes.get(i + 1); + long mergedSegmentSizeInBytes = size1.sizeInBytes() + size2.sizeInBytes(); + int mergedSegmentDocCount = size1.docCount() + size2.docCount(); + + if (mergedSegmentSizeInBytes <= maxMergedSegmentBytes + && size2.sizeInBytes() * 1.5 <= mergedSegmentSizeInBytes + && mergedSegmentDocCount <= maxDocsPerSegment) { + hasLegalMerges = true; break; } } - // There can be more segments if we can't merge docs because they are balanced between segments. - // At least the - // 2 smallest segments should be mergeable. - // should be 2 segments to merge - int maxDocsPerSegment = tmp.getMaxAllowedDocs(infos.totalMaxDoc(), totalDelCount); - List segmentDocs = - infos.asList().stream() - .map(info -> info.info.maxDoc() - info.getDelCount()) - .sorted() - .toList(); - boolean eligibleDocsMerge = - segmentDocs.size() >= 2 && segmentDocs.get(0) + segmentDocs.get(1) < maxDocsPerSegment; - int numSegments = infos.asList().size(); assertTrue( String.format( @@ -154,7 +157,7 @@ protected void assertSegmentInfos(MergePolicy policy, SegmentInfos infos) throws delPercentage, tmp.getDeletesPctAllowed(), tmp.getTargetSearchConcurrency()), - numSegments <= allowedSegCount || hasBalancedMerges == false || eligibleDocsMerge == false); + numSegments <= allowedSegCount || hasLegalMerges == false); } @Override diff --git a/lucene/core/src/test/org/apache/lucene/index/TestTryDelete.java b/lucene/core/src/test/org/apache/lucene/index/TestTryDelete.java index 78d00a1eec4e..a1ecc29ca2c0 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestTryDelete.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestTryDelete.java @@ -71,7 +71,7 @@ public void testTryDeleteDocument() throws IOException { IndexSearcher searcher = mgr.acquire(); TopDocs topDocs = searcher.search(new TermQuery(new Term("foo", "0")), 100); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); long result; if (random().nextBoolean()) { @@ -99,7 +99,7 @@ public void testTryDeleteDocument() throws IOException { topDocs = searcher.search(new TermQuery(new Term("foo", "0")), 100); - assertEquals(0, topDocs.totalHits.value); + assertEquals(0, topDocs.totalHits.value()); } public void testTryDeleteDocumentCloseAndReopen() throws IOException { @@ -112,7 +112,7 @@ public void testTryDeleteDocumentCloseAndReopen() throws IOException { IndexSearcher searcher = mgr.acquire(); TopDocs topDocs = searcher.search(new TermQuery(new Term("foo", "0")), 100); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); long result = writer.tryDeleteDocument(DirectoryReader.open(writer), 0); @@ -128,7 +128,7 @@ public void testTryDeleteDocumentCloseAndReopen() throws IOException { topDocs = searcher.search(new TermQuery(new Term("foo", "0")), 100); - assertEquals(0, topDocs.totalHits.value); + assertEquals(0, topDocs.totalHits.value()); writer.close(); @@ -136,7 +136,7 @@ public void testTryDeleteDocumentCloseAndReopen() throws IOException { topDocs = searcher.search(new TermQuery(new Term("foo", "0")), 100); - assertEquals(0, topDocs.totalHits.value); + assertEquals(0, topDocs.totalHits.value()); } public void testDeleteDocuments() throws IOException { @@ -149,7 +149,7 @@ public void testDeleteDocuments() throws IOException { IndexSearcher searcher = mgr.acquire(); TopDocs topDocs = searcher.search(new TermQuery(new Term("foo", "0")), 100); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); long result = writer.deleteDocuments(new TermQuery(new Term("foo", "0"))); @@ -165,6 +165,6 @@ public void testDeleteDocuments() throws IOException { topDocs = searcher.search(new TermQuery(new Term("foo", "0")), 100); - assertEquals(0, topDocs.totalHits.value); + assertEquals(0, topDocs.totalHits.value()); } } diff --git a/lucene/core/src/test/org/apache/lucene/internal/hppc/TestCharObjectHashMap.java b/lucene/core/src/test/org/apache/lucene/internal/hppc/TestCharObjectHashMap.java index 4cc036dcfe65..f9a1a259ce83 100644 --- a/lucene/core/src/test/org/apache/lucene/internal/hppc/TestCharObjectHashMap.java +++ b/lucene/core/src/test/org/apache/lucene/internal/hppc/TestCharObjectHashMap.java @@ -17,6 +17,8 @@ package org.apache.lucene.internal.hppc; +import static org.apache.lucene.internal.hppc.TestIntObjectHashMap.toList; + import com.carrotsearch.randomizedtesting.RandomizedTest; import java.util.Arrays; import java.util.HashMap; @@ -24,6 +26,8 @@ import java.util.Random; import java.util.concurrent.atomic.AtomicInteger; import org.apache.lucene.tests.util.LuceneTestCase; +import org.hamcrest.MatcherAssert; +import org.hamcrest.Matchers; import org.junit.After; import org.junit.Test; @@ -66,13 +70,6 @@ private static void assertSortedListEquals(char[] array, char... elements) { assertArrayEquals(elements, array); } - /** Check if the array's content is identical to a given sequence of elements. */ - private static void assertSortedListEquals(Object[] array, Object... elements) { - assertEquals(elements.length, array.length); - Arrays.sort(array); - assertArrayEquals(elements, array); - } - private final int value0 = vcast(0); private final int value1 = vcast(1); private final int value2 = vcast(2); @@ -603,13 +600,15 @@ public void testMapValues() { map.put(key1, value3); map.put(key2, value2); map.put(key3, value1); - assertSortedListEquals(map.values().toArray(), value1, value2, value3); + MatcherAssert.assertThat( + toList(map.values()), Matchers.containsInAnyOrder(value1, value2, value3)); map.clear(); map.put(key1, value1); map.put(key2, value2); map.put(key3, value2); - assertSortedListEquals(map.values().toArray(), value1, value2, value2); + MatcherAssert.assertThat( + toList(map.values()), Matchers.containsInAnyOrder(value1, value2, value2)); } /* */ diff --git a/lucene/core/src/test/org/apache/lucene/internal/hppc/TestIntLongHashMap.java b/lucene/core/src/test/org/apache/lucene/internal/hppc/TestIntLongHashMap.java new file mode 100644 index 000000000000..2af50c908004 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/internal/hppc/TestIntLongHashMap.java @@ -0,0 +1,699 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.internal.hppc; + +import com.carrotsearch.randomizedtesting.RandomizedTest; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Random; +import java.util.concurrent.atomic.AtomicInteger; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.junit.After; +import org.junit.Test; + +/** + * Tests for {@link IntLongHashMap}. + * + *

    Mostly forked and trimmed from com.carrotsearch.hppc.IntLongHashMapTest + * + *

    github: https://github.com/carrotsearch/hppc release: 0.10.0 + */ +public class TestIntLongHashMap extends LuceneTestCase { + + /* Ready to use key values. */ + + protected int keyE = 0; + protected int key0 = cast(0), k0 = key0; + protected int key1 = cast(1), k1 = key1; + protected int key2 = cast(2), k2 = key2; + protected int key3 = cast(3), k3 = key3; + protected int key4 = cast(4), k4 = key4; + protected int key5 = cast(5), k5 = key5; + protected int key6 = cast(6), k6 = key6; + protected int key7 = cast(7), k7 = key7; + protected int key8 = cast(8), k8 = key8; + protected int key9 = cast(9), k9 = key9; + + protected long value0 = vcast(0); + protected long value1 = vcast(1); + protected long value2 = vcast(2); + protected long value3 = vcast(3); + protected long value4 = vcast(4); + + private static int randomIntBetween(int min, int max) { + return min + random().nextInt(max + 1 - min); + } + + private final int[] newArray(int... elements) { + return elements; + } + + /** Create a new array of a given type and copy the arguments to this array. */ + /* */ + private final long[] newvArray(long... elements) { + return elements; + } + + /** Convert to target type from an integer used to test stuff. */ + private int cast(Integer v) { + return v.intValue(); + } + + /** Convert to target type from an integer used to test stuff. */ + private long vcast(int value) { + return (long) value; + } + + /** Check if the array's content is identical to a given sequence of elements. */ + public static void assertSortedListEquals(int[] array, int... elements) { + assertEquals(elements.length, array.length); + Arrays.sort(array); + Arrays.sort(elements); + assertArrayEquals(elements, array); + } + + /** Check if the array's content is identical to a given sequence of elements. */ + public static void assertSortedListEquals(long[] array, long... elements) { + assertEquals(elements.length, array.length); + Arrays.sort(array); + assertArrayEquals(elements, array); + } + + /** Per-test fresh initialized instance. */ + public IntLongHashMap map = newInstance(); + + protected IntLongHashMap newInstance() { + return new IntLongHashMap(); + } + + @After + public void checkEmptySlotsUninitialized() { + if (map != null) { + int occupied = 0; + for (int i = 0; i <= map.mask; i++) { + if (((map.keys[i]) == 0)) { + + } else { + occupied++; + } + } + assertEquals(occupied, map.assigned); + + if (!map.hasEmptyKey) {} + } + } + + private void assertSameMap(final IntLongHashMap c1, final IntLongHashMap c2) { + assertEquals(c1.size(), c2.size()); + + for (IntLongHashMap.IntLongCursor entry : c1) { + assertTrue(c2.containsKey(entry.key)); + assertEquals(entry.value, c2.get(entry.key)); + } + } + + /* */ + @Test + public void testEnsureCapacity() { + final AtomicInteger expands = new AtomicInteger(); + IntLongHashMap map = + new IntLongHashMap(0) { + @Override + protected void allocateBuffers(int arraySize) { + super.allocateBuffers(arraySize); + expands.incrementAndGet(); + } + }; + + // Add some elements. + final int max = rarely() ? 0 : randomIntBetween(0, 250); + for (int i = 0; i < max; i++) { + map.put(cast(i), value0); + } + + final int additions = randomIntBetween(max, max + 5000); + map.ensureCapacity(additions + map.size()); + final int before = expands.get(); + for (int i = 0; i < additions; i++) { + map.put(cast(i), value0); + } + assertEquals(before, expands.get()); + } + + @Test + public void testCursorIndexIsValid() { + map.put(keyE, value1); + map.put(key1, value2); + map.put(key2, value3); + + for (IntLongHashMap.IntLongCursor c : map) { + assertTrue(map.indexExists(c.index)); + assertEquals(c.value, map.indexGet(c.index)); + } + } + + @Test + public void testIndexMethods() { + map.put(keyE, value1); + map.put(key1, value2); + + assertTrue(map.indexOf(keyE) >= 0); + assertTrue(map.indexOf(key1) >= 0); + assertTrue(map.indexOf(key2) < 0); + + assertTrue(map.indexExists(map.indexOf(keyE))); + assertTrue(map.indexExists(map.indexOf(key1))); + assertFalse(map.indexExists(map.indexOf(key2))); + + assertEquals(value1, map.indexGet(map.indexOf(keyE))); + assertEquals(value2, map.indexGet(map.indexOf(key1))); + + expectThrows( + AssertionError.class, + () -> { + map.indexGet(map.indexOf(key2)); + }); + + assertEquals(value1, map.indexReplace(map.indexOf(keyE), value3)); + assertEquals(value2, map.indexReplace(map.indexOf(key1), value4)); + assertEquals(value3, map.indexGet(map.indexOf(keyE))); + assertEquals(value4, map.indexGet(map.indexOf(key1))); + + map.indexInsert(map.indexOf(key2), key2, value1); + assertEquals(value1, map.indexGet(map.indexOf(key2))); + assertEquals(3, map.size()); + + assertEquals(value3, map.indexRemove(map.indexOf(keyE))); + assertEquals(2, map.size()); + assertEquals(value1, map.indexRemove(map.indexOf(key2))); + assertEquals(1, map.size()); + assertTrue(map.indexOf(keyE) < 0); + assertTrue(map.indexOf(key1) >= 0); + assertTrue(map.indexOf(key2) < 0); + } + + /* */ + @Test + public void testCloningConstructor() { + map.put(key1, value1); + map.put(key2, value2); + map.put(key3, value3); + + assertSameMap(map, new IntLongHashMap(map)); + } + + /* */ + @Test + public void testFromArrays() { + map.put(key1, value1); + map.put(key2, value2); + map.put(key3, value3); + + IntLongHashMap map2 = + IntLongHashMap.from(newArray(key1, key2, key3), newvArray(value1, value2, value3)); + + assertSameMap(map, map2); + } + + @Test + public void testGetOrDefault() { + map.put(key2, value2); + assertTrue(map.containsKey(key2)); + + map.put(key1, value1); + assertEquals(value1, map.getOrDefault(key1, value3)); + assertEquals(value3, map.getOrDefault(key3, value3)); + map.remove(key1); + assertEquals(value3, map.getOrDefault(key1, value3)); + } + + /* */ + @Test + public void testPut() { + map.put(key1, value1); + + assertTrue(map.containsKey(key1)); + assertEquals(value1, map.get(key1)); + + map.put(key2, 0L); + + assertEquals(2, map.size()); + assertTrue(map.containsKey(key2)); + assertEquals(0L, map.get(key2)); + } + + /* */ + @Test + public void testPutOverExistingKey() { + map.put(key1, value1); + assertEquals(value1, map.put(key1, value3)); + assertEquals(value3, map.get(key1)); + assertEquals(1, map.size()); + + assertEquals(value3, map.put(key1, 0L)); + assertTrue(map.containsKey(key1)); + assertEquals(0L, map.get(key1)); + + assertEquals(0L, map.put(key1, value1)); + assertEquals(value1, map.get(key1)); + assertEquals(1, map.size()); + } + + /* */ + @Test + public void testPutWithExpansions() { + final int COUNT = 10000; + final Random rnd = new Random(random().nextLong()); + final HashSet values = new HashSet(); + + for (int i = 0; i < COUNT; i++) { + final int v = rnd.nextInt(); + final boolean hadKey = values.contains(cast(v)); + values.add(cast(v)); + + assertEquals(hadKey, map.containsKey(cast(v))); + map.put(cast(v), vcast(v)); + assertEquals(values.size(), map.size()); + } + assertEquals(values.size(), map.size()); + } + + /* */ + @Test + public void testPutAll() { + map.put(key1, value1); + map.put(key2, value1); + + IntLongHashMap map2 = newInstance(); + + map2.put(key2, value2); + map2.put(keyE, value1); + + // One new key (keyE). + assertEquals(1, map.putAll(map2)); + + // Assert the value under key2 has been replaced. + assertEquals(value2, map.get(key2)); + + // And key3 has been added. + assertEquals(value1, map.get(keyE)); + assertEquals(3, map.size()); + } + + /* */ + @Test + public void testPutIfAbsent() { + assertTrue(map.putIfAbsent(key1, value1)); + assertFalse(map.putIfAbsent(key1, value2)); + assertEquals(value1, map.get(key1)); + } + + @Test + public void testPutOrAdd() { + assertEquals(value1, map.putOrAdd(key1, value1, value2)); + assertEquals(value3, map.putOrAdd(key1, value1, value2)); + } + + @Test + public void testAddTo() { + assertEquals(value1, map.addTo(key1, value1)); + assertEquals(value3, map.addTo(key1, value2)); + } + + /* */ + @Test + public void testRemove() { + map.put(key1, value1); + assertEquals(value1, map.remove(key1)); + assertEquals(0L, map.remove(key1)); + assertEquals(0, map.size()); + + // These are internals, but perhaps worth asserting too. + assertEquals(0, map.assigned); + } + + /* */ + @Test + public void testEmptyKey() { + final int empty = 0; + + map.put(empty, value1); + assertEquals(1, map.size()); + assertEquals(false, map.isEmpty()); + assertEquals(value1, map.get(empty)); + assertEquals(value1, map.getOrDefault(empty, value2)); + assertEquals(true, map.iterator().hasNext()); + assertEquals(empty, map.iterator().next().key); + assertEquals(value1, map.iterator().next().value); + + assertEquals(1, map.keys().size()); + assertEquals(empty, map.keys().iterator().next().value); + assertEquals(value1, map.values().iterator().next().value); + + assertEquals(value1, map.put(empty, 0L)); + assertEquals(1, map.size()); + assertTrue(map.containsKey(empty)); + assertEquals(0L, map.get(empty)); + + map.remove(empty); + assertEquals(0L, map.get(empty)); + assertEquals(0, map.size()); + + assertEquals(0L, map.put(empty, value1)); + assertEquals(value1, map.put(empty, value2)); + map.clear(); + assertFalse(map.indexExists(map.indexOf(empty))); + assertEquals(0L, map.put(empty, value1)); + map.clear(); + assertEquals(0L, map.remove(empty)); + } + + /* */ + @Test + public void testMapKeySet() { + map.put(key1, value3); + map.put(key2, value2); + map.put(key3, value1); + + assertSortedListEquals(map.keys().toArray(), key1, key2, key3); + } + + /* */ + @Test + public void testMapKeySetIterator() { + map.put(key1, value3); + map.put(key2, value2); + map.put(key3, value1); + + int counted = 0; + for (IntCursor c : map.keys()) { + assertEquals(map.keys[c.index], c.value); + counted++; + } + assertEquals(counted, map.size()); + } + + /* */ + @Test + public void testClear() { + map.put(key1, value1); + map.put(key2, value1); + map.clear(); + assertEquals(0, map.size()); + + // These are internals, but perhaps worth asserting too. + assertEquals(0, map.assigned); + + // Check values are cleared. + assertEquals(0L, map.put(key1, value1)); + assertEquals(0L, map.remove(key2)); + map.clear(); + + // Check if the map behaves properly upon subsequent use. + testPutWithExpansions(); + } + + /* */ + @Test + public void testRelease() { + map.put(key1, value1); + map.put(key2, value1); + map.release(); + assertEquals(0, map.size()); + + // These are internals, but perhaps worth asserting too. + assertEquals(0, map.assigned); + + // Check if the map behaves properly upon subsequent use. + testPutWithExpansions(); + } + + /* */ + @Test + public void testIterable() { + map.put(key1, value1); + map.put(key2, value2); + map.put(key3, value3); + map.remove(key2); + + int count = 0; + for (IntLongHashMap.IntLongCursor cursor : map) { + count++; + assertTrue(map.containsKey(cursor.key)); + assertEquals(cursor.value, map.get(cursor.key)); + + assertEquals(cursor.value, map.values[cursor.index]); + assertEquals(cursor.key, map.keys[cursor.index]); + } + assertEquals(count, map.size()); + + map.clear(); + assertFalse(map.iterator().hasNext()); + } + + /* */ + @Test + public void testBug_HPPC73_FullCapacityGet() { + final AtomicInteger reallocations = new AtomicInteger(); + final int elements = 0x7F; + map = + new IntLongHashMap(elements, 1f) { + @Override + protected double verifyLoadFactor(double loadFactor) { + // Skip load factor sanity range checking. + return loadFactor; + } + + @Override + protected void allocateBuffers(int arraySize) { + super.allocateBuffers(arraySize); + reallocations.incrementAndGet(); + } + }; + + int reallocationsBefore = reallocations.get(); + assertEquals(reallocationsBefore, 1); + for (int i = 1; i <= elements; i++) { + map.put(cast(i), value1); + } + + // Non-existent key. + int outOfSet = cast(elements + 1); + map.remove(outOfSet); + assertFalse(map.containsKey(outOfSet)); + assertEquals(reallocationsBefore, reallocations.get()); + + // Should not expand because we're replacing an existing element. + map.put(k1, value2); + assertEquals(reallocationsBefore, reallocations.get()); + + // Remove from a full map. + map.remove(k1); + assertEquals(reallocationsBefore, reallocations.get()); + map.put(k1, value2); + + // Check expand on "last slot of a full map" condition. + map.put(outOfSet, value1); + assertEquals(reallocationsBefore + 1, reallocations.get()); + } + + @Test + public void testHashCodeEquals() { + IntLongHashMap l0 = newInstance(); + assertEquals(0, l0.hashCode()); + assertEquals(l0, newInstance()); + + IntLongHashMap l1 = + IntLongHashMap.from(newArray(key1, key2, key3), newvArray(value1, value2, value3)); + + IntLongHashMap l2 = + IntLongHashMap.from(newArray(key2, key1, key3), newvArray(value2, value1, value3)); + + IntLongHashMap l3 = IntLongHashMap.from(newArray(key1, key2), newvArray(value2, value1)); + + assertEquals(l1.hashCode(), l2.hashCode()); + assertEquals(l1, l2); + + assertFalse(l1.equals(l3)); + assertFalse(l2.equals(l3)); + } + + @Test + public void testBug_HPPC37() { + IntLongHashMap l1 = IntLongHashMap.from(newArray(key1), newvArray(value1)); + + IntLongHashMap l2 = IntLongHashMap.from(newArray(key2), newvArray(value1)); + + assertFalse(l1.equals(l2)); + assertFalse(l2.equals(l1)); + } + + @Test + public void testEmptyValue() { + assertEquals(0L, map.put(key1, 0L)); + assertEquals(0L, map.get(key1)); + assertTrue(map.containsKey(key1)); + map.remove(key1); + assertFalse(map.containsKey(key1)); + assertEquals(0, map.size()); + } + + /** Runs random insertions/deletions/clearing and compares the results against {@link HashMap}. */ + @Test + @SuppressWarnings({"rawtypes", "unchecked"}) + public void testAgainstHashMap() { + final Random rnd = RandomizedTest.getRandom(); + final HashMap other = new HashMap(); + + for (int size = 1000; size < 20000; size += 4000) { + other.clear(); + map.clear(); + + for (int round = 0; round < size * 20; round++) { + int key = cast(rnd.nextInt(size)); + if (rnd.nextInt(50) == 0) { + key = 0; + } + long value = vcast(rnd.nextInt()); + + boolean hadOldValue = map.containsKey(key); + if (rnd.nextBoolean()) { + long previousValue; + if (rnd.nextBoolean()) { + int index = map.indexOf(key); + if (map.indexExists(index)) { + previousValue = map.indexReplace(index, value); + } else { + map.indexInsert(index, key, value); + previousValue = 0L; + } + } else { + previousValue = map.put(key, value); + } + assertEquals( + other.put(key, value), ((previousValue) == 0) && !hadOldValue ? null : previousValue); + + assertEquals(value, map.get(key)); + assertEquals(value, map.indexGet(map.indexOf(key))); + assertTrue(map.containsKey(key)); + assertTrue(map.indexExists(map.indexOf(key))); + } else { + assertEquals(other.containsKey(key), map.containsKey(key)); + long previousValue = + map.containsKey(key) && rnd.nextBoolean() + ? map.indexRemove(map.indexOf(key)) + : map.remove(key); + assertEquals( + other.remove(key), ((previousValue) == 0) && !hadOldValue ? null : previousValue); + } + + assertEquals(other.size(), map.size()); + } + } + } + + /* + * + */ + @Test + public void testClone() { + this.map.put(key1, value1); + this.map.put(key2, value2); + this.map.put(key3, value3); + + IntLongHashMap cloned = map.clone(); + cloned.remove(key1); + + assertSortedListEquals(map.keys().toArray(), key1, key2, key3); + assertSortedListEquals(cloned.keys().toArray(), key2, key3); + } + + /* */ + @Test + public void testMapValues() { + map.put(key1, value3); + map.put(key2, value2); + map.put(key3, value1); + assertSortedListEquals(map.values().toArray(), value1, value2, value3); + + map.clear(); + map.put(key1, value1); + map.put(key2, value2); + map.put(key3, value2); + assertSortedListEquals(map.values().toArray(), value1, value2, value2); + } + + /* */ + @Test + public void testMapValuesIterator() { + map.put(key1, value3); + map.put(key2, value2); + map.put(key3, value1); + + int counted = 0; + for (LongCursor c : map.values()) { + assertEquals(map.values[c.index], c.value); + counted++; + } + assertEquals(counted, map.size()); + } + + /* */ + @Test + public void testEqualsSameClass() { + IntLongHashMap l1 = newInstance(); + l1.put(k1, value0); + l1.put(k2, value1); + l1.put(k3, value2); + + IntLongHashMap l2 = new IntLongHashMap(l1); + l2.putAll(l1); + + IntLongHashMap l3 = new IntLongHashMap(l2); + l3.putAll(l2); + l3.put(k4, value0); + + assertEquals(l1, l2); + assertEquals(l1.hashCode(), l2.hashCode()); + assertNotEquals(l1, l3); + } + + /* */ + @Test + public void testEqualsSubClass() { + class Sub extends IntLongHashMap {} + ; + + IntLongHashMap l1 = newInstance(); + l1.put(k1, value0); + l1.put(k2, value1); + l1.put(k3, value2); + + IntLongHashMap l2 = new Sub(); + l2.putAll(l1); + l2.put(k4, value3); + + IntLongHashMap l3 = new Sub(); + l3.putAll(l2); + + assertNotEquals(l1, l2); + assertEquals(l2.hashCode(), l3.hashCode()); + assertEquals(l2, l3); + } +} diff --git a/lucene/core/src/test/org/apache/lucene/internal/hppc/TestIntObjectHashMap.java b/lucene/core/src/test/org/apache/lucene/internal/hppc/TestIntObjectHashMap.java index 6c6c0872ede5..4144300ba558 100644 --- a/lucene/core/src/test/org/apache/lucene/internal/hppc/TestIntObjectHashMap.java +++ b/lucene/core/src/test/org/apache/lucene/internal/hppc/TestIntObjectHashMap.java @@ -18,12 +18,15 @@ package org.apache.lucene.internal.hppc; import com.carrotsearch.randomizedtesting.RandomizedTest; +import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; +import java.util.List; import java.util.Random; import java.util.concurrent.atomic.AtomicInteger; import org.apache.lucene.tests.util.LuceneTestCase; +import org.junit.Assert; import org.junit.Test; /** @@ -66,10 +69,8 @@ private static void assertSortedListEquals(int[] array, int... elements) { } /** Check if the array's content is identical to a given sequence of elements. */ - private static void assertSortedListEquals(Object[] array, Object... elements) { - assertEquals(elements.length, array.length); - Arrays.sort(array); - assertArrayEquals(elements, array); + private static void assertSortedListEquals(List array, Object... elements) { + Assert.assertEquals(Arrays.asList(elements), array.stream().sorted().toList()); } private final int value0 = vcast(0); @@ -584,13 +585,21 @@ public void testMapValues() { map.put(key1, value3); map.put(key2, value2); map.put(key3, value1); - assertSortedListEquals(map.values().toArray(), value1, value2, value3); + assertSortedListEquals(toList(map.values()), value1, value2, value3); map.clear(); map.put(key1, value1); map.put(key2, value2); map.put(key3, value2); - assertSortedListEquals(map.values().toArray(), value1, value2, value2); + assertSortedListEquals(toList(map.values()), value1, value2, value2); + } + + static List toList(Iterable> values) { + ArrayList list = new ArrayList<>(); + for (var c : values) { + list.add(c.value); + } + return list; } /* */ diff --git a/lucene/core/src/test/org/apache/lucene/internal/hppc/TestLongObjectHashMap.java b/lucene/core/src/test/org/apache/lucene/internal/hppc/TestLongObjectHashMap.java index f5d6176e24b3..df66561197d1 100644 --- a/lucene/core/src/test/org/apache/lucene/internal/hppc/TestLongObjectHashMap.java +++ b/lucene/core/src/test/org/apache/lucene/internal/hppc/TestLongObjectHashMap.java @@ -17,6 +17,8 @@ package org.apache.lucene.internal.hppc; +import static org.apache.lucene.internal.hppc.TestIntObjectHashMap.toList; + import com.carrotsearch.randomizedtesting.RandomizedTest; import java.util.Arrays; import java.util.HashMap; @@ -24,6 +26,8 @@ import java.util.Random; import java.util.concurrent.atomic.AtomicInteger; import org.apache.lucene.tests.util.LuceneTestCase; +import org.hamcrest.MatcherAssert; +import org.hamcrest.Matchers; import org.junit.Test; /** @@ -65,13 +69,6 @@ private static void assertSortedListEquals(long[] array, long... elements) { assertArrayEquals(elements, array); } - /** Check if the array's content is identical to a given sequence of elements. */ - private static void assertSortedListEquals(Object[] array, Object... elements) { - assertEquals(elements.length, array.length); - Arrays.sort(array); - assertArrayEquals(elements, array); - } - private final int value0 = vcast(0); private final int value1 = vcast(1); private final int value2 = vcast(2); @@ -585,13 +582,15 @@ public void testMapValues() { map.put(key1, value3); map.put(key2, value2); map.put(key3, value1); - assertSortedListEquals(map.values().toArray(), value1, value2, value3); + MatcherAssert.assertThat( + toList(map.values()), Matchers.containsInAnyOrder(value1, value2, value3)); map.clear(); map.put(key1, value1); map.put(key2, value2); map.put(key3, value2); - assertSortedListEquals(map.values().toArray(), value1, value2, value2); + MatcherAssert.assertThat( + toList(map.values()), Matchers.containsInAnyOrder(value1, value2, value2)); } /* */ diff --git a/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestPostingDecodingUtil.java b/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestPostingDecodingUtil.java new file mode 100644 index 000000000000..6c914dfcc032 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestPostingDecodingUtil.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.internal.vectorization; + +import org.apache.lucene.codecs.lucene101.ForUtil; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.MMapDirectory; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.tests.util.TestUtil; + +public class TestPostingDecodingUtil extends LuceneTestCase { + + public void testDuelSplitInts() throws Exception { + final int iterations = atLeast(100); + + try (Directory dir = new MMapDirectory(createTempDir())) { + try (IndexOutput out = dir.createOutput("tests.bin", IOContext.DEFAULT)) { + out.writeInt(random().nextInt()); + for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) { + out.writeLong(random().nextInt()); + } + } + VectorizationProvider vectorizationProvider = VectorizationProvider.lookup(true); + try (IndexInput in = dir.openInput("tests.bin", IOContext.DEFAULT)) { + int[] expectedB = new int[ForUtil.BLOCK_SIZE]; + int[] expectedC = new int[ForUtil.BLOCK_SIZE]; + int[] actualB = new int[ForUtil.BLOCK_SIZE]; + int[] actualC = new int[ForUtil.BLOCK_SIZE]; + for (int iter = 0; iter < iterations; ++iter) { + // Initialize arrays with random content. + for (int i = 0; i < expectedB.length; ++i) { + expectedB[i] = random().nextInt(); + actualB[i] = expectedB[i]; + expectedC[i] = random().nextInt(); + actualC[i] = expectedC[i]; + } + int bShift = TestUtil.nextInt(random(), 1, 31); + int dec = TestUtil.nextInt(random(), 1, bShift); + int numIters = (bShift + dec - 1) / dec; + int count = TestUtil.nextInt(random(), 1, 64 / numIters); + int bMask = random().nextInt(); + int cIndex = random().nextInt(64); + int cMask = random().nextInt(); + long startFP = random().nextInt(4); + + // Work on a slice that has just the right number of bytes to make the test fail with an + // index-out-of-bounds in case the implementation reads more than the allowed number of + // padding bytes. + IndexInput slice = in.slice("test", 0, startFP + count * Long.BYTES); + + PostingDecodingUtil defaultUtil = new PostingDecodingUtil(slice); + PostingDecodingUtil optimizedUtil = vectorizationProvider.newPostingDecodingUtil(slice); + + slice.seek(startFP); + defaultUtil.splitInts(count, expectedB, bShift, dec, bMask, expectedC, cIndex, cMask); + long expectedEndFP = slice.getFilePointer(); + slice.seek(startFP); + optimizedUtil.splitInts(count, actualB, bShift, dec, bMask, actualC, cIndex, cMask); + assertEquals(expectedEndFP, slice.getFilePointer()); + assertArrayEquals(expectedB, actualB); + assertArrayEquals(expectedC, actualC); + } + } + } + } +} diff --git a/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java b/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java index ce2ad6854a2f..bc3b6813a5be 100644 --- a/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java +++ b/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java @@ -16,6 +16,7 @@ */ package org.apache.lucene.internal.vectorization; +import static java.util.Locale.ROOT; import static org.apache.lucene.index.VectorSimilarityFunction.COSINE; import static org.apache.lucene.index.VectorSimilarityFunction.DOT_PRODUCT; import static org.apache.lucene.index.VectorSimilarityFunction.EUCLIDEAN; @@ -24,6 +25,8 @@ import com.carrotsearch.randomizedtesting.generators.RandomNumbers; import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; import java.util.Arrays; import java.util.List; import java.util.Objects; @@ -39,6 +42,8 @@ import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer; import org.apache.lucene.codecs.hnsw.FlatVectorsScorer; import org.apache.lucene.codecs.lucene95.OffHeapByteVectorValues; +import org.apache.lucene.codecs.lucene95.OffHeapFloatVectorValues; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; @@ -47,7 +52,6 @@ import org.apache.lucene.store.MMapDirectory; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.util.NamedThreadFactory; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.hnsw.RandomVectorScorer; import org.junit.BeforeClass; @@ -269,16 +273,8 @@ public void testCopiesAcrossThreads() throws Exception { } // A callable that scores the given ord and scorer and asserts the expected result. - static class AssertingScoreCallable implements Callable> { - final RandomVectorScorer scorer; - final int ord; - final float expectedScore; - - AssertingScoreCallable(RandomVectorScorer scorer, int ord, float expectedScore) { - this.scorer = scorer; - this.ord = ord; - this.expectedScore = expectedScore; - } + record AssertingScoreCallable(RandomVectorScorer scorer, int ord, float expectedScore) + implements Callable> { @Override public Optional call() throws Exception { @@ -337,12 +333,63 @@ public void testLarge() throws IOException { } } - RandomAccessVectorValues vectorValues( - int dims, int size, IndexInput in, VectorSimilarityFunction sim) throws IOException { + // Tests that the FlatVectorsScorer handles float vectors correctly. + public void testWithFloatValues() throws IOException { + try (Directory dir = new MMapDirectory(createTempDir("testWithFloatValues"))) { + final String fileName = "floatvalues"; + try (IndexOutput out = dir.createOutput(fileName, IOContext.DEFAULT)) { + var vec = floatToByteArray(1f); // single vector, with one dimension + out.writeBytes(vec, 0, vec.length); + } + + try (IndexInput in = dir.openInput(fileName, IOContext.DEFAULT)) { + for (int times = 0; times < TIMES; times++) { + for (var sim : List.of(COSINE, EUCLIDEAN, DOT_PRODUCT, MAXIMUM_INNER_PRODUCT)) { + var vectorValues = floatVectorValues(1, 1, in, sim); + assert vectorValues.getEncoding().byteSize == 4; + + var supplier1 = DEFAULT_SCORER.getRandomVectorScorerSupplier(sim, vectorValues); + var supplier2 = MEMSEG_SCORER.getRandomVectorScorerSupplier(sim, vectorValues); + // these assertion assumes that the supplier and scorer's toString will have float + // in it, since it's based on float vectors. + assertTrue(supplier1.toString().toLowerCase(ROOT).contains("float")); + assertTrue(supplier2.toString().toLowerCase(ROOT).contains("float")); + assertTrue(supplier1.scorer(0).toString().toLowerCase(ROOT).contains("float")); + assertTrue(supplier2.scorer(0).toString().toLowerCase(ROOT).contains("float")); + float expected = supplier1.scorer(0).score(0); + assertEquals(supplier2.scorer(0).score(0), expected, DELTA); + + var scorer1 = DEFAULT_SCORER.getRandomVectorScorer(sim, vectorValues, new float[] {1f}); + var scorer2 = MEMSEG_SCORER.getRandomVectorScorer(sim, vectorValues, new float[] {1f}); + assertTrue(scorer1.toString().toLowerCase(ROOT).contains("float")); + assertTrue(scorer2.toString().toLowerCase(ROOT).contains("float")); + expected = scorer1.score(0); + assertEquals(scorer2.score(0), expected, DELTA); + + expectThrows( + Throwable.class, + () -> DEFAULT_SCORER.getRandomVectorScorer(sim, vectorValues, new byte[] {1})); + expectThrows( + Throwable.class, + () -> MEMSEG_SCORER.getRandomVectorScorer(sim, vectorValues, new byte[] {1})); + } + } + } + } + } + + KnnVectorValues vectorValues(int dims, int size, IndexInput in, VectorSimilarityFunction sim) + throws IOException { return new OffHeapByteVectorValues.DenseOffHeapVectorValues( dims, size, in.slice("byteValues", 0, in.length()), dims, MEMSEG_SCORER, sim); } + KnnVectorValues floatVectorValues(int dims, int size, IndexInput in, VectorSimilarityFunction sim) + throws IOException { + return new OffHeapFloatVectorValues.DenseOffHeapVectorValues( + dims, size, in.slice("floatValues", 0, in.length()), dims, MEMSEG_SCORER, sim); + } + // creates the vector based on the given ordinal, which is reproducible given the ord and dims static byte[] vector(int ord, int dims) { var random = new Random(Objects.hash(ord, dims)); @@ -363,6 +410,11 @@ static byte[] concat(byte[]... arrays) throws IOException { } } + /** Converts a float value to a byte array. */ + public static byte[] floatToByteArray(float value) { + return ByteBuffer.allocate(4).order(ByteOrder.LITTLE_ENDIAN).putFloat(value).array(); + } + static int randomIntBetween(int minInclusive, int maxInclusive) { return RandomNumbers.randomIntBetween(random(), minInclusive, maxInclusive); } diff --git a/lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java b/lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java index 7520003ab4f9..4abd955eeb66 100644 --- a/lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java +++ b/lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java @@ -38,6 +38,7 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.QueryTimeout; @@ -49,7 +50,6 @@ import org.apache.lucene.search.knn.TopKnnCollectorManager; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.analysis.MockAnalyzer; -import org.apache.lucene.tests.codecs.asserting.AssertingCodec; import org.apache.lucene.tests.index.RandomIndexWriter; import org.apache.lucene.tests.store.BaseDirectoryWrapper; import org.apache.lucene.tests.util.LuceneTestCase; @@ -216,7 +216,7 @@ public void testSimpleFilter() throws IOException { Query filter = new TermQuery(new Term("id", "id2")); Query kvq = getKnnVectorQuery("field", new float[] {0, 0}, 10, filter); TopDocs topDocs = searcher.search(kvq, 3); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); assertIdMatches(reader, "id2", topDocs.scoreDocs[0]); } } @@ -230,7 +230,7 @@ public void testFilterWithNoVectorMatches() throws IOException { Query filter = new TermQuery(new Term("other", "value")); Query kvq = getKnnVectorQuery("field", new float[] {0, 0}, 10, filter); TopDocs topDocs = searcher.search(kvq, 3); - assertEquals(0, topDocs.totalHits.value); + assertEquals(0, topDocs.totalHits.value()); } } @@ -510,7 +510,7 @@ public void testRandom() throws IOException { // test that assert reader.hasDeletions() == false; assertEquals(expected, results.scoreDocs.length); - assertTrue(results.totalHits.value >= results.scoreDocs.length); + assertTrue(results.totalHits.value() >= results.scoreDocs.length); // verify the results are in descending score order float last = Float.MAX_VALUE; for (ScoreDoc scoreDoc : results.scoreDocs) { @@ -554,8 +554,8 @@ public void testRandomWithFilter() throws IOException { TopDocs results = searcher.search( getKnnVectorQuery("field", randomVector(dimension), 10, filter1), numDocs); - assertEquals(9, results.totalHits.value); - assertEquals(results.totalHits.value, results.scoreDocs.length); + assertEquals(9, results.totalHits.value()); + assertEquals(results.totalHits.value(), results.scoreDocs.length); expectThrows( UnsupportedOperationException.class, () -> @@ -568,8 +568,8 @@ public void testRandomWithFilter() throws IOException { results = searcher.search( getKnnVectorQuery("field", randomVector(dimension), 5, filter2), numDocs); - assertEquals(5, results.totalHits.value); - assertEquals(results.totalHits.value, results.scoreDocs.length); + assertEquals(5, results.totalHits.value()); + assertEquals(results.totalHits.value(), results.scoreDocs.length); expectThrows( UnsupportedOperationException.class, () -> @@ -584,8 +584,8 @@ public void testRandomWithFilter() throws IOException { getThrowingKnnVectorQuery("field", randomVector(dimension), 5, filter3), numDocs, new Sort(new SortField("tag", SortField.Type.INT))); - assertEquals(5, results.totalHits.value); - assertEquals(results.totalHits.value, results.scoreDocs.length); + assertEquals(5, results.totalHits.value()); + assertEquals(results.totalHits.value(), results.scoreDocs.length); for (ScoreDoc scoreDoc : results.scoreDocs) { FieldDoc fieldDoc = (FieldDoc) scoreDoc; @@ -740,7 +740,7 @@ public void testMergeAwayAllValues() throws IOException { LeafReader leafReader = getOnlyLeafReader(reader); FieldInfo fi = leafReader.getFieldInfos().fieldInfo("field"); assertNotNull(fi); - DocIdSetIterator vectorValues; + KnnVectorValues vectorValues; switch (fi.getVectorEncoding()) { case BYTE: vectorValues = leafReader.getByteVectorValues("field"); @@ -752,7 +752,7 @@ public void testMergeAwayAllValues() throws IOException { throw new AssertionError(); } assertNotNull(vectorValues); - assertEquals(NO_MORE_DOCS, vectorValues.nextDoc()); + assertEquals(NO_MORE_DOCS, vectorValues.iterator().nextDoc()); } } } @@ -838,7 +838,7 @@ public void testTimeLimitingKnnCollectorManager() throws IOException { // Check that results are complete TopDocs noTimeoutTopDocs = noTimeoutCollector.topDocs(); - assertEquals(TotalHits.Relation.EQUAL_TO, noTimeoutTopDocs.totalHits.relation); + assertEquals(TotalHits.Relation.EQUAL_TO, noTimeoutTopDocs.totalHits.relation()); assertEquals(1, noTimeoutTopDocs.scoreDocs.length); // A collector manager that immediately times out @@ -854,7 +854,8 @@ public void testTimeLimitingKnnCollectorManager() throws IOException { // Check that partial results are returned TopDocs timeoutTopDocs = timeoutCollector.topDocs(); - assertEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, timeoutTopDocs.totalHits.relation); + assertEquals( + TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, timeoutTopDocs.totalHits.relation()); assertEquals(1, timeoutTopDocs.scoreDocs.length); } } @@ -1082,13 +1083,7 @@ public void testSameFieldDifferentFormats() throws IOException { IndexWriterConfig iwc = newIndexWriterConfig(mockAnalyzer); KnnVectorsFormat format1 = randomVectorFormat(VectorEncoding.FLOAT32); KnnVectorsFormat format2 = randomVectorFormat(VectorEncoding.FLOAT32); - iwc.setCodec( - new AssertingCodec() { - @Override - public KnnVectorsFormat getKnnVectorsFormatForField(String field) { - return format1; - } - }); + iwc.setCodec(TestUtil.alwaysKnnVectorsFormat(format1)); try (IndexWriter iwriter = new IndexWriter(directory, iwc)) { Document doc = new Document(); @@ -1102,13 +1097,7 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { } iwc = newIndexWriterConfig(mockAnalyzer); - iwc.setCodec( - new AssertingCodec() { - @Override - public KnnVectorsFormat getKnnVectorsFormatForField(String field) { - return format2; - } - }); + iwc.setCodec(TestUtil.alwaysKnnVectorsFormat(format2)); try (IndexWriter iwriter = new IndexWriter(directory, iwc)) { Document doc = new Document(); diff --git a/lucene/core/src/test/org/apache/lucene/search/BaseVectorSimilarityQueryTestCase.java b/lucene/core/src/test/org/apache/lucene/search/BaseVectorSimilarityQueryTestCase.java index 1e32c07b6659..c3623d9c28d6 100644 --- a/lucene/core/src/test/org/apache/lucene/search/BaseVectorSimilarityQueryTestCase.java +++ b/lucene/core/src/test/org/apache/lucene/search/BaseVectorSimilarityQueryTestCase.java @@ -41,7 +41,7 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.tests.index.RandomIndexWriter; import org.apache.lucene.tests.util.LuceneTestCase; -import org.apache.lucene.tests.util.hnsw.HnswTestUtil; +import org.apache.lucene.util.hnsw.HnswUtil; @LuceneTestCase.SuppressCodecs("SimpleText") abstract class BaseVectorSimilarityQueryTestCase< @@ -135,7 +135,7 @@ public void testExtremes() throws IOException { try (Directory indexStore = getIndexStore(getRandomVectors(numDocs, dim)); IndexReader reader = DirectoryReader.open(indexStore)) { IndexSearcher searcher = newSearcher(reader); - assumeTrue("graph is disconnected", HnswTestUtil.graphIsConnected(reader, vectorField)); + assumeTrue("graph is disconnected", HnswUtil.graphIsRooted(reader, vectorField)); // All vectors are above -Infinity Query query1 = @@ -171,7 +171,7 @@ public void testRandomFilter() throws IOException { try (Directory indexStore = getIndexStore(getRandomVectors(numDocs, dim)); IndexReader reader = DirectoryReader.open(indexStore)) { - assumeTrue("graph is disconnected", HnswTestUtil.graphIsConnected(reader, vectorField)); + assumeTrue("graph is disconnected", HnswUtil.graphIsRooted(reader, vectorField)); IndexSearcher searcher = newSearcher(reader); Query query = @@ -296,7 +296,7 @@ public void testSomeDeletes() throws IOException { w.commit(); try (IndexReader reader = DirectoryReader.open(indexStore)) { - assumeTrue("graph is disconnected", HnswTestUtil.graphIsConnected(reader, vectorField)); + assumeTrue("graph is disconnected", HnswUtil.graphIsRooted(reader, vectorField)); IndexSearcher searcher = newSearcher(reader); Query query = diff --git a/lucene/core/src/test/org/apache/lucene/search/IntArrayDocIdSet.java b/lucene/core/src/test/org/apache/lucene/search/IntArrayDocIdSet.java new file mode 100644 index 000000000000..fa4ddece1e4d --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/IntArrayDocIdSet.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.io.IOException; +import java.util.Arrays; +import org.apache.lucene.util.ArrayUtil; + +class IntArrayDocIdSet extends DocIdSet { + private final int[] docs; + private final int length; + + IntArrayDocIdSet(int[] docs, int length) { + if (docs[length] != DocIdSetIterator.NO_MORE_DOCS) { + throw new IllegalArgumentException(); + } + this.docs = docs; + assert assertArraySorted(docs, length) + : "IntArrayDocIdSet need docs to be sorted" + + Arrays.toString(ArrayUtil.copyOfSubArray(docs, 0, length)); + this.length = length; + } + + private static boolean assertArraySorted(int[] docs, int length) { + for (int i = 1; i < length; i++) { + if (docs[i] < docs[i - 1]) { + return false; + } + } + return true; + } + + @Override + public DocIdSetIterator iterator() throws IOException { + return new DocIdSetIterator() { + int i = 0; + int doc = -1; + + @Override + public int docID() { + return doc; + } + + @Override + public int nextDoc() { + return doc = docs[i++]; + } + + @Override + public int advance(int target) { + int bound = 1; + // given that we use this for small arrays only, this is very unlikely to overflow + while (i + bound < length && docs[i + bound] < target) { + bound *= 2; + } + i = Arrays.binarySearch(docs, i + bound / 2, Math.min(i + bound + 1, length), target); + if (i < 0) { + i = -1 - i; + } + return doc = docs[i++]; + } + + @Override + public long cost() { + return length; + } + }; + } + + @Override + public long ramBytesUsed() { + throw new UnsupportedOperationException(); + } +} diff --git a/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQuery.java index 23168dbd04ad..4b10562de261 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQuery.java @@ -87,7 +87,7 @@ private long automatonQueryNrHits(AutomatonQuery query) throws IOException { if (VERBOSE) { System.out.println("TEST: run aq=" + query); } - return searcher.search(query, 5).totalHits.value; + return searcher.search(query, 5).totalHits.value(); } private void assertAutomatonHits(int expected, Automaton automaton) throws IOException { diff --git a/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQueryUnicode.java b/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQueryUnicode.java index 284d6e1382b8..0c4b3b52afc3 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQueryUnicode.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQueryUnicode.java @@ -93,7 +93,7 @@ private Term newTerm(String value) { } private long automatonQueryNrHits(AutomatonQuery query) throws IOException { - return searcher.search(query, 5).totalHits.value; + return searcher.search(query, 5).totalHits.value(); } private void assertAutomatonHits(int expected, Automaton automaton) throws IOException { diff --git a/lucene/core/src/test/org/apache/lucene/search/TestBlendedTermQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestBlendedTermQuery.java index 9f584b879b7e..a538363f4a21 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestBlendedTermQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestBlendedTermQuery.java @@ -98,7 +98,7 @@ public void testBlendedScores() throws IOException { .build(); TopDocs topDocs = searcher.search(query, 20); - assertEquals(11, topDocs.totalHits.value); + assertEquals(11, topDocs.totalHits.value()); // All docs must have the same score for (int i = 0; i < topDocs.scoreDocs.length; ++i) { assertEquals(topDocs.scoreDocs[0].score, topDocs.scoreDocs[i].score, 0.0f); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestBlockMaxConjunction.java b/lucene/core/src/test/org/apache/lucene/search/TestBlockMaxConjunction.java index 35812ab7775c..7e70279f57d1 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestBlockMaxConjunction.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestBlockMaxConjunction.java @@ -66,7 +66,10 @@ public void testRandom() throws IOException { } IndexReader reader = DirectoryReader.open(w); w.close(); - IndexSearcher searcher = newSearcher(reader); + // Disable search concurrency for this test: it requires a single segment, and no intra-segment + // concurrency for its assertions to always be valid + IndexSearcher searcher = + newSearcher(reader, random().nextBoolean(), random().nextBoolean(), false); for (int iter = 0; iter < 100; ++iter) { int start = random().nextInt(10); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestBoolean2.java b/lucene/core/src/test/org/apache/lucene/search/TestBoolean2.java index b0634e56da42..70fb8077011e 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestBoolean2.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestBoolean2.java @@ -255,7 +255,7 @@ public void queriesTest(Query query, int[] expDocNrs) throws Exception { CheckHits.checkHitsQuery(query, hits1, hits2, expDocNrs); // sanity check expected num matches in bigSearcher - assertEquals(mulFactor * topDocs.totalHits.value, bigSearcher.count(query)); + assertEquals(mulFactor * topDocs.totalHits.value(), bigSearcher.count(query)); // now check 2 diff scorers from the bigSearcher as well collectorManager = new TopScoreDocCollectorManager(topDocsToCheck, Integer.MAX_VALUE); @@ -399,16 +399,12 @@ public void testRandomQueries() throws Exception { q3.add(q1, BooleanClause.Occur.SHOULD); q3.add(new PrefixQuery(new Term("field2", "b")), BooleanClause.Occur.SHOULD); assertEquals( - mulFactor * topDocs.totalHits.value + NUM_EXTRA_DOCS / 2, + mulFactor * topDocs.totalHits.value() + NUM_EXTRA_DOCS / 2, bigSearcher.count(q3.build())); // test diff (randomized) scorers produce the same results on bigSearcher as well - hits1 = - bigSearcher.search(q1, new TopFieldCollectorManager(sort, 1000 * mulFactor, 1)) - .scoreDocs; - hits2 = - bigSearcher.search(q1, new TopFieldCollectorManager(sort, 1000 * mulFactor, 1)) - .scoreDocs; + hits1 = bigSearcher.search(q1, new TopFieldCollectorManager(sort, mulFactor, 1)).scoreDocs; + hits2 = bigSearcher.search(q1, new TopFieldCollectorManager(sort, mulFactor, 1)).scoreDocs; CheckHits.checkEqual(q1, hits1, hits2); } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestBooleanMinShouldMatch.java b/lucene/core/src/test/org/apache/lucene/search/TestBooleanMinShouldMatch.java index 4faa78f4bd9f..cafe06f3403a 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestBooleanMinShouldMatch.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestBooleanMinShouldMatch.java @@ -362,7 +362,7 @@ public void postCreate(BooleanQuery.Builder q) { private void assertSubsetOfSameScores(Query q, TopDocs top1, TopDocs top2) { // The constrained query // should be a subset to the unconstrained query. - if (top2.totalHits.value > top1.totalHits.value) { + if (top2.totalHits.value() > top1.totalHits.value()) { fail( "Constrained results not a subset:\n" + CheckHits.topdocsString(top1, 0, 0) @@ -371,12 +371,12 @@ private void assertSubsetOfSameScores(Query q, TopDocs top1, TopDocs top2) { + q.toString()); } - for (int hit = 0; hit < top2.totalHits.value; hit++) { + for (int hit = 0; hit < top2.totalHits.value(); hit++) { int id = top2.scoreDocs[hit].doc; float score = top2.scoreDocs[hit].score; boolean found = false; // find this doc in other hits - for (int other = 0; other < top1.totalHits.value; other++) { + for (int other = 0; other < top1.totalHits.value(); other++) { if (top1.scoreDocs[other].doc == id) { found = true; float otherScore = top1.scoreDocs[other].score; diff --git a/lucene/core/src/test/org/apache/lucene/search/TestBooleanOr.java b/lucene/core/src/test/org/apache/lucene/search/TestBooleanOr.java index b54dbeff9ef6..6e84993902d0 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestBooleanOr.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestBooleanOr.java @@ -16,7 +16,6 @@ */ package org.apache.lucene.search; -import com.carrotsearch.randomizedtesting.generators.RandomNumbers; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; @@ -33,7 +32,7 @@ import org.apache.lucene.tests.search.QueryUtils; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.tests.util.TestUtil; -import org.apache.lucene.util.Bits; +import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.FixedBitSet; public class TestBooleanOr extends LuceneTestCase { @@ -52,7 +51,7 @@ public class TestBooleanOr extends LuceneTestCase { private long search(Query q) throws IOException { QueryUtils.check(random(), q, searcher); - return searcher.search(q, 1000).totalHits.value; + return searcher.search(q, 1000).totalHits.value(); } public void testElements() throws IOException { @@ -205,34 +204,30 @@ public ScoreMode scoreMode() { dir.close(); } - private static BulkScorer scorer(int... matches) { - return new BulkScorer() { - final Score scorer = new Score(); - int i = 0; + private static Scorer scorer(int... matches) throws IOException { + matches = ArrayUtil.growExact(matches, matches.length + 1); + matches[matches.length - 1] = DocIdSetIterator.NO_MORE_DOCS; + DocIdSetIterator it = new IntArrayDocIdSet(matches, matches.length - 1).iterator(); + return new Scorer() { @Override - public int score(LeafCollector collector, Bits acceptDocs, int min, int max) - throws IOException { - collector.setScorer(scorer); - while (i < matches.length && matches[i] < min) { - i += 1; - } - while (i < matches.length && matches[i] < max) { - int doc = matches[i]; - if (acceptDocs == null || acceptDocs.get(doc)) { - collector.collect(doc); - } - i += 1; - } - if (i == matches.length) { - return DocIdSetIterator.NO_MORE_DOCS; - } - return RandomNumbers.randomIntBetween(random(), max, matches[i]); + public DocIdSetIterator iterator() { + return it; + } + + @Override + public int docID() { + return it.docID(); + } + + @Override + public float getMaxScore(int upTo) throws IOException { + return Float.MAX_VALUE; } @Override - public long cost() { - return matches.length; + public float score() throws IOException { + return 0; } }; } @@ -240,7 +235,7 @@ public long cost() { // Make sure that BooleanScorer keeps working even if the sub clauses return // next matching docs which are less than the actual next match public void testSubScorerNextIsNotMatch() throws IOException { - final List optionalScorers = + final List optionalScorers = Arrays.asList( scorer(100000, 1000001, 9999999), scorer(4000, 1000051), @@ -259,7 +254,9 @@ public void collect(int doc) throws IOException { matches.add(doc); } }, - null); + null, + 0, + DocIdSetIterator.NO_MORE_DOCS); assertEquals(Arrays.asList(4000, 5000, 100000, 1000001, 1000051, 9999998, 9999999), matches); } } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestBooleanQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestBooleanQuery.java index d81e98beac9b..3c9fa764ba4d 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestBooleanQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestBooleanQuery.java @@ -226,7 +226,7 @@ public void testNullOrSubScorer() throws Throwable { // PhraseQuery w/ no terms added returns a null scorer PhraseQuery pq = new PhraseQuery("field", new String[0]); q.add(pq, BooleanClause.Occur.SHOULD); - assertEquals(1, s.search(q.build(), 10).totalHits.value); + assertEquals(1, s.search(q.build(), 10).totalHits.value()); // A required clause which returns null scorer should return null scorer to // IndexSearcher. @@ -234,11 +234,11 @@ public void testNullOrSubScorer() throws Throwable { pq = new PhraseQuery("field", new String[0]); q.add(new TermQuery(new Term("field", "a")), BooleanClause.Occur.SHOULD); q.add(pq, BooleanClause.Occur.MUST); - assertEquals(0, s.search(q.build(), 10).totalHits.value); + assertEquals(0, s.search(q.build(), 10).totalHits.value()); DisjunctionMaxQuery dmq = new DisjunctionMaxQuery(Arrays.asList(new TermQuery(new Term("field", "a")), pq), 1.0f); - assertEquals(1, s.search(dmq, 10).totalHits.value); + assertEquals(1, s.search(dmq, 10).totalHits.value()); r.close(); w.close(); @@ -273,13 +273,13 @@ public void testDeMorgan() throws Exception { MultiReader multireader = new MultiReader(reader1, reader2); IndexSearcher searcher = newSearcher(multireader); - assertEquals(0, searcher.search(query.build(), 10).totalHits.value); + assertEquals(0, searcher.search(query.build(), 10).totalHits.value()); final ExecutorService es = Executors.newCachedThreadPool(new NamedThreadFactory("NRT search threads")); searcher = new IndexSearcher(multireader, es); if (VERBOSE) System.out.println("rewritten form: " + searcher.rewrite(query.build())); - assertEquals(0, searcher.search(query.build(), 10).totalHits.value); + assertEquals(0, searcher.search(query.build(), 10).totalHits.value()); es.shutdown(); es.awaitTermination(1, TimeUnit.SECONDS); @@ -419,7 +419,7 @@ public void testMinShouldMatchLeniency() throws Exception { // No doc can match: BQ has only 2 clauses and we are asking for minShouldMatch=4 bq.setMinimumNumberShouldMatch(4); - assertEquals(0, s.search(bq.build(), 1).totalHits.value); + assertEquals(0, s.search(bq.build(), 1).totalHits.value()); r.close(); w.close(); dir.close(); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestBooleanQueryVisitSubscorers.java b/lucene/core/src/test/org/apache/lucene/search/TestBooleanQueryVisitSubscorers.java index b291a96c8024..9a2c177c62d5 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestBooleanQueryVisitSubscorers.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestBooleanQueryVisitSubscorers.java @@ -29,14 +29,13 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.TextField; -import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.similarities.ClassicSimilarity; -import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarities.RawTFSimilarity; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.analysis.MockAnalyzer; import org.apache.lucene.tests.index.RandomIndexWriter; @@ -75,7 +74,7 @@ public void setUp() throws Exception { searcher = newSearcher(reader, true, false); searcher.setSimilarity(new ClassicSimilarity()); scorerSearcher = new ScorerIndexSearcher(reader); - scorerSearcher.setSimilarity(new CountingSimilarity()); + scorerSearcher.setSimilarity(new RawTFSimilarity()); } @Override @@ -156,7 +155,7 @@ static class MyCollector extends FilterCollector { private final Set tqsSet = new HashSet<>(); MyCollector() { - super(new TopScoreDocCollectorManager(10, null, Integer.MAX_VALUE, false).newCollector()); + super(new TopScoreDocCollectorManager(10, null, Integer.MAX_VALUE).newCollector()); } @Override @@ -190,7 +189,7 @@ private void fillLeaves(Scorable scorer, Set set) throws IOException { set.add((Scorer) scorer); } else { for (Scorable.ChildScorable child : scorer.getChildren()) { - fillLeaves(child.child, set); + fillLeaves(child.child(), set); } } } @@ -330,8 +329,8 @@ private static void summarizeScorer( final StringBuilder builder, final Scorable scorer, final int indent) throws IOException { builder.append(scorer.getClass().getSimpleName()); for (final Scorable.ChildScorable childScorer : scorer.getChildren()) { - indent(builder, indent + 1).append(childScorer.relationship).append(" "); - summarizeScorer(builder, childScorer.child, indent + 2); + indent(builder, indent + 1).append(childScorer.relationship()).append(" "); + summarizeScorer(builder, childScorer.child(), indent + 2); } } @@ -345,24 +344,4 @@ private static StringBuilder indent(final StringBuilder builder, final int inden return builder; } } - - // Similarity that just returns the frequency as the score - private static class CountingSimilarity extends Similarity { - - @Override - public long computeNorm(FieldInvertState state) { - return 1; - } - - @Override - public SimScorer scorer( - float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { - return new SimScorer() { - @Override - public float score(float freq, long norm) { - return freq; - } - }; - } - } } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestBooleanRewrites.java b/lucene/core/src/test/org/apache/lucene/search/TestBooleanRewrites.java index 15f0c0d0c947..f36c7539c7dc 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestBooleanRewrites.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestBooleanRewrites.java @@ -330,8 +330,8 @@ public void testDeeplyNestedBooleanRewriteShouldClauses() throws IOException { int depth = TestUtil.nextInt(random(), 10, 30); TestRewriteQuery rewriteQueryExpected = new TestRewriteQuery(); TestRewriteQuery rewriteQuery = new TestRewriteQuery(); - Query expectedQuery = - new BooleanQuery.Builder().add(rewriteQueryExpected, Occur.FILTER).build(); + BooleanQuery.Builder expectedQueryBuilder = + new BooleanQuery.Builder().add(rewriteQueryExpected, Occur.FILTER); Query deepBuilder = new BooleanQuery.Builder() .add(rewriteQuery, Occur.SHOULD) @@ -345,21 +345,19 @@ public void testDeeplyNestedBooleanRewriteShouldClauses() throws IOException { .add(tq, Occur.SHOULD) .add(deepBuilder, Occur.SHOULD); deepBuilder = bq.build(); - BooleanQuery.Builder expectedBq = new BooleanQuery.Builder().add(tq, Occur.FILTER); + expectedQueryBuilder.add(tq, Occur.FILTER); if (i == depth) { - expectedBq.add(rewriteQuery, Occur.FILTER); - } else { - expectedBq.add(expectedQuery, Occur.FILTER); + expectedQueryBuilder.add(rewriteQuery, Occur.FILTER); } - expectedQuery = expectedBq.build(); } BooleanQuery bq = new BooleanQuery.Builder().add(deepBuilder, Occur.FILTER).build(); - expectedQuery = new BoostQuery(new ConstantScoreQuery(expectedQuery), 0.0f); + Query expectedQuery = + new BoostQuery(new ConstantScoreQuery(expectedQueryBuilder.build()), 0.0f); Query rewritten = searcher.rewrite(bq); assertEquals(expectedQuery, rewritten); // the SHOULD clauses cause more rewrites because they incrementally change to `MUST` and then - // `FILTER` - assertEquals("Depth=" + depth, depth + 1, rewriteQuery.numRewrites); + // `FILTER`, plus the flattening of required clauses + assertEquals("Depth=" + depth, depth * 2, rewriteQuery.numRewrites); } public void testDeeplyNestedBooleanRewrite() throws IOException { @@ -369,27 +367,26 @@ public void testDeeplyNestedBooleanRewrite() throws IOException { int depth = TestUtil.nextInt(random(), 10, 30); TestRewriteQuery rewriteQueryExpected = new TestRewriteQuery(); TestRewriteQuery rewriteQuery = new TestRewriteQuery(); - Query expectedQuery = - new BooleanQuery.Builder().add(rewriteQueryExpected, Occur.FILTER).build(); + BooleanQuery.Builder expectedQueryBuilder = + new BooleanQuery.Builder().add(rewriteQueryExpected, Occur.FILTER); Query deepBuilder = new BooleanQuery.Builder().add(rewriteQuery, Occur.MUST).build(); for (int i = depth; i > 0; i--) { TermQuery tq = termQueryFunction.apply(i); BooleanQuery.Builder bq = new BooleanQuery.Builder().add(tq, Occur.MUST).add(deepBuilder, Occur.MUST); deepBuilder = bq.build(); - BooleanQuery.Builder expectedBq = new BooleanQuery.Builder().add(tq, Occur.FILTER); + expectedQueryBuilder.add(tq, Occur.FILTER); if (i == depth) { - expectedBq.add(rewriteQuery, Occur.FILTER); - } else { - expectedBq.add(expectedQuery, Occur.FILTER); + expectedQueryBuilder.add(rewriteQuery, Occur.FILTER); } - expectedQuery = expectedBq.build(); } BooleanQuery bq = new BooleanQuery.Builder().add(deepBuilder, Occur.FILTER).build(); - expectedQuery = new BoostQuery(new ConstantScoreQuery(expectedQuery), 0.0f); + Query expectedQuery = + new BoostQuery(new ConstantScoreQuery(expectedQueryBuilder.build()), 0.0f); Query rewritten = searcher.rewrite(bq); assertEquals(expectedQuery, rewritten); - assertEquals("Depth=" + depth, 1, rewriteQuery.numRewrites); + // `depth` rewrites because of the flattening + assertEquals("Depth=" + depth, depth, rewriteQuery.numRewrites); } public void testRemoveMatchAllFilter() throws IOException { @@ -541,7 +538,7 @@ private Query randomQuery(Random random) { } private void assertEquals(TopDocs td1, TopDocs td2) { - assertEquals(td1.totalHits.value, td2.totalHits.value); + assertEquals(td1.totalHits.value(), td2.totalHits.value()); assertEquals(td1.scoreDocs.length, td2.scoreDocs.length); Map expectedScores = Arrays.stream(td1.scoreDocs).collect(Collectors.toMap(sd -> sd.doc, sd -> sd.score)); @@ -691,6 +688,110 @@ public void testFlattenInnerDisjunctions() throws IOException { assertSame(query, searcher.rewrite(query)); } + public void testFlattenInnerConjunctions() throws IOException { + IndexSearcher searcher = newSearcher(new MultiReader()); + + Query inner = + new BooleanQuery.Builder() + .add(new TermQuery(new Term("foo", "bar")), Occur.MUST) + .add(new TermQuery(new Term("foo", "quux")), Occur.MUST) + .build(); + Query query = + new BooleanQuery.Builder() + .add(inner, Occur.MUST) + .add(new TermQuery(new Term("foo", "baz")), Occur.FILTER) + .build(); + Query expectedRewritten = + new BooleanQuery.Builder() + .add(new TermQuery(new Term("foo", "bar")), Occur.MUST) + .add(new TermQuery(new Term("foo", "quux")), Occur.MUST) + .add(new TermQuery(new Term("foo", "baz")), Occur.FILTER) + .build(); + assertEquals(expectedRewritten, searcher.rewrite(query)); + + query = + new BooleanQuery.Builder() + .setMinimumNumberShouldMatch(0) + .add(inner, Occur.MUST) + .add(new TermQuery(new Term("foo", "baz")), Occur.SHOULD) + .build(); + expectedRewritten = + new BooleanQuery.Builder() + .setMinimumNumberShouldMatch(0) + .add(new TermQuery(new Term("foo", "bar")), Occur.MUST) + .add(new TermQuery(new Term("foo", "quux")), Occur.MUST) + .add(new TermQuery(new Term("foo", "baz")), Occur.SHOULD) + .build(); + assertEquals(expectedRewritten, searcher.rewrite(query)); + + query = + new BooleanQuery.Builder() + .add(inner, Occur.MUST) + .add(new TermQuery(new Term("foo", "baz")), Occur.MUST_NOT) + .build(); + expectedRewritten = + new BooleanQuery.Builder() + .add(new TermQuery(new Term("foo", "bar")), Occur.MUST) + .add(new TermQuery(new Term("foo", "quux")), Occur.MUST) + .add(new TermQuery(new Term("foo", "baz")), Occur.MUST_NOT) + .build(); + assertEquals(expectedRewritten, searcher.rewrite(query)); + + inner = + new BooleanQuery.Builder() + .add(new TermQuery(new Term("foo", "bar")), Occur.MUST) + .add(new TermQuery(new Term("foo", "quux")), Occur.FILTER) + .build(); + query = + new BooleanQuery.Builder() + .add(inner, Occur.MUST) + .add(new TermQuery(new Term("foo", "baz")), Occur.MUST) + .build(); + expectedRewritten = + new BooleanQuery.Builder() + .add(new TermQuery(new Term("foo", "bar")), Occur.MUST) + .add(new TermQuery(new Term("foo", "quux")), Occur.FILTER) + .add(new TermQuery(new Term("foo", "baz")), Occur.MUST) + .build(); + assertEquals(expectedRewritten, searcher.rewrite(query)); + + inner = + new BooleanQuery.Builder() + .add(new TermQuery(new Term("foo", "bar")), Occur.MUST) + .add(new TermQuery(new Term("foo", "quux")), Occur.FILTER) + .build(); + query = + new BooleanQuery.Builder() + .add(inner, Occur.FILTER) + .add(new TermQuery(new Term("foo", "baz")), Occur.MUST) + .build(); + expectedRewritten = + new BooleanQuery.Builder() + .add(new TermQuery(new Term("foo", "bar")), Occur.FILTER) + .add(new TermQuery(new Term("foo", "quux")), Occur.FILTER) + .add(new TermQuery(new Term("foo", "baz")), Occur.MUST) + .build(); + assertEquals(expectedRewritten, searcher.rewrite(query)); + + inner = + new BooleanQuery.Builder() + .add(new TermQuery(new Term("foo", "bar")), Occur.MUST) + .add(new TermQuery(new Term("foo", "quux")), Occur.MUST_NOT) + .build(); + query = + new BooleanQuery.Builder() + .add(inner, Occur.FILTER) + .add(new TermQuery(new Term("foo", "baz")), Occur.MUST) + .build(); + expectedRewritten = + new BooleanQuery.Builder() + .add(new TermQuery(new Term("foo", "bar")), Occur.FILTER) + .add(new TermQuery(new Term("foo", "quux")), Occur.MUST_NOT) + .add(new TermQuery(new Term("foo", "baz")), Occur.MUST) + .build(); + assertEquals(expectedRewritten, searcher.rewrite(query)); + } + public void testDiscardShouldClauses() throws IOException { IndexSearcher searcher = newSearcher(new MultiReader()); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestCollectorManager.java b/lucene/core/src/test/org/apache/lucene/search/TestCollectorManager.java index 3c7360f7d584..1e5826160f61 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestCollectorManager.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestCollectorManager.java @@ -122,13 +122,8 @@ private static SortedSet generateDocIds(int count, Random random) { return generated; } - private static final class CompositeCollectorManager + private record CompositeCollectorManager(List> predicates) implements CollectorManager> { - private final List> predicates; - - CompositeCollectorManager(List> predicates) { - this.predicates = predicates; - } @Override public Collector newCollector() throws IOException { diff --git a/lucene/core/src/test/org/apache/lucene/search/TestConjunctions.java b/lucene/core/src/test/org/apache/lucene/search/TestConjunctions.java index 675aa959b665..8e1d31b4d76a 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestConjunctions.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestConjunctions.java @@ -29,12 +29,11 @@ import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; -import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; -import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarities.RawTFSimilarity; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.analysis.MockAnalyzer; import org.apache.lucene.tests.index.RandomIndexWriter; @@ -67,7 +66,7 @@ public void setUp() throws Exception { reader = writer.getReader(); writer.close(); searcher = newSearcher(reader); - searcher.setSimilarity(new TFSimilarity()); + searcher.setSimilarity(new RawTFSimilarity()); } static Document doc(String v1, String v2) { @@ -82,7 +81,7 @@ public void testTermConjunctionsWithOmitTF() throws Exception { bq.add(new TermQuery(new Term(F1, "nutch")), BooleanClause.Occur.MUST); bq.add(new TermQuery(new Term(F2, "is")), BooleanClause.Occur.MUST); TopDocs td = searcher.search(bq.build(), 3); - assertEquals(1, td.totalHits.value); + assertEquals(1, td.totalHits.value()); assertEquals(3F, td.scoreDocs[0].score, 0.001F); // f1:nutch + f2:is + f2:is } @@ -93,26 +92,6 @@ public void tearDown() throws Exception { super.tearDown(); } - // Similarity that returns the TF as score - private static class TFSimilarity extends Similarity { - - @Override - public long computeNorm(FieldInvertState state) { - return 1; // we dont care - } - - @Override - public SimScorer scorer( - float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { - return new SimScorer() { - @Override - public float score(float freq, long norm) { - return freq; - } - }; - } - } - public void testScorerGetChildren() throws Exception { Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestConstantScoreQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestConstantScoreQuery.java index adef5e004f84..7ca3ee39b396 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestConstantScoreQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestConstantScoreQuery.java @@ -107,7 +107,7 @@ private Scorable rootScorer(Scorable s) { try { Collection children = s.getChildren(); if (children.size() == 0) return s; - s = children.stream().findFirst().get().child; + s = children.stream().findFirst().get().child(); } catch ( @SuppressWarnings("unused") Exception e) { diff --git a/lucene/core/src/test/org/apache/lucene/search/TestConstantScoreScorer.java b/lucene/core/src/test/org/apache/lucene/search/TestConstantScoreScorer.java index e10ed44890ec..0882550144c2 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestConstantScoreScorer.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestConstantScoreScorer.java @@ -241,8 +241,8 @@ public void testEarlyTermination() throws IOException { TopScoreDocCollectorManager c = new TopScoreDocCollectorManager(10, 10); TopDocs topDocs = is.search(new ConstantScoreQuery(new TermQuery(new Term("key", "foo"))), c); - assertEquals(11, topDocs.totalHits.value); - assertEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, topDocs.totalHits.relation); + assertEquals(11, topDocs.totalHits.value()); + assertEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, topDocs.totalHits.relation()); c = new TopScoreDocCollectorManager(10, 10); Query query = @@ -251,8 +251,8 @@ public void testEarlyTermination() throws IOException { .add(new ConstantScoreQuery(new TermQuery(new Term("key", "bar"))), Occur.FILTER) .build(); topDocs = is.search(query, c); - assertEquals(11, topDocs.totalHits.value); - assertEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, topDocs.totalHits.relation); + assertEquals(11, topDocs.totalHits.value()); + assertEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, topDocs.totalHits.relation()); iw.close(); ir.close(); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestControlledRealTimeReopenThread.java b/lucene/core/src/test/org/apache/lucene/search/TestControlledRealTimeReopenThread.java index 1fadd9af22c0..2c40a4e74860 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestControlledRealTimeReopenThread.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestControlledRealTimeReopenThread.java @@ -112,7 +112,7 @@ protected void updateDocuments(Term id, List doc) thro System.out.println(Thread.currentThread().getName() + ": nrt: got noDeletes searcher=" + s); } try { - assertEquals("generation: " + gen, 1, s.search(new TermQuery(id), 10).totalHits.value); + assertEquals("generation: " + gen, 1, s.search(new TermQuery(id), 10).totalHits.value()); } finally { nrtNoDeletes.release(s); } @@ -192,7 +192,7 @@ protected void updateDocument(Term id, Iterable doc) t System.out.println(Thread.currentThread().getName() + ": nrt: got deletes searcher=" + s); } try { - assertEquals("generation: " + gen, 1, s.search(new TermQuery(id), 10).totalHits.value); + assertEquals("generation: " + gen, 1, s.search(new TermQuery(id), 10).totalHits.value()); } finally { nrtDeletes.release(s); } @@ -220,7 +220,7 @@ protected void deleteDocuments(Term id) throws Exception { System.out.println(Thread.currentThread().getName() + ": nrt: got deletes searcher=" + s); } try { - assertEquals(0, s.search(new TermQuery(id), 10).totalHits.value); + assertEquals(0, s.search(new TermQuery(id), 10).totalHits.value()); } finally { nrtDeletes.release(s); } @@ -563,7 +563,7 @@ public void run() { IndexSearcher searcher = sm.acquire(); TopDocs td = searcher.search(new TermQuery(new Term("count", i + "")), 10); sm.release(searcher); - assertEquals(1, td.totalHits.value); + assertEquals(1, td.totalHits.value()); } for (Thread commitThread : commitThreads) { diff --git a/lucene/core/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java index 12dc0ffc96b4..1464aa84bd32 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java @@ -25,6 +25,7 @@ import java.util.Collections; import java.util.List; import java.util.Locale; +import java.util.stream.Collectors; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -489,6 +490,27 @@ public void testDisjunctOrderAndEquals() throws Exception { assertEquals(q1, q2); } + /* Inspired from TestIntervals.testIntervalDisjunctionToStringStability */ + public void testToStringOrderMatters() { + final int clauseNbr = + random().nextInt(22) + 4; // ensure a reasonably large minimum number of clauses + final String[] terms = new String[clauseNbr]; + for (int i = 0; i < clauseNbr; i++) { + terms[i] = Character.toString((char) ('a' + i)); + } + + final String expected = + Arrays.stream(terms) + .map((term) -> "test:" + term) + .collect(Collectors.joining(" | ", "(", ")~1.0")); + + DisjunctionMaxQuery source = + new DisjunctionMaxQuery( + Arrays.stream(terms).map((term) -> tq("test", term)).toList(), 1.0f); + + assertEquals(expected, source.toString("")); + } + public void testRandomTopDocs() throws Exception { doTestRandomTopDocs(2, 0.05f, 0.05f); doTestRandomTopDocs(2, 1.0f, 0.05f); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestDocValuesQueries.java b/lucene/core/src/test/org/apache/lucene/search/TestDocValuesQueries.java index c864fa0d98c4..9c61faad3fbf 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestDocValuesQueries.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestDocValuesQueries.java @@ -22,6 +22,8 @@ import java.util.HashSet; import java.util.List; import java.util.Set; +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.LongPoint; @@ -31,6 +33,7 @@ import org.apache.lucene.document.SortedSetDocValuesField; import org.apache.lucene.document.StringField; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.index.RandomIndexWriter; @@ -42,24 +45,80 @@ public class TestDocValuesQueries extends LuceneTestCase { + private Codec getCodec() { + // small interval size to test with many intervals + return TestUtil.alwaysDocValuesFormat(new Lucene90DocValuesFormat(random().nextInt(4, 16))); + } + public void testDuelPointRangeSortedNumericRangeQuery() throws IOException { - doTestDuelPointRangeNumericRangeQuery(true, 1); + doTestDuelPointRangeNumericRangeQuery(true, 1, false); + } + + public void testDuelPointRangeSortedNumericRangeWithSlipperQuery() throws IOException { + doTestDuelPointRangeNumericRangeQuery(true, 1, true); } public void testDuelPointRangeMultivaluedSortedNumericRangeQuery() throws IOException { - doTestDuelPointRangeNumericRangeQuery(true, 3); + doTestDuelPointRangeNumericRangeQuery(true, 3, false); + } + + public void testDuelPointRangeMultivaluedSortedNumericRangeWithSkipperQuery() throws IOException { + doTestDuelPointRangeNumericRangeQuery(true, 3, true); } public void testDuelPointRangeNumericRangeQuery() throws IOException { - doTestDuelPointRangeNumericRangeQuery(false, 1); + doTestDuelPointRangeNumericRangeQuery(false, 1, false); } - private void doTestDuelPointRangeNumericRangeQuery(boolean sortedNumeric, int maxValuesPerDoc) - throws IOException { + public void testDuelPointRangeNumericRangeWithSkipperQuery() throws IOException { + doTestDuelPointRangeNumericRangeQuery(false, 1, true); + } + + public void testDuelPointNumericSortedWithSkipperRangeQuery() throws IOException { + Directory dir = newDirectory(); + IndexWriterConfig config = new IndexWriterConfig().setCodec(getCodec()); + config.setIndexSort(new Sort(new SortField("dv", SortField.Type.LONG, random().nextBoolean()))); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, config); + final int numDocs = atLeast(1000); + for (int i = 0; i < numDocs; ++i) { + Document doc = new Document(); + final long value = TestUtil.nextLong(random(), -100, 10000); + doc.add(NumericDocValuesField.indexedField("dv", value)); + doc.add(new LongPoint("idx", value)); + iw.addDocument(doc); + } + + final IndexReader reader = iw.getReader(); + final IndexSearcher searcher = newSearcher(reader, false); + iw.close(); + + for (int i = 0; i < 100; ++i) { + final long min = + random().nextBoolean() ? Long.MIN_VALUE : TestUtil.nextLong(random(), -100, 10000); + final long max = + random().nextBoolean() ? Long.MAX_VALUE : TestUtil.nextLong(random(), -100, 10000); + final Query q1 = LongPoint.newRangeQuery("idx", min, max); + final Query q2 = NumericDocValuesField.newSlowRangeQuery("dv", min, max); + assertSameMatches(searcher, q1, q2, false); + } + reader.close(); + dir.close(); + } + + private void doTestDuelPointRangeNumericRangeQuery( + boolean sortedNumeric, int maxValuesPerDoc, boolean skypper) throws IOException { final int iters = atLeast(10); for (int iter = 0; iter < iters; ++iter) { Directory dir = newDirectory(); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir); + RandomIndexWriter iw; + if (sortedNumeric || random().nextBoolean()) { + iw = new RandomIndexWriter(random(), dir); + } else { + IndexWriterConfig config = new IndexWriterConfig().setCodec(getCodec()); + config.setIndexSort( + new Sort(new SortField("dv", SortField.Type.LONG, random().nextBoolean()))); + iw = new RandomIndexWriter(random(), dir, config); + } final int numDocs = atLeast(100); for (int i = 0; i < numDocs; ++i) { Document doc = new Document(); @@ -67,9 +126,17 @@ private void doTestDuelPointRangeNumericRangeQuery(boolean sortedNumeric, int ma for (int j = 0; j < numValues; ++j) { final long value = TestUtil.nextLong(random(), -100, 10000); if (sortedNumeric) { - doc.add(new SortedNumericDocValuesField("dv", value)); + if (skypper) { + doc.add(SortedNumericDocValuesField.indexedField("dv", value)); + } else { + doc.add(new SortedNumericDocValuesField("dv", value)); + } } else { - doc.add(new NumericDocValuesField("dv", value)); + if (skypper) { + doc.add(NumericDocValuesField.indexedField("dv", value)); + } else { + doc.add(new NumericDocValuesField("dv", value)); + } } doc.add(new LongPoint("idx", value)); } @@ -102,12 +169,20 @@ private void doTestDuelPointRangeNumericRangeQuery(boolean sortedNumeric, int ma } } - private void doTestDuelPointRangeSortedRangeQuery(boolean sortedSet, int maxValuesPerDoc) - throws IOException { + private void doTestDuelPointRangeSortedRangeQuery( + boolean sortedSet, int maxValuesPerDoc, boolean skypper) throws IOException { final int iters = atLeast(10); for (int iter = 0; iter < iters; ++iter) { Directory dir = newDirectory(); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir); + RandomIndexWriter iw; + if (sortedSet || random().nextBoolean()) { + iw = new RandomIndexWriter(random(), dir); + } else { + IndexWriterConfig config = new IndexWriterConfig().setCodec(getCodec()); + config.setIndexSort( + new Sort(new SortField("dv", SortField.Type.STRING, random().nextBoolean()))); + iw = new RandomIndexWriter(random(), dir, config); + } final int numDocs = atLeast(100); for (int i = 0; i < numDocs; ++i) { Document doc = new Document(); @@ -117,9 +192,17 @@ private void doTestDuelPointRangeSortedRangeQuery(boolean sortedSet, int maxValu byte[] encoded = new byte[Long.BYTES]; LongPoint.encodeDimension(value, encoded, 0); if (sortedSet) { - doc.add(new SortedSetDocValuesField("dv", newBytesRef(encoded))); + if (skypper) { + doc.add(SortedSetDocValuesField.indexedField("dv", newBytesRef(encoded))); + } else { + doc.add(new SortedSetDocValuesField("dv", newBytesRef(encoded))); + } } else { - doc.add(new SortedDocValuesField("dv", newBytesRef(encoded))); + if (skypper) { + doc.add(SortedDocValuesField.indexedField("dv", newBytesRef(encoded))); + } else { + doc.add(new SortedDocValuesField("dv", newBytesRef(encoded))); + } } doc.add(new LongPoint("idx", value)); } @@ -179,15 +262,79 @@ private void doTestDuelPointRangeSortedRangeQuery(boolean sortedSet, int maxValu } public void testDuelPointRangeSortedSetRangeQuery() throws IOException { - doTestDuelPointRangeSortedRangeQuery(true, 1); + doTestDuelPointRangeSortedRangeQuery(true, 1, false); + } + + public void testDuelPointRangeSortedSetRangeSkipperQuery() throws IOException { + doTestDuelPointRangeSortedRangeQuery(true, 1, true); } public void testDuelPointRangeMultivaluedSortedSetRangeQuery() throws IOException { - doTestDuelPointRangeSortedRangeQuery(true, 3); + doTestDuelPointRangeSortedRangeQuery(true, 3, false); + } + + public void testDuelPointRangeMultivaluedSortedSetRangeSkipperQuery() throws IOException { + doTestDuelPointRangeSortedRangeQuery(true, 3, true); } public void testDuelPointRangeSortedRangeQuery() throws IOException { - doTestDuelPointRangeSortedRangeQuery(false, 1); + doTestDuelPointRangeSortedRangeQuery(false, 1, false); + } + + public void testDuelPointRangeSortedRangeSkipperQuery() throws IOException { + doTestDuelPointRangeSortedRangeQuery(false, 1, true); + } + + public void testDuelPointSortedSetSortedWithSkipperRangeQuery() throws IOException { + Directory dir = newDirectory(); + IndexWriterConfig config = new IndexWriterConfig().setCodec(getCodec()); + config.setIndexSort( + new Sort(new SortField("dv", SortField.Type.STRING, random().nextBoolean()))); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, config); + final int numDocs = atLeast(1000); + for (int i = 0; i < numDocs; ++i) { + Document doc = new Document(); + final long value = TestUtil.nextLong(random(), -100, 10000); + byte[] encoded = new byte[Long.BYTES]; + LongPoint.encodeDimension(value, encoded, 0); + doc.add(SortedDocValuesField.indexedField("dv", newBytesRef(encoded))); + doc.add(new LongPoint("idx", value)); + iw.addDocument(doc); + } + + final IndexReader reader = iw.getReader(); + final IndexSearcher searcher = newSearcher(reader, false); + iw.close(); + + for (int i = 0; i < 100; ++i) { + long min = random().nextBoolean() ? Long.MIN_VALUE : TestUtil.nextLong(random(), -100, 10000); + long max = random().nextBoolean() ? Long.MAX_VALUE : TestUtil.nextLong(random(), -100, 10000); + byte[] encodedMin = new byte[Long.BYTES]; + byte[] encodedMax = new byte[Long.BYTES]; + LongPoint.encodeDimension(min, encodedMin, 0); + LongPoint.encodeDimension(max, encodedMax, 0); + boolean includeMin = true; + boolean includeMax = true; + if (random().nextBoolean()) { + includeMin = false; + min++; + } + if (random().nextBoolean()) { + includeMax = false; + max--; + } + final Query q1 = LongPoint.newRangeQuery("idx", min, max); + final Query q2 = + SortedDocValuesField.newSlowRangeQuery( + "dv", + min == Long.MIN_VALUE && random().nextBoolean() ? null : newBytesRef(encodedMin), + max == Long.MAX_VALUE && random().nextBoolean() ? null : newBytesRef(encodedMax), + includeMin, + includeMax); + assertSameMatches(searcher, q1, q2, false); + } + reader.close(); + dir.close(); } private void assertSameMatches(IndexSearcher searcher, Query q1, Query q2, boolean scores) @@ -195,7 +342,7 @@ private void assertSameMatches(IndexSearcher searcher, Query q1, Query q2, boole final int maxDoc = searcher.getIndexReader().maxDoc(); final TopDocs td1 = searcher.search(q1, maxDoc, scores ? Sort.RELEVANCE : Sort.INDEXORDER); final TopDocs td2 = searcher.search(q2, maxDoc, scores ? Sort.RELEVANCE : Sort.INDEXORDER); - assertEquals(td1.totalHits.value, td2.totalHits.value); + assertEquals(td1.totalHits.value(), td2.totalHits.value()); for (int i = 0; i < td1.scoreDocs.length; ++i) { assertEquals(td1.scoreDocs[i].doc, td2.scoreDocs[i].doc); if (scores) { diff --git a/lucene/core/src/test/org/apache/lucene/search/TestDocValuesRangeIterator.java b/lucene/core/src/test/org/apache/lucene/search/TestDocValuesRangeIterator.java new file mode 100644 index 000000000000..f014348739e1 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/TestDocValuesRangeIterator.java @@ -0,0 +1,332 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.io.IOException; +import java.util.concurrent.atomic.AtomicBoolean; +import org.apache.lucene.index.DocValuesSkipper; +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.tests.util.LuceneTestCase; + +public class TestDocValuesRangeIterator extends LuceneTestCase { + + public void testSingleLevel() throws IOException { + doTestBasics(false); + } + + public void testMultipleLevels() throws IOException { + doTestBasics(true); + } + + private void doTestBasics(boolean doLevels) throws IOException { + long queryMin = 10; + long queryMax = 20; + + // Test with both gaps and no-gaps in the ranges: + NumericDocValues values = docValues(queryMin, queryMax); + NumericDocValues values2 = docValues(queryMin, queryMax); + + AtomicBoolean twoPhaseCalled = new AtomicBoolean(); + TwoPhaseIterator twoPhase = twoPhaseIterator(values, queryMin, queryMax, twoPhaseCalled); + AtomicBoolean twoPhaseCalled2 = new AtomicBoolean(); + TwoPhaseIterator twoPhase2 = twoPhaseIterator(values2, queryMin, queryMax, twoPhaseCalled2); + + DocValuesSkipper skipper = docValuesSkipper(queryMin, queryMax, doLevels); + DocValuesSkipper skipper2 = docValuesSkipper(queryMin, queryMax, doLevels); + + DocValuesRangeIterator rangeIterator = + new DocValuesRangeIterator(twoPhase, skipper, queryMin, queryMax, false); + DocValuesRangeIterator rangeIteratorWithGaps = + new DocValuesRangeIterator(twoPhase2, skipper2, queryMin, queryMax, true); + DocValuesRangeIterator.Approximation rangeApproximation = + (DocValuesRangeIterator.Approximation) rangeIterator.approximation(); + DocValuesRangeIterator.Approximation rangeApproximationWithGaps = + (DocValuesRangeIterator.Approximation) rangeIteratorWithGaps.approximation(); + + assertEquals(100, rangeApproximation.advance(100)); + assertEquals(100, rangeApproximationWithGaps.advance(100)); + assertEquals(DocValuesRangeIterator.Match.YES, rangeApproximation.match); + assertEquals(DocValuesRangeIterator.Match.MAYBE, rangeApproximationWithGaps.match); + assertEquals(255, rangeApproximation.upTo); + if (doLevels) { + assertEquals(127, rangeApproximationWithGaps.upTo); + } else { + assertEquals(255, rangeApproximationWithGaps.upTo); + } + assertTrue(rangeIterator.matches()); + assertTrue(rangeIteratorWithGaps.matches()); + assertTrue(values.docID() < rangeApproximation.docID()); // we did not advance doc values + assertEquals( + values2.docID(), rangeApproximationWithGaps.docID()); // we _did_ advance doc values + assertFalse(twoPhaseCalled.get()); + assertTrue(twoPhaseCalled2.get()); + twoPhaseCalled2.set(false); + + assertEquals(768, rangeApproximation.advance(300)); + assertEquals(768, rangeApproximationWithGaps.advance(300)); + assertEquals(DocValuesRangeIterator.Match.MAYBE, rangeApproximation.match); + assertEquals(DocValuesRangeIterator.Match.MAYBE, rangeApproximationWithGaps.match); + if (doLevels) { + assertEquals(831, rangeApproximation.upTo); + assertEquals(831, rangeApproximationWithGaps.upTo); + } else { + assertEquals(1023, rangeApproximation.upTo); + assertEquals(1023, rangeApproximationWithGaps.upTo); + } + for (int i = 0; i < 10; ++i) { + assertEquals(values.docID(), rangeApproximation.docID()); + assertEquals(values2.docID(), rangeApproximationWithGaps.docID()); + assertEquals(twoPhase.matches(), rangeIterator.matches()); + assertEquals(twoPhase2.matches(), rangeIteratorWithGaps.matches()); + assertTrue(twoPhaseCalled.get()); + assertTrue(twoPhaseCalled2.get()); + twoPhaseCalled.set(false); + twoPhaseCalled2.set(false); + rangeApproximation.nextDoc(); + rangeApproximationWithGaps.nextDoc(); + } + + assertEquals(1100, rangeApproximation.advance(1099)); + assertEquals(1100, rangeApproximationWithGaps.advance(1099)); + assertEquals(DocValuesRangeIterator.Match.IF_DOC_HAS_VALUE, rangeApproximation.match); + assertEquals(DocValuesRangeIterator.Match.MAYBE, rangeApproximationWithGaps.match); + assertEquals(1024 + 256 - 1, rangeApproximation.upTo); + if (doLevels) { + assertEquals(1024 + 128 - 1, rangeApproximationWithGaps.upTo); + } else { + assertEquals(1024 + 256 - 1, rangeApproximationWithGaps.upTo); + } + assertEquals(values.docID(), rangeApproximation.docID()); + assertEquals(values2.docID(), rangeApproximationWithGaps.docID()); + assertTrue(rangeIterator.matches()); + assertTrue(rangeIteratorWithGaps.matches()); + assertFalse(twoPhaseCalled.get()); + assertTrue(twoPhaseCalled2.get()); + twoPhaseCalled2.set(false); + + assertEquals(1024 + 768, rangeApproximation.advance(1024 + 300)); + assertEquals(1024 + 768, rangeApproximationWithGaps.advance(1024 + 300)); + assertEquals(DocValuesRangeIterator.Match.MAYBE, rangeApproximation.match); + assertEquals(DocValuesRangeIterator.Match.MAYBE, rangeApproximationWithGaps.match); + if (doLevels) { + assertEquals(1024 + 831, rangeApproximation.upTo); + assertEquals(1024 + 831, rangeApproximationWithGaps.upTo); + } else { + assertEquals(2047, rangeApproximation.upTo); + assertEquals(2047, rangeApproximationWithGaps.upTo); + } + for (int i = 0; i < 10; ++i) { + assertEquals(values.docID(), rangeApproximation.docID()); + assertEquals(values2.docID(), rangeApproximationWithGaps.docID()); + assertEquals(twoPhase.matches(), rangeIterator.matches()); + assertEquals(twoPhase2.matches(), rangeIteratorWithGaps.matches()); + assertTrue(twoPhaseCalled.get()); + assertTrue(twoPhaseCalled2.get()); + twoPhaseCalled.set(false); + twoPhaseCalled2.set(false); + rangeApproximation.nextDoc(); + rangeApproximationWithGaps.nextDoc(); + } + + assertEquals(DocIdSetIterator.NO_MORE_DOCS, rangeApproximation.advance(2048)); + assertEquals(DocIdSetIterator.NO_MORE_DOCS, rangeApproximationWithGaps.advance(2048)); + } + + // Fake numeric doc values so that: + // docs 0-256 all match + // docs in 256-512 are all greater than queryMax + // docs in 512-768 are all less than queryMin + // docs in 768-1024 have some docs that match the range, others not + // docs in 1024-2048 follow a similar pattern as docs in 0-1024 except that not all docs have a + // value + private static NumericDocValues docValues(long queryMin, long queryMax) { + return new NumericDocValues() { + + int doc = -1; + + @Override + public boolean advanceExact(int target) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public int docID() { + return doc; + } + + @Override + public int nextDoc() throws IOException { + return advance(doc + 1); + } + + @Override + public int advance(int target) throws IOException { + if (target < 1024) { + // dense up to 1024 + return doc = target; + } else if (doc < 2047) { + // 50% docs have a value up to 2048 + return doc = target + (target & 1); + } else { + return doc = DocIdSetIterator.NO_MORE_DOCS; + } + } + + @Override + public long longValue() throws IOException { + int d = doc % 1024; + if (d < 128) { + return (queryMin + queryMax) >> 1; + } else if (d < 256) { + return queryMax + 1; + } else if (d < 512) { + return queryMin - 1; + } else { + return switch ((d / 2) % 3) { + case 0 -> queryMin - 1; + case 1 -> queryMax + 1; + case 2 -> (queryMin + queryMax) >> 1; + default -> throw new AssertionError(); + }; + } + } + + @Override + public long cost() { + return 42; + } + }; + } + + private static TwoPhaseIterator twoPhaseIterator( + NumericDocValues values, long queryMin, long queryMax, AtomicBoolean twoPhaseCalled) { + return new TwoPhaseIterator(values) { + + @Override + public boolean matches() throws IOException { + twoPhaseCalled.set(true); + long v = values.longValue(); + return v >= queryMin && v <= queryMax; + } + + @Override + public float matchCost() { + return 2f; // 2 comparisons + } + }; + } + + private static DocValuesSkipper docValuesSkipper(long queryMin, long queryMax, boolean doLevels) { + return new DocValuesSkipper() { + + int doc = -1; + + @Override + public void advance(int target) throws IOException { + doc = target; + } + + @Override + public int numLevels() { + return doLevels ? 3 : 1; + } + + @Override + public int minDocID(int level) { + int rangeLog = 9 - numLevels() + level; + + // the level is the log2 of the interval + if (doc < 0) { + return -1; + } else if (doc >= 2048) { + return DocIdSetIterator.NO_MORE_DOCS; + } else { + int mask = (1 << rangeLog) - 1; + // prior multiple of 2^level + return doc & ~mask; + } + } + + @Override + public int maxDocID(int level) { + int rangeLog = 9 - numLevels() + level; + + int minDocID = minDocID(level); + return switch (minDocID) { + case -1 -> -1; + case DocIdSetIterator.NO_MORE_DOCS -> DocIdSetIterator.NO_MORE_DOCS; + default -> minDocID + (1 << rangeLog) - 1; + }; + } + + @Override + public long minValue(int level) { + int d = doc % 1024; + if (d < 128) { + return queryMin; + } else if (d < 256) { + return queryMax + 1; + } else if (d < 768) { + return queryMin - 1; + } else { + return queryMin - 1; + } + } + + @Override + public long maxValue(int level) { + int d = doc % 1024; + if (d < 128) { + return queryMax; + } else if (d < 256) { + return queryMax + 1; + } else if (d < 768) { + return queryMin - 1; + } else { + return queryMax + 1; + } + } + + @Override + public int docCount(int level) { + int rangeLog = 9 - numLevels() + level; + + if (doc < 1024) { + return 1 << rangeLog; + } else { + // half docs have a value + return 1 << rangeLog >> 1; + } + } + + @Override + public long minValue() { + return Long.MIN_VALUE; + } + + @Override + public long maxValue() { + return Long.MAX_VALUE; + } + + @Override + public int docCount() { + return 1024 + 1024 / 2; + } + }; + } +} diff --git a/lucene/core/src/test/org/apache/lucene/search/TestDocValuesRewriteMethod.java b/lucene/core/src/test/org/apache/lucene/search/TestDocValuesRewriteMethod.java index a87ca4681956..69b4c8e617ba 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestDocValuesRewriteMethod.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestDocValuesRewriteMethod.java @@ -41,8 +41,7 @@ /** Tests the DocValuesRewriteMethod */ public class TestDocValuesRewriteMethod extends LuceneTestCase { - protected IndexSearcher searcher1; - protected IndexSearcher searcher2; + protected IndexSearcher searcher; private IndexReader reader; private Directory dir; protected String fieldName; @@ -69,6 +68,7 @@ public void setUp() throws Exception { String s = TestUtil.randomUnicodeString(random()); doc.add(newStringField(fieldName, s, Field.Store.NO)); doc.add(new SortedSetDocValuesField(fieldName, new BytesRef(s))); + doc.add(SortedSetDocValuesField.indexedField(fieldName + "_with-skip", new BytesRef(s))); terms.add(s); } writer.addDocument(doc); @@ -89,8 +89,7 @@ public void setUp() throws Exception { } reader = writer.getReader(); - searcher1 = newSearcher(reader); - searcher2 = newSearcher(reader); + searcher = newSearcher(reader); writer.close(); } @@ -123,12 +122,22 @@ protected void assertSame(String regexp) throws IOException { name -> null, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT, new DocValuesRewriteMethod()); + RegexpQuery docValuesWithSkip = + new RegexpQuery( + new Term(fieldName + "_with-skip", regexp), + RegExp.NONE, + 0, + name -> null, + Operations.DEFAULT_DETERMINIZE_WORK_LIMIT, + new DocValuesRewriteMethod()); RegexpQuery inverted = new RegexpQuery(new Term(fieldName, regexp), RegExp.NONE); - TopDocs invertedDocs = searcher1.search(inverted, 25); - TopDocs docValuesDocs = searcher2.search(docValues, 25); + TopDocs invertedDocs = searcher.search(inverted, 25); + TopDocs docValuesDocs = searcher.search(docValues, 25); + TopDocs docValuesWithSkipDocs = searcher.search(docValuesWithSkip, 25); CheckHits.checkEqual(inverted, invertedDocs.scoreDocs, docValuesDocs.scoreDocs); + CheckHits.checkEqual(inverted, invertedDocs.scoreDocs, docValuesWithSkipDocs.scoreDocs); } public void testEquals() throws Exception { diff --git a/lucene/core/src/test/org/apache/lucene/search/TestDoubleValuesSource.java b/lucene/core/src/test/org/apache/lucene/search/TestDoubleValuesSource.java index 0c30c6baf861..ccdb060047d2 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestDoubleValuesSource.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestDoubleValuesSource.java @@ -225,7 +225,7 @@ void checkSorts(Query query, Sort sort) throws Exception { CheckHits.checkEqual(query, expected.scoreDocs, actual.scoreDocs); - if (size < actual.totalHits.value) { + if (size < actual.totalHits.value()) { expected = searcher.searchAfter(expected.scoreDocs[size - 1], query, size, sort); actual = searcher.searchAfter(actual.scoreDocs[size - 1], query, size, mutatedSort); CheckHits.checkEqual(query, expected.scoreDocs, actual.scoreDocs); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestElevationComparator.java b/lucene/core/src/test/org/apache/lucene/search/TestElevationComparator.java index 46ab7ad420b5..dab1e1651136 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestElevationComparator.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestElevationComparator.java @@ -98,7 +98,7 @@ private void runTest(boolean reversed) throws Throwable { TopDocs topDocs = searcher.search( - newq.build(), new TopFieldCollectorManager(sort, 50, null, Integer.MAX_VALUE, true)); + newq.build(), new TopFieldCollectorManager(sort, 50, null, Integer.MAX_VALUE)); int nDocsReturned = topDocs.scoreDocs.length; assertEquals(4, nDocsReturned); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestFieldExistsQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestFieldExistsQuery.java index 9edb586da718..75b438fac4b0 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestFieldExistsQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestFieldExistsQuery.java @@ -804,7 +804,7 @@ private void assertSameMatches(IndexSearcher searcher, Query q1, Query q2, boole final int maxDoc = searcher.getIndexReader().maxDoc(); final TopDocs td1 = searcher.search(q1, maxDoc, scores ? Sort.RELEVANCE : Sort.INDEXORDER); final TopDocs td2 = searcher.search(q2, maxDoc, scores ? Sort.RELEVANCE : Sort.INDEXORDER); - assertEquals(td1.totalHits.value, td2.totalHits.value); + assertEquals(td1.totalHits.value(), td2.totalHits.value()); for (int i = 0; i < td1.scoreDocs.length; ++i) { assertEquals(td1.scoreDocs[i].doc, td2.scoreDocs[i].doc); if (scores) { diff --git a/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java index 6515417b3a8c..70d491ddb8bd 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java @@ -422,7 +422,7 @@ public void testTieBreaker() throws Exception { IndexSearcher searcher = newSearcher(mr); FuzzyQuery fq = new FuzzyQuery(new Term("field", "z123456"), 1, 0, 2, false); TopDocs docs = searcher.search(fq, 2); - assertEquals(5, docs.totalHits.value); // 5 docs, from the a and b's + assertEquals(5, docs.totalHits.value()); // 5 docs, from the a and b's mr.close(); ir1.close(); ir2.close(); @@ -705,14 +705,7 @@ public void testRandom() throws Exception { IOUtils.close(r, dir); } - private static class TermAndScore implements Comparable { - final String term; - final float score; - - public TermAndScore(String term, float score) { - this.term = term; - this.score = score; - } + private record TermAndScore(String term, float score) implements Comparable { @Override public int compareTo(TermAndScore other) { @@ -725,11 +718,6 @@ public int compareTo(TermAndScore other) { return term.compareTo(other.term); } } - - @Override - public String toString() { - return term + " score=" + score; - } } // Poached from LuceneLevenshteinDistance.java (from suggest module): it supports transpositions diff --git a/lucene/core/src/test/org/apache/lucene/search/TestIndexSearcher.java b/lucene/core/src/test/org/apache/lucene/search/TestIndexSearcher.java index 724013abac71..3fc4ec4bf728 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestIndexSearcher.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestIndexSearcher.java @@ -19,6 +19,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.List; import java.util.Random; import java.util.concurrent.ExecutorService; @@ -166,7 +167,7 @@ public void testCount() throws IOException { .add(new TermQuery(new Term("foo", "bar")), Occur.SHOULD) .add(new TermQuery(new Term("foo", "baz")), Occur.SHOULD) .build())) { - assertEquals(searcher.count(query), searcher.search(query, 1).totalHits.value); + assertEquals(searcher.count(query), searcher.search(query, 1).totalHits.value()); } reader.close(); } @@ -246,7 +247,7 @@ public void testGetSlices() throws Exception { // without executor IndexSearcher.LeafSlice[] slices = new IndexSearcher(r).getSlices(); assertEquals(1, slices.length); - assertEquals(r.leaves().size(), slices[0].leaves.length); + assertEquals(r.leaves().size(), slices[0].partitions.length); } { // force creation of multiple slices, and provide an executor @@ -254,12 +255,12 @@ public void testGetSlices() throws Exception { new IndexSearcher(r, Runnable::run) { @Override protected LeafSlice[] slices(List leaves) { - return slices(leaves, 1, 1); + return slices(leaves, 1, 1, false); } }; IndexSearcher.LeafSlice[] slices = searcher.getSlices(); for (IndexSearcher.LeafSlice slice : slices) { - assertEquals(1, slice.leaves.length); + assertEquals(1, slice.partitions.length); } assertEquals(r.leaves().size(), slices.length); } @@ -280,7 +281,10 @@ public void testSlicesOffloadedToTheExecutor() throws IOException { protected LeafSlice[] slices(List leaves) { ArrayList slices = new ArrayList<>(); for (LeafReaderContext ctx : leaves) { - slices.add(new LeafSlice(Arrays.asList(ctx))); + slices.add( + new LeafSlice( + Collections.singletonList( + LeafReaderContextPartition.createForEntireSegment(ctx)))); } return slices.toArray(new LeafSlice[0]); } @@ -293,4 +297,32 @@ public void testNullExecutorNonNullTaskExecutor() { IndexSearcher indexSearcher = new IndexSearcher(reader); assertNotNull(indexSearcher.getTaskExecutor()); } + + public void testSegmentPartitionsSameSlice() { + IndexSearcher indexSearcher = + new IndexSearcher(reader, Runnable::run) { + @Override + protected LeafSlice[] slices(List leaves) { + List slices = new ArrayList<>(); + for (LeafReaderContext ctx : leaves) { + slices.add( + new LeafSlice( + new ArrayList<>( + List.of( + LeafReaderContextPartition.createFromAndTo(ctx, 0, 1), + LeafReaderContextPartition.createFromAndTo( + ctx, 1, ctx.reader().maxDoc()))))); + } + return slices.toArray(new LeafSlice[0]); + } + }; + + assumeTrue( + "Needs at least 2 docs in the same segment", + indexSearcher.leafContexts.stream().allMatch(ctx -> ctx.reader().maxDoc() > 1)); + IllegalStateException e = expectThrows(IllegalStateException.class, indexSearcher::getSlices); + assertEquals( + "The same slice targets multiple leaf partitions of the same leaf reader context. A physical segment should rather get partitioned to be searched concurrently from as many slices as the number of leaf partitions it is split into.", + e.getMessage()); + } } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestIndexSortSortedNumericDocValuesRangeQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestIndexSortSortedNumericDocValuesRangeQuery.java index ff258610ae9f..a807a88c8ec0 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestIndexSortSortedNumericDocValuesRangeQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestIndexSortSortedNumericDocValuesRangeQuery.java @@ -99,7 +99,7 @@ private static void assertSameHits(IndexSearcher searcher, Query q1, Query q2, b final int maxDoc = searcher.getIndexReader().maxDoc(); final TopDocs td1 = searcher.search(q1, maxDoc, scores ? Sort.RELEVANCE : Sort.INDEXORDER); final TopDocs td2 = searcher.search(q2, maxDoc, scores ? Sort.RELEVANCE : Sort.INDEXORDER); - assertEquals(td1.totalHits.value, td2.totalHits.value); + assertEquals(td1.totalHits.value(), td2.totalHits.value()); for (int i = 0; i < td1.scoreDocs.length; ++i) { assertEquals(td1.scoreDocs[i].doc, td2.scoreDocs[i].doc); if (scores) { diff --git a/lucene/core/src/test/org/apache/lucene/search/TestKnnByteVectorQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestKnnByteVectorQuery.java index 4dc3d385b087..b45d6e8fb641 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestKnnByteVectorQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestKnnByteVectorQuery.java @@ -23,6 +23,7 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.QueryTimeout; +import org.apache.lucene.index.Term; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.store.Directory; import org.apache.lucene.util.TestVectorUtil; @@ -78,6 +79,11 @@ public void testToString() throws IOException { assertEquals("KnnByteVectorQuery:field[0,...][10]", query.toString("ignored")); assertDocScoreQueryToString(query.rewrite(newSearcher(reader))); + + // test with filter + Query filter = new TermQuery(new Term("id", "text")); + query = getKnnVectorQuery("field", new float[] {0, 1}, 10, filter); + assertEquals("KnnByteVectorQuery:field[0,...][10][id:text]", query.toString("ignored")); } } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestKnnFloatVectorQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestKnnFloatVectorQuery.java index f2e5a3e274ab..feebe858c099 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestKnnFloatVectorQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestKnnFloatVectorQuery.java @@ -35,6 +35,7 @@ import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.QueryTimeout; +import org.apache.lucene.index.Term; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.index.RandomIndexWriter; @@ -77,6 +78,11 @@ public void testToString() throws IOException { assertEquals("KnnFloatVectorQuery:field[0.0,...][10]", query.toString("ignored")); assertDocScoreQueryToString(query.rewrite(newSearcher(reader))); + + // test with filter + Query filter = new TermQuery(new Term("id", "text")); + query = getKnnVectorQuery("field", new float[] {0.0f, 1.0f}, 10, filter); + assertEquals("KnnFloatVectorQuery:field[0.0,...][10][id:text]", query.toString("ignored")); } } @@ -223,8 +229,8 @@ public void testDocAndScoreQueryBasics() throws IOException { docs, scores, maxScore, segments, indexReader.getContext().id()); final Weight w = query.createWeight(searcher, ScoreMode.TOP_SCORES, 1.0f); TopDocs topDocs = searcher.search(query, 100); - assertEquals(scoreDocs.length, topDocs.totalHits.value); - assertEquals(TotalHits.Relation.EQUAL_TO, topDocs.totalHits.relation); + assertEquals(scoreDocs.length, topDocs.totalHits.value()); + assertEquals(TotalHits.Relation.EQUAL_TO, topDocs.totalHits.relation()); Arrays.sort(topDocs.scoreDocs, Comparator.comparingInt(scoreDoc -> scoreDoc.doc)); assertEquals(scoreDocs.length, topDocs.scoreDocs.length); for (int i = 0; i < scoreDocs.length; i++) { diff --git a/lucene/core/src/test/org/apache/lucene/search/TestLiveFieldValues.java b/lucene/core/src/test/org/apache/lucene/search/TestLiveFieldValues.java index dfbc94f4b2aa..907b454ea27c 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestLiveFieldValues.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestLiveFieldValues.java @@ -64,8 +64,8 @@ public IndexSearcher newSearcher(IndexReader r, IndexReader previous) { protected Integer lookupFromSearcher(IndexSearcher s, String id) throws IOException { TermQuery tq = new TermQuery(new Term("id", id)); TopDocs hits = s.search(tq, 1); - assertTrue(hits.totalHits.value <= 1); - if (hits.totalHits.value == 0) { + assertTrue(hits.totalHits.value() <= 1); + if (hits.totalHits.value() == 0) { return null; } else { Document doc = s.storedFields().document(hits.scoreDocs[0].doc); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestLongValuesSource.java b/lucene/core/src/test/org/apache/lucene/search/TestLongValuesSource.java index ac3b6e7bd7e6..5732e61e3f90 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestLongValuesSource.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestLongValuesSource.java @@ -192,7 +192,7 @@ void checkSorts(Query query, Sort sort) throws Exception { CheckHits.checkEqual(query, expected.scoreDocs, actual.scoreDocs); - if (size < actual.totalHits.value) { + if (size < actual.totalHits.value()) { expected = searcher.searchAfter(expected.scoreDocs[size - 1], query, size, sort); actual = searcher.searchAfter(actual.scoreDocs[size - 1], query, size, mutatedSort); CheckHits.checkEqual(query, expected.scoreDocs, actual.scoreDocs); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestMatchAllDocsQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestMatchAllDocsQuery.java index dca7ff8c815a..4d3add048658 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestMatchAllDocsQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestMatchAllDocsQuery.java @@ -121,15 +121,15 @@ public void testEarlyTermination() throws IOException { new TopScoreDocCollectorManager(10, totalHitsThreshold); TopDocs topDocs = singleThreadedSearcher.search(new MatchAllDocsQuery(), collectorManager); - assertEquals(totalHitsThreshold + 1, topDocs.totalHits.value); - assertEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, topDocs.totalHits.relation); + assertEquals(totalHitsThreshold + 1, topDocs.totalHits.value()); + assertEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, topDocs.totalHits.relation()); IndexSearcher is = newSearcher(ir); collectorManager = new TopScoreDocCollectorManager(10, numDocs); topDocs = is.search(new MatchAllDocsQuery(), collectorManager); - assertEquals(numDocs, topDocs.totalHits.value); - assertEquals(TotalHits.Relation.EQUAL_TO, topDocs.totalHits.relation); + assertEquals(numDocs, topDocs.totalHits.value()); + assertEquals(TotalHits.Relation.EQUAL_TO, topDocs.totalHits.relation()); iw.close(); ir.close(); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreAccumulator.java b/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreAccumulator.java index 700fba697c51..561609719313 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreAccumulator.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreAccumulator.java @@ -23,44 +23,28 @@ public class TestMaxScoreAccumulator extends LuceneTestCase { public void testSimple() { MaxScoreAccumulator acc = new MaxScoreAccumulator(); acc.accumulate(0, 0f); - assertEquals(0f, acc.get().score, 0); - assertEquals(0, acc.get().docBase, 0); + assertEquals(0f, MaxScoreAccumulator.toScore(acc.getRaw()), 0); + assertEquals(0, MaxScoreAccumulator.docId(acc.getRaw()), 0); acc.accumulate(10, 0f); - assertEquals(0f, acc.get().score, 0); - assertEquals(0, acc.get().docBase, 0); + assertEquals(0f, MaxScoreAccumulator.toScore(acc.getRaw()), 0); + assertEquals(0, MaxScoreAccumulator.docId(acc.getRaw()), 0); acc.accumulate(100, 1000f); - assertEquals(1000f, acc.get().score, 0); - assertEquals(100, acc.get().docBase, 0); + assertEquals(1000f, MaxScoreAccumulator.toScore(acc.getRaw()), 0); + assertEquals(100, MaxScoreAccumulator.docId(acc.getRaw()), 0); acc.accumulate(1000, 5f); - assertEquals(1000f, acc.get().score, 0); - assertEquals(100, acc.get().docBase, 0); + assertEquals(1000f, MaxScoreAccumulator.toScore(acc.getRaw()), 0); + assertEquals(100, MaxScoreAccumulator.docId(acc.getRaw()), 0); acc.accumulate(99, 1000f); - assertEquals(1000f, acc.get().score, 0); - assertEquals(99, acc.get().docBase, 0); + assertEquals(1000f, MaxScoreAccumulator.toScore(acc.getRaw()), 0); + assertEquals(99, MaxScoreAccumulator.docId(acc.getRaw()), 0); acc.accumulate(1000, 1001f); - assertEquals(1001f, acc.get().score, 0); - assertEquals(1000, acc.get().docBase, 0); + assertEquals(1001f, MaxScoreAccumulator.toScore(acc.getRaw()), 0); + assertEquals(1000, MaxScoreAccumulator.docId(acc.getRaw()), 0); acc.accumulate(10, 1001f); - assertEquals(1001f, acc.get().score, 0); - assertEquals(10, acc.get().docBase, 0); + assertEquals(1001f, MaxScoreAccumulator.toScore(acc.getRaw()), 0); + assertEquals(10, MaxScoreAccumulator.docId(acc.getRaw()), 0); acc.accumulate(100, 1001f); - assertEquals(1001f, acc.get().score, 0); - assertEquals(10, acc.get().docBase, 0); - } - - public void testRandom() { - MaxScoreAccumulator acc = new MaxScoreAccumulator(); - int numDocs = atLeast(100); - int maxDocs = atLeast(10000); - MaxScoreAccumulator.DocAndScore max = new MaxScoreAccumulator.DocAndScore(-1, -1); - for (int i = 0; i < numDocs; i++) { - MaxScoreAccumulator.DocAndScore res = - new MaxScoreAccumulator.DocAndScore(random().nextInt(maxDocs), random().nextFloat()); - acc.accumulate(res.docBase, res.score); - if (res.compareTo(max) > 0) { - max = res; - } - } - assertEquals(max, acc.get()); + assertEquals(1001f, MaxScoreAccumulator.toScore(acc.getRaw()), 0); + assertEquals(10, MaxScoreAccumulator.docId(acc.getRaw()), 0); } } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreBulkScorer.java b/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreBulkScorer.java index d7ccea692759..6973cc0025a4 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreBulkScorer.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreBulkScorer.java @@ -38,23 +38,6 @@ // These basic tests are similar to some of the tests in TestWANDScorer, and may not need to be kept public class TestMaxScoreBulkScorer extends LuceneTestCase { - private static class CapMaxScoreWindowAt2048Scorer extends FilterScorer { - - public CapMaxScoreWindowAt2048Scorer(Scorer in) { - super(in); - } - - @Override - public int advanceShallow(int target) throws IOException { - return Math.min(target | 0x7FF, in.advanceShallow(target)); - } - - @Override - public float getMaxScore(int upTo) throws IOException { - return in.getMaxScore(upTo); - } - } - private void writeDocuments(Directory dir) throws IOException { try (IndexWriter w = new IndexWriter(dir, newIndexWriterConfig().setMergePolicy(newLogMergePolicy()))) { @@ -96,12 +79,10 @@ public void testBasicsWithTwoDisjunctionClauses() throws Exception { searcher .createWeight(searcher.rewrite(clause1), ScoreMode.TOP_SCORES, 1f) .scorer(context); - scorer1 = new CapMaxScoreWindowAt2048Scorer(scorer1); Scorer scorer2 = searcher .createWeight(searcher.rewrite(clause2), ScoreMode.TOP_SCORES, 1f) .scorer(context); - scorer2 = new CapMaxScoreWindowAt2048Scorer(scorer2); BulkScorer scorer = new MaxScoreBulkScorer(context.reader().maxDoc(), Arrays.asList(scorer1, scorer2)); @@ -146,7 +127,9 @@ public void collect(int doc) throws IOException { } } }, - null); + null, + 0, + DocIdSetIterator.NO_MORE_DOCS); } } } @@ -166,12 +149,10 @@ public void testBasicsWithTwoDisjunctionClausesAndSkipping() throws Exception { searcher .createWeight(searcher.rewrite(clause1), ScoreMode.TOP_SCORES, 1f) .scorer(context); - scorer1 = new CapMaxScoreWindowAt2048Scorer(scorer1); Scorer scorer2 = searcher .createWeight(searcher.rewrite(clause2), ScoreMode.TOP_SCORES, 1f) .scorer(context); - scorer2 = new CapMaxScoreWindowAt2048Scorer(scorer2); BulkScorer scorer = new MaxScoreBulkScorer(context.reader().maxDoc(), Arrays.asList(scorer1, scorer2)); @@ -211,7 +192,9 @@ public void collect(int doc) throws IOException { } } }, - null); + null, + 0, + DocIdSetIterator.NO_MORE_DOCS); } } } @@ -233,17 +216,14 @@ public void testBasicsWithThreeDisjunctionClauses() throws Exception { searcher .createWeight(searcher.rewrite(clause1), ScoreMode.TOP_SCORES, 1f) .scorer(context); - scorer1 = new CapMaxScoreWindowAt2048Scorer(scorer1); Scorer scorer2 = searcher .createWeight(searcher.rewrite(clause2), ScoreMode.TOP_SCORES, 1f) .scorer(context); - scorer2 = new CapMaxScoreWindowAt2048Scorer(scorer2); Scorer scorer3 = searcher .createWeight(searcher.rewrite(clause3), ScoreMode.TOP_SCORES, 1f) .scorer(context); - scorer3 = new CapMaxScoreWindowAt2048Scorer(scorer3); BulkScorer scorer = new MaxScoreBulkScorer( @@ -289,7 +269,9 @@ public void collect(int doc) throws IOException { } } }, - null); + null, + 0, + DocIdSetIterator.NO_MORE_DOCS); } } } @@ -311,17 +293,14 @@ public void testBasicsWithThreeDisjunctionClausesAndSkipping() throws Exception searcher .createWeight(searcher.rewrite(clause1), ScoreMode.TOP_SCORES, 1f) .scorer(context); - scorer1 = new CapMaxScoreWindowAt2048Scorer(scorer1); Scorer scorer2 = searcher .createWeight(searcher.rewrite(clause2), ScoreMode.TOP_SCORES, 1f) .scorer(context); - scorer2 = new CapMaxScoreWindowAt2048Scorer(scorer2); Scorer scorer3 = searcher .createWeight(searcher.rewrite(clause3), ScoreMode.TOP_SCORES, 1f) .scorer(context); - scorer3 = new CapMaxScoreWindowAt2048Scorer(scorer3); BulkScorer scorer = new MaxScoreBulkScorer( @@ -367,7 +346,9 @@ public void collect(int doc) throws IOException { } } }, - null); + null, + 0, + DocIdSetIterator.NO_MORE_DOCS); } } } @@ -503,7 +484,7 @@ public void finish() throws IOException { assertEquals(1, i); } }; - scorer.score(collector, liveDocs); + scorer.score(collector, liveDocs, 0, DocIdSetIterator.NO_MORE_DOCS); collector.finish(); } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestMinShouldMatch2.java b/lucene/core/src/test/org/apache/lucene/search/TestMinShouldMatch2.java index 95f6f7ee1247..e06951df164e 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestMinShouldMatch2.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestMinShouldMatch2.java @@ -29,6 +29,7 @@ import org.apache.lucene.document.StringField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermStates; @@ -128,7 +129,8 @@ private Scorer scorer(String[] values, int minShouldMatch, Mode mode) throws Exc case SCORER: return weight.scorer(reader.getContext()); case BULK_SCORER: - final BulkScorer bulkScorer = weight.optionalBulkScorer(reader.getContext()); + final ScorerSupplier ss = weight.scorerSupplier(reader.getContext()); + final BulkScorer bulkScorer = ss.bulkScorer(); if (bulkScorer == null) { if (weight.scorer(reader.getContext()) != null) { throw new AssertionError("BooleanScorer should be applicable for this query"); @@ -344,7 +346,8 @@ static class SlowMinShouldMatchScorer extends Scorer { final int maxDoc; final Set ords = new HashSet<>(); - final LeafSimScorer[] sims; + final SimScorer[] sims; + final NumericDocValues norms; final int minNrShouldMatch; double score = Float.NaN; @@ -355,7 +358,7 @@ static class SlowMinShouldMatchScorer extends Scorer { this.maxDoc = reader.maxDoc(); BooleanQuery bq = (BooleanQuery) weight.getQuery(); this.minNrShouldMatch = bq.getMinimumNumberShouldMatch(); - this.sims = new LeafSimScorer[(int) dv.getValueCount()]; + this.sims = new SimScorer[(int) dv.getValueCount()]; for (BooleanClause clause : bq.clauses()) { assert !clause.isProhibited(); assert !clause.isRequired(); @@ -365,14 +368,14 @@ static class SlowMinShouldMatchScorer extends Scorer { boolean success = ords.add(ord); assert success; // no dups TermStates ts = TermStates.build(searcher, term, true); - SimScorer w = + sims[(int) ord] = weight.similarity.scorer( 1f, searcher.collectionStatistics("field"), searcher.termStatistics(term, ts.docFreq(), ts.totalTermFreq())); - sims[(int) ord] = new LeafSimScorer(w, reader, "field", true); } } + norms = reader.getNormValues("field"); } @Override @@ -408,11 +411,15 @@ public int nextDoc() throws IOException { continue; } long ord; + long norm = 1L; + if (norms != null && norms.advanceExact(currentDoc)) { + norm = norms.longValue(); + } for (int i = 0; i < dv.docValueCount(); i++) { ord = dv.nextOrd(); if (ords.contains(ord)) { currentMatched++; - score += sims[(int) ord].score(currentDoc, 1); + score += sims[(int) ord].score(1, norm); } } if (currentMatched >= minNrShouldMatch) { diff --git a/lucene/core/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java index 296a4648922b..636dda9deb62 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java @@ -367,10 +367,10 @@ public void testZeroPosIncr() throws IOException { mpqb.add(new Term[] {new Term("field", "b"), new Term("field", "c")}, 0); } TopDocs hits = s.search(mpqb.build(), 2); - assertEquals(2, hits.totalHits.value); + assertEquals(2, hits.totalHits.value()); assertEquals(hits.scoreDocs[0].score, hits.scoreDocs[1].score, 1e-5); /* - for(int hit=0;hit 0); } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestPrefixInBooleanQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestPrefixInBooleanQuery.java index e0e3928778bc..0b34e2e8eed2 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestPrefixInBooleanQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestPrefixInBooleanQuery.java @@ -82,12 +82,12 @@ public static void afterClass() throws Exception { public void testPrefixQuery() throws Exception { Query query = new PrefixQuery(new Term(FIELD, "tang")); - assertEquals("Number of matched documents", 2, searcher.search(query, 1000).totalHits.value); + assertEquals("Number of matched documents", 2, searcher.search(query, 1000).totalHits.value()); } public void testTermQuery() throws Exception { Query query = new TermQuery(new Term(FIELD, "tangfulin")); - assertEquals("Number of matched documents", 2, searcher.search(query, 1000).totalHits.value); + assertEquals("Number of matched documents", 2, searcher.search(query, 1000).totalHits.value()); } public void testTermBooleanQuery() throws Exception { @@ -95,7 +95,7 @@ public void testTermBooleanQuery() throws Exception { query.add(new TermQuery(new Term(FIELD, "tangfulin")), BooleanClause.Occur.SHOULD); query.add(new TermQuery(new Term(FIELD, "notexistnames")), BooleanClause.Occur.SHOULD); assertEquals( - "Number of matched documents", 2, searcher.search(query.build(), 1000).totalHits.value); + "Number of matched documents", 2, searcher.search(query.build(), 1000).totalHits.value()); } public void testPrefixBooleanQuery() throws Exception { @@ -103,6 +103,6 @@ public void testPrefixBooleanQuery() throws Exception { query.add(new PrefixQuery(new Term(FIELD, "tang")), BooleanClause.Occur.SHOULD); query.add(new TermQuery(new Term(FIELD, "notexistnames")), BooleanClause.Occur.SHOULD); assertEquals( - "Number of matched documents", 2, searcher.search(query.build(), 1000).totalHits.value); + "Number of matched documents", 2, searcher.search(query.build(), 1000).totalHits.value()); } } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestPrefixQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestPrefixQuery.java index 9e8761a8b2bf..48351e89b637 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestPrefixQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestPrefixQuery.java @@ -76,7 +76,7 @@ public void testMatchAll() throws Exception { PrefixQuery query = new PrefixQuery(new Term("field", "")); IndexSearcher searcher = newSearcher(reader); - assertEquals(1, searcher.search(query, 1000).totalHits.value); + assertEquals(1, searcher.search(query, 1000).totalHits.value()); writer.close(); reader.close(); directory.close(); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestQueryRescorer.java b/lucene/core/src/test/org/apache/lucene/search/TestQueryRescorer.java index d422f85e012f..b3c1978ecaad 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestQueryRescorer.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestQueryRescorer.java @@ -169,7 +169,7 @@ public void testBasic() throws Exception { searcher.setSimilarity(new ClassicSimilarity()); TopDocs hits = searcher.search(bq.build(), 10); - assertEquals(2, hits.totalHits.value); + assertEquals(2, hits.totalHits.value()); assertEquals("0", searcher.storedFields().document(hits.scoreDocs[0].doc).get("id")); assertEquals("1", searcher.storedFields().document(hits.scoreDocs[1].doc).get("id")); @@ -179,7 +179,7 @@ public void testBasic() throws Exception { TopDocs hits2 = QueryRescorer.rescore(searcher, hits, pq, 2.0, 10); // Resorting changed the order: - assertEquals(2, hits2.totalHits.value); + assertEquals(2, hits2.totalHits.value()); assertEquals("1", searcher.storedFields().document(hits2.scoreDocs[0].doc).get("id")); assertEquals("0", searcher.storedFields().document(hits2.scoreDocs[1].doc).get("id")); @@ -212,7 +212,7 @@ public void testNullScorerTermQuery() throws Exception { searcher.setSimilarity(new ClassicSimilarity()); TopDocs hits = searcher.search(bq.build(), 10); - assertEquals(2, hits.totalHits.value); + assertEquals(2, hits.totalHits.value()); assertEquals("0", searcher.storedFields().document(hits.scoreDocs[0].doc).get("id")); assertEquals("1", searcher.storedFields().document(hits.scoreDocs[1].doc).get("id")); @@ -221,7 +221,7 @@ public void testNullScorerTermQuery() throws Exception { TopDocs hits2 = QueryRescorer.rescore(searcher, hits, tq, 2.0, 10); // Just testing that null scorer is handled. - assertEquals(2, hits2.totalHits.value); + assertEquals(2, hits2.totalHits.value()); r.close(); dir.close(); @@ -250,7 +250,7 @@ public void testCustomCombine() throws Exception { IndexSearcher searcher = getSearcher(r); TopDocs hits = searcher.search(bq.build(), 10); - assertEquals(2, hits.totalHits.value); + assertEquals(2, hits.totalHits.value()); assertEquals("0", searcher.storedFields().document(hits.scoreDocs[0].doc).get("id")); assertEquals("1", searcher.storedFields().document(hits.scoreDocs[1].doc).get("id")); @@ -272,7 +272,7 @@ protected float combine( }.rescore(searcher, hits, 10); // Resorting didn't change the order: - assertEquals(2, hits2.totalHits.value); + assertEquals(2, hits2.totalHits.value()); assertEquals("0", searcher.storedFields().document(hits2.scoreDocs[0].doc).get("id")); assertEquals("1", searcher.storedFields().document(hits2.scoreDocs[1].doc).get("id")); @@ -303,7 +303,7 @@ public void testExplain() throws Exception { IndexSearcher searcher = getSearcher(r); TopDocs hits = searcher.search(bq.build(), 10); - assertEquals(2, hits.totalHits.value); + assertEquals(2, hits.totalHits.value()); assertEquals("0", searcher.storedFields().document(hits.scoreDocs[0].doc).get("id")); assertEquals("1", searcher.storedFields().document(hits.scoreDocs[1].doc).get("id")); @@ -326,7 +326,7 @@ protected float combine( TopDocs hits2 = rescorer.rescore(searcher, hits, 10); // Resorting changed the order: - assertEquals(2, hits2.totalHits.value); + assertEquals(2, hits2.totalHits.value()); assertEquals("1", searcher.storedFields().document(hits2.scoreDocs[0].doc).get("id")); assertEquals("0", searcher.storedFields().document(hits2.scoreDocs[1].doc).get("id")); @@ -376,7 +376,7 @@ public void testMissingSecondPassScore() throws Exception { IndexSearcher searcher = getSearcher(r); TopDocs hits = searcher.search(bq.build(), 10); - assertEquals(2, hits.totalHits.value); + assertEquals(2, hits.totalHits.value()); assertEquals("0", searcher.storedFields().document(hits.scoreDocs[0].doc).get("id")); assertEquals("1", searcher.storedFields().document(hits.scoreDocs[1].doc).get("id")); @@ -386,7 +386,7 @@ public void testMissingSecondPassScore() throws Exception { TopDocs hits2 = QueryRescorer.rescore(searcher, hits, pq, 2.0, 10); // Resorting changed the order: - assertEquals(2, hits2.totalHits.value); + assertEquals(2, hits2.totalHits.value()); assertEquals("1", searcher.storedFields().document(hits2.scoreDocs[0].doc).get("id")); assertEquals("0", searcher.storedFields().document(hits2.scoreDocs[1].doc).get("id")); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestRangeFieldsDocValuesQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestRangeFieldsDocValuesQuery.java index 27ebeb2cff66..d295e9115020 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestRangeFieldsDocValuesQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestRangeFieldsDocValuesQuery.java @@ -244,7 +244,7 @@ public void testNoData() throws IOException { Query q1 = LongRangeDocValuesField.newSlowIntersectsQuery("bar", new long[] {20}, new long[] {27}); TopDocs r = searcher.search(q1, 10); - assertEquals(0, r.totalHits.value); + assertEquals(0, r.totalHits.value()); // test on field of wrong type Query q2 = diff --git a/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java index 89d70da9d9b0..5c7e527040cf 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java @@ -170,7 +170,7 @@ public Automaton getAutomaton(String name) { RegexpQuery query = new RegexpQuery( newTerm(""), RegExp.ALL, myProvider, DEFAULT_DETERMINIZE_WORK_LIMIT); - assertEquals(1, searcher.search(query, 5).totalHits.value); + assertEquals(1, searcher.search(query, 5).totalHits.value()); } /** diff --git a/lucene/core/src/test/org/apache/lucene/search/TestRegexpRandom.java b/lucene/core/src/test/org/apache/lucene/search/TestRegexpRandom.java index f39ac709d7fe..89b7b7455432 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestRegexpRandom.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestRegexpRandom.java @@ -90,7 +90,7 @@ private String fillPattern(String wildcardPattern) { private void assertPatternHits(String pattern, int numHits) throws Exception { Query wq = new RegexpQuery(new Term("field", fillPattern(pattern))); TopDocs docs = searcher.search(wq, 25); - assertEquals("Incorrect hits for pattern: " + pattern, numHits, docs.totalHits.value); + assertEquals("Incorrect hits for pattern: " + pattern, numHits, docs.totalHits.value()); } @Override diff --git a/lucene/core/src/test/org/apache/lucene/search/TestReqExclBulkScorer.java b/lucene/core/src/test/org/apache/lucene/search/TestReqExclBulkScorer.java index ccd298e25caf..d0c989024a20 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestReqExclBulkScorer.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestReqExclBulkScorer.java @@ -103,7 +103,9 @@ public void collect(int doc) throws IOException { actualMatches.set(doc); } }, - null); + null, + 0, + DocIdSetIterator.NO_MORE_DOCS); } else { int next = 0; while (next < maxDoc) { diff --git a/lucene/core/src/test/org/apache/lucene/search/TestSameScoresWithThreads.java b/lucene/core/src/test/org/apache/lucene/search/TestSameScoresWithThreads.java index 02f86826f69d..89aab597d952 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestSameScoresWithThreads.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestSameScoresWithThreads.java @@ -96,7 +96,7 @@ public void run() { for (Map.Entry ent : shuffled) { TopDocs actual = s.search(new TermQuery(new Term("body", ent.getKey())), 100); TopDocs expected = ent.getValue(); - assertEquals(expected.totalHits.value, actual.totalHits.value); + assertEquals(expected.totalHits.value(), actual.totalHits.value()); assertEquals( "query=" + ent.getKey().utf8ToString(), expected.scoreDocs.length, diff --git a/lucene/core/src/test/org/apache/lucene/search/TestScorerPerf.java b/lucene/core/src/test/org/apache/lucene/search/TestScorerPerf.java index 25404eacb308..b50b352274bb 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestScorerPerf.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestScorerPerf.java @@ -17,15 +17,11 @@ package org.apache.lucene.search; import java.io.IOException; -import java.util.BitSet; import java.util.Collection; import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.index.Term; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.analysis.MockAnalyzer; import org.apache.lucene.tests.util.LuceneTestCase; @@ -35,34 +31,7 @@ public class TestScorerPerf extends LuceneTestCase { private final boolean validate = true; // set to false when doing performance testing - public void createRandomTerms(int nDocs, int nTerms, double power, Directory dir) - throws Exception { - int[] freq = new int[nTerms]; - Term[] terms = new Term[nTerms]; - for (int i = 0; i < nTerms; i++) { - int f = (nTerms + 1) - i; // make first terms less frequent - freq[i] = (int) Math.ceil(Math.pow(f, power)); - terms[i] = new Term("f", Character.toString((char) ('A' + i))); - } - - IndexWriter iw = - new IndexWriter( - dir, newIndexWriterConfig(new MockAnalyzer(random())).setOpenMode(OpenMode.CREATE)); - for (int i = 0; i < nDocs; i++) { - Document d = new Document(); - for (int j = 0; j < nTerms; j++) { - if (random().nextInt(freq[j]) == 0) { - d.add(newStringField("f", terms[j].text(), Field.Store.NO)); - // System.out.println(d); - } - } - iw.addDocument(d); - } - iw.forceMerge(1); - iw.close(); - } - - public FixedBitSet randBitSet(int sz, int numBitsToSet) { + private static FixedBitSet randBitSet(int sz, int numBitsToSet) { FixedBitSet set = new FixedBitSet(sz); for (int i = 0; i < numBitsToSet; i++) { set.set(random().nextInt(sz)); @@ -70,7 +39,7 @@ public FixedBitSet randBitSet(int sz, int numBitsToSet) { return set; } - public FixedBitSet[] randBitSets(int numSets, int setSize) { + private static FixedBitSet[] randBitSets(int numSets, int setSize) { FixedBitSet[] sets = new FixedBitSet[numSets]; for (int i = 0; i < sets.length; i++) { sets[i] = randBitSet(setSize, random().nextInt(setSize)); @@ -78,25 +47,16 @@ public FixedBitSet[] randBitSets(int numSets, int setSize) { return sets; } - private static final class CountingHitCollectorManager + private record CountingHitCollectorManager() implements CollectorManager { - private final boolean validate; - private final FixedBitSet result; - - CountingHitCollectorManager(boolean validate, FixedBitSet result) { - this.validate = validate; - this.result = result; - } - @Override public CountingHitCollector newCollector() { - return validate ? new MatchingHitCollector(result) : new CountingHitCollector(); + return new CountingHitCollector(); } @Override - public CountingHitCollector reduce(Collection collectors) - throws IOException { + public CountingHitCollector reduce(Collection collectors) { CountingHitCollector result = new CountingHitCollector(); for (CountingHitCollector collector : collectors) { result.count += collector.count; @@ -106,7 +66,7 @@ public CountingHitCollector reduce(Collection collectors) } } - public static class CountingHitCollector extends SimpleCollector { + private static class CountingHitCollector extends SimpleCollector { int count = 0; int sum = 0; protected int docBase = 0; @@ -121,12 +81,8 @@ public int getCount() { return count; } - public int getSum() { - return sum; - } - @Override - protected void doSetNextReader(LeafReaderContext context) throws IOException { + protected void doSetNextReader(LeafReaderContext context) { docBase = context.docBase; } @@ -136,24 +92,6 @@ public ScoreMode scoreMode() { } } - public static class MatchingHitCollector extends CountingHitCollector { - FixedBitSet answer; - int pos = -1; - - public MatchingHitCollector(FixedBitSet answer) { - this.answer = answer; - } - - public void collect(int doc, float score) { - - pos = answer.nextSetBit(pos + 1); - if (pos != doc + docBase) { - throw new RuntimeException("Expected doc " + pos + " but got " + (doc + docBase)); - } - super.collect(doc); - } - } - private static class BitSetQuery extends Query { private final FixedBitSet docs; @@ -163,11 +101,10 @@ private static class BitSetQuery extends Query { } @Override - public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) - throws IOException { + public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) { return new ConstantScoreWeight(this, boost) { @Override - public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException { + public ScorerSupplier scorerSupplier(LeafReaderContext context) { final var scorer = new ConstantScoreScorer( score(), scoreMode, new BitSetIterator(docs, docs.approximateCardinality())); @@ -200,20 +137,22 @@ public int hashCode() { } } - FixedBitSet addClause(FixedBitSet[] sets, BooleanQuery.Builder bq, FixedBitSet result) { + private FixedBitSet addClause(FixedBitSet[] sets, BooleanQuery.Builder bq, FixedBitSet result) { final FixedBitSet rnd = sets[random().nextInt(sets.length)]; Query q = new BitSetQuery(rnd); bq.add(q, BooleanClause.Occur.MUST); if (validate) { - if (result == null) result = rnd.clone(); - else result.and(rnd); + if (result == null) { + result = rnd.clone(); + } else { + result.and(rnd); + } } return result; } - public int doConjunctions(IndexSearcher s, FixedBitSet[] sets, int iter, int maxClauses) + private void doConjunctions(IndexSearcher s, FixedBitSet[] sets, int iter, int maxClauses) throws IOException { - int ret = 0; for (int i = 0; i < iter; i++) { int nClauses = random().nextInt(maxClauses - 1) + 2; // min 2 clauses @@ -222,21 +161,17 @@ public int doConjunctions(IndexSearcher s, FixedBitSet[] sets, int iter, int max for (int j = 0; j < nClauses; j++) { result = addClause(sets, bq, result); } - CountingHitCollector hc = - s.search(bq.build(), new CountingHitCollectorManager(validate, result)); - ret += hc.getSum(); + CountingHitCollector hc = s.search(bq.build(), new CountingHitCollectorManager()); - if (validate) assertEquals(result.cardinality(), hc.getCount()); - // System.out.println(hc.getCount()); + if (validate) { + assertEquals(result.cardinality(), hc.getCount()); + } } - - return ret; } - public int doNestedConjunctions( + private void doNestedConjunctions( IndexSearcher s, FixedBitSet[] sets, int iter, int maxOuterClauses, int maxClauses) throws IOException { - int ret = 0; long nMatches = 0; for (int i = 0; i < iter; i++) { @@ -255,107 +190,15 @@ public int doNestedConjunctions( oq.add(bq.build(), BooleanClause.Occur.MUST); } // outer - CountingHitCollector hc = - s.search(oq.build(), new CountingHitCollectorManager(validate, result)); + CountingHitCollector hc = s.search(oq.build(), new CountingHitCollectorManager()); nMatches += hc.getCount(); - ret += hc.getSum(); - if (validate) assertEquals(result.cardinality(), hc.getCount()); - // System.out.println(hc.getCount()); - } - if (VERBOSE) System.out.println("Average number of matches=" + (nMatches / iter)); - return ret; - } - - public int doTermConjunctions( - Term[] terms, IndexSearcher s, int termsInIndex, int maxClauses, int iter) - throws IOException { - int ret = 0; - - long nMatches = 0; - for (int i = 0; i < iter; i++) { - int nClauses = random().nextInt(maxClauses - 1) + 2; // min 2 clauses - BooleanQuery.Builder bq = new BooleanQuery.Builder(); - BitSet termflag = new BitSet(termsInIndex); - for (int j = 0; j < nClauses; j++) { - int tnum; - // don't pick same clause twice - tnum = random().nextInt(termsInIndex); - if (termflag.get(tnum)) tnum = termflag.nextClearBit(tnum); - if (tnum < 0 || tnum >= termsInIndex) tnum = termflag.nextClearBit(0); - termflag.set(tnum); - Query tq = new TermQuery(terms[tnum]); - bq.add(tq, BooleanClause.Occur.MUST); + if (validate) { + assertEquals(result.cardinality(), hc.getCount()); } - - CountingHitCollector hc = s.search(bq.build(), new CountingHitCollectorManager(false, null)); - nMatches += hc.getCount(); - ret += hc.getSum(); - } - if (VERBOSE) System.out.println("Average number of matches=" + (nMatches / iter)); - - return ret; - } - - public int doNestedTermConjunctions( - IndexSearcher s, - Term[] terms, - int termsInIndex, - int maxOuterClauses, - int maxClauses, - int iter) - throws IOException { - int ret = 0; - long nMatches = 0; - for (int i = 0; i < iter; i++) { - int oClauses = random().nextInt(maxOuterClauses - 1) + 2; - BooleanQuery.Builder oq = new BooleanQuery.Builder(); - for (int o = 0; o < oClauses; o++) { - - int nClauses = random().nextInt(maxClauses - 1) + 2; // min 2 clauses - BooleanQuery.Builder bq = new BooleanQuery.Builder(); - BitSet termflag = new BitSet(termsInIndex); - for (int j = 0; j < nClauses; j++) { - int tnum; - // don't pick same clause twice - tnum = random().nextInt(termsInIndex); - if (termflag.get(tnum)) tnum = termflag.nextClearBit(tnum); - if (tnum < 0 || tnum >= 25) tnum = termflag.nextClearBit(0); - termflag.set(tnum); - Query tq = new TermQuery(terms[tnum]); - bq.add(tq, BooleanClause.Occur.MUST); - } // inner - - oq.add(bq.build(), BooleanClause.Occur.MUST); - } // outer - - CountingHitCollector hc = s.search(oq.build(), new CountingHitCollectorManager(false, null)); - nMatches += hc.getCount(); - ret += hc.getSum(); } - if (VERBOSE) System.out.println("Average number of matches=" + (nMatches / iter)); - return ret; - } - - public int doSloppyPhrase(IndexSearcher s, int termsInIndex, int maxClauses, int iter) - throws IOException { - int ret = 0; - - for (int i = 0; i < iter; i++) { - int nClauses = random().nextInt(maxClauses - 1) + 2; // min 2 clauses - PhraseQuery.Builder builder = new PhraseQuery.Builder(); - for (int j = 0; j < nClauses; j++) { - int tnum = random().nextInt(termsInIndex); - builder.add(new Term("f", Character.toString((char) (tnum + 'A')))); - } - // slop could be random too - builder.setSlop(termsInIndex); - PhraseQuery q = builder.build(); - - CountingHitCollector hc = s.search(q, new CountingHitCollectorManager(false, null)); - ret += hc.getSum(); + if (VERBOSE) { + System.out.println("Average number of matches=" + (nMatches / iter)); } - - return ret; } public void testConjunctions() throws Exception { diff --git a/lucene/core/src/test/org/apache/lucene/search/TestSearchAfter.java b/lucene/core/src/test/org/apache/lucene/search/TestSearchAfter.java index d0ecb7c1ad12..ecc5727625cf 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestSearchAfter.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestSearchAfter.java @@ -234,10 +234,10 @@ void assertQuery(Query query, Sort sort) throws Exception { allManager = new TopScoreDocCollectorManager(maxDoc, null, Integer.MAX_VALUE); doScores = false; } else if (sort == Sort.RELEVANCE) { - allManager = new TopFieldCollectorManager(sort, maxDoc, null, Integer.MAX_VALUE, true); + allManager = new TopFieldCollectorManager(sort, maxDoc, null, Integer.MAX_VALUE); doScores = true; } else { - allManager = new TopFieldCollectorManager(sort, maxDoc, null, Integer.MAX_VALUE, true); + allManager = new TopFieldCollectorManager(sort, maxDoc, null, Integer.MAX_VALUE); doScores = random().nextBoolean(); } all = searcher.search(query, allManager); @@ -246,7 +246,7 @@ void assertQuery(Query query, Sort sort) throws Exception { } if (VERBOSE) { - System.out.println(" all.totalHits.value=" + all.totalHits.value); + System.out.println(" all.totalHits.value()=" + all.totalHits.value()); int upto = 0; StoredFields storedFields = searcher.storedFields(); for (ScoreDoc scoreDoc : all.scoreDocs) { @@ -261,22 +261,20 @@ void assertQuery(Query query, Sort sort) throws Exception { } int pageStart = 0; ScoreDoc lastBottom = null; - while (pageStart < all.totalHits.value) { + while (pageStart < all.totalHits.value()) { TopDocs paged; final CollectorManager pagedManager; if (sort == null) { if (VERBOSE) { System.out.println(" iter lastBottom=" + lastBottom); } - pagedManager = - new TopScoreDocCollectorManager(pageSize, lastBottom, Integer.MAX_VALUE, true); + pagedManager = new TopScoreDocCollectorManager(pageSize, lastBottom, Integer.MAX_VALUE); } else { if (VERBOSE) { System.out.println(" iter lastBottom=" + lastBottom); } pagedManager = - new TopFieldCollectorManager( - sort, pageSize, (FieldDoc) lastBottom, Integer.MAX_VALUE, true); + new TopFieldCollectorManager(sort, pageSize, (FieldDoc) lastBottom, Integer.MAX_VALUE); } paged = searcher.search(query, pagedManager); if (doScores) { @@ -298,7 +296,7 @@ void assertQuery(Query query, Sort sort) throws Exception { } void assertPage(int pageStart, TopDocs all, TopDocs paged) throws IOException { - assertEquals(all.totalHits.value, paged.totalHits.value); + assertEquals(all.totalHits.value(), paged.totalHits.value()); StoredFields storedFields = searcher.storedFields(); for (int i = 0; i < paged.scoreDocs.length; i++) { ScoreDoc sd1 = all.scoreDocs[pageStart + i]; diff --git a/lucene/core/src/test/org/apache/lucene/search/TestShardSearching.java b/lucene/core/src/test/org/apache/lucene/search/TestShardSearching.java index be0a4825ae6f..8a2639bea6a2 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestShardSearching.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestShardSearching.java @@ -361,12 +361,12 @@ private PreviousSearchState assertSame( System.out.println(" shard=" + shardID + " maxDoc=" + shardSearchers[shardID].searcher.getIndexReader().maxDoc()); } */ - System.out.println(" single searcher: " + hits.totalHits.value); + System.out.println(" single searcher: " + hits.totalHits.value()); for (int i = 0; i < hits.scoreDocs.length; i++) { final ScoreDoc sd = hits.scoreDocs[i]; System.out.println(" doc=" + sd.doc + " score=" + sd.score); } - System.out.println(" shard searcher: " + shardHits.totalHits.value); + System.out.println(" shard searcher: " + shardHits.totalHits.value()); for (int i = 0; i < shardHits.scoreDocs.length; i++) { final ScoreDoc sd = shardHits.scoreDocs[i]; System.out.println( @@ -391,7 +391,7 @@ private PreviousSearchState assertSame( final ScoreDoc bottomHit; final ScoreDoc bottomHitShards; - if (numHitsPaged < hits.totalHits.value) { + if (numHitsPaged < hits.totalHits.value()) { // More hits to page through moreHits = true; if (sort == null) { @@ -408,7 +408,7 @@ private PreviousSearchState assertSame( } } else { - assertEquals(hits.totalHits.value, numHitsPaged); + assertEquals(hits.totalHits.value(), numHitsPaged); bottomHit = null; bottomHitShards = null; moreHits = false; diff --git a/lucene/core/src/test/org/apache/lucene/search/TestSimilarityProvider.java b/lucene/core/src/test/org/apache/lucene/search/TestSimilarityProvider.java index faf4c251fe30..03bea2aee60d 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestSimilarityProvider.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestSimilarityProvider.java @@ -81,9 +81,9 @@ public void testBasics() throws Exception { // sanity check of searching TopDocs foodocs = searcher.search(new TermQuery(new Term("foo", "brown")), 10); - assertTrue(foodocs.totalHits.value > 0); + assertTrue(foodocs.totalHits.value() > 0); TopDocs bardocs = searcher.search(new TermQuery(new Term("bar", "brown")), 10); - assertTrue(bardocs.totalHits.value > 0); + assertTrue(bardocs.totalHits.value() > 0); assertTrue(foodocs.scoreDocs[0].score < bardocs.scoreDocs[0].score); } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestSloppyPhraseQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestSloppyPhraseQuery.java index 8b69510f7354..80bbf76b7829 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestSloppyPhraseQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestSloppyPhraseQuery.java @@ -300,13 +300,13 @@ public void testSlopWithHoles() throws Exception { builder.add(new Term("lyrics", "drug"), 4); PhraseQuery pq = builder.build(); // "drug the drug"~1 - assertEquals(1, is.search(pq, 4).totalHits.value); + assertEquals(1, is.search(pq, 4).totalHits.value()); builder.setSlop(1); pq = builder.build(); - assertEquals(3, is.search(pq, 4).totalHits.value); + assertEquals(3, is.search(pq, 4).totalHits.value()); builder.setSlop(2); pq = builder.build(); - assertEquals(4, is.search(pq, 4).totalHits.value); + assertEquals(4, is.search(pq, 4).totalHits.value()); ir.close(); dir.close(); } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestSort.java b/lucene/core/src/test/org/apache/lucene/search/TestSort.java index 61c202e6884a..db2c7bbe884e 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestSort.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestSort.java @@ -109,7 +109,7 @@ public void testString() throws IOException { Sort sort = new Sort(new SortField("value", SortField.Type.STRING)); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(2, td.totalHits.value); + assertEquals(2, td.totalHits.value()); // 'bar' comes before 'foo' assertEquals("bar", searcher.storedFields().document(td.scoreDocs[0].doc).get("value")); assertEquals("foo", searcher.storedFields().document(td.scoreDocs[1].doc).get("value")); @@ -137,7 +137,7 @@ public void testStringReverse() throws IOException { Sort sort = new Sort(new SortField("value", SortField.Type.STRING, true)); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(2, td.totalHits.value); + assertEquals(2, td.totalHits.value()); // 'foo' comes after 'bar' in reverse order assertEquals("foo", searcher.storedFields().document(td.scoreDocs[0].doc).get("value")); assertEquals("bar", searcher.storedFields().document(td.scoreDocs[1].doc).get("value")); @@ -165,7 +165,7 @@ public void testStringVal() throws IOException { Sort sort = new Sort(new SortField("value", SortField.Type.STRING_VAL)); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(2, td.totalHits.value); + assertEquals(2, td.totalHits.value()); // 'bar' comes before 'foo' assertEquals("bar", searcher.storedFields().document(td.scoreDocs[0].doc).get("value")); assertEquals("foo", searcher.storedFields().document(td.scoreDocs[1].doc).get("value")); @@ -193,7 +193,7 @@ public void testStringValReverse() throws IOException { Sort sort = new Sort(new SortField("value", SortField.Type.STRING_VAL, true)); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(2, td.totalHits.value); + assertEquals(2, td.totalHits.value()); // 'foo' comes after 'bar' in reverse order assertEquals("foo", searcher.storedFields().document(td.scoreDocs[0].doc).get("value")); assertEquals("bar", searcher.storedFields().document(td.scoreDocs[1].doc).get("value")); @@ -225,7 +225,7 @@ public void testInt() throws IOException { Sort sort = new Sort(new SortField("value", SortField.Type.INT)); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(3, td.totalHits.value); + assertEquals(3, td.totalHits.value()); // numeric order assertEquals("-1", searcher.storedFields().document(td.scoreDocs[0].doc).get("value")); assertEquals("4", searcher.storedFields().document(td.scoreDocs[1].doc).get("value")); @@ -258,7 +258,7 @@ public void testIntReverse() throws IOException { Sort sort = new Sort(new SortField("value", SortField.Type.INT, true)); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(3, td.totalHits.value); + assertEquals(3, td.totalHits.value()); // reverse numeric order assertEquals("300000", searcher.storedFields().document(td.scoreDocs[0].doc).get("value")); assertEquals("4", searcher.storedFields().document(td.scoreDocs[1].doc).get("value")); @@ -289,7 +289,7 @@ public void testIntMissing() throws IOException { Sort sort = new Sort(new SortField("value", SortField.Type.INT)); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(3, td.totalHits.value); + assertEquals(3, td.totalHits.value()); // null is treated as a 0 assertEquals("-1", searcher.storedFields().document(td.scoreDocs[0].doc).get("value")); assertNull(searcher.storedFields().document(td.scoreDocs[1].doc).get("value")); @@ -324,7 +324,7 @@ public void testIntMissingLast() throws IOException { Sort sort = new Sort(sortField); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(3, td.totalHits.value); + assertEquals(3, td.totalHits.value()); // null is treated as a Integer.MAX_VALUE assertEquals("-1", searcher.storedFields().document(td.scoreDocs[0].doc).get("value")); assertEquals("4", searcher.storedFields().document(td.scoreDocs[1].doc).get("value")); @@ -357,7 +357,7 @@ public void testLong() throws IOException { Sort sort = new Sort(new SortField("value", SortField.Type.LONG)); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(3, td.totalHits.value); + assertEquals(3, td.totalHits.value()); // numeric order assertEquals("-1", searcher.storedFields().document(td.scoreDocs[0].doc).get("value")); assertEquals("4", searcher.storedFields().document(td.scoreDocs[1].doc).get("value")); @@ -390,7 +390,7 @@ public void testLongReverse() throws IOException { Sort sort = new Sort(new SortField("value", SortField.Type.LONG, true)); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(3, td.totalHits.value); + assertEquals(3, td.totalHits.value()); // reverse numeric order assertEquals("3000000000", searcher.storedFields().document(td.scoreDocs[0].doc).get("value")); assertEquals("4", searcher.storedFields().document(td.scoreDocs[1].doc).get("value")); @@ -421,7 +421,7 @@ public void testLongMissing() throws IOException { Sort sort = new Sort(new SortField("value", SortField.Type.LONG)); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(3, td.totalHits.value); + assertEquals(3, td.totalHits.value()); // null is treated as 0 assertEquals("-1", searcher.storedFields().document(td.scoreDocs[0].doc).get("value")); assertNull(searcher.storedFields().document(td.scoreDocs[1].doc).get("value")); @@ -456,7 +456,7 @@ public void testLongMissingLast() throws IOException { Sort sort = new Sort(sortField); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(3, td.totalHits.value); + assertEquals(3, td.totalHits.value()); // null is treated as Long.MAX_VALUE assertEquals("-1", searcher.storedFields().document(td.scoreDocs[0].doc).get("value")); assertEquals("4", searcher.storedFields().document(td.scoreDocs[1].doc).get("value")); @@ -489,7 +489,7 @@ public void testFloat() throws IOException { Sort sort = new Sort(new SortField("value", SortField.Type.FLOAT)); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(3, td.totalHits.value); + assertEquals(3, td.totalHits.value()); // numeric order assertEquals("-1.3", searcher.storedFields().document(td.scoreDocs[0].doc).get("value")); assertEquals("4.2", searcher.storedFields().document(td.scoreDocs[1].doc).get("value")); @@ -522,7 +522,7 @@ public void testFloatReverse() throws IOException { Sort sort = new Sort(new SortField("value", SortField.Type.FLOAT, true)); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(3, td.totalHits.value); + assertEquals(3, td.totalHits.value()); // reverse numeric order assertEquals("30.1", searcher.storedFields().document(td.scoreDocs[0].doc).get("value")); assertEquals("4.2", searcher.storedFields().document(td.scoreDocs[1].doc).get("value")); @@ -553,7 +553,7 @@ public void testFloatMissing() throws IOException { Sort sort = new Sort(new SortField("value", SortField.Type.FLOAT)); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(3, td.totalHits.value); + assertEquals(3, td.totalHits.value()); // null is treated as 0 assertEquals("-1.3", searcher.storedFields().document(td.scoreDocs[0].doc).get("value")); assertNull(searcher.storedFields().document(td.scoreDocs[1].doc).get("value")); @@ -588,7 +588,7 @@ public void testFloatMissingLast() throws IOException { Sort sort = new Sort(sortField); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(3, td.totalHits.value); + assertEquals(3, td.totalHits.value()); // null is treated as Float.MAX_VALUE assertEquals("-1.3", searcher.storedFields().document(td.scoreDocs[0].doc).get("value")); assertEquals("4.2", searcher.storedFields().document(td.scoreDocs[1].doc).get("value")); @@ -625,7 +625,7 @@ public void testDouble() throws IOException { Sort sort = new Sort(new SortField("value", SortField.Type.DOUBLE)); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(4, td.totalHits.value); + assertEquals(4, td.totalHits.value()); // numeric order assertEquals("-1.3", searcher.storedFields().document(td.scoreDocs[0].doc).get("value")); assertEquals( @@ -658,7 +658,7 @@ public void testDoubleSignedZero() throws IOException { Sort sort = new Sort(new SortField("value", SortField.Type.DOUBLE)); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(2, td.totalHits.value); + assertEquals(2, td.totalHits.value()); // numeric order assertEquals("-0", searcher.storedFields().document(td.scoreDocs[0].doc).get("value")); assertEquals("+0", searcher.storedFields().document(td.scoreDocs[1].doc).get("value")); @@ -694,7 +694,7 @@ public void testDoubleReverse() throws IOException { Sort sort = new Sort(new SortField("value", SortField.Type.DOUBLE, true)); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(4, td.totalHits.value); + assertEquals(4, td.totalHits.value()); // numeric order assertEquals("30.1", searcher.storedFields().document(td.scoreDocs[0].doc).get("value")); assertEquals( @@ -732,7 +732,7 @@ public void testDoubleMissing() throws IOException { Sort sort = new Sort(new SortField("value", SortField.Type.DOUBLE)); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(4, td.totalHits.value); + assertEquals(4, td.totalHits.value()); // null treated as a 0 assertEquals("-1.3", searcher.storedFields().document(td.scoreDocs[0].doc).get("value")); assertNull(searcher.storedFields().document(td.scoreDocs[1].doc).get("value")); @@ -775,7 +775,7 @@ public void testDoubleMissingLast() throws IOException { Sort sort = new Sort(sortField); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(4, td.totalHits.value); + assertEquals(4, td.totalHits.value()); // null treated as Double.MAX_VALUE assertEquals("-1.3", searcher.storedFields().document(td.scoreDocs[0].doc).get("value")); assertEquals( @@ -826,7 +826,7 @@ public void testMultiSort() throws IOException { new SortField("value2", SortField.Type.LONG)); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(4, td.totalHits.value); + assertEquals(4, td.totalHits.value()); // 'bar' comes before 'foo' assertEquals("bar", searcher.storedFields().document(td.scoreDocs[0].doc).get("value1")); assertEquals("bar", searcher.storedFields().document(td.scoreDocs[1].doc).get("value1")); @@ -840,7 +840,7 @@ public void testMultiSort() throws IOException { // Now with overflow td = searcher.search(new MatchAllDocsQuery(), 1, sort); - assertEquals(4, td.totalHits.value); + assertEquals(4, td.totalHits.value()); assertEquals("bar", searcher.storedFields().document(td.scoreDocs[0].doc).get("value1")); assertEquals("0", searcher.storedFields().document(td.scoreDocs[0].doc).get("value2")); @@ -896,7 +896,7 @@ private void doTestStringGhost(boolean indexed) throws IOException { Sort sort = new Sort(new SortField("value", SortField.Type.STRING)); TopFieldDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(2, td.totalHits.value); + assertEquals(2, td.totalHits.value()); assertNull(((FieldDoc) td.scoreDocs[0]).fields[0]); assertNull(((FieldDoc) td.scoreDocs[1]).fields[0]); @@ -932,7 +932,7 @@ private void doTestIntGhost(boolean indexed) throws IOException { Sort sort = new Sort(new SortField("value", SortField.Type.INT)); TopFieldDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(2, td.totalHits.value); + assertEquals(2, td.totalHits.value()); assertEquals(0, ((FieldDoc) td.scoreDocs[0]).fields[0]); assertEquals(0, ((FieldDoc) td.scoreDocs[1]).fields[0]); @@ -968,7 +968,7 @@ private void doTestLongGhost(boolean indexed) throws IOException { Sort sort = new Sort(new SortField("value", SortField.Type.LONG)); TopFieldDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(2, td.totalHits.value); + assertEquals(2, td.totalHits.value()); assertEquals(0L, ((FieldDoc) td.scoreDocs[0]).fields[0]); assertEquals(0L, ((FieldDoc) td.scoreDocs[1]).fields[0]); @@ -1004,7 +1004,7 @@ private void doTestDoubleGhost(boolean indexed) throws IOException { Sort sort = new Sort(new SortField("value", SortField.Type.DOUBLE)); TopFieldDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(2, td.totalHits.value); + assertEquals(2, td.totalHits.value()); assertEquals(0.0, ((FieldDoc) td.scoreDocs[0]).fields[0]); assertEquals(0.0, ((FieldDoc) td.scoreDocs[1]).fields[0]); @@ -1040,7 +1040,7 @@ private void doTestFloatGhost(boolean indexed) throws IOException { Sort sort = new Sort(new SortField("value", SortField.Type.FLOAT)); TopFieldDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(2, td.totalHits.value); + assertEquals(2, td.totalHits.value()); assertEquals(0.0f, ((FieldDoc) td.scoreDocs[0]).fields[0]); assertEquals(0.0f, ((FieldDoc) td.scoreDocs[1]).fields[0]); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java b/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java index a1c0372cd08b..98496ae32106 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java @@ -101,8 +101,8 @@ public void testLongSortOptimization() throws IOException { FieldDoc fieldDoc = (FieldDoc) topDocs.scoreDocs[i]; assertEquals(i, ((Long) fieldDoc.fields[0]).intValue()); } - assertEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, topDocs.totalHits.relation); - assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, numDocs); + assertEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, topDocs.totalHits.relation()); + assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value(), numDocs); } { // paging sort with after @@ -116,8 +116,8 @@ public void testLongSortOptimization() throws IOException { FieldDoc fieldDoc = (FieldDoc) topDocs.scoreDocs[i]; assertEquals(afterValue + 1 + i, fieldDoc.fields[0]); } - assertEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, topDocs.totalHits.relation); - assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, numDocs); + assertEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, topDocs.totalHits.relation()); + assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value(), numDocs); } { // test that if there is the secondary sort on _score, scores are filled correctly @@ -132,8 +132,8 @@ public void testLongSortOptimization() throws IOException { float score = (float) fieldDoc.fields[1]; assertEquals(1.0, score, 0.001); } - assertEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, topDocs.totalHits.relation); - assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, numDocs); + assertEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, topDocs.totalHits.relation()); + assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value(), numDocs); } { // test that if numeric field is a secondary sort, no optimization is run @@ -143,7 +143,7 @@ public void testLongSortOptimization() throws IOException { TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), collectorManager); assertEquals(topDocs.scoreDocs.length, numHits); assertEquals( - topDocs.totalHits.value, + topDocs.totalHits.value(), numDocs); // assert that all documents were collected => optimization was not run } @@ -185,7 +185,7 @@ public void testLongSortOptimizationOnFieldNotIndexedWithPoints() throws IOExcep assertEquals(i, ((Long) fieldDoc.fields[0]).intValue()); // returns expected values } assertEquals( - topDocs.totalHits.value, + topDocs.totalHits.value(), numDocs); // assert that all documents were collected => optimization was not run reader.close(); @@ -227,7 +227,7 @@ public void testSortOptimizationWithMissingValues() throws IOException { new TopFieldCollectorManager(sort, numHits, totalHitsThreshold); TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), collectorManager); assertEquals(topDocs.scoreDocs.length, numHits); - assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, numDocs); + assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value(), numDocs); } { // test that optimization is not run when missing value setting of SortField is competitive // with Puring.SKIP @@ -237,11 +237,11 @@ public void testSortOptimizationWithMissingValues() throws IOException { sortField2.setMissingValue(0L); // set a competitive missing value final Sort sort = new Sort(sortField1, sortField2); CollectorManager manager = - new TopFieldCollectorManager(sort, numHits, null, totalHitsThreshold, true); + new TopFieldCollectorManager(sort, numHits, null, totalHitsThreshold); TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), manager); assertEquals(topDocs.scoreDocs.length, numHits); assertEquals( - topDocs.totalHits.value, + topDocs.totalHits.value(), numDocs); // assert that all documents were collected => optimization was not run } { // test that optimization is run when missing value setting of SortField is NOT competitive @@ -252,7 +252,7 @@ public void testSortOptimizationWithMissingValues() throws IOException { new TopFieldCollectorManager(sort, numHits, totalHitsThreshold); TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), collectorManager); assertEquals(topDocs.scoreDocs.length, numHits); - assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, numDocs); + assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value(), numDocs); } { // test that optimization is not run when missing value setting of SortField is competitive @@ -264,10 +264,10 @@ public void testSortOptimizationWithMissingValues() throws IOException { sortField.setMissingValue(Long.MAX_VALUE); // set a competitive missing value final Sort sort = new Sort(sortField); CollectorManager manager = - new TopFieldCollectorManager(sort, numHits, after, totalHitsThreshold, true); + new TopFieldCollectorManager(sort, numHits, after, totalHitsThreshold); TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), manager); assertEquals(topDocs.scoreDocs.length, numHits); - assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, numDocs); + assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value(), numDocs); } { // test that optimization is not run when missing value setting of SortField is competitive @@ -279,10 +279,34 @@ public void testSortOptimizationWithMissingValues() throws IOException { sortField.setMissingValue(Long.MAX_VALUE); // set a competitive missing value final Sort sort = new Sort(sortField); CollectorManager manager = - new TopFieldCollectorManager(sort, numHits, after, totalHitsThreshold, true); + new TopFieldCollectorManager(sort, numHits, after, totalHitsThreshold); TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), manager); assertEquals(topDocs.scoreDocs.length, numHits); - assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, numDocs); + assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value(), numDocs); + } + + { + // test that optimization is run when missing value setting of SortField is NOT competitive + // with after on asc order + long afterValue = 3L; + FieldDoc after = new FieldDoc(3, Float.NaN, new Long[] {afterValue}); + final SortField sortField = new SortField("my_field", SortField.Type.LONG); + sortField.setMissingValue(2L); + final Sort sort = new Sort(sortField); + final TopFieldCollectorManager collectorManager = + new TopFieldCollectorManager(sort, numHits, after, totalHitsThreshold); + TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), collectorManager); + assertEquals(topDocs.scoreDocs.length, numHits); + for (int i = 0; i < numHits; i++) { + FieldDoc fieldDoc = (FieldDoc) topDocs.scoreDocs[i]; + assertEquals(afterValue + 1 + i, fieldDoc.fields[0]); + } + assertEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, topDocs.totalHits.relation()); + // expect to skip all but the first leaf in the BKD tree in the first segment as well as the + // second segment + // doc-0 has no target field, so we need to minus 1 + final int expectedSkipped = (7001 - 512 - 1) + (numDocs - 7001); + assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value(), numDocs - expectedSkipped + 1); } reader.close(); @@ -323,9 +347,9 @@ public void testNumericDocValuesOptimizationWithMissingValues() throws IOExcepti sortField.setMissingValue(0L); // missing value is not competitive final Sort sort = new Sort(sortField); CollectorManager manager = - new TopFieldCollectorManager(sort, numHits, null, totalHitsThreshold, true); + new TopFieldCollectorManager(sort, numHits, null, totalHitsThreshold); topDocs1 = searcher.search(new MatchAllDocsQuery(), manager); - assertNonCompetitiveHitsAreSkipped(topDocs1.totalHits.value, numDocs); + assertNonCompetitiveHitsAreSkipped(topDocs1.totalHits.value(), numDocs); } { // Test that sort on sorted numeric field without sort optimization and with sort optimization // produce the same results @@ -334,7 +358,7 @@ public void testNumericDocValuesOptimizationWithMissingValues() throws IOExcepti final Sort sort = new Sort(sortField); sortField.setOptimizeSortWithPoints(false); CollectorManager manager = - new TopFieldCollectorManager(sort, numHits, null, totalHitsThreshold, true); + new TopFieldCollectorManager(sort, numHits, null, totalHitsThreshold); topDocs2 = searcher.search(new MatchAllDocsQuery(), manager); // assert that the resulting hits are the same assertEquals(topDocs1.scoreDocs.length, topDocs2.scoreDocs.length); @@ -347,7 +371,7 @@ public void testNumericDocValuesOptimizationWithMissingValues() throws IOExcepti assertEquals(fieldDoc.fields[0], fieldDoc2.fields[0]); assertEquals(fieldDoc.doc, fieldDoc2.doc); } - assertTrue(topDocs1.totalHits.value < topDocs2.totalHits.value); + assertTrue(topDocs1.totalHits.value() < topDocs2.totalHits.value()); } { // Test that we can't do optimization via NumericDocValues when there are multiple comparators @@ -357,10 +381,10 @@ public void testNumericDocValuesOptimizationWithMissingValues() throws IOExcepti sortField2.setMissingValue(0L); // missing value is not competitive final Sort multiSorts = new Sort(new SortField[] {sortField1, sortField2}); CollectorManager manager = - new TopFieldCollectorManager(multiSorts, numHits, null, totalHitsThreshold, true); + new TopFieldCollectorManager(multiSorts, numHits, null, totalHitsThreshold); TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), manager); // can't optimization with NumericDocValues when there are multiple comparators - assertEquals(topDocs.totalHits.value, numDocs); + assertEquals(topDocs.totalHits.value(), numDocs); } reader.close(); @@ -410,9 +434,9 @@ public void testSortOptimizationEqualValues() throws IOException { } if (reader.leaves().size() == 1) { // if segment size equals one, totalHits should always equals numHits plus 1 - assertEquals(topDocs.totalHits.value, numHits + 1); + assertEquals(topDocs.totalHits.value(), numHits + 1); } - assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, numDocs); + assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value(), numDocs); } { // test that sorting on a single field with equal values and after parameter @@ -431,7 +455,7 @@ public void testSortOptimizationEqualValues() throws IOException { assertEquals(100, fieldDoc.fields[0]); assertTrue(fieldDoc.doc > afterDocID); } - assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, numDocs); + assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value(), numDocs); } { // test that sorting on main field with equal values + another field for tie breaks doesn't @@ -450,7 +474,7 @@ public void testSortOptimizationEqualValues() throws IOException { } assertEquals(topDocs.scoreDocs.length, numHits); assertEquals( - topDocs.totalHits.value, + topDocs.totalHits.value(), numDocs); // assert that all documents were collected => optimization was not run } @@ -489,8 +513,8 @@ public void testFloatSortOptimization() throws IOException { FieldDoc fieldDoc = (FieldDoc) topDocs.scoreDocs[i]; assertEquals(1f * i, fieldDoc.fields[0]); } - assertEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, topDocs.totalHits.relation); - assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, numDocs); + assertEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, topDocs.totalHits.relation()); + assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value(), numDocs); } reader.close(); @@ -544,7 +568,7 @@ public void testDocSortOptimizationMultipleIndices() throws IOException { for (int docID = 0; docID < topDocs[i].scoreDocs.length; docID++) { topDocs[i].scoreDocs[docID].shardIndex = i; } - collectedDocs += topDocs[i].totalHits.value; + collectedDocs += topDocs[i].totalHits.value(); totalDocs += numDocsInIndex; } TopFieldDocs mergedTopDcs = TopDocs.merge(sort, size, topDocs); @@ -600,8 +624,8 @@ public void testDocSortOptimizationWithAfter() throws IOException { int expectedDocID = searchAfter + 1 + i; assertEquals(expectedDocID, topDocs.scoreDocs[i].doc); } - assertEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, topDocs.totalHits.relation); - assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, numDocs); + assertEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, topDocs.totalHits.relation()); + assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value(), numDocs); } // sort by _doc + _score with search after should trigger optimization @@ -618,8 +642,8 @@ public void testDocSortOptimizationWithAfter() throws IOException { int expectedDocID = searchAfter + 1 + i; assertEquals(expectedDocID, topDocs.scoreDocs[i].doc); } - assertEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, topDocs.totalHits.relation); - assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, numDocs); + assertEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, topDocs.totalHits.relation()); + assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value(), numDocs); } // sort by _doc desc should not trigger optimization @@ -636,7 +660,7 @@ public void testDocSortOptimizationWithAfter() throws IOException { assertEquals(expectedDocID, topDocs.scoreDocs[i].doc); } // assert that all documents were collected - assertEquals(numDocs, topDocs.totalHits.value); + assertEquals(numDocs, topDocs.totalHits.value()); } } @@ -715,8 +739,8 @@ public void testDocSortOptimization() throws IOException { for (int i = 0; i < numHits; i++) { assertEquals(i, topDocs.scoreDocs[i].doc); } - assertEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, topDocs.totalHits.relation); - assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, 10); + assertEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, topDocs.totalHits.relation()); + assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value(), 10); } // sort by _doc with a bool query should skip all non-competitive documents @@ -736,8 +760,8 @@ public void testDocSortOptimization() throws IOException { assertEquals(Integer.toString(i + lowerRange), d.get("slf")); assertEquals("seg1", d.get("tf")); } - assertEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, topDocs.totalHits.relation); - assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, 10); + assertEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, topDocs.totalHits.relation()); + assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value(), 10); } reader.close(); @@ -935,7 +959,7 @@ public void testRandomLong() throws IOException { // test search int numHits = 1 + random().nextInt(100); CollectorManager manager = - new TopFieldCollectorManager(new Sort(sortField), numHits, null, numHits, true); + new TopFieldCollectorManager(new Sort(sortField), numHits, null, numHits); TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), manager); for (int i = 0; i < topDocs.scoreDocs.length; i++) { long expectedSeqNo = seqNos.get(i); @@ -986,12 +1010,12 @@ public void testSortOptimizationOnSortedNumericField() throws IOException { int expectedHits = Math.min(numDocs - visitedHits, batch); CollectorManager manager = - new TopFieldCollectorManager(sort, batch, (FieldDoc) after, totalHitsThreshold, true); + new TopFieldCollectorManager(sort, batch, (FieldDoc) after, totalHitsThreshold); TopDocs topDocs = searcher.search(query, manager); ScoreDoc[] scoreDocs = topDocs.scoreDocs; CollectorManager manager2 = - new TopFieldCollectorManager(sort2, batch, (FieldDoc) after, totalHitsThreshold, true); + new TopFieldCollectorManager(sort2, batch, (FieldDoc) after, totalHitsThreshold); TopDocs topDocs2 = searcher.search(query, manager2); ScoreDoc[] scoreDocs2 = topDocs2.scoreDocs; @@ -1007,8 +1031,8 @@ public void testSortOptimizationOnSortedNumericField() throws IOException { } expectedCollectedHits += numDocs; - collectedHits += topDocs.totalHits.value; - collectedHits2 += topDocs2.totalHits.value; + collectedHits += topDocs.totalHits.value(); + collectedHits2 += topDocs2.totalHits.value(); after = scoreDocs[expectedHits - 1]; } assertEquals(visitedHits, numDocs); @@ -1086,7 +1110,7 @@ private void doTestStringSortOptimization(DirectoryReader reader) throws IOExcep sortField.setMissingValue(SortField.STRING_LAST); Sort sort = new Sort(sortField); TopDocs topDocs = assertSort(reader, sort, numHits, null); - assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, numDocs); + assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value(), numDocs); } { // simple descending sort @@ -1094,7 +1118,7 @@ private void doTestStringSortOptimization(DirectoryReader reader) throws IOExcep sortField.setMissingValue(SortField.STRING_FIRST); Sort sort = new Sort(sortField); TopDocs topDocs = assertSort(reader, sort, numHits, null); - assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, numDocs); + assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value(), numDocs); } { // ascending sort that returns missing values first @@ -1120,7 +1144,7 @@ private void doTestStringSortOptimization(DirectoryReader reader) throws IOExcep BytesRef afterValue = new BytesRef(random().nextBoolean() ? "23" : "230000000"); FieldDoc after = new FieldDoc(2, Float.NaN, new Object[] {afterValue}); TopDocs topDocs = assertSort(reader, sort, numHits, after); - assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, numDocs); + assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value(), numDocs); } { // paging descending sort with after @@ -1130,7 +1154,7 @@ private void doTestStringSortOptimization(DirectoryReader reader) throws IOExcep BytesRef afterValue = new BytesRef(random().nextBoolean() ? "17" : "170000000"); FieldDoc after = new FieldDoc(2, Float.NaN, new Object[] {afterValue}); TopDocs topDocs = assertSort(reader, sort, numHits, after); - assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, numDocs); + assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value(), numDocs); } { // paging ascending sort with after that returns missing values first @@ -1141,7 +1165,7 @@ private void doTestStringSortOptimization(DirectoryReader reader) throws IOExcep BytesRef afterValue = new BytesRef(random().nextBoolean() ? "23" : "230000000"); FieldDoc after = new FieldDoc(2, Float.NaN, new Object[] {afterValue}); TopDocs topDocs = assertSort(reader, sort, numHits, after); - assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, numDocs); + assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value(), numDocs); } { // paging descending sort with after that returns missing values first @@ -1151,7 +1175,7 @@ private void doTestStringSortOptimization(DirectoryReader reader) throws IOExcep BytesRef afterValue = new BytesRef(random().nextBoolean() ? "17" : "170000000"); FieldDoc after = new FieldDoc(2, Float.NaN, new Object[] {afterValue}); TopDocs topDocs = assertSort(reader, sort, numHits, after); - assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, numDocs); + assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value(), numDocs); } { // test that if there is the secondary sort on _score, hits are still skipped @@ -1160,7 +1184,7 @@ private void doTestStringSortOptimization(DirectoryReader reader) throws IOExcep sortField.setMissingValue(SortField.STRING_LAST); Sort sort = new Sort(sortField, FIELD_SCORE); TopDocs topDocs = assertSort(reader, sort, numHits, null); - assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, numDocs); + assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value(), numDocs); } { // test that if string field is a secondary sort, no optimization is run @@ -1170,7 +1194,7 @@ private void doTestStringSortOptimization(DirectoryReader reader) throws IOExcep Sort sort = new Sort(FIELD_SCORE, sortField); TopDocs topDocs = assertSort(reader, sort, numHits, null); assertEquals( - topDocs.totalHits.value, + topDocs.totalHits.value(), numDocs); // assert that all documents were collected => optimization was not run } } @@ -1186,11 +1210,11 @@ public void doTestStringSortOptimizationDisabled(DirectoryReader reader) throws final int totalHitsThreshold = 5; CollectorManager manager = - new TopFieldCollectorManager(sort, numHits, null, totalHitsThreshold, true); + new TopFieldCollectorManager(sort, numHits, null, totalHitsThreshold); IndexSearcher searcher = newSearcher(reader, random().nextBoolean(), random().nextBoolean(), false); TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), manager); - assertEquals(numDocs, topDocs.totalHits.value); + assertEquals(numDocs, topDocs.totalHits.value()); } private TopDocs assertSort(DirectoryReader reader, Sort sort, int n, FieldDoc after) @@ -1216,7 +1240,7 @@ private TopDocs assertSearchHits(DirectoryReader reader, Sort sort, int n, Field IndexSearcher searcher = newSearcher(reader, true, true, false); Query query = new MatchAllDocsQuery(); CollectorManager manager = - new TopFieldCollectorManager(sort, n, after, n, true); + new TopFieldCollectorManager(sort, n, after, n); TopDocs topDocs = searcher.search(query, manager); IndexSearcher unoptimizedSearcher = newSearcher(new NoIndexDirectoryReader(reader), true, true, false); @@ -1289,7 +1313,7 @@ public FieldInfos getFieldInfos() { false, IndexOptions.NONE, fi.getDocValuesType(), - fi.hasDocValuesSkipIndex(), + fi.docValuesSkipIndexType(), fi.getDocValuesGen(), fi.attributes(), 0, diff --git a/lucene/core/src/test/org/apache/lucene/search/TestSortRandom.java b/lucene/core/src/test/org/apache/lucene/search/TestSortRandom.java index 35b44fd719c9..2ff71e266f2d 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestSortRandom.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestSortRandom.java @@ -17,14 +17,17 @@ package org.apache.lucene.search; import java.io.IOException; +import java.io.UncheckedIOException; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Objects; import java.util.Random; import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; import org.apache.lucene.document.Document; import org.apache.lucene.document.NumericDocValuesField; import org.apache.lucene.document.SortedDocValuesField; @@ -238,6 +241,7 @@ private static class RandomQuery extends Query { private final List docValues; public final List matchValues = Collections.synchronizedList(new ArrayList()); + private final Map bitsets = new ConcurrentHashMap<>(); // density should be 0.0 ... 1.0 public RandomQuery(long seed, float density, List docValues) { @@ -252,20 +256,34 @@ public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float bo return new ConstantScoreWeight(this, boost) { @Override public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException { - Random random = new Random(context.docBase ^ seed); - final int maxDoc = context.reader().maxDoc(); - final NumericDocValues idSource = DocValues.getNumeric(context.reader(), "id"); - assertNotNull(idSource); - final FixedBitSet bits = new FixedBitSet(maxDoc); - for (int docID = 0; docID < maxDoc; docID++) { - assertEquals(docID, idSource.nextDoc()); - if (random.nextFloat() <= density) { - bits.set(docID); - // System.out.println(" acc id=" + idSource.getInt(docID) + " docID=" + docID); - matchValues.add(docValues.get((int) idSource.longValue())); - } - } - + FixedBitSet bits = + bitsets.computeIfAbsent( + context, + ctx -> { + Random random = new Random(context.docBase ^ seed); + final int maxDoc = context.reader().maxDoc(); + try { + final NumericDocValues idSource = + DocValues.getNumeric(context.reader(), "id"); + assertNotNull(idSource); + final FixedBitSet bitset = new FixedBitSet(maxDoc); + for (int docID = 0; docID < maxDoc; docID++) { + assertEquals(docID, idSource.nextDoc()); + if (random.nextFloat() <= density) { + bitset.set(docID); + // System.out.println(" acc id=" + idSource.getInt(docID) + " docID=" + + // docID); + matchValues.add(docValues.get((int) idSource.longValue())); + } + } + return bitset; + } catch (IOException e) { + throw new UncheckedIOException(e); + } + }); + // The bitset is built for the whole segment, the first time each leaf is seen. Every + // partition iterates through its own set of doc ids, using a separate iterator backed by + // the shared bitset. final var scorer = new ConstantScoreScorer( score(), scoreMode, new BitSetIterator(bits, bits.approximateCardinality())); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestSortRescorer.java b/lucene/core/src/test/org/apache/lucene/search/TestSortRescorer.java index e48828e5b2f3..5322b2e87962 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestSortRescorer.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestSortRescorer.java @@ -84,7 +84,7 @@ public void testBasic() throws Exception { // Just first pass query TopDocs hits = searcher.search(query, 10); - assertEquals(3, hits.totalHits.value); + assertEquals(3, hits.totalHits.value()); assertEquals("3", r.storedFields().document(hits.scoreDocs[0].doc).get("id")); assertEquals("1", r.storedFields().document(hits.scoreDocs[1].doc).get("id")); assertEquals("2", r.storedFields().document(hits.scoreDocs[2].doc).get("id")); @@ -93,7 +93,7 @@ public void testBasic() throws Exception { Sort sort = new Sort(new SortField("popularity", SortField.Type.INT, true)); Rescorer rescorer = new SortRescorer(sort); hits = rescorer.rescore(searcher, hits, 10); - assertEquals(3, hits.totalHits.value); + assertEquals(3, hits.totalHits.value()); assertEquals("2", r.storedFields().document(hits.scoreDocs[0].doc).get("id")); assertEquals("1", r.storedFields().document(hits.scoreDocs[1].doc).get("id")); assertEquals("3", r.storedFields().document(hits.scoreDocs[2].doc).get("id")); @@ -120,7 +120,7 @@ public void testDoubleValuesSourceSort() throws Exception { // Just first pass query TopDocs hits = searcher.search(query, 10); - assertEquals(3, hits.totalHits.value); + assertEquals(3, hits.totalHits.value()); assertEquals("3", r.storedFields().document(hits.scoreDocs[0].doc).get("id")); assertEquals("1", r.storedFields().document(hits.scoreDocs[1].doc).get("id")); assertEquals("2", r.storedFields().document(hits.scoreDocs[2].doc).get("id")); @@ -131,7 +131,7 @@ public void testDoubleValuesSourceSort() throws Exception { Sort sort = new Sort(source.getSortField(true)); Rescorer rescorer = new SortRescorer(sort); hits = rescorer.rescore(searcher, hits, 10); - assertEquals(3, hits.totalHits.value); + assertEquals(3, hits.totalHits.value()); assertEquals("2", r.storedFields().document(hits.scoreDocs[0].doc).get("id")); assertEquals("1", r.storedFields().document(hits.scoreDocs[1].doc).get("id")); assertEquals("3", r.storedFields().document(hits.scoreDocs[2].doc).get("id")); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestSortedNumericSortField.java b/lucene/core/src/test/org/apache/lucene/search/TestSortedNumericSortField.java index d6919f014dd3..da0ea802eb9f 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestSortedNumericSortField.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestSortedNumericSortField.java @@ -42,7 +42,7 @@ public void testEmptyIndex() throws Exception { 10, new Sort(new SortedNumericSortField("sortednumeric", SortField.Type.LONG)), true); - assertEquals(0, td.totalHits.value); + assertEquals(0, td.totalHits.value()); // for an empty index, any selector should work for (SortedNumericSelector.Type v : SortedNumericSelector.Type.values()) { @@ -52,7 +52,7 @@ public void testEmptyIndex() throws Exception { 10, new Sort(new SortedNumericSortField("sortednumeric", SortField.Type.LONG, false, v)), true); - assertEquals(0, td.totalHits.value); + assertEquals(0, td.totalHits.value()); } } @@ -96,7 +96,7 @@ public void testForward() throws Exception { Sort sort = new Sort(new SortedNumericSortField("value", SortField.Type.INT)); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(2, td.totalHits.value); + assertEquals(2, td.totalHits.value()); // 3 comes before 5 assertEquals("1", searcher.storedFields().document(td.scoreDocs[0].doc).get("id")); assertEquals("2", searcher.storedFields().document(td.scoreDocs[1].doc).get("id")); @@ -125,7 +125,7 @@ public void testReverse() throws Exception { Sort sort = new Sort(new SortedNumericSortField("value", SortField.Type.INT, true)); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(2, td.totalHits.value); + assertEquals(2, td.totalHits.value()); // 'bar' comes before 'baz' assertEquals("2", searcher.storedFields().document(td.scoreDocs[0].doc).get("id")); assertEquals("1", searcher.storedFields().document(td.scoreDocs[1].doc).get("id")); @@ -158,7 +158,7 @@ public void testMissingFirst() throws Exception { Sort sort = new Sort(sortField); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(3, td.totalHits.value); + assertEquals(3, td.totalHits.value()); // 3 comes before 5 // null comes first assertEquals("3", searcher.storedFields().document(td.scoreDocs[0].doc).get("id")); @@ -193,7 +193,7 @@ public void testMissingLast() throws Exception { Sort sort = new Sort(sortField); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(3, td.totalHits.value); + assertEquals(3, td.totalHits.value()); // 3 comes before 5 assertEquals("1", searcher.storedFields().document(td.scoreDocs[0].doc).get("id")); assertEquals("2", searcher.storedFields().document(td.scoreDocs[1].doc).get("id")); @@ -222,7 +222,7 @@ public void testSingleton() throws Exception { Sort sort = new Sort(new SortedNumericSortField("value", SortField.Type.INT)); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(2, td.totalHits.value); + assertEquals(2, td.totalHits.value()); // 3 comes before 5 assertEquals("1", searcher.storedFields().document(td.scoreDocs[0].doc).get("id")); assertEquals("2", searcher.storedFields().document(td.scoreDocs[1].doc).get("id")); @@ -250,7 +250,7 @@ public void testFloat() throws Exception { Sort sort = new Sort(new SortedNumericSortField("value", SortField.Type.FLOAT)); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(2, td.totalHits.value); + assertEquals(2, td.totalHits.value()); // -5 comes before -3 assertEquals("1", searcher.storedFields().document(td.scoreDocs[0].doc).get("id")); assertEquals("2", searcher.storedFields().document(td.scoreDocs[1].doc).get("id")); @@ -278,7 +278,7 @@ public void testDouble() throws Exception { Sort sort = new Sort(new SortedNumericSortField("value", SortField.Type.DOUBLE)); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(2, td.totalHits.value); + assertEquals(2, td.totalHits.value()); // -5 comes before -3 assertEquals("1", searcher.storedFields().document(td.scoreDocs[0].doc).get("id")); assertEquals("2", searcher.storedFields().document(td.scoreDocs[1].doc).get("id")); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestSortedSetSelector.java b/lucene/core/src/test/org/apache/lucene/search/TestSortedSetSelector.java index 614c0376f236..2e41b26f70a4 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestSortedSetSelector.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestSortedSetSelector.java @@ -53,7 +53,7 @@ public void testMax() throws Exception { Sort sort = new Sort(new SortedSetSortField("value", false, SortedSetSelector.Type.MAX)); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(2, td.totalHits.value); + assertEquals(2, td.totalHits.value()); // 'baz' comes before 'foo' assertEquals("2", searcher.storedFields().document(td.scoreDocs[0].doc).get("id")); assertEquals("1", searcher.storedFields().document(td.scoreDocs[1].doc).get("id")); @@ -83,7 +83,7 @@ public void testMaxReverse() throws Exception { Sort sort = new Sort(new SortedSetSortField("value", true, SortedSetSelector.Type.MAX)); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(2, td.totalHits.value); + assertEquals(2, td.totalHits.value()); // 'baz' comes before 'foo' assertEquals("1", searcher.storedFields().document(td.scoreDocs[0].doc).get("id")); assertEquals("2", searcher.storedFields().document(td.scoreDocs[1].doc).get("id")); @@ -118,7 +118,7 @@ public void testMaxMissingFirst() throws Exception { Sort sort = new Sort(sortField); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(3, td.totalHits.value); + assertEquals(3, td.totalHits.value()); // null comes first assertEquals("1", searcher.storedFields().document(td.scoreDocs[0].doc).get("id")); // 'baz' comes before 'foo' @@ -155,7 +155,7 @@ public void testMaxMissingLast() throws Exception { Sort sort = new Sort(sortField); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(3, td.totalHits.value); + assertEquals(3, td.totalHits.value()); // 'baz' comes before 'foo' assertEquals("3", searcher.storedFields().document(td.scoreDocs[0].doc).get("id")); assertEquals("2", searcher.storedFields().document(td.scoreDocs[1].doc).get("id")); @@ -185,7 +185,7 @@ public void testMaxSingleton() throws Exception { Sort sort = new Sort(new SortedSetSortField("value", false, SortedSetSelector.Type.MAX)); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(2, td.totalHits.value); + assertEquals(2, td.totalHits.value()); // 'bar' comes before 'baz' assertEquals("1", searcher.storedFields().document(td.scoreDocs[0].doc).get("id")); assertEquals("2", searcher.storedFields().document(td.scoreDocs[1].doc).get("id")); @@ -216,7 +216,7 @@ public void testMiddleMin() throws Exception { Sort sort = new Sort(new SortedSetSortField("value", false, SortedSetSelector.Type.MIDDLE_MIN)); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(2, td.totalHits.value); + assertEquals(2, td.totalHits.value()); // 'b' comes before 'c' assertEquals("1", searcher.storedFields().document(td.scoreDocs[0].doc).get("id")); assertEquals("2", searcher.storedFields().document(td.scoreDocs[1].doc).get("id")); @@ -247,7 +247,7 @@ public void testMiddleMinReverse() throws Exception { Sort sort = new Sort(new SortedSetSortField("value", true, SortedSetSelector.Type.MIDDLE_MIN)); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(2, td.totalHits.value); + assertEquals(2, td.totalHits.value()); // 'b' comes before 'c' assertEquals("2", searcher.storedFields().document(td.scoreDocs[0].doc).get("id")); assertEquals("1", searcher.storedFields().document(td.scoreDocs[1].doc).get("id")); @@ -283,7 +283,7 @@ public void testMiddleMinMissingFirst() throws Exception { Sort sort = new Sort(sortField); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(3, td.totalHits.value); + assertEquals(3, td.totalHits.value()); // null comes first assertEquals("3", searcher.storedFields().document(td.scoreDocs[0].doc).get("id")); // 'b' comes before 'c' @@ -321,7 +321,7 @@ public void testMiddleMinMissingLast() throws Exception { Sort sort = new Sort(sortField); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(3, td.totalHits.value); + assertEquals(3, td.totalHits.value()); // 'b' comes before 'c' assertEquals("1", searcher.storedFields().document(td.scoreDocs[0].doc).get("id")); assertEquals("2", searcher.storedFields().document(td.scoreDocs[1].doc).get("id")); @@ -351,7 +351,7 @@ public void testMiddleMinSingleton() throws Exception { Sort sort = new Sort(new SortedSetSortField("value", false, SortedSetSelector.Type.MIDDLE_MIN)); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(2, td.totalHits.value); + assertEquals(2, td.totalHits.value()); // 'bar' comes before 'baz' assertEquals("1", searcher.storedFields().document(td.scoreDocs[0].doc).get("id")); assertEquals("2", searcher.storedFields().document(td.scoreDocs[1].doc).get("id")); @@ -382,7 +382,7 @@ public void testMiddleMax() throws Exception { Sort sort = new Sort(new SortedSetSortField("value", false, SortedSetSelector.Type.MIDDLE_MAX)); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(2, td.totalHits.value); + assertEquals(2, td.totalHits.value()); // 'b' comes before 'c' assertEquals("2", searcher.storedFields().document(td.scoreDocs[0].doc).get("id")); assertEquals("1", searcher.storedFields().document(td.scoreDocs[1].doc).get("id")); @@ -413,7 +413,7 @@ public void testMiddleMaxReverse() throws Exception { Sort sort = new Sort(new SortedSetSortField("value", true, SortedSetSelector.Type.MIDDLE_MAX)); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(2, td.totalHits.value); + assertEquals(2, td.totalHits.value()); // 'b' comes before 'c' assertEquals("1", searcher.storedFields().document(td.scoreDocs[0].doc).get("id")); assertEquals("2", searcher.storedFields().document(td.scoreDocs[1].doc).get("id")); @@ -449,7 +449,7 @@ public void testMiddleMaxMissingFirst() throws Exception { Sort sort = new Sort(sortField); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(3, td.totalHits.value); + assertEquals(3, td.totalHits.value()); // null comes first assertEquals("3", searcher.storedFields().document(td.scoreDocs[0].doc).get("id")); // 'b' comes before 'c' @@ -487,7 +487,7 @@ public void testMiddleMaxMissingLast() throws Exception { Sort sort = new Sort(sortField); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(3, td.totalHits.value); + assertEquals(3, td.totalHits.value()); // 'b' comes before 'c' assertEquals("2", searcher.storedFields().document(td.scoreDocs[0].doc).get("id")); assertEquals("1", searcher.storedFields().document(td.scoreDocs[1].doc).get("id")); @@ -517,7 +517,7 @@ public void testMiddleMaxSingleton() throws Exception { Sort sort = new Sort(new SortedSetSortField("value", false, SortedSetSelector.Type.MIDDLE_MAX)); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(2, td.totalHits.value); + assertEquals(2, td.totalHits.value()); // 'bar' comes before 'baz' assertEquals("1", searcher.storedFields().document(td.scoreDocs[0].doc).get("id")); assertEquals("2", searcher.storedFields().document(td.scoreDocs[1].doc).get("id")); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestSortedSetSortField.java b/lucene/core/src/test/org/apache/lucene/search/TestSortedSetSortField.java index 873d948373f1..74bea98cf607 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestSortedSetSortField.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestSortedSetSortField.java @@ -34,12 +34,12 @@ public void testEmptyIndex() throws Exception { Query query = new TermQuery(new Term("contents", "foo")); TopDocs td = empty.search(query, 10, new Sort(new SortedSetSortField("sortedset", false)), true); - assertEquals(0, td.totalHits.value); + assertEquals(0, td.totalHits.value()); // for an empty index, any selector should work for (SortedSetSelector.Type v : SortedSetSelector.Type.values()) { td = empty.search(query, 10, new Sort(new SortedSetSortField("sortedset", false, v)), true); - assertEquals(0, td.totalHits.value); + assertEquals(0, td.totalHits.value()); } } @@ -79,7 +79,7 @@ public void testForward() throws Exception { Sort sort = new Sort(new SortedSetSortField("value", false)); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(2, td.totalHits.value); + assertEquals(2, td.totalHits.value()); // 'bar' comes before 'baz' assertEquals("1", searcher.storedFields().document(td.scoreDocs[0].doc).get("id")); assertEquals("2", searcher.storedFields().document(td.scoreDocs[1].doc).get("id")); @@ -108,7 +108,7 @@ public void testReverse() throws Exception { Sort sort = new Sort(new SortedSetSortField("value", true)); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(2, td.totalHits.value); + assertEquals(2, td.totalHits.value()); // 'bar' comes before 'baz' assertEquals("2", searcher.storedFields().document(td.scoreDocs[0].doc).get("id")); assertEquals("1", searcher.storedFields().document(td.scoreDocs[1].doc).get("id")); @@ -141,7 +141,7 @@ public void testMissingFirst() throws Exception { Sort sort = new Sort(sortField); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(3, td.totalHits.value); + assertEquals(3, td.totalHits.value()); // 'bar' comes before 'baz' // null comes first assertEquals("3", searcher.storedFields().document(td.scoreDocs[0].doc).get("id")); @@ -176,7 +176,7 @@ public void testMissingLast() throws Exception { Sort sort = new Sort(sortField); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(3, td.totalHits.value); + assertEquals(3, td.totalHits.value()); // 'bar' comes before 'baz' assertEquals("1", searcher.storedFields().document(td.scoreDocs[0].doc).get("id")); assertEquals("2", searcher.storedFields().document(td.scoreDocs[1].doc).get("id")); @@ -205,7 +205,7 @@ public void testSingleton() throws Exception { Sort sort = new Sort(new SortedSetSortField("value", false)); TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); - assertEquals(2, td.totalHits.value); + assertEquals(2, td.totalHits.value()); // 'bar' comes before 'baz' assertEquals("1", searcher.storedFields().document(td.scoreDocs[0].doc).get("id")); assertEquals("2", searcher.storedFields().document(td.scoreDocs[1].doc).get("id")); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestSynonymQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestSynonymQuery.java index b3a886d49647..75b8ecb223ea 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestSynonymQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestSynonymQuery.java @@ -165,7 +165,7 @@ public void testToString() { } public void testScores() throws IOException { - doTestScores(2); + doTestScores(1); doTestScores(Integer.MAX_VALUE); } @@ -195,10 +195,10 @@ private void doTestScores(int totalHitsThreshold) throws IOException { new TopScoreDocCollectorManager( Math.min(reader.numDocs(), totalHitsThreshold), totalHitsThreshold); TopDocs topDocs = searcher.search(query, collectorManager); - if (topDocs.totalHits.value < totalHitsThreshold) { + if (topDocs.totalHits.value() < totalHitsThreshold) { assertEquals(new TotalHits(11, TotalHits.Relation.EQUAL_TO), topDocs.totalHits); } else { - assertEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, topDocs.totalHits.relation); + assertEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, topDocs.totalHits.relation()); } // All docs must have the same score for (int i = 0; i < topDocs.scoreDocs.length; ++i) { @@ -211,7 +211,7 @@ private void doTestScores(int totalHitsThreshold) throws IOException { } public void testBoosts() throws IOException { - doTestBoosts(2); + doTestBoosts(1); doTestBoosts(Integer.MAX_VALUE); } @@ -254,11 +254,11 @@ public void doTestBoosts(int totalHitsThreshold) throws IOException { new TopScoreDocCollectorManager( Math.min(reader.numDocs(), totalHitsThreshold), totalHitsThreshold); TopDocs topDocs = searcher.search(query, collectorManager); - if (topDocs.totalHits.value < totalHitsThreshold) { - assertEquals(TotalHits.Relation.EQUAL_TO, topDocs.totalHits.relation); - assertEquals(22, topDocs.totalHits.value); + if (topDocs.totalHits.value() < totalHitsThreshold) { + assertEquals(TotalHits.Relation.EQUAL_TO, topDocs.totalHits.relation()); + assertEquals(22, topDocs.totalHits.value()); } else { - assertEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, topDocs.totalHits.relation); + assertEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, topDocs.totalHits.relation()); } // All docs must have the same score for (int i = 0; i < topDocs.scoreDocs.length; ++i) { diff --git a/lucene/core/src/test/org/apache/lucene/search/TestTaskExecutor.java b/lucene/core/src/test/org/apache/lucene/search/TestTaskExecutor.java index 26847f60a2a0..be49e2466a16 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestTaskExecutor.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestTaskExecutor.java @@ -132,7 +132,7 @@ private static void doTestInvokeAllFromTaskDoesNotDeadlockSameSearcher(Executor new IndexSearcher(reader, executor) { @Override protected LeafSlice[] slices(List leaves) { - return slices(leaves, 1, 1); + return slices(leaves, 1, 1, false); } }; @@ -206,7 +206,7 @@ private static void doTestInvokeAllFromTaskDoesNotDeadlockMultipleSearchers(Exec new IndexSearcher(reader, executor) { @Override protected LeafSlice[] slices(List leaves) { - return slices(leaves, 1, 1); + return slices(leaves, 1, 1, false); } }; diff --git a/lucene/core/src/test/org/apache/lucene/search/TestTermInSetQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestTermInSetQuery.java index 707913b9193c..b65030216175 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestTermInSetQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestTermInSetQuery.java @@ -31,6 +31,7 @@ import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.KeywordField; +import org.apache.lucene.document.SortedSetDocValuesField; import org.apache.lucene.document.StringField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.FilterDirectoryReader; @@ -51,6 +52,7 @@ import org.apache.lucene.tests.util.RamUsageTester; import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefIterator; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.automaton.ByteRunAutomaton; @@ -101,7 +103,7 @@ public void testAllDocsInFieldTerm() throws IOException { TermInSetQuery query = new TermInSetQuery(field, queryTerms); TopDocs topDocs = searcher.search(query, numDocs); - assertEquals(numDocs, topDocs.totalHits.value); + assertEquals(numDocs, topDocs.totalHits.value()); reader.close(); dir.close(); @@ -119,11 +121,14 @@ public void testDuel() throws IOException { } Directory dir = newDirectory(); RandomIndexWriter iw = new RandomIndexWriter(random(), dir); - final int numDocs = atLeast(100); + final int numDocs = atLeast(10_000); for (int i = 0; i < numDocs; ++i) { Document doc = new Document(); final BytesRef term = allTerms.get(random().nextInt(allTerms.size())); doc.add(new StringField(field, term, Store.NO)); + // Also include a doc values field with a skip-list so we can test doc-value rewrite as + // well: + doc.add(SortedSetDocValuesField.indexedField(field, term)); iw.addDocument(doc); } if (numTerms > 1 && random().nextBoolean()) { @@ -154,7 +159,9 @@ public void testDuel() throws IOException { } final Query q1 = new ConstantScoreQuery(bq.build()); final Query q2 = new TermInSetQuery(field, queryTerms); + final Query q3 = new TermInSetQuery(MultiTermQuery.DOC_VALUES_REWRITE, field, queryTerms); assertSameMatches(searcher, new BoostQuery(q1, boost), new BoostQuery(q2, boost), true); + assertSameMatches(searcher, new BoostQuery(q1, boost), new BoostQuery(q3, boost), false); } reader.close(); @@ -225,12 +232,59 @@ public void testReturnsNullScoreSupplier() throws Exception { } } + /** + * Make sure the doc values skipper isn't making the incorrect assumption that the min/max terms + * from a TermInSetQuery don't form a continuous range. + */ + public void testSkipperOptimizationGapAssumption() throws IOException { + Directory dir = newDirectory(); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir); + // Index the first 10,000 docs all with the term "b" to get some skip list blocks with the range + // [b, b]: + for (int i = 0; i < 10_000; i++) { + Document doc = new Document(); + BytesRef term = new BytesRef("b"); + doc.add(new SortedSetDocValuesField("field", term)); + doc.add(SortedSetDocValuesField.indexedField("idx_field", term)); + iw.addDocument(doc); + } + + // Index a couple more docs with terms "a" and "c": + Document doc = new Document(); + BytesRef term = new BytesRef("a"); + doc.add(new SortedSetDocValuesField("field", term)); + doc.add(SortedSetDocValuesField.indexedField("idx_field", term)); + iw.addDocument(doc); + doc = new Document(); + term = new BytesRef("c"); + doc.add(new SortedSetDocValuesField("field", term)); + doc.add(SortedSetDocValuesField.indexedField("idx_field", term)); + iw.addDocument(doc); + + iw.commit(); + IndexReader reader = iw.getReader(); + IndexSearcher searcher = newSearcher(reader); + iw.close(); + + // Our query is for (or "a" "c") which should use a skip-list optimization to exclude blocks of + // documents that fall outside the range [a, c]. We want to test that they don't incorrectly do + // the inverse and include all docs in a block that fall within [a, c] (which is why we have + // blocks of only "b" docs up-front): + List queryTerms = List.of(new BytesRef("a"), new BytesRef("c")); + Query q1 = new TermInSetQuery(MultiTermQuery.DOC_VALUES_REWRITE, "field", queryTerms); + Query q2 = new TermInSetQuery(MultiTermQuery.DOC_VALUES_REWRITE, "idx_field", queryTerms); + assertSameMatches(searcher, q1, q2, false); + + reader.close(); + dir.close(); + } + private void assertSameMatches(IndexSearcher searcher, Query q1, Query q2, boolean scores) throws IOException { final int maxDoc = searcher.getIndexReader().maxDoc(); final TopDocs td1 = searcher.search(q1, maxDoc, scores ? Sort.RELEVANCE : Sort.INDEXORDER); final TopDocs td2 = searcher.search(q2, maxDoc, scores ? Sort.RELEVANCE : Sort.INDEXORDER); - assertEquals(td1.totalHits.value, td2.totalHits.value); + assertEquals(td1.totalHits.value(), td2.totalHits.value()); for (int i = 0; i < td1.scoreDocs.length; ++i) { assertEquals(td1.scoreDocs[i].doc, td2.scoreDocs[i].doc); if (scores) { @@ -474,4 +528,19 @@ public void consumeTermsMatching( } }); } + + public void testTermsIterator() throws IOException { + TermInSetQuery empty = new TermInSetQuery("field", Collections.emptyList()); + BytesRefIterator it = empty.getBytesRefIterator(); + assertNull(it.next()); + + TermInSetQuery query = + new TermInSetQuery( + "field", List.of(newBytesRef("term1"), newBytesRef("term2"), newBytesRef("term3"))); + it = query.getBytesRefIterator(); + assertEquals(newBytesRef("term1"), it.next()); + assertEquals(newBytesRef("term2"), it.next()); + assertEquals(newBytesRef("term3"), it.next()); + assertNull(it.next()); + } } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestTermRangeQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestTermRangeQuery.java index 7f4ad686baa3..e4f25179018a 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestTermRangeQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestTermRangeQuery.java @@ -279,7 +279,7 @@ public void testExclusiveLowerNull() throws Exception { initializeIndex(new String[] {"A", "B", "", "C", "D"}, analyzer); IndexReader reader = DirectoryReader.open(dir); IndexSearcher searcher = newSearcher(reader); - long numHits = searcher.search(query, 1000).totalHits.value; + long numHits = searcher.search(query, 1000).totalHits.value(); // When Lucene-38 is fixed, use the assert on the next line: assertEquals("A,B,,C,D => A, B & are in range", 3, numHits); // until Lucene-38 is fixed, use this assert: @@ -290,7 +290,7 @@ public void testExclusiveLowerNull() throws Exception { initializeIndex(new String[] {"A", "B", "", "D"}, analyzer); reader = DirectoryReader.open(dir); searcher = newSearcher(reader); - numHits = searcher.search(query, 1000).totalHits.value; + numHits = searcher.search(query, 1000).totalHits.value(); // When Lucene-38 is fixed, use the assert on the next line: assertEquals("A,B,,D => A, B & are in range", 3, numHits); // until Lucene-38 is fixed, use this assert: @@ -299,7 +299,7 @@ public void testExclusiveLowerNull() throws Exception { addDoc("C"); reader = DirectoryReader.open(dir); searcher = newSearcher(reader); - numHits = searcher.search(query, 1000).totalHits.value; + numHits = searcher.search(query, 1000).totalHits.value(); // When Lucene-38 is fixed, use the assert on the next line: assertEquals("C added, still A, B & are in range", 3, numHits); // until Lucene-38 is fixed, use this assert @@ -315,7 +315,7 @@ public void testInclusiveLowerNull() throws Exception { initializeIndex(new String[] {"A", "B", "", "C", "D"}, analyzer); IndexReader reader = DirectoryReader.open(dir); IndexSearcher searcher = newSearcher(reader); - long numHits = searcher.search(query, 1000).totalHits.value; + long numHits = searcher.search(query, 1000).totalHits.value(); // When Lucene-38 is fixed, use the assert on the next line: assertEquals("A,B,,C,D => A,B,,C in range", 4, numHits); // until Lucene-38 is fixed, use this assert @@ -324,7 +324,7 @@ public void testInclusiveLowerNull() throws Exception { initializeIndex(new String[] {"A", "B", "", "D"}, analyzer); reader = DirectoryReader.open(dir); searcher = newSearcher(reader); - numHits = searcher.search(query, 1000).totalHits.value; + numHits = searcher.search(query, 1000).totalHits.value(); // When Lucene-38 is fixed, use the assert on the next line: assertEquals("A,B,,D - A, B and in range", 3, numHits); // until Lucene-38 is fixed, use this assert @@ -333,7 +333,7 @@ public void testInclusiveLowerNull() throws Exception { addDoc("C"); reader = DirectoryReader.open(dir); searcher = newSearcher(reader); - numHits = searcher.search(query, 1000).totalHits.value; + numHits = searcher.search(query, 1000).totalHits.value(); // When Lucene-38 is fixed, use the assert on the next line: assertEquals("C added => A,B,,C in range", 4, numHits); // until Lucene-38 is fixed, use this assert diff --git a/lucene/core/src/test/org/apache/lucene/search/TestTermScorer.java b/lucene/core/src/test/org/apache/lucene/search/TestTermScorer.java index 314bc7a5d68f..3823f2ea6d78 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestTermScorer.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestTermScorer.java @@ -123,7 +123,9 @@ public ScoreMode scoreMode() { return ScoreMode.COMPLETE; } }, - null); + null, + 0, + DocIdSetIterator.NO_MORE_DOCS); assertTrue("docs Size: " + docs.size() + " is not: " + 2, docs.size() == 2); TestHit doc0 = docs.get(0); TestHit doc5 = docs.get(1); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestTimeLimitingBulkScorer.java b/lucene/core/src/test/org/apache/lucene/search/TestTimeLimitingBulkScorer.java index cce82cd34ac0..de5512a904a1 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestTimeLimitingBulkScorer.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestTimeLimitingBulkScorer.java @@ -113,7 +113,7 @@ public long cost() { private static QueryTimeout countingQueryTimeout(int timeallowed) { return new QueryTimeout() { - static int counter = 0; + int counter = 0; @Override public boolean shouldExit() { diff --git a/lucene/core/src/test/org/apache/lucene/search/TestTopDocsCollector.java b/lucene/core/src/test/org/apache/lucene/search/TestTopDocsCollector.java index e91858800a2d..cc7405f67724 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestTopDocsCollector.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestTopDocsCollector.java @@ -89,12 +89,11 @@ protected TopDocs newTopDocs(ScoreDoc[] results, int start) { public LeafCollector getLeafCollector(LeafReaderContext context) throws IOException { final int base = context.docBase; return new LeafCollector() { - private int idx = 0; @Override public void collect(int doc) { ++totalHits; - pq.insertWithOverflow(new ScoreDoc(doc + base, scores[context.docBase + idx++])); + pq.insertWithOverflow(new ScoreDoc(doc + base, scores[context.docBase + doc])); } @Override @@ -161,8 +160,7 @@ private TopDocs doSearchWithThreshold( int numResults, int thresHold, Query q, IndexReader indexReader) throws IOException { IndexSearcher searcher = newSearcher(indexReader, true, true, false); TopScoreDocCollectorManager collectorManager = - new TopScoreDocCollectorManager( - numResults, null, thresHold, searcher.getSlices().length > 1); + new TopScoreDocCollectorManager(numResults, null, thresHold); return searcher.search(q, collectorManager); } @@ -170,8 +168,7 @@ private static TopDocs doConcurrentSearchWithThreshold( int numResults, int threshold, Query q, IndexReader indexReader) throws IOException { IndexSearcher searcher = newSearcher(indexReader, true, true, true); TopScoreDocCollectorManager collectorManager = - new TopScoreDocCollectorManager( - numResults, null, threshold, searcher.getSlices().length > 1); + new TopScoreDocCollectorManager(numResults, null, threshold); return searcher.search(q, collectorManager); } @@ -441,7 +438,7 @@ public void testTotalHits() throws Exception { leafCollector.collect(1); TopDocs topDocs = collector.topDocs(); - assertEquals(4, topDocs.totalHits.value); + assertEquals(4, topDocs.totalHits.value()); assertEquals(totalHitsThreshold < 4, scorer.minCompetitiveScore != null); assertEquals( new TotalHits( @@ -471,18 +468,18 @@ public void testRelationVsTopDocsCount() throws Exception { IndexSearcher searcher = new IndexSearcher(reader); TopScoreDocCollectorManager collectorManager = new TopScoreDocCollectorManager(2, 10); TopDocs topDocs = searcher.search(new TermQuery(new Term("f", "foo")), collectorManager); - assertEquals(10, topDocs.totalHits.value); - assertEquals(TotalHits.Relation.EQUAL_TO, topDocs.totalHits.relation); + assertEquals(10, topDocs.totalHits.value()); + assertEquals(TotalHits.Relation.EQUAL_TO, topDocs.totalHits.relation()); collectorManager = new TopScoreDocCollectorManager(2, 2); topDocs = searcher.search(new TermQuery(new Term("f", "foo")), collectorManager); - assertTrue(10 >= topDocs.totalHits.value); - assertEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, topDocs.totalHits.relation); + assertTrue(10 >= topDocs.totalHits.value()); + assertEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, topDocs.totalHits.relation()); collectorManager = new TopScoreDocCollectorManager(10, 2); topDocs = searcher.search(new TermQuery(new Term("f", "foo")), collectorManager); - assertEquals(10, topDocs.totalHits.value); - assertEquals(TotalHits.Relation.EQUAL_TO, topDocs.totalHits.relation); + assertEquals(10, topDocs.totalHits.value()); + assertEquals(TotalHits.Relation.EQUAL_TO, topDocs.totalHits.relation()); } } } @@ -520,47 +517,45 @@ public void testConcurrentMinScore() throws Exception { scorer.score = 3; leafCollector.collect(0); - assertNull(minValueChecker.get()); + assertEquals(Long.MIN_VALUE, minValueChecker.getRaw()); assertNull(scorer.minCompetitiveScore); scorer2.score = 6; leafCollector2.collect(0); - assertNull(minValueChecker.get()); + assertEquals(Long.MIN_VALUE, minValueChecker.getRaw()); assertNull(scorer2.minCompetitiveScore); scorer.score = 2; leafCollector.collect(1); - assertEquals(2f, minValueChecker.get().score, 0f); - assertEquals(Math.nextUp(2f), scorer.minCompetitiveScore, 0f); - assertNull(scorer2.minCompetitiveScore); + assertEquals(Long.MIN_VALUE, minValueChecker.getRaw()); + assertNull(scorer.minCompetitiveScore); scorer2.score = 9; leafCollector2.collect(1); - assertEquals(6f, minValueChecker.get().score, 0f); - assertEquals(Math.nextUp(2f), scorer.minCompetitiveScore, 0f); - assertEquals(Math.nextUp(6f), scorer2.minCompetitiveScore, 0f); + assertEquals(Long.MIN_VALUE, minValueChecker.getRaw()); + assertNull(scorer2.minCompetitiveScore); scorer2.score = 7; leafCollector2.collect(2); - assertEquals(minValueChecker.get().score, 7f, 0f); - assertEquals(Math.nextUp(2f), scorer.minCompetitiveScore, 0f); + assertEquals(MaxScoreAccumulator.toScore(minValueChecker.getRaw()), 7f, 0f); + assertNull(scorer.minCompetitiveScore); assertEquals(Math.nextUp(7f), scorer2.minCompetitiveScore, 0f); scorer2.score = 1; leafCollector2.collect(3); - assertEquals(minValueChecker.get().score, 7f, 0f); - assertEquals(Math.nextUp(2f), scorer.minCompetitiveScore, 0f); + assertEquals(MaxScoreAccumulator.toScore(minValueChecker.getRaw()), 7f, 0f); + assertNull(scorer.minCompetitiveScore); assertEquals(Math.nextUp(7f), scorer2.minCompetitiveScore, 0f); scorer.score = 10; leafCollector.collect(2); - assertEquals(minValueChecker.get().score, 7f, 0f); + assertEquals(MaxScoreAccumulator.toScore(minValueChecker.getRaw()), 7f, 0f); assertEquals(7f, scorer.minCompetitiveScore, 0f); assertEquals(Math.nextUp(7f), scorer2.minCompetitiveScore, 0f); scorer.score = 11; leafCollector.collect(3); - assertEquals(minValueChecker.get().score, 10, 0f); + assertEquals(MaxScoreAccumulator.toScore(minValueChecker.getRaw()), 10, 0f); assertEquals(Math.nextUp(10f), scorer.minCompetitiveScore, 0f); assertEquals(Math.nextUp(7f), scorer2.minCompetitiveScore, 0f); @@ -572,25 +567,25 @@ public void testConcurrentMinScore() throws Exception { scorer3.score = 1f; leafCollector3.collect(0); - assertEquals(10f, minValueChecker.get().score, 0f); + assertEquals(10f, MaxScoreAccumulator.toScore(minValueChecker.getRaw()), 0f); assertEquals(Math.nextUp(10f), scorer3.minCompetitiveScore, 0f); scorer.score = 11; leafCollector.collect(4); - assertEquals(11f, minValueChecker.get().score, 0f); + assertEquals(11f, MaxScoreAccumulator.toScore(minValueChecker.getRaw()), 0f); assertEquals(Math.nextUp(11f), scorer.minCompetitiveScore, 0f); assertEquals(Math.nextUp(7f), scorer2.minCompetitiveScore, 0f); assertEquals(Math.nextUp(10f), scorer3.minCompetitiveScore, 0f); scorer3.score = 2f; leafCollector3.collect(1); - assertEquals(minValueChecker.get().score, 11f, 0f); + assertEquals(MaxScoreAccumulator.toScore(minValueChecker.getRaw()), 11f, 0f); assertEquals(Math.nextUp(11f), scorer.minCompetitiveScore, 0f); assertEquals(Math.nextUp(7f), scorer2.minCompetitiveScore, 0f); assertEquals(Math.nextUp(11f), scorer3.minCompetitiveScore, 0f); TopDocs topDocs = manager.reduce(Arrays.asList(collector, collector2, collector3)); - assertEquals(11, topDocs.totalHits.value); + assertEquals(11, topDocs.totalHits.value()); assertEquals(new TotalHits(11, TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO), topDocs.totalHits); leafCollector.setScorer(scorer); @@ -637,8 +632,8 @@ public void testRandomMinCompetitiveScore() throws Exception { TopDocs tdc = doConcurrentSearchWithThreshold(5, 0, query, indexReader); TopDocs tdc2 = doSearchWithThreshold(5, 0, query, indexReader); - assertTrue(tdc.totalHits.value > 0); - assertTrue(tdc2.totalHits.value > 0); + assertTrue(tdc.totalHits.value() > 0); + assertTrue(tdc2.totalHits.value() > 0); CheckHits.checkEqual(query, tdc.scoreDocs, tdc2.scoreDocs); } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestTopDocsMerge.java b/lucene/core/src/test/org/apache/lucene/search/TestTopDocsMerge.java index c936c4fe0bb6..1d5b346c29db 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestTopDocsMerge.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestTopDocsMerge.java @@ -51,13 +51,13 @@ public ShardSearcher(LeafReaderContext ctx, IndexReaderContext parent) { } public void search(Weight weight, Collector collector) throws IOException { - searchLeaf(ctx, weight, collector); + searchLeaf(ctx, 0, DocIdSetIterator.NO_MORE_DOCS, weight, collector); } public TopDocs search(Weight weight, int topN) throws IOException { TopScoreDocCollector collector = - new TopScoreDocCollectorManager(topN, null, Integer.MAX_VALUE, false).newCollector(); - searchLeaf(ctx, weight, collector); + new TopScoreDocCollectorManager(topN, null, Integer.MAX_VALUE).newCollector(); + searchLeaf(ctx, 0, DocIdSetIterator.NO_MORE_DOCS, weight, collector); return collector.topDocs(); } @@ -321,7 +321,7 @@ void testSort(boolean useFrom) throws Exception { } System.out.println( " top search: " - + topHits.totalHits.value + + topHits.totalHits.value() + " totalHits; hits=" + (topHits.scoreDocs == null ? "null" : topHits.scoreDocs.length)); if (topHits.scoreDocs != null) { @@ -348,8 +348,7 @@ void testSort(boolean useFrom) throws Exception { subHits = subSearcher.search(w, numHits); } else { final TopFieldCollector c = - new TopFieldCollectorManager(sort, numHits, null, Integer.MAX_VALUE, false) - .newCollector(); + new TopFieldCollectorManager(sort, numHits, null, Integer.MAX_VALUE).newCollector(); subSearcher.search(w, c); subHits = c.topDocs(0, numHits); } @@ -364,7 +363,7 @@ void testSort(boolean useFrom) throws Exception { " shard=" + shardIDX + " " - + subHits.totalHits.value + + subHits.totalHits.value() + " totalHits hits=" + (subHits.scoreDocs == null ? "null" : subHits.scoreDocs.length)); if (subHits.scoreDocs != null) { diff --git a/lucene/core/src/test/org/apache/lucene/search/TestTopFieldCollector.java b/lucene/core/src/test/org/apache/lucene/search/TestTopFieldCollector.java index 6662cf0c2220..4393ace2c265 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestTopFieldCollector.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestTopFieldCollector.java @@ -76,8 +76,7 @@ private static TopDocs doSearchWithThreshold( throws IOException { IndexSearcher searcher = newSearcher(indexReader); TopFieldCollectorManager manager = - new TopFieldCollectorManager( - sort, numResults, null, thresHold, searcher.getSlices().length > 1); + new TopFieldCollectorManager(sort, numResults, null, thresHold); return searcher.search(q, manager); } @@ -87,8 +86,7 @@ private static TopDocs doConcurrentSearchWithThreshold( IndexSearcher searcher = newSearcher(indexReader, true, true, true); TopFieldCollectorManager collectorManager = - new TopFieldCollectorManager( - sort, numResults, null, threshold, searcher.getSlices().length > 1); + new TopFieldCollectorManager(sort, numResults, null, threshold); TopDocs topDoc = searcher.search(q, collectorManager); @@ -122,7 +120,7 @@ public void testSort() throws Exception { for (int i = 0; i < sort.length; i++) { Query q = new MatchAllDocsQuery(); TopFieldCollectorManager tdc = - new TopFieldCollectorManager(sort[i], 10, null, Integer.MAX_VALUE, false); + new TopFieldCollectorManager(sort[i], 10, null, Integer.MAX_VALUE); TopDocs td = is.search(q, tdc); ScoreDoc[] sd = td.scoreDocs; for (int j = 0; j < sd.length; j++) { @@ -384,9 +382,9 @@ public void testSortNoResults() throws Exception { Sort[] sort = new Sort[] {new Sort(SortField.FIELD_DOC), new Sort()}; for (int i = 0; i < sort.length; i++) { TopDocsCollector tdc = - new TopFieldCollectorManager(sort[i], 10, null, Integer.MAX_VALUE, false).newCollector(); + new TopFieldCollectorManager(sort[i], 10, null, Integer.MAX_VALUE).newCollector(); TopDocs td = tdc.topDocs(); - assertEquals(0, td.totalHits.value); + assertEquals(0, td.totalHits.value()); } } @@ -577,47 +575,45 @@ public void testConcurrentMinScore() throws Exception { scorer.score = 3; leafCollector.collect(0); - assertNull(minValueChecker.get()); + assertEquals(Long.MIN_VALUE, minValueChecker.getRaw()); assertNull(scorer.minCompetitiveScore); scorer2.score = 6; leafCollector2.collect(0); - assertNull(minValueChecker.get()); + assertEquals(Long.MIN_VALUE, minValueChecker.getRaw()); assertNull(scorer2.minCompetitiveScore); scorer.score = 2; leafCollector.collect(1); - assertEquals(2f, minValueChecker.get().score, 0f); - assertEquals(2f, scorer.minCompetitiveScore, 0f); - assertNull(scorer2.minCompetitiveScore); + assertEquals(Long.MIN_VALUE, minValueChecker.getRaw()); + assertNull(scorer.minCompetitiveScore); scorer2.score = 9; leafCollector2.collect(1); - assertEquals(6f, minValueChecker.get().score, 0f); - assertEquals(2f, scorer.minCompetitiveScore, 0f); - assertEquals(6f, scorer2.minCompetitiveScore, 0f); + assertEquals(Long.MIN_VALUE, minValueChecker.getRaw()); + assertNull(scorer2.minCompetitiveScore); scorer2.score = 7; leafCollector2.collect(2); - assertEquals(7f, minValueChecker.get().score, 0f); - assertEquals(2f, scorer.minCompetitiveScore, 0f); + assertEquals(7f, MaxScoreAccumulator.toScore(minValueChecker.getRaw()), 0f); + assertNull(scorer.minCompetitiveScore); assertEquals(7f, scorer2.minCompetitiveScore, 0f); scorer2.score = 1; leafCollector2.collect(3); - assertEquals(7f, minValueChecker.get().score, 0f); - assertEquals(2f, scorer.minCompetitiveScore, 0f); + assertEquals(7f, MaxScoreAccumulator.toScore(minValueChecker.getRaw()), 0f); + assertNull(scorer.minCompetitiveScore); assertEquals(7f, scorer2.minCompetitiveScore, 0f); scorer.score = 10; leafCollector.collect(2); - assertEquals(7f, minValueChecker.get().score, 0f); + assertEquals(7f, MaxScoreAccumulator.toScore(minValueChecker.getRaw()), 0f); assertEquals(7f, scorer.minCompetitiveScore, 0f); assertEquals(7f, scorer2.minCompetitiveScore, 0f); scorer.score = 11; leafCollector.collect(3); - assertEquals(10f, minValueChecker.get().score, 0f); + assertEquals(10f, MaxScoreAccumulator.toScore(minValueChecker.getRaw()), 0f); assertEquals(10f, scorer.minCompetitiveScore, 0f); assertEquals(7f, scorer2.minCompetitiveScore, 0f); @@ -629,25 +625,25 @@ public void testConcurrentMinScore() throws Exception { scorer3.score = 1f; leafCollector3.collect(0); - assertEquals(10f, minValueChecker.get().score, 0f); + assertEquals(10f, MaxScoreAccumulator.toScore(minValueChecker.getRaw()), 0f); assertEquals(10f, scorer3.minCompetitiveScore, 0f); scorer.score = 11; leafCollector.collect(4); - assertEquals(11f, minValueChecker.get().score, 0f); + assertEquals(11f, MaxScoreAccumulator.toScore(minValueChecker.getRaw()), 0f); assertEquals(11f, scorer.minCompetitiveScore, 0f); assertEquals(7f, scorer2.minCompetitiveScore, 0f); assertEquals(10f, scorer3.minCompetitiveScore, 0f); scorer3.score = 2f; leafCollector3.collect(1); - assertEquals(11f, minValueChecker.get().score, 0f); + assertEquals(11f, MaxScoreAccumulator.toScore(minValueChecker.getRaw()), 0f); assertEquals(11f, scorer.minCompetitiveScore, 0f); assertEquals(7f, scorer2.minCompetitiveScore, 0f); assertEquals(11f, scorer3.minCompetitiveScore, 0f); TopFieldDocs topDocs = manager.reduce(Arrays.asList(collector, collector2, collector3)); - assertEquals(11, topDocs.totalHits.value); + assertEquals(11, topDocs.totalHits.value()); assertEquals(new TotalHits(11, TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO), topDocs.totalHits); leafCollector.setScorer(scorer); @@ -695,8 +691,8 @@ public void testRandomMinCompetitiveScore() throws Exception { TopDocs tdc = doConcurrentSearchWithThreshold(5, 0, query, sort, indexReader); TopDocs tdc2 = doSearchWithThreshold(5, 0, query, sort, indexReader); - assertTrue(tdc.totalHits.value > 0); - assertTrue(tdc2.totalHits.value > 0); + assertTrue(tdc.totalHits.value() > 0); + assertTrue(tdc2.totalHits.value() > 0); CheckHits.checkEqual(query, tdc.scoreDocs, tdc2.scoreDocs); } @@ -718,21 +714,20 @@ public void testRelationVsTopDocsCount() throws Exception { try (IndexReader reader = DirectoryReader.open(w)) { IndexSearcher searcher = new IndexSearcher(reader); - TopFieldCollectorManager collectorManager = - new TopFieldCollectorManager(sort, 2, null, 10, true); + TopFieldCollectorManager collectorManager = new TopFieldCollectorManager(sort, 2, null, 10); TopDocs topDocs = searcher.search(new TermQuery(new Term("f", "foo")), collectorManager); - assertEquals(10, topDocs.totalHits.value); - assertEquals(TotalHits.Relation.EQUAL_TO, topDocs.totalHits.relation); + assertEquals(10, topDocs.totalHits.value()); + assertEquals(TotalHits.Relation.EQUAL_TO, topDocs.totalHits.relation()); - collectorManager = new TopFieldCollectorManager(sort, 2, null, 2, true); + collectorManager = new TopFieldCollectorManager(sort, 2, null, 2); topDocs = searcher.search(new TermQuery(new Term("f", "foo")), collectorManager); - assertTrue(10 >= topDocs.totalHits.value); - assertEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, topDocs.totalHits.relation); + assertTrue(10 >= topDocs.totalHits.value()); + assertEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, topDocs.totalHits.relation()); - collectorManager = new TopFieldCollectorManager(sort, 10, null, 2, true); + collectorManager = new TopFieldCollectorManager(sort, 10, null, 2); topDocs = searcher.search(new TermQuery(new Term("f", "foo")), collectorManager); - assertEquals(10, topDocs.totalHits.value); - assertEquals(TotalHits.Relation.EQUAL_TO, topDocs.totalHits.relation); + assertEquals(10, topDocs.totalHits.value()); + assertEquals(TotalHits.Relation.EQUAL_TO, topDocs.totalHits.relation()); } } } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestTopFieldCollectorEarlyTermination.java b/lucene/core/src/test/org/apache/lucene/search/TestTopFieldCollectorEarlyTermination.java index 4c788492becf..52f313b3dfd9 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestTopFieldCollectorEarlyTermination.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestTopFieldCollectorEarlyTermination.java @@ -29,9 +29,10 @@ import org.apache.lucene.document.StringField; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.SerialMergeScheduler; import org.apache.lucene.index.Term; +import org.apache.lucene.search.IndexSearcher.LeafReaderContextPartition; +import org.apache.lucene.search.IndexSearcher.LeafSlice; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.analysis.MockAnalyzer; import org.apache.lucene.tests.index.MockRandomMergePolicy; @@ -39,6 +40,7 @@ import org.apache.lucene.tests.search.CheckHits; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.tests.util.TestUtil; +import org.apache.lucene.util.Bits; public class TestTopFieldCollectorEarlyTermination extends LuceneTestCase { @@ -119,12 +121,22 @@ private void doTestEarlyTermination(boolean paging) throws IOException { final int iters = atLeast(1); for (int i = 0; i < iters; ++i) { createRandomIndex(false); - int maxSegmentSize = 0; - for (LeafReaderContext ctx : reader.leaves()) { - maxSegmentSize = Math.max(ctx.reader().numDocs(), maxSegmentSize); - } for (int j = 0; j < iters; ++j) { final IndexSearcher searcher = newSearcher(reader); + int maxSliceSize = 0; + for (LeafSlice slice : searcher.getSlices()) { + int numDocs = 0; // number of live docs in the slice + for (LeafReaderContextPartition partition : slice.partitions) { + Bits liveDocs = partition.ctx.reader().getLiveDocs(); + int maxDoc = Math.min(partition.maxDocId, partition.ctx.reader().maxDoc()); + for (int doc = partition.minDocId; doc < maxDoc; ++doc) { + if (liveDocs == null || liveDocs.get(doc)) { + numDocs++; + } + } + } + maxSliceSize = Math.max(maxSliceSize, numDocs); + } final int numHits = TestUtil.nextInt(random(), 1, numDocs); FieldDoc after; if (paging) { @@ -148,16 +160,16 @@ private void doTestEarlyTermination(boolean paging) throws IOException { TopDocs td1 = searcher.search(query, manager1); TopDocs td2 = searcher.search(query, manager2); - assertNotEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, td1.totalHits.relation); - if (paging == false && maxSegmentSize > numHits && query instanceof MatchAllDocsQuery) { + assertNotEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, td1.totalHits.relation()); + if (paging == false && maxSliceSize > numHits && query instanceof MatchAllDocsQuery) { // Make sure that we sometimes early terminate - assertEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, td2.totalHits.relation); + assertEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, td2.totalHits.relation()); } - if (td2.totalHits.relation == TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO) { - assertTrue(td2.totalHits.value >= td1.scoreDocs.length); - assertTrue(td2.totalHits.value <= reader.maxDoc()); + if (td2.totalHits.relation() == TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO) { + assertTrue(td2.totalHits.value() >= td1.scoreDocs.length); + assertTrue(td2.totalHits.value() <= reader.maxDoc()); } else { - assertEquals(td2.totalHits.value, td1.totalHits.value); + assertEquals(td2.totalHits.value(), td1.totalHits.value()); } CheckHits.checkEqual(query, td1.scoreDocs, td2.scoreDocs); } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestTotalHitCountCollector.java b/lucene/core/src/test/org/apache/lucene/search/TestTotalHitCountCollector.java index eb2afb58e34c..7ad1d2bacc21 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestTotalHitCountCollector.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestTotalHitCountCollector.java @@ -16,6 +16,7 @@ */ package org.apache.lucene.search; +import com.carrotsearch.randomizedtesting.RandomizedTest; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; @@ -40,8 +41,10 @@ public void testBasics() throws Exception { IndexReader reader = writer.getReader(); writer.close(); - IndexSearcher searcher = newSearcher(reader, true, true, random().nextBoolean()); - TotalHitCountCollectorManager collectorManager = new TotalHitCountCollectorManager(); + Concurrency concurrency = RandomizedTest.randomFrom(Concurrency.values()); + IndexSearcher searcher = newSearcher(reader, true, true, concurrency); + final TotalHitCountCollectorManager collectorManager = + new TotalHitCountCollectorManager(searcher.getSlices()); int totalHits = searcher.search(new MatchAllDocsQuery(), collectorManager); assertEquals(5, totalHits); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestTotalHits.java b/lucene/core/src/test/org/apache/lucene/search/TestTotalHits.java index b27d551a46d8..f404cc72ebc1 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestTotalHits.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestTotalHits.java @@ -26,17 +26,18 @@ public class TestTotalHits extends LuceneTestCase { public void testEqualsAndHashcode() { TotalHits totalHits1 = randomTotalHits(); assertFalse(totalHits1.equals(null)); - assertFalse(totalHits1.equals(totalHits1.value)); + assertFalse(totalHits1.equals(totalHits1.value())); assertEquals(totalHits1, totalHits1); assertEquals(totalHits1.hashCode(), totalHits1.hashCode()); - TotalHits totalHits2 = new TotalHits(totalHits1.value, totalHits1.relation); + TotalHits totalHits2 = new TotalHits(totalHits1.value(), totalHits1.relation()); assertEquals(totalHits1, totalHits2); assertEquals(totalHits2, totalHits1); assertEquals(totalHits1.hashCode(), totalHits2.hashCode()); TotalHits totalHits4 = randomTotalHits(); - if (totalHits4.value == totalHits1.value && totalHits4.relation == totalHits1.relation) { + if (totalHits4.value() == totalHits1.value() + && totalHits4.relation() == totalHits1.relation()) { assertEquals(totalHits1, totalHits4); assertEquals(totalHits2, totalHits4); assertEquals(totalHits1.hashCode(), totalHits4.hashCode()); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestWildcardQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestWildcardQuery.java index 04807eb19ae7..4dfca8101fdb 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestWildcardQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestWildcardQuery.java @@ -21,6 +21,7 @@ import org.apache.lucene.document.Field; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.MultiTerms; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; @@ -418,4 +419,42 @@ public void testLarge() throws IOException { reader.close(); dir.close(); } + + public void testCostEstimate() throws IOException { + Directory dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir); + for (int i = 0; i < 1000; i++) { + Document doc = new Document(); + doc.add(newStringField("body", "foo bar", Field.Store.NO)); + writer.addDocument(doc); + doc = new Document(); + doc.add(newStringField("body", "foo wuzzle", Field.Store.NO)); + writer.addDocument(doc); + doc = new Document(); + doc.add(newStringField("body", "bar " + i, Field.Store.NO)); + writer.addDocument(doc); + } + writer.flush(); + writer.forceMerge(1); + writer.close(); + + IndexReader reader = DirectoryReader.open(dir); + IndexSearcher searcher = newSearcher(reader); + LeafReaderContext lrc = reader.leaves().get(0); + + WildcardQuery query = new WildcardQuery(new Term("body", "foo*")); + Query rewritten = searcher.rewrite(query); + Weight weight = rewritten.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1.0f); + ScorerSupplier supplier = weight.scorerSupplier(lrc); + assertEquals(2000, supplier.cost()); // Sum the terms doc freqs + + query = new WildcardQuery(new Term("body", "bar*")); + rewritten = searcher.rewrite(query); + weight = rewritten.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1.0f); + supplier = weight.scorerSupplier(lrc); + assertEquals(3000, supplier.cost()); // Too many terms, assume worst-case all terms match + + reader.close(); + dir.close(); + } } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestWildcardRandom.java b/lucene/core/src/test/org/apache/lucene/search/TestWildcardRandom.java index 36e0515f70cd..96134a2295d8 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestWildcardRandom.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestWildcardRandom.java @@ -94,7 +94,7 @@ private void assertPatternHits(String pattern, int numHits) throws Exception { } Query wq = new WildcardQuery(new Term("field", filledPattern)); TopDocs docs = searcher.search(wq, 25); - assertEquals("Incorrect hits for pattern: " + pattern, numHits, docs.totalHits.value); + assertEquals("Incorrect hits for pattern: " + pattern, numHits, docs.totalHits.value()); } @Override diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestBooleanSimilarity.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestBooleanSimilarity.java index a40e476a3b6d..cbe2c7f3bdc9 100644 --- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestBooleanSimilarity.java +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestBooleanSimilarity.java @@ -56,16 +56,16 @@ public void testTermScoreIsEqualToBoost() throws IOException { IndexSearcher searcher = newSearcher(reader); searcher.setSimilarity(new BooleanSimilarity()); TopDocs topDocs = searcher.search(new TermQuery(new Term("foo", "bar")), 2); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); assertEquals(1f, topDocs.scoreDocs[0].score, 0f); assertEquals(1f, topDocs.scoreDocs[1].score, 0f); topDocs = searcher.search(new TermQuery(new Term("foo", "baz")), 1); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); assertEquals(1f, topDocs.scoreDocs[0].score, 0f); topDocs = searcher.search(new BoostQuery(new TermQuery(new Term("foo", "baz")), 3f), 1); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); assertEquals(3f, topDocs.scoreDocs[0].score, 0f); reader.close(); @@ -89,11 +89,11 @@ public void testPhraseScoreIsEqualToBoost() throws IOException { PhraseQuery query = new PhraseQuery(2, "foo", "bar", "quux"); TopDocs topDocs = searcher.search(query, 2); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); assertEquals(1f, topDocs.scoreDocs[0].score, 0f); topDocs = searcher.search(new BoostQuery(query, 7), 2); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); assertEquals(7f, topDocs.scoreDocs[0].score, 0f); reader.close(); diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestClassicSimilarity.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestClassicSimilarity.java index 2e41b758c3e7..3ec08d75f850 100644 --- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestClassicSimilarity.java +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestClassicSimilarity.java @@ -70,7 +70,7 @@ public void tearDown() throws Exception { public void testHit() throws IOException { Query query = new TermQuery(new Term("test", "hit")); TopDocs topDocs = indexSearcher.search(query, 1); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); assertEquals(1, topDocs.scoreDocs.length); assertTrue(topDocs.scoreDocs[0].score != 0); } @@ -78,13 +78,13 @@ public void testHit() throws IOException { public void testMiss() throws IOException { Query query = new TermQuery(new Term("test", "miss")); TopDocs topDocs = indexSearcher.search(query, 1); - assertEquals(0, topDocs.totalHits.value); + assertEquals(0, topDocs.totalHits.value()); } public void testEmpty() throws IOException { Query query = new TermQuery(new Term("empty", "miss")); TopDocs topDocs = indexSearcher.search(query, 1); - assertEquals(0, topDocs.totalHits.value); + assertEquals(0, topDocs.totalHits.value()); } public void testBQHit() throws IOException { @@ -93,7 +93,7 @@ public void testBQHit() throws IOException { .add(new TermQuery(new Term("test", "hit")), Occur.SHOULD) .build(); TopDocs topDocs = indexSearcher.search(query, 1); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); assertEquals(1, topDocs.scoreDocs.length); assertTrue(topDocs.scoreDocs[0].score != 0); } @@ -105,7 +105,7 @@ public void testBQHitOrMiss() throws IOException { .add(new TermQuery(new Term("test", "miss")), Occur.SHOULD) .build(); TopDocs topDocs = indexSearcher.search(query, 1); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); assertEquals(1, topDocs.scoreDocs.length); assertTrue(topDocs.scoreDocs[0].score != 0); } @@ -117,7 +117,7 @@ public void testBQHitOrEmpty() throws IOException { .add(new TermQuery(new Term("empty", "miss")), Occur.SHOULD) .build(); TopDocs topDocs = indexSearcher.search(query, 1); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); assertEquals(1, topDocs.scoreDocs.length); assertTrue(topDocs.scoreDocs[0].score != 0); } @@ -125,7 +125,7 @@ public void testBQHitOrEmpty() throws IOException { public void testDMQHit() throws IOException { Query query = new DisjunctionMaxQuery(Arrays.asList(new TermQuery(new Term("test", "hit"))), 0); TopDocs topDocs = indexSearcher.search(query, 1); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); assertEquals(1, topDocs.scoreDocs.length); assertTrue(topDocs.scoreDocs[0].score != 0); } @@ -137,7 +137,7 @@ public void testDMQHitOrMiss() throws IOException { new TermQuery(new Term("test", "hit")), new TermQuery(new Term("test", "miss"))), 0); TopDocs topDocs = indexSearcher.search(query, 1); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); assertEquals(1, topDocs.scoreDocs.length); assertTrue(topDocs.scoreDocs[0].score != 0); } @@ -149,7 +149,7 @@ public void testDMQHitOrEmpty() throws IOException { new TermQuery(new Term("test", "hit")), new TermQuery(new Term("empty", "miss"))), 0); TopDocs topDocs = indexSearcher.search(query, 1); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); assertEquals(1, topDocs.scoreDocs.length); assertTrue(topDocs.scoreDocs[0].score != 0); } diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestRawTFSimilarity.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestRawTFSimilarity.java new file mode 100644 index 000000000000..2631058834eb --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestRawTFSimilarity.java @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.similarities; + +import java.io.IOException; +import java.util.Random; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BoostQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.search.similarities.BaseSimilarityTestCase; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.util.IOUtils; + +public class TestRawTFSimilarity extends BaseSimilarityTestCase { + + private Directory directory; + private IndexReader indexReader; + private IndexSearcher indexSearcher; + + @Override + protected Similarity getSimilarity(Random random) { + return new RawTFSimilarity(); + } + + @Override + public void setUp() throws Exception { + super.setUp(); + directory = newDirectory(); + try (IndexWriter indexWriter = new IndexWriter(directory, newIndexWriterConfig())) { + final Document document1 = new Document(); + final Document document2 = new Document(); + final Document document3 = new Document(); + document1.add(LuceneTestCase.newTextField("test", "one", Field.Store.YES)); + document2.add(LuceneTestCase.newTextField("test", "two two", Field.Store.YES)); + document3.add(LuceneTestCase.newTextField("test", "three three three", Field.Store.YES)); + indexWriter.addDocument(document1); + indexWriter.addDocument(document2); + indexWriter.addDocument(document3); + indexWriter.commit(); + } + indexReader = DirectoryReader.open(directory); + indexSearcher = newSearcher(indexReader); + indexSearcher.setSimilarity(new RawTFSimilarity()); + } + + @Override + public void tearDown() throws Exception { + IOUtils.close(indexReader, directory); + super.tearDown(); + } + + public void testOne() throws IOException { + implTest("one", 1f); + } + + public void testTwo() throws IOException { + implTest("two", 2f); + } + + public void testThree() throws IOException { + implTest("three", 3f); + } + + private void implTest(String text, float expectedScore) throws IOException { + Query query = new TermQuery(new Term("test", text)); + TopDocs topDocs = indexSearcher.search(query, 1); + assertEquals(1, topDocs.totalHits.value()); + assertEquals(1, topDocs.scoreDocs.length); + assertEquals(expectedScore, topDocs.scoreDocs[0].score, 0.0); + } + + public void testBoostQuery() throws IOException { + Query query = new TermQuery(new Term("test", "three")); + float boost = 14f; + TopDocs topDocs = indexSearcher.search(new BoostQuery(query, boost), 1); + assertEquals(1, topDocs.totalHits.value()); + assertEquals(1, topDocs.scoreDocs.length); + assertEquals(42f, topDocs.scoreDocs[0].score, 0.0); + } +} diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarity2.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarity2.java index e75b82ca9a44..1430385c708c 100644 --- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarity2.java +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarity2.java @@ -90,7 +90,7 @@ public void testEmptyIndex() throws Exception { for (Similarity sim : sims) { is.setSimilarity(sim); - assertEquals(0, is.search(new TermQuery(new Term("foo", "bar")), 10).totalHits.value); + assertEquals(0, is.search(new TermQuery(new Term("foo", "bar")), 10).totalHits.value()); } ir.close(); dir.close(); @@ -112,7 +112,7 @@ public void testEmptyField() throws Exception { BooleanQuery.Builder query = new BooleanQuery.Builder(); query.add(new TermQuery(new Term("foo", "bar")), BooleanClause.Occur.SHOULD); query.add(new TermQuery(new Term("bar", "baz")), BooleanClause.Occur.SHOULD); - assertEquals(1, is.search(query.build(), 10).totalHits.value); + assertEquals(1, is.search(query.build(), 10).totalHits.value()); } ir.close(); dir.close(); @@ -136,7 +136,7 @@ public void testEmptyTerm() throws Exception { BooleanQuery.Builder query = new BooleanQuery.Builder(); query.add(new TermQuery(new Term("foo", "bar")), BooleanClause.Occur.SHOULD); query.add(new TermQuery(new Term("foo", "baz")), BooleanClause.Occur.SHOULD); - assertEquals(1, is.search(query.build(), 10).totalHits.value); + assertEquals(1, is.search(query.build(), 10).totalHits.value()); } ir.close(); dir.close(); @@ -160,7 +160,7 @@ public void testNoNorms() throws Exception { is.setSimilarity(sim); BooleanQuery.Builder query = new BooleanQuery.Builder(); query.add(new TermQuery(new Term("foo", "bar")), BooleanClause.Occur.SHOULD); - assertEquals(1, is.search(query.build(), 10).totalHits.value); + assertEquals(1, is.search(query.build(), 10).totalHits.value()); } ir.close(); dir.close(); @@ -234,7 +234,7 @@ public void testOmitTF() throws Exception { is.setSimilarity(sim); BooleanQuery.Builder query = new BooleanQuery.Builder(); query.add(new TermQuery(new Term("foo", "bar")), BooleanClause.Occur.SHOULD); - assertEquals(1, is.search(query.build(), 10).totalHits.value); + assertEquals(1, is.search(query.build(), 10).totalHits.value()); } ir.close(); dir.close(); @@ -260,7 +260,7 @@ public void testOmitTFAndNorms() throws Exception { is.setSimilarity(sim); BooleanQuery.Builder query = new BooleanQuery.Builder(); query.add(new TermQuery(new Term("foo", "bar")), BooleanClause.Occur.SHOULD); - assertEquals(1, is.search(query.build(), 10).totalHits.value); + assertEquals(1, is.search(query.build(), 10).totalHits.value()); } ir.close(); dir.close(); diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java index 6bb581d108ad..b24bd9c5afb1 100644 --- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java @@ -499,7 +499,7 @@ public void testHeartList() throws IOException { for (SimilarityBase sim : sims) { searcher.setSimilarity(sim); TopDocs topDocs = searcher.search(q, 1000); - assertEquals("Failed: " + sim.toString(), 3, topDocs.totalHits.value); + assertEquals("Failed: " + sim.toString(), 3, topDocs.totalHits.value()); } } @@ -526,17 +526,17 @@ public void tearDown() throws Exception { // LUCENE-5221 public void testDiscountOverlapsBoost() throws IOException { - BM25Similarity expected = new BM25Similarity(false); - SimilarityBase actual = - new DFRSimilarity(new BasicModelIne(), new AfterEffectB(), new NormalizationH2()); - actual.setDiscountOverlaps(false); + final BM25Similarity expected0 = new BM25Similarity(false); + final SimilarityBase actual0 = + new DFRSimilarity(new BasicModelIne(), new AfterEffectB(), new NormalizationH2(), false); FieldInvertState state = new FieldInvertState(Version.LATEST.major, "foo", IndexOptions.DOCS_AND_FREQS); state.setLength(5); state.setNumOverlap(2); - assertEquals(expected.computeNorm(state), actual.computeNorm(state)); - expected = new BM25Similarity(); - actual.setDiscountOverlaps(true); - assertEquals(expected.computeNorm(state), actual.computeNorm(state)); + assertEquals(expected0.computeNorm(state), actual0.computeNorm(state)); + final BM25Similarity expected1 = new BM25Similarity(true); + final SimilarityBase actual1 = + new DFRSimilarity(new BasicModelIne(), new AfterEffectB(), new NormalizationH2(), true); + assertEquals(expected1.computeNorm(state), actual1.computeNorm(state)); } } diff --git a/lucene/core/src/test/org/apache/lucene/util/BaseSortTestCase.java b/lucene/core/src/test/org/apache/lucene/util/BaseSortTestCase.java index ba4ed6fa82be..5a0382fe3d82 100644 --- a/lucene/core/src/test/org/apache/lucene/util/BaseSortTestCase.java +++ b/lucene/core/src/test/org/apache/lucene/util/BaseSortTestCase.java @@ -23,15 +23,7 @@ public abstract class BaseSortTestCase extends LuceneTestCase { - public static class Entry implements java.lang.Comparable { - - public final int value; - public final int ord; - - public Entry(int value, int ord) { - this.value = value; - this.ord = ord; - } + public record Entry(int value, int ord) implements Comparable { @Override public int compareTo(Entry other) { diff --git a/lucene/core/src/test/org/apache/lucene/util/TestArrayUtil.java b/lucene/core/src/test/org/apache/lucene/util/TestArrayUtil.java index 972fd0c3c848..41f320ba0071 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestArrayUtil.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestArrayUtil.java @@ -197,13 +197,7 @@ public void testTimSort() { } } - static class Item implements Comparable { - final int val, order; - - Item(int val, int order) { - this.val = val; - this.order = order; - } + record Item(int val, int order) implements Comparable { @Override public int compareTo(Item other) { diff --git a/lucene/core/src/test/org/apache/lucene/util/TestFixedBitSet.java b/lucene/core/src/test/org/apache/lucene/util/TestFixedBitSet.java index e26aed408b74..67f0918f46b7 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestFixedBitSet.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestFixedBitSet.java @@ -98,26 +98,7 @@ void doPrevSetBit(java.util.BitSet a, FixedBitSet b) { } // test interleaving different FixedBitSetIterator.next()/skipTo() - void doIterate(java.util.BitSet a, FixedBitSet b, int mode) throws IOException { - if (mode == 1) doIterate1(a, b); - if (mode == 2) doIterate2(a, b); - } - - void doIterate1(java.util.BitSet a, FixedBitSet b) throws IOException { - assertEquals(a.cardinality(), b.cardinality()); - int aa = -1, bb = -1; - DocIdSetIterator iterator = new BitSetIterator(b, 0); - do { - aa = a.nextSetBit(aa + 1); - bb = - (bb < b.length() && random().nextBoolean()) - ? iterator.nextDoc() - : iterator.advance(bb + 1); - assertEquals(aa == -1 ? DocIdSetIterator.NO_MORE_DOCS : aa, bb); - } while (aa >= 0); - } - - void doIterate2(java.util.BitSet a, FixedBitSet b) throws IOException { + void doIterate(java.util.BitSet a, FixedBitSet b) throws IOException { assertEquals(a.cardinality(), b.cardinality()); int aa = -1, bb = -1; DocIdSetIterator iterator = new BitSetIterator(b, 0); @@ -128,7 +109,7 @@ void doIterate2(java.util.BitSet a, FixedBitSet b) throws IOException { } while (aa >= 0); } - void doRandomSets(int maxSize, int iter, int mode) throws IOException { + void doRandomSets(int maxSize, int iter) throws IOException { java.util.BitSet a0 = null; FixedBitSet b0 = null; @@ -181,7 +162,7 @@ void doRandomSets(int maxSize, int iter, int mode) throws IOException { FixedBitSet bb = b.clone(); bb.flip(fromIndex, toIndex); - doIterate(aa, bb, mode); // a problem here is from flip or doIterate + doIterate(aa, bb); // a problem here is from flip or doIterate fromIndex = random().nextInt(sz / 2); toIndex = fromIndex + random().nextInt(sz - fromIndex); @@ -230,10 +211,10 @@ void doRandomSets(int maxSize, int iter, int mode) throws IOException { assertEquals(a0.cardinality(), b0.cardinality()); assertEquals(a_or.cardinality(), b_or.cardinality()); - doIterate(a_and, b_and, mode); - doIterate(a_or, b_or, mode); - doIterate(a_andn, b_andn, mode); - doIterate(a_xor, b_xor, mode); + doIterate(a_and, b_and); + doIterate(a_or, b_or); + doIterate(a_andn, b_andn); + doIterate(a_xor, b_xor); assertEquals(a_and.cardinality(), b_and.cardinality()); assertEquals(a_or.cardinality(), b_or.cardinality()); @@ -250,8 +231,7 @@ void doRandomSets(int maxSize, int iter, int mode) throws IOException { // larger testsuite. public void testSmall() throws IOException { final int iters = TEST_NIGHTLY ? atLeast(1000) : 100; - doRandomSets(atLeast(1200), iters, 1); - doRandomSets(atLeast(1200), iters, 2); + doRandomSets(atLeast(1200), iters); } // uncomment to run a bigger test (~2 minutes). diff --git a/lucene/core/src/test/org/apache/lucene/util/TestVectorUtil.java b/lucene/core/src/test/org/apache/lucene/util/TestVectorUtil.java index 00577c3db524..6e449a550028 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestVectorUtil.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestVectorUtil.java @@ -353,4 +353,35 @@ private static int xorBitCount(byte[] a, byte[] b) { } return res; } + + public void testFindNextGEQ() { + int padding = TestUtil.nextInt(random(), 0, 5); + int[] values = new int[128 + padding]; + int v = 0; + for (int i = 0; i < 128; ++i) { + v += TestUtil.nextInt(random(), 1, 1000); + values[i] = v; + } + + // Now duel with slowFindFirstGreater + for (int iter = 0; iter < 1_000; ++iter) { + int from = TestUtil.nextInt(random(), 0, 127); + int target = + TestUtil.nextInt(random(), values[from], Math.max(values[from], values[127])) + + random().nextInt(10) + - 5; + assertEquals( + slowFindNextGEQ(values, 128, target, from), + VectorUtil.findNextGEQ(values, target, from, 128)); + } + } + + private static int slowFindNextGEQ(int[] buffer, int length, int target, int from) { + for (int i = from; i < length; ++i) { + if (buffer[i] >= target) { + return i; + } + } + return length; + } } diff --git a/lucene/core/src/test/org/apache/lucene/util/TestVersion.java b/lucene/core/src/test/org/apache/lucene/util/TestVersion.java index d34ee2f78db3..b3a69b48fa46 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestVersion.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestVersion.java @@ -33,24 +33,27 @@ public void testOnOrAfter() throws Exception { assertTrue("LATEST must be always onOrAfter(" + v + ")", Version.LATEST.onOrAfter(v)); } } - assertTrue(Version.LUCENE_10_0_0.onOrAfter(Version.LUCENE_9_0_0)); + assertTrue(Version.LUCENE_11_0_0.onOrAfter(Version.fromBits(9, 0, 0))); + assertTrue(Version.LUCENE_11_0_0.onOrAfter(Version.LUCENE_10_0_0)); + assertTrue(Version.LUCENE_11_0_0.onOrAfter(Version.LUCENE_10_1_0)); } public void testToString() { - assertEquals("9.0.0", Version.LUCENE_9_0_0.toString()); + assertEquals("9.0.0", Version.fromBits(9, 0, 0).toString()); assertEquals("10.0.0", Version.LUCENE_10_0_0.toString()); + assertEquals("10.1.0", Version.LUCENE_10_1_0.toString()); + assertEquals("11.0.0", Version.LUCENE_11_0_0.toString()); } public void testParseLeniently() throws Exception { + assertEquals(Version.LUCENE_11_0_0, Version.parseLeniently("11.0")); + assertEquals(Version.LUCENE_11_0_0, Version.parseLeniently("11.0.0")); + assertEquals(Version.LUCENE_11_0_0, Version.parseLeniently("LUCENE_11_0")); + assertEquals(Version.LUCENE_11_0_0, Version.parseLeniently("LUCENE_11_0_0")); assertEquals(Version.LUCENE_10_0_0, Version.parseLeniently("10.0")); assertEquals(Version.LUCENE_10_0_0, Version.parseLeniently("10.0.0")); assertEquals(Version.LUCENE_10_0_0, Version.parseLeniently("LUCENE_10_0")); assertEquals(Version.LUCENE_10_0_0, Version.parseLeniently("LUCENE_10_0_0")); - assertEquals(Version.LUCENE_9_0_0, Version.parseLeniently("9.0")); - assertEquals(Version.LUCENE_9_0_0, Version.parseLeniently("9.0.0")); - assertEquals(Version.LUCENE_9_0_0, Version.parseLeniently("LUCENE_90")); - assertEquals(Version.LUCENE_9_0_0, Version.parseLeniently("LUCENE_9_0")); - assertEquals(Version.LUCENE_9_0_0, Version.parseLeniently("LUCENE_9_0_0")); assertEquals(Version.LATEST, Version.parseLeniently("LATEST")); assertEquals(Version.LATEST, Version.parseLeniently("latest")); @@ -108,7 +111,7 @@ public void testParseLenientlyOnAllConstants() throws Exception { public void testParse() throws Exception { assertEquals(Version.LUCENE_10_0_0, Version.parse("10.0.0")); - assertEquals(Version.LUCENE_9_0_0, Version.parse("9.0.0")); + assertEquals(Version.LUCENE_11_0_0, Version.parse("11.0.0")); // Version does not pass judgement on the major version: assertEquals(1, Version.parse("1.0").major); @@ -116,7 +119,9 @@ public void testParse() throws Exception { } public void testForwardsCompatibility() throws Exception { - assertTrue(Version.parse("9.10.20").onOrAfter(Version.LUCENE_9_0_0)); + assertTrue(Version.parse("11.10.20").onOrAfter(Version.LUCENE_11_0_0)); + assertTrue(Version.parse("10.10.20").onOrAfter(Version.LUCENE_10_0_0)); + assertTrue(Version.parse("9.10.20").onOrAfter(Version.fromBits(9, 0, 0))); } public void testParseExceptions() { diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/MinimizationOperations.java b/lucene/core/src/test/org/apache/lucene/util/automaton/MinimizationOperations.java index 820ae1881c64..ecf0beaa6180 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/MinimizationOperations.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/MinimizationOperations.java @@ -262,15 +262,7 @@ public static Automaton minimize(Automaton a, int determinizeWorkLimit) { return Operations.removeDeadStates(result); } - static final class IntPair { - - final int n1, n2; - - IntPair(int n1, int n2) { - this.n1 = n1; - this.n2 = n2; - } - } + record IntPair(int n1, int n2) {} static final class StateList { diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java index e4dd739ef78d..3c7d6eea198a 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java @@ -87,7 +87,7 @@ public void testSameLanguage() throws Exception { Automaton a2 = Operations.removeDeadStates( Operations.concatenate(Automata.makeString("foo"), Automata.makeString("bar"))); - assertTrue(Operations.sameLanguage(a1, a2)); + assertTrue(AutomatonTestUtil.sameLanguage(a1, a2)); } public void testCommonPrefixString() throws Exception { @@ -257,7 +257,7 @@ public void testMinimizeSimple() throws Exception { Automaton a = Automata.makeString("foobar"); Automaton aMin = MinimizationOperations.minimize(a, DEFAULT_DETERMINIZE_WORK_LIMIT); - assertTrue(Operations.sameLanguage(a, aMin)); + assertTrue(AutomatonTestUtil.sameLanguage(a, aMin)); } public void testMinimize2() throws Exception { @@ -266,7 +266,7 @@ public void testMinimize2() throws Exception { Arrays.asList(Automata.makeString("foobar"), Automata.makeString("boobar"))); Automaton aMin = MinimizationOperations.minimize(a, DEFAULT_DETERMINIZE_WORK_LIMIT); assertTrue( - Operations.sameLanguage( + AutomatonTestUtil.sameLanguage( Operations.determinize(Operations.removeDeadStates(a), DEFAULT_DETERMINIZE_WORK_LIMIT), aMin)); } @@ -276,7 +276,7 @@ public void testReverse() throws Exception { Automaton ra = Operations.reverse(a); Automaton a2 = Operations.determinize(Operations.reverse(ra), DEFAULT_DETERMINIZE_WORK_LIMIT); - assertTrue(Operations.sameLanguage(a, a2)); + assertTrue(AutomatonTestUtil.sameLanguage(a, a2)); } public void testOptional() throws Exception { @@ -401,7 +401,7 @@ public void testReverseRandom1() throws Exception { Automaton ra = Operations.reverse(a); Automaton rra = Operations.reverse(ra); assertTrue( - Operations.sameLanguage( + AutomatonTestUtil.sameLanguage( Operations.determinize(Operations.removeDeadStates(a), Integer.MAX_VALUE), Operations.determinize(Operations.removeDeadStates(rra), Integer.MAX_VALUE))); } @@ -502,7 +502,7 @@ public void testBuilderRandom() throws Exception { } assertTrue( - Operations.sameLanguage( + AutomatonTestUtil.sameLanguage( Operations.determinize(Operations.removeDeadStates(a), Integer.MAX_VALUE), Operations.determinize( Operations.removeDeadStates(builder.finish()), Integer.MAX_VALUE))); @@ -735,7 +735,8 @@ public void testSameLanguage1() throws Exception { a2.addTransition(0, state, 'a'); a2.finishState(); assertTrue( - Operations.sameLanguage(Operations.removeDeadStates(a), Operations.removeDeadStates(a2))); + AutomatonTestUtil.sameLanguage( + Operations.removeDeadStates(a), Operations.removeDeadStates(a2))); } private Automaton randomNoOp(Automaton a) { @@ -1288,7 +1289,7 @@ private void assertSame(Collection terms, Automaton a) { Automaton a2 = Operations.removeDeadStates(Operations.determinize(unionTerms(terms), Integer.MAX_VALUE)); assertTrue( - Operations.sameLanguage( + AutomatonTestUtil.sameLanguage( a2, Operations.removeDeadStates(Operations.determinize(a, Integer.MAX_VALUE)))); // Do same check, in UTF8 space @@ -1613,7 +1614,7 @@ public void testMakeBinaryIntervalOpenBoth() throws Exception { public void testAcceptAllEmptyStringMin() throws Exception { Automaton a = Automata.makeBinaryInterval(newBytesRef(), true, null, true); - assertTrue(Operations.sameLanguage(Automata.makeAnyBinary(), a)); + assertTrue(AutomatonTestUtil.sameLanguage(Automata.makeAnyBinary(), a)); } private static IntsRef toIntsRef(String s) { diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestDeterminism.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestDeterminism.java index 65616fa55b99..e69568d38739 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestDeterminism.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestDeterminism.java @@ -41,7 +41,7 @@ public void testAgainstSimple() throws Exception { a = AutomatonTestUtil.determinizeSimple(a); Automaton b = Operations.determinize(a, Integer.MAX_VALUE); // TODO: more verifications possible? - assertTrue(Operations.sameLanguage(a, b)); + assertTrue(AutomatonTestUtil.sameLanguage(a, b)); } } @@ -53,20 +53,20 @@ private static void assertAutomaton(Automaton a) { Operations.complement( Operations.complement(a, DEFAULT_DETERMINIZE_WORK_LIMIT), DEFAULT_DETERMINIZE_WORK_LIMIT); - assertTrue(Operations.sameLanguage(a, equivalent)); + assertTrue(AutomatonTestUtil.sameLanguage(a, equivalent)); // a union a = a equivalent = Operations.determinize( Operations.removeDeadStates(Operations.union(a, a)), DEFAULT_DETERMINIZE_WORK_LIMIT); - assertTrue(Operations.sameLanguage(a, equivalent)); + assertTrue(AutomatonTestUtil.sameLanguage(a, equivalent)); // a intersect a = a equivalent = Operations.determinize( Operations.removeDeadStates(Operations.intersection(a, a)), DEFAULT_DETERMINIZE_WORK_LIMIT); - assertTrue(Operations.sameLanguage(a, equivalent)); + assertTrue(AutomatonTestUtil.sameLanguage(a, equivalent)); // a minus a = empty Automaton empty = Operations.minus(a, a, DEFAULT_DETERMINIZE_WORK_LIMIT); @@ -81,7 +81,7 @@ private static void assertAutomaton(Automaton a) { equivalent = Operations.minus(optional, Automata.makeEmptyString(), DEFAULT_DETERMINIZE_WORK_LIMIT); // System.out.println("equiv " + equivalent); - assertTrue(Operations.sameLanguage(a, equivalent)); + assertTrue(AutomatonTestUtil.sameLanguage(a, equivalent)); } } } diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestLevenshteinAutomata.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestLevenshteinAutomata.java index c8adb8751b90..bc6d268c15e9 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestLevenshteinAutomata.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestLevenshteinAutomata.java @@ -81,44 +81,46 @@ private void assertLev(String s, int maxDistance) { // check that the dfa for n-1 accepts a subset of the dfa for n if (n > 0) { assertTrue( - Operations.subsetOf( + AutomatonTestUtil.subsetOf( Operations.removeDeadStates(automata[n - 1]), Operations.removeDeadStates(automata[n]))); assertTrue( - Operations.subsetOf( + AutomatonTestUtil.subsetOf( Operations.removeDeadStates(automata[n - 1]), Operations.removeDeadStates(tautomata[n]))); assertTrue( - Operations.subsetOf( + AutomatonTestUtil.subsetOf( Operations.removeDeadStates(tautomata[n - 1]), Operations.removeDeadStates(automata[n]))); assertTrue( - Operations.subsetOf( + AutomatonTestUtil.subsetOf( Operations.removeDeadStates(tautomata[n - 1]), Operations.removeDeadStates(tautomata[n]))); assertNotSame(automata[n - 1], automata[n]); } // check that Lev(N) is a subset of LevT(N) assertTrue( - Operations.subsetOf( + AutomatonTestUtil.subsetOf( Operations.removeDeadStates(automata[n]), Operations.removeDeadStates(tautomata[n]))); // special checks for specific n switch (n) { case 0: // easy, matches the string itself assertTrue( - Operations.sameLanguage( + AutomatonTestUtil.sameLanguage( Automata.makeString(s), Operations.removeDeadStates(automata[0]))); assertTrue( - Operations.sameLanguage( + AutomatonTestUtil.sameLanguage( Automata.makeString(s), Operations.removeDeadStates(tautomata[0]))); break; case 1: // generate a lev1 naively, and check the accepted lang is the same. assertTrue( - Operations.sameLanguage(naiveLev1(s), Operations.removeDeadStates(automata[1]))); + AutomatonTestUtil.sameLanguage( + naiveLev1(s), Operations.removeDeadStates(automata[1]))); assertTrue( - Operations.sameLanguage(naiveLev1T(s), Operations.removeDeadStates(tautomata[1]))); + AutomatonTestUtil.sameLanguage( + naiveLev1T(s), Operations.removeDeadStates(tautomata[1]))); break; default: assertBruteForce(s, automata[n], n); diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestMinimize.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestMinimize.java index a43cc8ae8b13..92be1e4d5697 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestMinimize.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestMinimize.java @@ -28,7 +28,7 @@ public void testBasic() { Automaton a = AutomatonTestUtil.randomAutomaton(random()); Automaton la = Operations.determinize(Operations.removeDeadStates(a), Integer.MAX_VALUE); Automaton lb = MinimizationOperations.minimize(a, Integer.MAX_VALUE); - assertTrue(Operations.sameLanguage(la, lb)); + assertTrue(AutomatonTestUtil.sameLanguage(la, lb)); } } @@ -42,7 +42,7 @@ public void testAgainstBrzozowski() { Automaton a = AutomatonTestUtil.randomAutomaton(random()); a = AutomatonTestUtil.minimizeSimple(a); Automaton b = MinimizationOperations.minimize(a, Integer.MAX_VALUE); - assertTrue(Operations.sameLanguage(a, b)); + assertTrue(AutomatonTestUtil.sameLanguage(a, b)); assertEquals(a.getNumStates(), b.getNumStates()); int numStates = a.getNumStates(); diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestNFARunAutomaton.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestNFARunAutomaton.java index 3ae55ac46d2f..c577c1de8de7 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestNFARunAutomaton.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestNFARunAutomaton.java @@ -32,14 +32,25 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.tests.index.RandomIndexWriter; import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.tests.util.RamUsageTester; import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.tests.util.automaton.AutomatonTestUtil; import org.apache.lucene.util.IntsRef; +import org.junit.Assert; public class TestNFARunAutomaton extends LuceneTestCase { private static final String FIELD = "field"; + public void testRamUsageEstimation() { + RegExp regExp = new RegExp(AutomatonTestUtil.randomRegexp(random()), RegExp.NONE); + Automaton nfa = regExp.toAutomaton(); + NFARunAutomaton runAutomaton = new NFARunAutomaton(nfa); + long estimation = runAutomaton.ramBytesUsed(); + long actual = RamUsageTester.ramUsed(runAutomaton); + Assert.assertEquals((double) actual, (double) estimation, (double) actual * 0.3); + } + @SuppressWarnings("unused") public void testWithRandomRegex() { for (int i = 0; i < 100; i++) { diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java index ec38eafe0ced..3de263030d64 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java @@ -50,7 +50,7 @@ public void testStringUnion() { assertTrue(naiveUnion.isDeterministic()); assertFalse(Operations.hasDeadStatesFromInitial(naiveUnion)); - assertTrue(Operations.sameLanguage(union, naiveUnion)); + assertTrue(AutomatonTestUtil.sameLanguage(union, naiveUnion)); } private static Automaton naiveUnion(List strings) { @@ -116,13 +116,13 @@ public void testEmptySingletonNFAConcatenate() { Automaton concat2 = Operations.concatenate(singleton, nfa); assertFalse(concat2.isDeterministic()); assertTrue( - Operations.sameLanguage( + AutomatonTestUtil.sameLanguage( Operations.determinize(concat1, 100), Operations.determinize(concat2, 100))); assertTrue( - Operations.sameLanguage( + AutomatonTestUtil.sameLanguage( Operations.determinize(nfa, 100), Operations.determinize(concat1, 100))); assertTrue( - Operations.sameLanguage( + AutomatonTestUtil.sameLanguage( Operations.determinize(nfa, 100), Operations.determinize(concat2, 100))); } @@ -173,6 +173,42 @@ public void testIsFiniteEatsStack() { assertTrue(exc.getMessage().contains("input automaton is too large")); } + public void testIsTotal() { + // minimal + assertFalse(Operations.isTotal(Automata.makeEmpty())); + assertFalse(Operations.isTotal(Automata.makeEmptyString())); + assertTrue(Operations.isTotal(Automata.makeAnyString())); + assertTrue(Operations.isTotal(Automata.makeAnyBinary(), 0, 255)); + assertFalse(Operations.isTotal(Automata.makeNonEmptyBinary(), 0, 255)); + // deterministic, but not minimal + assertTrue(Operations.isTotal(Operations.repeat(Automata.makeAnyChar()))); + Automaton tricky = + Operations.repeat( + Operations.union( + Automata.makeCharRange(Character.MIN_CODE_POINT, 100), + Automata.makeCharRange(101, Character.MAX_CODE_POINT))); + assertTrue(Operations.isTotal(tricky)); + // not total, but close + Automaton tricky2 = + Operations.repeat( + Operations.union( + Automata.makeCharRange(Character.MIN_CODE_POINT + 1, 100), + Automata.makeCharRange(101, Character.MAX_CODE_POINT))); + assertFalse(Operations.isTotal(tricky2)); + Automaton tricky3 = + Operations.repeat( + Operations.union( + Automata.makeCharRange(Character.MIN_CODE_POINT, 99), + Automata.makeCharRange(101, Character.MAX_CODE_POINT))); + assertFalse(Operations.isTotal(tricky3)); + Automaton tricky4 = + Operations.repeat( + Operations.union( + Automata.makeCharRange(Character.MIN_CODE_POINT, 100), + Automata.makeCharRange(101, Character.MAX_CODE_POINT - 1))); + assertFalse(Operations.isTotal(tricky4)); + } + /** * Returns the set of all accepted strings. * @@ -254,4 +290,192 @@ private Automaton generateRandomAutomaton(boolean hasCycle) { a.finishState(); return a; } + + public void testRepeat() { + Automaton emptyLanguage = Automata.makeEmpty(); + assertSame(emptyLanguage, Operations.repeat(emptyLanguage)); + + Automaton emptyString = Automata.makeEmptyString(); + assertSame(emptyString, Operations.repeat(emptyString)); + + Automaton a = Automata.makeChar('a'); + Automaton as = new Automaton(); + as.createState(); + as.setAccept(0, true); + as.addTransition(0, 0, 'a'); + as.finishState(); + assertTrue(AutomatonTestUtil.sameLanguage(as, Operations.repeat(a))); + assertSame(as, Operations.repeat(as)); + + Automaton aOrEmpty = new Automaton(); + aOrEmpty.createState(); + aOrEmpty.setAccept(0, true); + aOrEmpty.createState(); + aOrEmpty.setAccept(1, true); + aOrEmpty.addTransition(0, 1, 'a'); + assertTrue(AutomatonTestUtil.sameLanguage(as, Operations.repeat(aOrEmpty))); + + Automaton ab = Automata.makeString("ab"); + Automaton abs = new Automaton(); + abs.createState(); + abs.createState(); + abs.setAccept(0, true); + abs.addTransition(0, 1, 'a'); + abs.finishState(); + abs.addTransition(1, 0, 'b'); + abs.finishState(); + assertTrue(AutomatonTestUtil.sameLanguage(abs, Operations.repeat(ab))); + assertSame(abs, Operations.repeat(abs)); + + Automaton absThenC = Operations.concatenate(abs, Automata.makeChar('c')); + Automaton absThenCs = new Automaton(); + absThenCs.createState(); + absThenCs.createState(); + absThenCs.createState(); + absThenCs.setAccept(0, true); + absThenCs.addTransition(0, 1, 'a'); + absThenCs.addTransition(0, 0, 'c'); + absThenCs.finishState(); + absThenCs.addTransition(1, 2, 'b'); + absThenCs.finishState(); + absThenCs.addTransition(2, 1, 'a'); + absThenCs.addTransition(2, 0, 'c'); + absThenCs.finishState(); + assertTrue(AutomatonTestUtil.sameLanguage(absThenCs, Operations.repeat(absThenC))); + assertSame(absThenCs, Operations.repeat(absThenCs)); + + Automaton aOrAb = new Automaton(); + aOrAb.createState(); + aOrAb.createState(); + aOrAb.createState(); + aOrAb.setAccept(1, true); + aOrAb.setAccept(2, true); + aOrAb.addTransition(0, 1, 'a'); + aOrAb.finishState(); + aOrAb.addTransition(1, 2, 'b'); + aOrAb.finishState(); + Automaton aOrAbs = new Automaton(); + aOrAbs.createState(); + aOrAbs.createState(); + aOrAbs.setAccept(0, true); + aOrAbs.addTransition(0, 0, 'a'); + aOrAbs.addTransition(0, 1, 'a'); + aOrAbs.finishState(); + aOrAbs.addTransition(1, 0, 'b'); + aOrAbs.finishState(); + assertTrue( + AutomatonTestUtil.sameLanguage( + Operations.determinize(aOrAbs, Integer.MAX_VALUE), + Operations.determinize(Operations.repeat(aOrAb), Integer.MAX_VALUE))); + } + + public void testDuelRepeat() { + final int iters = atLeast(1_000); + for (int iter = 0; iter < iters; ++iter) { + Automaton a = AutomatonTestUtil.randomAutomaton(random()); + Automaton repeat1 = Operations.determinize(Operations.repeat(a), Integer.MAX_VALUE); + Automaton repeat2 = Operations.determinize(naiveRepeat(a), Integer.MAX_VALUE); + assertTrue(AutomatonTestUtil.sameLanguage(repeat1, repeat2)); + } + } + + // This is the original implementation of Operations#repeat, before we improved it to generate + // simpler automata in some common cases. + private static Automaton naiveRepeat(Automaton a) { + if (a.getNumStates() == 0) { + return a; + } + + Automaton.Builder builder = new Automaton.Builder(); + // Create the initial state, which is accepted + builder.createState(); + builder.setAccept(0, true); + builder.copy(a); + + Transition t = new Transition(); + int count = a.initTransition(0, t); + for (int i = 0; i < count; i++) { + a.getNextTransition(t); + builder.addTransition(0, t.dest + 1, t.min, t.max); + } + + int numStates = a.getNumStates(); + for (int s = 0; s < numStates; s++) { + if (a.isAccept(s)) { + count = a.initTransition(0, t); + for (int i = 0; i < count; i++) { + a.getNextTransition(t); + builder.addTransition(s + 1, t.dest + 1, t.min, t.max); + } + } + } + + return builder.finish(); + } + + public void testOptional() { + Automaton a = Automata.makeChar('a'); + + Automaton optionalA = new Automaton(); + optionalA.createState(); + optionalA.setAccept(0, true); + optionalA.finishState(); + optionalA.createState(); + optionalA.setAccept(1, true); + optionalA.addTransition(0, 1, 'a'); + optionalA.finishState(); + + assertTrue(AutomatonTestUtil.sameLanguage(Operations.optional(a), optionalA)); + assertSame(optionalA, Operations.optional(optionalA)); + + // Now test an automaton that has a transition to state 0. a(ba)* + a = new Automaton(); + a.createState(); + a.createState(); + a.setAccept(1, true); + a.addTransition(0, 1, 'a'); + a.finishState(); + a.addTransition(1, 0, 'b'); + a.finishState(); + + optionalA = new Automaton(); + optionalA.createState(); + optionalA.setAccept(0, true); + optionalA.createState(); + optionalA.createState(); + optionalA.setAccept(2, true); + optionalA.addTransition(0, 2, 'a'); + optionalA.finishState(); + optionalA.addTransition(1, 2, 'a'); + optionalA.finishState(); + optionalA.addTransition(2, 1, 'b'); + optionalA.finishState(); + + assertTrue(AutomatonTestUtil.sameLanguage(Operations.optional(a), optionalA)); + assertSame(optionalA, Operations.optional(optionalA)); + } + + public void testDuelOptional() { + final int iters = atLeast(1_000); + for (int iter = 0; iter < iters; ++iter) { + Automaton a = AutomatonTestUtil.randomAutomaton(random()); + Automaton repeat1 = Operations.determinize(Operations.optional(a), Integer.MAX_VALUE); + Automaton repeat2 = Operations.determinize(naiveOptional(a), Integer.MAX_VALUE); + assertTrue(AutomatonTestUtil.sameLanguage(repeat1, repeat2)); + } + } + + // This is the original implementation of Operations#optional, before we improved it to generate + // simpler automata in some common cases. + private static Automaton naiveOptional(Automaton a) { + Automaton result = new Automaton(); + result.createState(); + result.setAccept(0, true); + if (a.getNumStates() > 0) { + result.copy(a); + result.addEpsilon(0, 1); + } + result.finishState(); + return result; + } } diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java index c934108115d8..1747926295eb 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java @@ -19,6 +19,7 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.tests.util.automaton.AutomatonTestUtil; import org.apache.lucene.util.BytesRef; public class TestRegExp extends LuceneTestCase { @@ -259,4 +260,19 @@ protected String checkRandomExpression(String docValue) { public void testRegExpNoStackOverflow() { new RegExp("(a)|".repeat(50000) + "(a)"); } + + /** + * Tests the deprecate complement flag. Keep the simple test only, no random tests to let it cause + * us pain. + * + * @deprecated Remove in Lucene 11 + */ + @Deprecated + public void testDeprecatedComplement() { + Automaton expected = + Operations.complement( + Automata.makeString("abcd"), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT); + Automaton actual = new RegExp("~(abcd)", RegExp.DEPRECATED_COMPLEMENT).toAutomaton(); + assertTrue(AutomatonTestUtil.sameLanguage(expected, actual)); + } } diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExpParsing.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExpParsing.java index 7d0f062f36bc..74fb08cb7188 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExpParsing.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExpParsing.java @@ -20,6 +20,7 @@ import java.util.Map; import java.util.Set; import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.tests.util.automaton.AutomatonTestUtil; /** * Simple unit tests for RegExp parsing. @@ -698,7 +699,7 @@ public void testIllegalMatchFlags() { private void assertSameLanguage(Automaton expected, Automaton actual) { expected = Operations.determinize(expected, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT); actual = Operations.determinize(actual, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT); - boolean result = Operations.sameLanguage(expected, actual); + boolean result = AutomatonTestUtil.sameLanguage(expected, actual); if (result == false) { System.out.println(expected.toDot()); System.out.println(actual.toDot()); diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java index 0e5a3f9fc30d..efaa451258bb 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java @@ -28,6 +28,7 @@ import java.util.Set; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.tests.util.TestUtil; +import org.apache.lucene.tests.util.automaton.AutomatonTestUtil; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; @@ -158,7 +159,7 @@ private void checkMinimized(Automaton a) { private static void assertSameAutomaton(Automaton a, Automaton b) { assertEquals(a.getNumStates(), b.getNumStates()); assertEquals(a.getNumTransitions(), b.getNumTransitions()); - assertTrue(Operations.sameLanguage(a, b)); + assertTrue(AutomatonTestUtil.sameLanguage(a, b)); } private List basicTerms() { diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java index 2b88c8d05418..557b9afdfdaa 100644 --- a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java +++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java @@ -954,22 +954,22 @@ public void visit(int docID) { @Override public void visit(int docID, byte[] packedValue) { // System.out.println("visit check docID=" + docID); - for (int dim = 0; dim < config.numIndexDims; dim++) { + for (int dim = 0; dim < config.numIndexDims(); dim++) { if (Arrays.compareUnsigned( packedValue, - dim * config.bytesPerDim, - dim * config.bytesPerDim + config.bytesPerDim, + dim * config.bytesPerDim(), + dim * config.bytesPerDim() + config.bytesPerDim(), queryMin[dim], 0, - config.bytesPerDim) + config.bytesPerDim()) < 0 || Arrays.compareUnsigned( packedValue, - dim * config.bytesPerDim, - dim * config.bytesPerDim + config.bytesPerDim, + dim * config.bytesPerDim(), + dim * config.bytesPerDim() + config.bytesPerDim(), queryMax[dim], 0, - config.bytesPerDim) + config.bytesPerDim()) > 0) { // System.out.println(" no"); return; @@ -1005,39 +1005,39 @@ public void visit(DocIdSetIterator iterator, byte[] packedValue) throws IOExcept @Override public Relation compare(byte[] minPacked, byte[] maxPacked) { boolean crosses = false; - for (int dim = 0; dim < config.numIndexDims; dim++) { + for (int dim = 0; dim < config.numIndexDims(); dim++) { if (Arrays.compareUnsigned( maxPacked, - dim * config.bytesPerDim, - dim * config.bytesPerDim + config.bytesPerDim, + dim * config.bytesPerDim(), + dim * config.bytesPerDim() + config.bytesPerDim(), queryMin[dim], 0, - config.bytesPerDim) + config.bytesPerDim()) < 0 || Arrays.compareUnsigned( minPacked, - dim * config.bytesPerDim, - dim * config.bytesPerDim + config.bytesPerDim, + dim * config.bytesPerDim(), + dim * config.bytesPerDim() + config.bytesPerDim(), queryMax[dim], 0, - config.bytesPerDim) + config.bytesPerDim()) > 0) { return Relation.CELL_OUTSIDE_QUERY; } else if (Arrays.compareUnsigned( minPacked, - dim * config.bytesPerDim, - dim * config.bytesPerDim + config.bytesPerDim, + dim * config.bytesPerDim(), + dim * config.bytesPerDim() + config.bytesPerDim(), queryMin[dim], 0, - config.bytesPerDim) + config.bytesPerDim()) < 0 || Arrays.compareUnsigned( maxPacked, - dim * config.bytesPerDim, - dim * config.bytesPerDim + config.bytesPerDim, + dim * config.bytesPerDim(), + dim * config.bytesPerDim() + config.bytesPerDim(), queryMax[dim], 0, - config.bytesPerDim) + config.bytesPerDim()) > 0) { crosses = true; } diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDConfig.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDConfig.java new file mode 100644 index 000000000000..6a137009ca7e --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDConfig.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.util.bkd; + +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.util.ArrayUtil; + +public class TestBKDConfig extends LuceneTestCase { + + public void testInvalidNumDims() { + IllegalArgumentException ex = + expectThrows( + IllegalArgumentException.class, + () -> new BKDConfig(0, 0, 8, BKDConfig.DEFAULT_MAX_POINTS_IN_LEAF_NODE)); + assertTrue(ex.getMessage().contains("numDims must be 1 .. " + BKDConfig.MAX_DIMS)); + } + + public void testInvalidNumIndexedDims() { + { + IllegalArgumentException ex = + expectThrows( + IllegalArgumentException.class, + () -> new BKDConfig(1, 0, 8, BKDConfig.DEFAULT_MAX_POINTS_IN_LEAF_NODE)); + assertTrue(ex.getMessage().contains("numIndexDims must be 1 .. " + BKDConfig.MAX_INDEX_DIMS)); + } + { + IllegalArgumentException ex = + expectThrows( + IllegalArgumentException.class, + () -> new BKDConfig(1, 2, 8, BKDConfig.DEFAULT_MAX_POINTS_IN_LEAF_NODE)); + assertTrue(ex.getMessage().contains("numIndexDims cannot exceed numDims")); + } + } + + public void testInvalidBytesPerDim() { + IllegalArgumentException ex = + expectThrows( + IllegalArgumentException.class, + () -> new BKDConfig(1, 1, 0, BKDConfig.DEFAULT_MAX_POINTS_IN_LEAF_NODE)); + assertTrue(ex.getMessage().contains("bytesPerDim must be > 0")); + } + + public void testInvalidMaxPointsPerLeafNode() { + { + IllegalArgumentException ex = + expectThrows(IllegalArgumentException.class, () -> new BKDConfig(1, 1, 8, -1)); + assertTrue(ex.getMessage().contains("maxPointsInLeafNode must be > 0")); + } + { + IllegalArgumentException ex = + expectThrows( + IllegalArgumentException.class, + () -> new BKDConfig(1, 1, 8, ArrayUtil.MAX_ARRAY_LENGTH + 1)); + assertTrue( + ex.getMessage().contains("maxPointsInLeafNode must be <= ArrayUtil.MAX_ARRAY_LENGTH")); + } + } +} diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSelector.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSelector.java index 807daef88d42..977e68b7156a 100644 --- a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSelector.java +++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSelector.java @@ -37,7 +37,7 @@ public void testBasic() throws IOException { new BKDConfig( dimensions, dimensions, bytesPerDimensions, BKDConfig.DEFAULT_MAX_POINTS_IN_LEAF_NODE); PointWriter points = getRandomPointWriter(config, dir, values); - byte[] value = new byte[config.packedBytesLength]; + byte[] value = new byte[config.packedBytesLength()]; NumericUtils.intToSortableBytes(1, value, 0); points.append(value, 0); NumericUtils.intToSortableBytes(2, value, 0); @@ -81,7 +81,7 @@ private void doTestRandomBinary(int count) throws IOException { int partitionPoint = TestUtil.nextInt(random(), start + 1, end - 1); int sortedOnHeap = random().nextInt(5000); PointWriter points = getRandomPointWriter(config, dir, values); - byte[] value = new byte[config.packedBytesLength]; + byte[] value = new byte[config.packedBytesLength()]; for (int i = 0; i < values; i++) { random().nextBytes(value); points.append(value, i); @@ -102,7 +102,7 @@ public void testRandomAllDimensionsEquals() throws IOException { int partitionPoint = random().nextInt(values); int sortedOnHeap = random().nextInt(5000); PointWriter points = getRandomPointWriter(config, dir, values); - byte[] value = new byte[config.packedBytesLength]; + byte[] value = new byte[config.packedBytesLength()]; random().nextBytes(value); for (int i = 0; i < values; i++) { if (random().nextBoolean()) { @@ -123,7 +123,7 @@ public void testRandomLastByteTwoValues() throws IOException { int sortedOnHeap = random().nextInt(5000); BKDConfig config = getRandomConfig(); PointWriter points = getRandomPointWriter(config, dir, values); - byte[] value = new byte[config.packedBytesLength]; + byte[] value = new byte[config.packedBytesLength()]; random().nextBytes(value); for (int i = 0; i < values; i++) { if (random().nextBoolean()) { @@ -144,7 +144,7 @@ public void testRandomAllDocsEquals() throws IOException { int sortedOnHeap = random().nextInt(5000); BKDConfig config = getRandomConfig(); PointWriter points = getRandomPointWriter(config, dir, values); - byte[] value = new byte[config.packedBytesLength]; + byte[] value = new byte[config.packedBytesLength()]; random().nextBytes(value); for (int i = 0; i < values; i++) { points.append(value, 0); @@ -162,7 +162,7 @@ public void testRandomFewDifferentValues() throws IOException { int sortedOnHeap = random().nextInt(5000); PointWriter points = getRandomPointWriter(config, dir, values); int numberValues = random().nextInt(8) + 2; - byte[][] differentValues = new byte[numberValues][config.packedBytesLength]; + byte[][] differentValues = new byte[numberValues][config.packedBytesLength()]; for (int i = 0; i < numberValues; i++) { random().nextBytes(differentValues[i]); } @@ -181,9 +181,9 @@ public void testRandomDataDimDiffValues() throws IOException { int partitionPoint = random().nextInt(values); int sortedOnHeap = random().nextInt(5000); PointWriter points = getRandomPointWriter(config, dir, values); - byte[] value = new byte[config.packedBytesLength]; - int dataOnlyDims = config.numDims - config.numIndexDims; - byte[] dataValue = new byte[dataOnlyDims * config.bytesPerDim]; + byte[] value = new byte[config.packedBytesLength()]; + int dataOnlyDims = config.numDims() - config.numIndexDims(); + byte[] dataValue = new byte[dataOnlyDims * config.bytesPerDim()]; random().nextBytes(value); for (int i = 0; i < values; i++) { random().nextBytes(dataValue); @@ -191,8 +191,8 @@ public void testRandomDataDimDiffValues() throws IOException { dataValue, 0, value, - config.numIndexDims * config.bytesPerDim, - dataOnlyDims * config.bytesPerDim); + config.numIndexDims() * config.bytesPerDim(), + dataOnlyDims * config.bytesPerDim()); points.append(value, i); } points.close(); @@ -210,9 +210,9 @@ private void verify( int sortedOnHeap) throws IOException { BKDRadixSelector radixSelector = new BKDRadixSelector(config, sortedOnHeap, dir, "test"); - int dataOnlyDims = config.numDims - config.numIndexDims; + int dataOnlyDims = config.numDims() - config.numIndexDims(); // we only split by indexed dimension so we check for each only those dimension - for (int splitDim = 0; splitDim < config.numIndexDims; splitDim++) { + for (int splitDim = 0; splitDim < config.numIndexDims(); splitDim++) { // We need to make a copy of the data as it is deleted in the process BKDRadixSelector.PathSlice inputSlice = new BKDRadixSelector.PathSlice(copyPoints(config, dir, points), 0, points.count()); @@ -221,12 +221,12 @@ private void verify( byte[] partitionPoint = radixSelector.select( inputSlice, slices, start, end, middle, splitDim, commonPrefixLengthInput); - assertEquals(middle - start, slices[0].count); - assertEquals(end - middle, slices[1].count); + assertEquals(middle - start, slices[0].count()); + assertEquals(end - middle, slices[1].count()); // check that left and right slices contain the correct points byte[] max = getMax(config, slices[0], splitDim); byte[] min = getMin(config, slices[1], splitDim); - int cmp = Arrays.compareUnsigned(max, 0, config.bytesPerDim, min, 0, config.bytesPerDim); + int cmp = Arrays.compareUnsigned(max, 0, config.bytesPerDim(), min, 0, config.bytesPerDim()); assertTrue(cmp <= 0); if (cmp == 0) { byte[] maxDataDim = getMaxDataDimension(config, slices[0], max, splitDim); @@ -235,10 +235,10 @@ private void verify( Arrays.compareUnsigned( maxDataDim, 0, - dataOnlyDims * config.bytesPerDim, + dataOnlyDims * config.bytesPerDim(), minDataDim, 0, - dataOnlyDims * config.bytesPerDim); + dataOnlyDims * config.bytesPerDim()); assertTrue(cmp <= 0); if (cmp == 0) { int maxDocID = getMaxDocId(config, slices[0], splitDim, partitionPoint, maxDataDim); @@ -247,8 +247,8 @@ private void verify( } } assertTrue(Arrays.equals(partitionPoint, min)); - slices[0].writer.destroy(); - slices[1].writer.destroy(); + slices[0].writer().destroy(); + slices[1].writer().destroy(); } points.destroy(); } @@ -270,9 +270,9 @@ private int getRandomCommonPrefix( byte[] pointsMax = getMax(config, inputSlice, splitDim); byte[] pointsMin = getMin(config, inputSlice, splitDim); int commonPrefixLength = - Arrays.mismatch(pointsMin, 0, config.bytesPerDim, pointsMax, 0, config.bytesPerDim); + Arrays.mismatch(pointsMin, 0, config.bytesPerDim(), pointsMax, 0, config.bytesPerDim()); if (commonPrefixLength == -1) { - commonPrefixLength = config.bytesPerDim; + commonPrefixLength = config.bytesPerDim(); } return (random().nextBoolean()) ? commonPrefixLength @@ -300,22 +300,23 @@ private Directory getDirectory(int numPoints) { private byte[] getMin(BKDConfig config, BKDRadixSelector.PathSlice pathSlice, int dimension) throws IOException { - byte[] min = new byte[config.bytesPerDim]; + byte[] min = new byte[config.bytesPerDim()]; Arrays.fill(min, (byte) 0xff); - try (PointReader reader = pathSlice.writer.getReader(pathSlice.start, pathSlice.count)) { - byte[] value = new byte[config.bytesPerDim]; + try (PointReader reader = pathSlice.writer().getReader(pathSlice.start(), pathSlice.count())) { + byte[] value = new byte[config.bytesPerDim()]; while (reader.next()) { PointValue pointValue = reader.pointValue(); BytesRef packedValue = pointValue.packedValue(); System.arraycopy( packedValue.bytes, - packedValue.offset + dimension * config.bytesPerDim, + packedValue.offset + dimension * config.bytesPerDim(), value, 0, - config.bytesPerDim); - if (Arrays.compareUnsigned(min, 0, config.bytesPerDim, value, 0, config.bytesPerDim) > 0) { - System.arraycopy(value, 0, min, 0, config.bytesPerDim); + config.bytesPerDim()); + if (Arrays.compareUnsigned(min, 0, config.bytesPerDim(), value, 0, config.bytesPerDim()) + > 0) { + System.arraycopy(value, 0, min, 0, config.bytesPerDim()); } } } @@ -330,20 +331,20 @@ private int getMinDocId( byte[] dataDim) throws IOException { int docID = Integer.MAX_VALUE; - try (PointReader reader = p.writer.getReader(p.start, p.count)) { + try (PointReader reader = p.writer().getReader(p.start(), p.count())) { while (reader.next()) { PointValue pointValue = reader.pointValue(); BytesRef packedValue = pointValue.packedValue(); - int offset = dimension * config.bytesPerDim; - int dataOffset = config.packedIndexBytesLength; - int dataLength = (config.numDims - config.numIndexDims) * config.bytesPerDim; + int offset = dimension * config.bytesPerDim(); + int dataOffset = config.packedIndexBytesLength(); + int dataLength = (config.numDims() - config.numIndexDims()) * config.bytesPerDim(); if (Arrays.compareUnsigned( packedValue.bytes, packedValue.offset + offset, - packedValue.offset + offset + config.bytesPerDim, + packedValue.offset + offset + config.bytesPerDim(), partitionPoint, 0, - config.bytesPerDim) + config.bytesPerDim()) == 0 && Arrays.compareUnsigned( packedValue.bytes, @@ -366,38 +367,38 @@ private int getMinDocId( private byte[] getMinDataDimension( BKDConfig config, BKDRadixSelector.PathSlice p, byte[] minDim, int splitDim) throws IOException { - final int numDataDims = config.numDims - config.numIndexDims; - byte[] min = new byte[numDataDims * config.bytesPerDim]; + final int numDataDims = config.numDims() - config.numIndexDims(); + byte[] min = new byte[numDataDims * config.bytesPerDim()]; Arrays.fill(min, (byte) 0xff); - int offset = splitDim * config.bytesPerDim; - try (PointReader reader = p.writer.getReader(p.start, p.count)) { - byte[] value = new byte[numDataDims * config.bytesPerDim]; + int offset = splitDim * config.bytesPerDim(); + try (PointReader reader = p.writer().getReader(p.start(), p.count())) { + byte[] value = new byte[numDataDims * config.bytesPerDim()]; while (reader.next()) { PointValue pointValue = reader.pointValue(); BytesRef packedValue = pointValue.packedValue(); if (Arrays.mismatch( minDim, 0, - config.bytesPerDim, + config.bytesPerDim(), packedValue.bytes, packedValue.offset + offset, - packedValue.offset + offset + config.bytesPerDim) + packedValue.offset + offset + config.bytesPerDim()) == -1) { System.arraycopy( packedValue.bytes, - packedValue.offset + config.numIndexDims * config.bytesPerDim, + packedValue.offset + config.numIndexDims() * config.bytesPerDim(), value, 0, - numDataDims * config.bytesPerDim); + numDataDims * config.bytesPerDim()); if (Arrays.compareUnsigned( min, 0, - numDataDims * config.bytesPerDim, + numDataDims * config.bytesPerDim(), value, 0, - numDataDims * config.bytesPerDim) + numDataDims * config.bytesPerDim()) > 0) { - System.arraycopy(value, 0, min, 0, numDataDims * config.bytesPerDim); + System.arraycopy(value, 0, min, 0, numDataDims * config.bytesPerDim()); } } } @@ -407,21 +408,22 @@ private byte[] getMinDataDimension( private byte[] getMax(BKDConfig config, BKDRadixSelector.PathSlice p, int dimension) throws IOException { - byte[] max = new byte[config.bytesPerDim]; + byte[] max = new byte[config.bytesPerDim()]; Arrays.fill(max, (byte) 0); - try (PointReader reader = p.writer.getReader(p.start, p.count)) { - byte[] value = new byte[config.bytesPerDim]; + try (PointReader reader = p.writer().getReader(p.start(), p.count())) { + byte[] value = new byte[config.bytesPerDim()]; while (reader.next()) { PointValue pointValue = reader.pointValue(); BytesRef packedValue = pointValue.packedValue(); System.arraycopy( packedValue.bytes, - packedValue.offset + dimension * config.bytesPerDim, + packedValue.offset + dimension * config.bytesPerDim(), value, 0, - config.bytesPerDim); - if (Arrays.compareUnsigned(max, 0, config.bytesPerDim, value, 0, config.bytesPerDim) < 0) { - System.arraycopy(value, 0, max, 0, config.bytesPerDim); + config.bytesPerDim()); + if (Arrays.compareUnsigned(max, 0, config.bytesPerDim(), value, 0, config.bytesPerDim()) + < 0) { + System.arraycopy(value, 0, max, 0, config.bytesPerDim()); } } } @@ -431,38 +433,38 @@ private byte[] getMax(BKDConfig config, BKDRadixSelector.PathSlice p, int dimens private byte[] getMaxDataDimension( BKDConfig config, BKDRadixSelector.PathSlice p, byte[] maxDim, int splitDim) throws IOException { - final int numDataDims = config.numDims - config.numIndexDims; - byte[] max = new byte[numDataDims * config.bytesPerDim]; + final int numDataDims = config.numDims() - config.numIndexDims(); + byte[] max = new byte[numDataDims * config.bytesPerDim()]; Arrays.fill(max, (byte) 0); - int offset = splitDim * config.bytesPerDim; - try (PointReader reader = p.writer.getReader(p.start, p.count)) { - byte[] value = new byte[numDataDims * config.bytesPerDim]; + int offset = splitDim * config.bytesPerDim(); + try (PointReader reader = p.writer().getReader(p.start(), p.count())) { + byte[] value = new byte[numDataDims * config.bytesPerDim()]; while (reader.next()) { PointValue pointValue = reader.pointValue(); BytesRef packedValue = pointValue.packedValue(); if (Arrays.mismatch( maxDim, 0, - config.bytesPerDim, + config.bytesPerDim(), packedValue.bytes, packedValue.offset + offset, - packedValue.offset + offset + config.bytesPerDim) + packedValue.offset + offset + config.bytesPerDim()) == -1) { System.arraycopy( packedValue.bytes, - packedValue.offset + config.packedIndexBytesLength, + packedValue.offset + config.packedIndexBytesLength(), value, 0, - numDataDims * config.bytesPerDim); + numDataDims * config.bytesPerDim()); if (Arrays.compareUnsigned( max, 0, - numDataDims * config.bytesPerDim, + numDataDims * config.bytesPerDim(), value, 0, - numDataDims * config.bytesPerDim) + numDataDims * config.bytesPerDim()) < 0) { - System.arraycopy(value, 0, max, 0, numDataDims * config.bytesPerDim); + System.arraycopy(value, 0, max, 0, numDataDims * config.bytesPerDim()); } } } @@ -478,20 +480,20 @@ private int getMaxDocId( byte[] dataDim) throws IOException { int docID = Integer.MIN_VALUE; - try (PointReader reader = p.writer.getReader(p.start, p.count)) { + try (PointReader reader = p.writer().getReader(p.start(), p.count())) { while (reader.next()) { PointValue pointValue = reader.pointValue(); BytesRef packedValue = pointValue.packedValue(); - int offset = dimension * config.bytesPerDim; - int dataOffset = config.packedIndexBytesLength; - int dataLength = (config.numDims - config.numIndexDims) * config.bytesPerDim; + int offset = dimension * config.bytesPerDim(); + int dataOffset = config.packedIndexBytesLength(); + int dataLength = (config.numDims() - config.numIndexDims()) * config.bytesPerDim(); if (Arrays.compareUnsigned( packedValue.bytes, packedValue.offset + offset, - packedValue.offset + offset + config.bytesPerDim, + packedValue.offset + offset + config.bytesPerDim(), partitionPoint, 0, - config.bytesPerDim) + config.bytesPerDim()) == 0 && Arrays.compareUnsigned( packedValue.bytes, diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSort.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSort.java index 807f2304aaa2..bc62d69f0848 100644 --- a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSort.java +++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSort.java @@ -30,7 +30,7 @@ public void testRandom() throws IOException { BKDConfig config = getRandomConfig(); int numPoints = TestUtil.nextInt(random(), 1, BKDConfig.DEFAULT_MAX_POINTS_IN_LEAF_NODE); HeapPointWriter points = new HeapPointWriter(config, numPoints); - byte[] value = new byte[config.packedBytesLength]; + byte[] value = new byte[config.packedBytesLength()]; for (int i = 0; i < numPoints; i++) { random().nextBytes(value); points.append(value, i); @@ -42,7 +42,7 @@ public void testRandomAllEquals() throws IOException { BKDConfig config = getRandomConfig(); int numPoints = TestUtil.nextInt(random(), 1, BKDConfig.DEFAULT_MAX_POINTS_IN_LEAF_NODE); HeapPointWriter points = new HeapPointWriter(config, numPoints); - byte[] value = new byte[config.packedBytesLength]; + byte[] value = new byte[config.packedBytesLength()]; random().nextBytes(value); for (int i = 0; i < numPoints; i++) { points.append(value, random().nextInt(numPoints)); @@ -54,7 +54,7 @@ public void testRandomLastByteTwoValues() throws IOException { BKDConfig config = getRandomConfig(); int numPoints = TestUtil.nextInt(random(), 1, BKDConfig.DEFAULT_MAX_POINTS_IN_LEAF_NODE); HeapPointWriter points = new HeapPointWriter(config, numPoints); - byte[] value = new byte[config.packedBytesLength]; + byte[] value = new byte[config.packedBytesLength()]; random().nextBytes(value); for (int i = 0; i < numPoints; i++) { if (random().nextBoolean()) { @@ -71,7 +71,7 @@ public void testRandomFewDifferentValues() throws IOException { int numPoints = TestUtil.nextInt(random(), 1, BKDConfig.DEFAULT_MAX_POINTS_IN_LEAF_NODE); HeapPointWriter points = new HeapPointWriter(config, numPoints); int numberValues = random().nextInt(8) + 2; - byte[][] differentValues = new byte[numberValues][config.packedBytesLength]; + byte[][] differentValues = new byte[numberValues][config.packedBytesLength()]; for (int i = 0; i < numberValues; i++) { random().nextBytes(differentValues[i]); } @@ -85,9 +85,9 @@ public void testRandomDataDimDifferent() throws IOException { BKDConfig config = getRandomConfig(); int numPoints = TestUtil.nextInt(random(), 1, BKDConfig.DEFAULT_MAX_POINTS_IN_LEAF_NODE); HeapPointWriter points = new HeapPointWriter(config, numPoints); - byte[] value = new byte[config.packedBytesLength]; - int totalDataDimension = config.numDims - config.numIndexDims; - byte[] dataDimensionValues = new byte[totalDataDimension * config.bytesPerDim]; + byte[] value = new byte[config.packedBytesLength()]; + int totalDataDimension = config.numDims() - config.numIndexDims(); + byte[] dataDimensionValues = new byte[totalDataDimension * config.bytesPerDim()]; random().nextBytes(value); for (int i = 0; i < numPoints; i++) { random().nextBytes(dataDimensionValues); @@ -95,8 +95,8 @@ public void testRandomDataDimDifferent() throws IOException { dataDimensionValues, 0, value, - config.packedIndexBytesLength, - totalDataDimension * config.bytesPerDim); + config.packedIndexBytesLength(), + totalDataDimension * config.bytesPerDim()); points.append(value, random().nextInt(numPoints)); } verifySort(config, points, 0, numPoints); @@ -107,17 +107,17 @@ private void verifySort(BKDConfig config, HeapPointWriter points, int start, int Directory dir = newDirectory(); BKDRadixSelector radixSelector = new BKDRadixSelector(config, 1000, dir, "test"); // we check for each dimension - for (int splitDim = 0; splitDim < config.numDims; splitDim++) { + for (int splitDim = 0; splitDim < config.numDims(); splitDim++) { radixSelector.heapRadixSort( points, start, end, splitDim, getRandomCommonPrefix(config, points, start, end, splitDim)); - byte[] previous = new byte[config.packedBytesLength]; + byte[] previous = new byte[config.packedBytesLength()]; int previousDocId = -1; Arrays.fill(previous, (byte) 0); - int dimOffset = splitDim * config.bytesPerDim; + int dimOffset = splitDim * config.bytesPerDim(); for (int j = start; j < end; j++) { PointValue pointValue = points.getPackedValueSlice(j); BytesRef value = pointValue.packedValue(); @@ -125,27 +125,27 @@ private void verifySort(BKDConfig config, HeapPointWriter points, int start, int Arrays.compareUnsigned( value.bytes, value.offset + dimOffset, - value.offset + dimOffset + config.bytesPerDim, + value.offset + dimOffset + config.bytesPerDim(), previous, dimOffset, - dimOffset + config.bytesPerDim); + dimOffset + config.bytesPerDim()); assertTrue(cmp >= 0); if (cmp == 0) { - int dataOffset = config.numIndexDims * config.bytesPerDim; + int dataOffset = config.numIndexDims() * config.bytesPerDim(); cmp = Arrays.compareUnsigned( value.bytes, value.offset + dataOffset, - value.offset + config.packedBytesLength, + value.offset + config.packedBytesLength(), previous, dataOffset, - config.packedBytesLength); + config.packedBytesLength()); assertTrue(cmp >= 0); } if (cmp == 0) { assertTrue(pointValue.docID() >= previousDocId); } - System.arraycopy(value.bytes, value.offset, previous, 0, config.packedBytesLength); + System.arraycopy(value.bytes, value.offset, previous, 0, config.packedBytesLength()); previousDocId = pointValue.docID(); } } @@ -155,12 +155,12 @@ private void verifySort(BKDConfig config, HeapPointWriter points, int start, int /** returns a common prefix length equal or lower than the current one */ private int getRandomCommonPrefix( BKDConfig config, HeapPointWriter points, int start, int end, int sortDim) { - int commonPrefixLength = config.bytesPerDim; + int commonPrefixLength = config.bytesPerDim(); PointValue value = points.getPackedValueSlice(start); BytesRef bytesRef = value.packedValue(); - byte[] firstValue = new byte[config.bytesPerDim]; - int offset = sortDim * config.bytesPerDim; - System.arraycopy(bytesRef.bytes, bytesRef.offset + offset, firstValue, 0, config.bytesPerDim); + byte[] firstValue = new byte[config.bytesPerDim()]; + int offset = sortDim * config.bytesPerDim(); + System.arraycopy(bytesRef.bytes, bytesRef.offset + offset, firstValue, 0, config.bytesPerDim()); for (int i = start + 1; i < end; i++) { value = points.getPackedValueSlice(i); bytesRef = value.packedValue(); @@ -168,10 +168,10 @@ private int getRandomCommonPrefix( Arrays.mismatch( bytesRef.bytes, bytesRef.offset + offset, - bytesRef.offset + offset + config.bytesPerDim, + bytesRef.offset + offset + config.bytesPerDim(), firstValue, 0, - config.bytesPerDim); + config.bytesPerDim()); if (diff != -1 && commonPrefixLength > diff) { if (diff == 0) { return diff; diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestMutablePointTreeReaderUtils.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestMutablePointTreeReaderUtils.java index 8c468bf5478a..9dbe2ee210a8 100644 --- a/lucene/core/src/test/org/apache/lucene/util/bkd/TestMutablePointTreeReaderUtils.java +++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestMutablePointTreeReaderUtils.java @@ -87,10 +87,10 @@ public void testSortByDim() { private void doTestSortByDim() { BKDConfig config = createRandomConfig(); final int maxDoc = TestUtil.nextInt(random(), 1, 1 << random().nextInt(30)); - int[] commonPrefixLengths = new int[config.numDims]; + int[] commonPrefixLengths = new int[config.numDims()]; Point[] points = createRandomPoints(config, maxDoc, commonPrefixLengths, false); DummyPointsReader reader = new DummyPointsReader(points); - final int sortedDim = random().nextInt(config.numIndexDims); + final int sortedDim = random().nextInt(config.numIndexDims()); MutablePointTreeReaderUtils.sortByDim( config, sortedDim, @@ -101,20 +101,20 @@ private void doTestSortByDim() { new BytesRef(), new BytesRef()); for (int i = 1; i < points.length; ++i) { - final int offset = sortedDim * config.bytesPerDim; + final int offset = sortedDim * config.bytesPerDim(); BytesRef previousValue = reader.points[i - 1].packedValue; BytesRef currentValue = reader.points[i].packedValue; int cmp = Arrays.compareUnsigned( previousValue.bytes, previousValue.offset + offset, - previousValue.offset + offset + config.bytesPerDim, + previousValue.offset + offset + config.bytesPerDim(), currentValue.bytes, currentValue.offset + offset, - currentValue.offset + offset + config.bytesPerDim); + currentValue.offset + offset + config.bytesPerDim()); if (cmp == 0) { - int dataDimOffset = config.packedIndexBytesLength; - int dataDimsLength = (config.numDims - config.numIndexDims) * config.bytesPerDim; + int dataDimOffset = config.packedIndexBytesLength(); + int dataDimsLength = (config.numDims() - config.numIndexDims()) * config.bytesPerDim(); cmp = Arrays.compareUnsigned( previousValue.bytes, @@ -139,10 +139,10 @@ public void testPartition() { private void doTestPartition() { BKDConfig config = createRandomConfig(); - int[] commonPrefixLengths = new int[config.numDims]; + int[] commonPrefixLengths = new int[config.numDims()]; final int maxDoc = TestUtil.nextInt(random(), 1, 1 << random().nextInt(30)); Point[] points = createRandomPoints(config, maxDoc, commonPrefixLengths, false); - final int splitDim = random().nextInt(config.numIndexDims); + final int splitDim = random().nextInt(config.numIndexDims()); DummyPointsReader reader = new DummyPointsReader(points); final int pivot = TestUtil.nextInt(random(), 0, points.length - 1); MutablePointTreeReaderUtils.partition( @@ -157,20 +157,20 @@ private void doTestPartition() { new BytesRef(), new BytesRef()); BytesRef pivotValue = reader.points[pivot].packedValue; - int offset = splitDim * config.bytesPerDim; + int offset = splitDim * config.bytesPerDim(); for (int i = 0; i < points.length; ++i) { BytesRef value = reader.points[i].packedValue; int cmp = Arrays.compareUnsigned( value.bytes, value.offset + offset, - value.offset + offset + config.bytesPerDim, + value.offset + offset + config.bytesPerDim(), pivotValue.bytes, pivotValue.offset + offset, - pivotValue.offset + offset + config.bytesPerDim); + pivotValue.offset + offset + config.bytesPerDim()); if (cmp == 0) { - int dataDimOffset = config.packedIndexBytesLength; - int dataDimsLength = (config.numDims - config.numIndexDims) * config.bytesPerDim; + int dataDimOffset = config.packedIndexBytesLength(); + int dataDimsLength = (config.numDims() - config.numIndexDims()) * config.bytesPerDim(); cmp = Arrays.compareUnsigned( value.bytes, @@ -203,24 +203,24 @@ private static BKDConfig createRandomConfig() { private static Point[] createRandomPoints( BKDConfig config, int maxDoc, int[] commonPrefixLengths, boolean isDocIdIncremental) { - assertTrue(commonPrefixLengths.length == config.numDims); + assertTrue(commonPrefixLengths.length == config.numDims()); final int numPoints = TestUtil.nextInt(random(), 1, 100000); Point[] points = new Point[numPoints]; if (random().nextInt(10) != 0) { for (int i = 0; i < numPoints; ++i) { - byte[] value = new byte[config.packedBytesLength]; + byte[] value = new byte[config.packedBytesLength()]; random().nextBytes(value); points[i] = new Point( value, isDocIdIncremental ? Math.min(i, maxDoc - 1) : random().nextInt(maxDoc)); } - for (int i = 0; i < config.numDims; ++i) { - commonPrefixLengths[i] = TestUtil.nextInt(random(), 0, config.bytesPerDim); + for (int i = 0; i < config.numDims(); ++i) { + commonPrefixLengths[i] = TestUtil.nextInt(random(), 0, config.bytesPerDim()); } BytesRef firstValue = points[0].packedValue; for (int i = 1; i < points.length; ++i) { - for (int dim = 0; dim < config.numDims; ++dim) { - int offset = dim * config.bytesPerDim; + for (int dim = 0; dim < config.numDims(); ++dim) { + int offset = dim * config.bytesPerDim(); BytesRef packedValue = points[i].packedValue; System.arraycopy( firstValue.bytes, @@ -232,30 +232,34 @@ private static Point[] createRandomPoints( } } else { // index dim are equal, data dims different - int numDataDims = config.numDims - config.numIndexDims; - byte[] indexDims = new byte[config.packedIndexBytesLength]; + int numDataDims = config.numDims() - config.numIndexDims(); + byte[] indexDims = new byte[config.packedIndexBytesLength()]; random().nextBytes(indexDims); - byte[] dataDims = new byte[numDataDims * config.bytesPerDim]; + byte[] dataDims = new byte[numDataDims * config.bytesPerDim()]; for (int i = 0; i < numPoints; ++i) { - byte[] value = new byte[config.packedBytesLength]; - System.arraycopy(indexDims, 0, value, 0, config.packedIndexBytesLength); + byte[] value = new byte[config.packedBytesLength()]; + System.arraycopy(indexDims, 0, value, 0, config.packedIndexBytesLength()); random().nextBytes(dataDims); System.arraycopy( - dataDims, 0, value, config.packedIndexBytesLength, numDataDims * config.bytesPerDim); + dataDims, + 0, + value, + config.packedIndexBytesLength(), + numDataDims * config.bytesPerDim()); points[i] = new Point( value, isDocIdIncremental ? Math.min(i, maxDoc - 1) : random().nextInt(maxDoc)); } - for (int i = 0; i < config.numIndexDims; ++i) { - commonPrefixLengths[i] = config.bytesPerDim; + for (int i = 0; i < config.numIndexDims(); ++i) { + commonPrefixLengths[i] = config.bytesPerDim(); } - for (int i = config.numIndexDims; i < config.numDims; ++i) { - commonPrefixLengths[i] = TestUtil.nextInt(random(), 0, config.bytesPerDim); + for (int i = config.numIndexDims(); i < config.numDims(); ++i) { + commonPrefixLengths[i] = TestUtil.nextInt(random(), 0, config.bytesPerDim()); } BytesRef firstValue = points[0].packedValue; for (int i = 1; i < points.length; ++i) { - for (int dim = config.numIndexDims; dim < config.numDims; ++dim) { - int offset = dim * config.bytesPerDim; + for (int dim = config.numIndexDims(); dim < config.numDims(); ++dim) { + int offset = dim * config.bytesPerDim(); BytesRef packedValue = points[i].packedValue; System.arraycopy( firstValue.bytes, diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestNodeHash.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTSuffixNodeCache.java similarity index 84% rename from lucene/core/src/test/org/apache/lucene/util/fst/TestNodeHash.java rename to lucene/core/src/test/org/apache/lucene/util/fst/TestFSTSuffixNodeCache.java index 8319f20efea3..2fb93c7f5a46 100644 --- a/lucene/core/src/test/org/apache/lucene/util/fst/TestNodeHash.java +++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTSuffixNodeCache.java @@ -19,14 +19,16 @@ import com.carrotsearch.randomizedtesting.generators.RandomBytes; import org.apache.lucene.tests.util.LuceneTestCase; -public class TestNodeHash extends LuceneTestCase { +public class TestFSTSuffixNodeCache extends LuceneTestCase { public void testCopyFallbackNodeBytes() { // we don't need the FSTCompiler in this test - NodeHash nodeHash = new NodeHash<>(null, 1); + FSTSuffixNodeCache suffixCache = new FSTSuffixNodeCache<>(null, 1); - NodeHash.PagedGrowableHash primaryHashTable = nodeHash.new PagedGrowableHash(); - NodeHash.PagedGrowableHash fallbackHashTable = nodeHash.new PagedGrowableHash(); + FSTSuffixNodeCache.PagedGrowableHash primaryHashTable = + suffixCache.new PagedGrowableHash(); + FSTSuffixNodeCache.PagedGrowableHash fallbackHashTable = + suffixCache.new PagedGrowableHash(); int nodeLength = atLeast(500); long fallbackHashSlot = 1; byte[] fallbackBytes = RandomBytes.randomBytesOfLength(random(), nodeLength); diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java index 9d09f075b726..e83e4f1ec047 100644 --- a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java +++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java @@ -1350,14 +1350,14 @@ public void testShortestPaths() throws Exception { true); assertTrue(res.isComplete); assertEquals(3, res.topN.size()); - assertEquals(Util.toIntsRef(newBytesRef("aac"), scratch), res.topN.get(0).input); - assertEquals(7L, res.topN.get(0).output.longValue()); + assertEquals(Util.toIntsRef(newBytesRef("aac"), scratch), res.topN.get(0).input()); + assertEquals(7L, res.topN.get(0).output().longValue()); - assertEquals(Util.toIntsRef(newBytesRef("ax"), scratch), res.topN.get(1).input); - assertEquals(17L, res.topN.get(1).output.longValue()); + assertEquals(Util.toIntsRef(newBytesRef("ax"), scratch), res.topN.get(1).input()); + assertEquals(17L, res.topN.get(1).output().longValue()); - assertEquals(Util.toIntsRef(newBytesRef("aab"), scratch), res.topN.get(2).input); - assertEquals(22L, res.topN.get(2).output.longValue()); + assertEquals(Util.toIntsRef(newBytesRef("aab"), scratch), res.topN.get(2).input()); + assertEquals(22L, res.topN.get(2).output().longValue()); } public void testRejectNoLimits() throws IOException { @@ -1393,8 +1393,8 @@ protected boolean acceptResult(IntsRef input, Long output) { assertTrue(res.isComplete); // rejected(4) + topN(2) <= maxQueueSize(6) assertEquals(1, res.topN.size()); - assertEquals(Util.toIntsRef(newBytesRef("aac"), scratch), res.topN.get(0).input); - assertEquals(7L, res.topN.get(0).output.longValue()); + assertEquals(Util.toIntsRef(newBytesRef("aac"), scratch), res.topN.get(0).input()); + assertEquals(7L, res.topN.get(0).output().longValue()); rejectCount.set(0); searcher = new Util.TopNSearcher<>(fst, 2, 5, minLongComparator) { @@ -1452,17 +1452,17 @@ public void testShortestPathsWFST() throws Exception { assertTrue(res.isComplete); assertEquals(3, res.topN.size()); - assertEquals(Util.toIntsRef(newBytesRef("aac"), scratch), res.topN.get(0).input); - assertEquals(7L, res.topN.get(0).output.output1.longValue()); // weight - assertEquals(36L, res.topN.get(0).output.output2.longValue()); // output + assertEquals(Util.toIntsRef(newBytesRef("aac"), scratch), res.topN.get(0).input()); + assertEquals(7L, res.topN.get(0).output().output1.longValue()); // weight + assertEquals(36L, res.topN.get(0).output().output2.longValue()); // output - assertEquals(Util.toIntsRef(newBytesRef("ax"), scratch), res.topN.get(1).input); - assertEquals(17L, res.topN.get(1).output.output1.longValue()); // weight - assertEquals(85L, res.topN.get(1).output.output2.longValue()); // output + assertEquals(Util.toIntsRef(newBytesRef("ax"), scratch), res.topN.get(1).input()); + assertEquals(17L, res.topN.get(1).output().output1.longValue()); // weight + assertEquals(85L, res.topN.get(1).output().output2.longValue()); // output - assertEquals(Util.toIntsRef(newBytesRef("aab"), scratch), res.topN.get(2).input); - assertEquals(22L, res.topN.get(2).output.output1.longValue()); // weight - assertEquals(57L, res.topN.get(2).output.output2.longValue()); // output + assertEquals(Util.toIntsRef(newBytesRef("aab"), scratch), res.topN.get(2).input()); + assertEquals(22L, res.topN.get(2).output().output1.longValue()); // weight + assertEquals(57L, res.topN.get(2).output().output2.longValue()); // output } public void testShortestPathsRandom() throws Exception { @@ -1548,24 +1548,20 @@ public void testShortestPathsRandom() throws Exception { for (int hit = 0; hit < r.topN.size(); hit++) { // System.out.println(" check hit " + hit); - assertEquals(matches.get(hit).input, r.topN.get(hit).input); - assertEquals(matches.get(hit).output, r.topN.get(hit).output); + assertEquals(matches.get(hit).input(), r.topN.get(hit).input()); + assertEquals(matches.get(hit).output(), r.topN.get(hit).output()); } } } - private static class TieBreakByInputComparator implements Comparator> { - private final Comparator comparator; - - public TieBreakByInputComparator(Comparator comparator) { - this.comparator = comparator; - } + private record TieBreakByInputComparator(Comparator comparator) + implements Comparator> { @Override public int compare(Result a, Result b) { - int cmp = comparator.compare(a.output, b.output); + int cmp = comparator.compare(a.output(), b.output()); if (cmp == 0) { - return a.input.compareTo(b.input); + return a.input().compareTo(b.input()); } else { return cmp; } @@ -1678,8 +1674,8 @@ public void testShortestPathsWFSTRandom() throws Exception { for (int hit = 0; hit < r.topN.size(); hit++) { // System.out.println(" check hit " + hit); - assertEquals(matches.get(hit).input, r.topN.get(hit).input); - assertEquals(matches.get(hit).output, r.topN.get(hit).output); + assertEquals(matches.get(hit).input(), r.topN.get(hit).input()); + assertEquals(matches.get(hit).output(), r.topN.get(hit).output()); } } } diff --git a/lucene/core/src/test/org/apache/lucene/util/hnsw/AbstractMockVectorValues.java b/lucene/core/src/test/org/apache/lucene/util/hnsw/AbstractMockVectorValues.java deleted file mode 100644 index 54de3919b516..000000000000 --- a/lucene/core/src/test/org/apache/lucene/util/hnsw/AbstractMockVectorValues.java +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.lucene.util.hnsw; - -import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; - -import java.io.IOException; -import org.apache.lucene.util.BytesRef; - -abstract class AbstractMockVectorValues implements RandomAccessVectorValues { - - protected final int dimension; - protected final T[] denseValues; - protected final T[] values; - protected final int numVectors; - protected final BytesRef binaryValue; - - protected int pos = -1; - - AbstractMockVectorValues(T[] values, int dimension, T[] denseValues, int numVectors) { - this.dimension = dimension; - this.values = values; - this.denseValues = denseValues; - // used by tests that build a graph from bytes rather than floats - binaryValue = new BytesRef(dimension); - binaryValue.length = dimension; - this.numVectors = numVectors; - } - - @Override - public int size() { - return numVectors; - } - - @Override - public int dimension() { - return dimension; - } - - public T vectorValue(int targetOrd) { - return denseValues[targetOrd]; - } - - @Override - public abstract AbstractMockVectorValues copy(); - - public abstract T vectorValue() throws IOException; - - private boolean seek(int target) { - if (target >= 0 && target < values.length && values[target] != null) { - pos = target; - return true; - } else { - return false; - } - } - - public int docID() { - return pos; - } - - public int nextDoc() { - return advance(pos + 1); - } - - public int advance(int target) { - while (++pos < values.length) { - if (seek(pos)) { - return pos; - } - } - return NO_MORE_DOCS; - } -} diff --git a/lucene/core/src/test/org/apache/lucene/util/hnsw/HnswGraphTestCase.java b/lucene/core/src/test/org/apache/lucene/util/hnsw/HnswGraphTestCase.java index 09d7721fec98..1da8c8169a98 100644 --- a/lucene/core/src/test/org/apache/lucene/util/hnsw/HnswGraphTestCase.java +++ b/lucene/core/src/test/org/apache/lucene/util/hnsw/HnswGraphTestCase.java @@ -38,8 +38,6 @@ import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import java.util.stream.Collectors; -import org.apache.lucene.codecs.FilterCodec; -import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer; import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader; @@ -56,6 +54,7 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.StoredFields; @@ -97,43 +96,39 @@ abstract class HnswGraphTestCase extends LuceneTestCase { abstract T randomVector(int dim); - abstract AbstractMockVectorValues vectorValues(int size, int dimension); + abstract KnnVectorValues vectorValues(int size, int dimension); - abstract AbstractMockVectorValues vectorValues(float[][] values); + abstract KnnVectorValues vectorValues(float[][] values); - abstract AbstractMockVectorValues vectorValues(LeafReader reader, String fieldName) - throws IOException; + abstract KnnVectorValues vectorValues(LeafReader reader, String fieldName) throws IOException; - abstract AbstractMockVectorValues vectorValues( - int size, - int dimension, - AbstractMockVectorValues pregeneratedVectorValues, - int pregeneratedOffset); + abstract KnnVectorValues vectorValues( + int size, int dimension, KnnVectorValues pregeneratedVectorValues, int pregeneratedOffset); abstract Field knnVectorField(String name, T vector, VectorSimilarityFunction similarityFunction); - abstract RandomAccessVectorValues circularVectorValues(int nDoc); + abstract KnnVectorValues circularVectorValues(int nDoc); abstract T getTargetVector(); - protected RandomVectorScorerSupplier buildScorerSupplier(RandomAccessVectorValues vectors) + protected RandomVectorScorerSupplier buildScorerSupplier(KnnVectorValues vectors) throws IOException { return flatVectorScorer.getRandomVectorScorerSupplier(similarityFunction, vectors); } - protected RandomVectorScorer buildScorer(RandomAccessVectorValues vectors, T query) - throws IOException { - RandomAccessVectorValues vectorsCopy = vectors.copy(); + protected RandomVectorScorer buildScorer(KnnVectorValues vectors, T query) throws IOException { + KnnVectorValues vectorsCopy = vectors.copy(); return switch (getVectorEncoding()) { - case BYTE -> flatVectorScorer.getRandomVectorScorer( - similarityFunction, vectorsCopy, (byte[]) query); - case FLOAT32 -> flatVectorScorer.getRandomVectorScorer( - similarityFunction, vectorsCopy, (float[]) query); + case BYTE -> + flatVectorScorer.getRandomVectorScorer(similarityFunction, vectorsCopy, (byte[]) query); + case FLOAT32 -> + flatVectorScorer.getRandomVectorScorer(similarityFunction, vectorsCopy, (float[]) query); }; } // Tests writing segments of various sizes and merging to ensure there are no errors // in the HNSW graph merging logic. + @SuppressWarnings("unchecked") public void testRandomReadWriteAndMerge() throws IOException { int dim = random().nextInt(100) + 1; int[] segmentSizes = @@ -148,32 +143,20 @@ public void testRandomReadWriteAndMerge() throws IOException { int M = random().nextInt(4) + 2; int beamWidth = random().nextInt(10) + 5; long seed = random().nextLong(); - AbstractMockVectorValues vectors = vectorValues(numVectors, dim); + KnnVectorValues vectors = vectorValues(numVectors, dim); HnswGraphBuilder.randSeed = seed; try (Directory dir = newDirectory()) { IndexWriterConfig iwc = new IndexWriterConfig() .setCodec( - new FilterCodec( - TestUtil.getDefaultCodec().getName(), TestUtil.getDefaultCodec()) { - - @Override - public KnnVectorsFormat knnVectorsFormat() { - return new PerFieldKnnVectorsFormat() { - @Override - public KnnVectorsFormat getKnnVectorsFormatForField(String field) { - return new Lucene99HnswVectorsFormat(M, beamWidth); - } - }; - } - }) + TestUtil.alwaysKnnVectorsFormat(new Lucene99HnswVectorsFormat(M, beamWidth))) // set a random merge policy .setMergePolicy(newMergePolicy(random())); try (IndexWriter iw = new IndexWriter(dir, iwc)) { for (int i = 0; i < segmentSizes.length; i++) { int size = segmentSizes[i]; - while (vectors.nextDoc() < size) { + for (int ord = 0; ord < size; ord++) { if (isSparse[i] && random().nextBoolean()) { int d = random().nextInt(10) + 1; for (int j = 0; j < d; j++) { @@ -182,8 +165,24 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { } } Document doc = new Document(); - doc.add(knnVectorField("field", vectors.vectorValue(), similarityFunction)); - doc.add(new StringField("id", Integer.toString(vectors.docID()), Field.Store.NO)); + switch (vectors.getEncoding()) { + case BYTE -> { + doc.add( + knnVectorField( + "field", + (T) ((ByteVectorValues) vectors).vectorValue(ord), + similarityFunction)); + } + case FLOAT32 -> { + doc.add( + knnVectorField( + "field", + (T) ((FloatVectorValues) vectors).vectorValue(ord), + similarityFunction)); + } + } + ; + doc.add(new StringField("id", Integer.toString(vectors.ordToDoc(ord)), Field.Store.NO)); iw.addDocument(doc); } iw.commit(); @@ -199,13 +198,26 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { } try (IndexReader reader = DirectoryReader.open(dir)) { for (LeafReaderContext ctx : reader.leaves()) { - AbstractMockVectorValues values = vectorValues(ctx.reader(), "field"); + KnnVectorValues values = vectorValues(ctx.reader(), "field"); assertEquals(dim, values.dimension()); } } } } + @SuppressWarnings("unchecked") + private T vectorValue(KnnVectorValues vectors, int ord) throws IOException { + switch (vectors.getEncoding()) { + case BYTE -> { + return (T) ((ByteVectorValues) vectors).vectorValue(ord); + } + case FLOAT32 -> { + return (T) ((FloatVectorValues) vectors).vectorValue(ord); + } + } + throw new AssertionError("unknown encoding " + vectors.getEncoding()); + } + // test writing out and reading in a graph gives the expected graph public void testReadWrite() throws IOException { int dim = random().nextInt(100) + 1; @@ -213,8 +225,8 @@ public void testReadWrite() throws IOException { int M = random().nextInt(4) + 2; int beamWidth = random().nextInt(10) + 5; long seed = random().nextLong(); - AbstractMockVectorValues vectors = vectorValues(nDoc, dim); - AbstractMockVectorValues v2 = vectors.copy(), v3 = vectors.copy(); + KnnVectorValues vectors = vectorValues(nDoc, dim); + KnnVectorValues v2 = vectors.copy(), v3 = vectors.copy(); RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectors); HnswGraphBuilder builder = HnswGraphBuilder.create(scorerSupplier, M, beamWidth, seed); HnswGraph hnsw = builder.build(vectors.size()); @@ -229,28 +241,18 @@ public void testReadWrite() throws IOException { IndexWriterConfig iwc = new IndexWriterConfig() .setCodec( - new FilterCodec( - TestUtil.getDefaultCodec().getName(), TestUtil.getDefaultCodec()) { - @Override - public KnnVectorsFormat knnVectorsFormat() { - return new PerFieldKnnVectorsFormat() { - @Override - public KnnVectorsFormat getKnnVectorsFormatForField(String field) { - return new Lucene99HnswVectorsFormat(M, beamWidth); - } - }; - } - }); + TestUtil.alwaysKnnVectorsFormat(new Lucene99HnswVectorsFormat(M, beamWidth))); try (IndexWriter iw = new IndexWriter(dir, iwc)) { - while (v2.nextDoc() != NO_MORE_DOCS) { - while (indexedDoc < v2.docID()) { + KnnVectorValues.DocIndexIterator it2 = v2.iterator(); + while (it2.nextDoc() != NO_MORE_DOCS) { + while (indexedDoc < it2.docID()) { // increment docId in the index by adding empty documents iw.addDocument(new Document()); indexedDoc++; } Document doc = new Document(); - doc.add(knnVectorField("field", v2.vectorValue(), similarityFunction)); - doc.add(new StoredField("id", v2.docID())); + doc.add(knnVectorField("field", vectorValue(v2, it2.index()), similarityFunction)); + doc.add(new StoredField("id", it2.docID())); iw.addDocument(doc); nVec++; indexedDoc++; @@ -258,7 +260,7 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { } try (IndexReader reader = DirectoryReader.open(dir)) { for (LeafReaderContext ctx : reader.leaves()) { - AbstractMockVectorValues values = vectorValues(ctx.reader(), "field"); + KnnVectorValues values = vectorValues(ctx.reader(), "field"); assertEquals(dim, values.dimension()); assertEquals(nVec, values.size()); assertEquals(indexedDoc, ctx.reader().maxDoc()); @@ -280,7 +282,7 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { public void testSortedAndUnsortedIndicesReturnSameResults() throws IOException { int dim = random().nextInt(10) + 3; int nDoc = random().nextInt(200) + 100; - AbstractMockVectorValues vectors = vectorValues(nDoc, dim); + KnnVectorValues vectors = vectorValues(nDoc, dim); int M = random().nextInt(10) + 5; int beamWidth = random().nextInt(10) + 10; @@ -290,32 +292,10 @@ public void testSortedAndUnsortedIndicesReturnSameResults() throws IOException { HnswGraphBuilder.randSeed = seed; IndexWriterConfig iwc = new IndexWriterConfig() - .setCodec( - new FilterCodec(TestUtil.getDefaultCodec().getName(), TestUtil.getDefaultCodec()) { - @Override - public KnnVectorsFormat knnVectorsFormat() { - return new PerFieldKnnVectorsFormat() { - @Override - public KnnVectorsFormat getKnnVectorsFormatForField(String field) { - return new Lucene99HnswVectorsFormat(M, beamWidth); - } - }; - } - }); + .setCodec(TestUtil.alwaysKnnVectorsFormat(new Lucene99HnswVectorsFormat(M, beamWidth))); IndexWriterConfig iwc2 = new IndexWriterConfig() - .setCodec( - new FilterCodec(TestUtil.getDefaultCodec().getName(), TestUtil.getDefaultCodec()) { - @Override - public KnnVectorsFormat knnVectorsFormat() { - return new PerFieldKnnVectorsFormat() { - @Override - public KnnVectorsFormat getKnnVectorsFormatForField(String field) { - return new Lucene99HnswVectorsFormat(M, beamWidth); - } - }; - } - }) + .setCodec(TestUtil.alwaysKnnVectorsFormat(new Lucene99HnswVectorsFormat(M, beamWidth))) .setIndexSort(new Sort(new SortField("sortkey", SortField.Type.LONG))); try (Directory dir = newDirectory(); @@ -323,15 +303,15 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { int indexedDoc = 0; try (IndexWriter iw = new IndexWriter(dir, iwc); IndexWriter iw2 = new IndexWriter(dir2, iwc2)) { - while (vectors.nextDoc() != NO_MORE_DOCS) { - while (indexedDoc < vectors.docID()) { + for (int ord = 0; ord < vectors.size(); ord++) { + while (indexedDoc < vectors.ordToDoc(ord)) { // increment docId in the index by adding empty documents iw.addDocument(new Document()); indexedDoc++; } Document doc = new Document(); - doc.add(knnVectorField("vector", vectors.vectorValue(), similarityFunction)); - doc.add(new StoredField("id", vectors.docID())); + doc.add(knnVectorField("vector", vectorValue(vectors, ord), similarityFunction)); + doc.add(new StoredField("id", vectors.ordToDoc(ord))); doc.add(new NumericDocValuesField("sortkey", random().nextLong())); iw.addDocument(doc); iw2.addDocument(doc); @@ -461,7 +441,7 @@ void assertGraphEqual(HnswGraph g, HnswGraph h) throws IOException { public void testAknnDiverse() throws IOException { int nDoc = 100; similarityFunction = VectorSimilarityFunction.DOT_PRODUCT; - RandomAccessVectorValues vectors = circularVectorValues(nDoc); + KnnVectorValues vectors = circularVectorValues(nDoc); RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectors); HnswGraphBuilder builder = HnswGraphBuilder.create(scorerSupplier, 10, 100, random().nextInt()); OnHeapHnswGraph hnsw = builder.build(vectors.size()); @@ -493,7 +473,7 @@ public void testAknnDiverse() throws IOException { @SuppressWarnings("unchecked") public void testSearchWithAcceptOrds() throws IOException { int nDoc = 100; - RandomAccessVectorValues vectors = circularVectorValues(nDoc); + KnnVectorValues vectors = circularVectorValues(nDoc); similarityFunction = VectorSimilarityFunction.DOT_PRODUCT; RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectors); HnswGraphBuilder builder = HnswGraphBuilder.create(scorerSupplier, 16, 100, random().nextInt()); @@ -518,7 +498,7 @@ public void testSearchWithAcceptOrds() throws IOException { @SuppressWarnings("unchecked") public void testSearchWithSelectiveAcceptOrds() throws IOException { int nDoc = 100; - RandomAccessVectorValues vectors = circularVectorValues(nDoc); + KnnVectorValues vectors = circularVectorValues(nDoc); similarityFunction = VectorSimilarityFunction.DOT_PRODUCT; RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectors); HnswGraphBuilder builder = HnswGraphBuilder.create(scorerSupplier, 16, 100, random().nextInt()); @@ -552,13 +532,13 @@ public void testHnswGraphBuilderInitializationFromGraph_withOffsetZero() throws int dim = atLeast(10); long seed = random().nextLong(); - AbstractMockVectorValues initializerVectors = vectorValues(initializerSize, dim); + KnnVectorValues initializerVectors = vectorValues(initializerSize, dim); RandomVectorScorerSupplier initialscorerSupplier = buildScorerSupplier(initializerVectors); HnswGraphBuilder initializerBuilder = HnswGraphBuilder.create(initialscorerSupplier, 10, 30, seed); OnHeapHnswGraph initializerGraph = initializerBuilder.build(initializerVectors.size()); - AbstractMockVectorValues finalVectorValues = + KnnVectorValues finalVectorValues = vectorValues(totalSize, dim, initializerVectors, docIdOffset); int[] initializerOrdMap = createOffsetOrdinalMap(initializerSize, finalVectorValues, docIdOffset); @@ -598,13 +578,13 @@ public void testHnswGraphBuilderInitializationFromGraph_withNonZeroOffset() thro int dim = atLeast(10); long seed = random().nextLong(); - AbstractMockVectorValues initializerVectors = vectorValues(initializerSize, dim); + KnnVectorValues initializerVectors = vectorValues(initializerSize, dim); RandomVectorScorerSupplier initialscorerSupplier = buildScorerSupplier(initializerVectors); HnswGraphBuilder initializerBuilder = HnswGraphBuilder.create(initialscorerSupplier, 10, 30, seed); OnHeapHnswGraph initializerGraph = initializerBuilder.build(initializerVectors.size()); - AbstractMockVectorValues finalVectorValues = + KnnVectorValues finalVectorValues = vectorValues(totalSize, dim, initializerVectors.copy(), docIdOffset); int[] initializerOrdMap = createOffsetOrdinalMap(initializerSize, finalVectorValues, docIdOffset); @@ -688,19 +668,17 @@ private int[] mapArrayAndSort(int[] arr, int[] offset) { } private int[] createOffsetOrdinalMap( - int docIdSize, AbstractMockVectorValues totalVectorValues, int docIdOffset) { + int docIdSize, KnnVectorValues totalVectorValues, int docIdOffset) throws IOException { // Compute the offset for the ordinal map to be the number of non-null vectors in the total - // vector values - // before the docIdOffset + // vector values before the docIdOffset int ordinalOffset = 0; - while (totalVectorValues.nextDoc() < docIdOffset) { + KnnVectorValues.DocIndexIterator it = totalVectorValues.iterator(); + while (it.nextDoc() < docIdOffset) { ordinalOffset++; } int[] offsetOrdinalMap = new int[docIdSize]; - for (int curr = 0; - totalVectorValues.docID() < docIdOffset + docIdSize; - totalVectorValues.nextDoc()) { + for (int curr = 0; it.docID() < docIdOffset + docIdSize; it.nextDoc()) { offsetOrdinalMap[curr] = ordinalOffset + curr++; } @@ -711,7 +689,7 @@ private int[] createOffsetOrdinalMap( public void testVisitedLimit() throws IOException { int nDoc = 500; similarityFunction = VectorSimilarityFunction.DOT_PRODUCT; - RandomAccessVectorValues vectors = circularVectorValues(nDoc); + KnnVectorValues vectors = circularVectorValues(nDoc); RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectors); HnswGraphBuilder builder = HnswGraphBuilder.create(scorerSupplier, 16, 100, random().nextInt()); OnHeapHnswGraph hnsw = builder.build(vectors.size()); @@ -746,7 +724,7 @@ public void testRamUsageEstimate() throws IOException { int M = randomIntBetween(4, 96); similarityFunction = RandomizedTest.randomFrom(VectorSimilarityFunction.values()); - RandomAccessVectorValues vectors = vectorValues(size, dim); + KnnVectorValues vectors = vectorValues(size, dim); RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectors); HnswGraphBuilder builder = @@ -771,7 +749,7 @@ public void testDiversity() throws IOException { unitVector2d(0.77), unitVector2d(0.6) }; - AbstractMockVectorValues vectors = vectorValues(values); + KnnVectorValues vectors = vectorValues(values); // First add nodes until everybody gets a full neighbor list RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectors); HnswGraphBuilder builder = HnswGraphBuilder.create(scorerSupplier, 2, 10, random().nextInt()); @@ -825,7 +803,7 @@ public void testDiversityFallback() throws IOException { {10, 0, 0}, {0, 4, 0} }; - AbstractMockVectorValues vectors = vectorValues(values); + KnnVectorValues vectors = vectorValues(values); // First add nodes until everybody gets a full neighbor list RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectors); HnswGraphBuilder builder = HnswGraphBuilder.create(scorerSupplier, 1, 10, random().nextInt()); @@ -855,7 +833,7 @@ public void testDiversity3d() throws IOException { {0, 0, 20}, {0, 9, 0} }; - AbstractMockVectorValues vectors = vectorValues(values); + KnnVectorValues vectors = vectorValues(values); // First add nodes until everybody gets a full neighbor list RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectors); HnswGraphBuilder builder = HnswGraphBuilder.create(scorerSupplier, 1, 10, random().nextInt()); @@ -891,7 +869,7 @@ private void assertLevel0Neighbors(OnHeapHnswGraph graph, int node, int... expec public void testRandom() throws IOException { int size = atLeast(100); int dim = atLeast(10); - AbstractMockVectorValues vectors = vectorValues(size, dim); + KnnVectorValues vectors = vectorValues(size, dim); int topK = 5; RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectors); HnswGraphBuilder builder = HnswGraphBuilder.create(scorerSupplier, 10, 30, random().nextLong()); @@ -908,15 +886,13 @@ public void testRandom() throws IOException { TopDocs topDocs = actual.topDocs(); NeighborQueue expected = new NeighborQueue(topK, false); for (int j = 0; j < size; j++) { - if (vectors.vectorValue(j) != null && (acceptOrds == null || acceptOrds.get(j))) { + if (vectorValue(vectors, j) != null && (acceptOrds == null || acceptOrds.get(j))) { if (getVectorEncoding() == VectorEncoding.BYTE) { - assert query instanceof byte[]; expected.add( - j, similarityFunction.compare((byte[]) query, (byte[]) vectors.vectorValue(j))); + j, similarityFunction.compare((byte[]) query, (byte[]) vectorValue(vectors, j))); } else { - assert query instanceof float[]; expected.add( - j, similarityFunction.compare((float[]) query, (float[]) vectors.vectorValue(j))); + j, similarityFunction.compare((float[]) query, (float[]) vectorValue(vectors, j))); } if (expected.size() > topK) { expected.pop(); @@ -940,7 +916,7 @@ public void testOnHeapHnswGraphSearch() throws IOException, ExecutionException, InterruptedException, TimeoutException { int size = atLeast(100); int dim = atLeast(10); - AbstractMockVectorValues vectors = vectorValues(size, dim); + KnnVectorValues vectors = vectorValues(size, dim); RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectors); HnswGraphBuilder builder = HnswGraphBuilder.create(scorerSupplier, 10, 30, random().nextLong()); OnHeapHnswGraph hnsw = builder.build(vectors.size()); @@ -1004,7 +980,7 @@ public void testOnHeapHnswGraphSearch() public void testConcurrentMergeBuilder() throws IOException { int size = atLeast(1000); int dim = atLeast(10); - AbstractMockVectorValues vectors = vectorValues(size, dim); + KnnVectorValues vectors = vectorValues(size, dim); RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectors); ExecutorService exec = Executors.newFixedThreadPool(4, new NamedThreadFactory("hnswMerge")); TaskExecutor taskExecutor = new TaskExecutor(exec); @@ -1033,7 +1009,7 @@ public void testAllNodesVisitedInSingleLevel() throws IOException { // Search for a large number of results int topK = size - 1; - AbstractMockVectorValues docVectors = vectorValues(size, dim); + KnnVectorValues docVectors = vectorValues(size, dim); HnswGraph graph = HnswGraphBuilder.create(buildScorerSupplier(docVectors), 10, 30, random().nextLong()) .build(size); @@ -1047,8 +1023,8 @@ public int numLevels() { } }; - AbstractMockVectorValues queryVectors = vectorValues(1, dim); - RandomVectorScorer queryScorer = buildScorer(docVectors, queryVectors.vectorValue(0)); + KnnVectorValues queryVectors = vectorValues(1, dim); + RandomVectorScorer queryScorer = buildScorer(docVectors, vectorValue(queryVectors, 0)); KnnCollector collector = new TopKnnCollector(topK, Integer.MAX_VALUE); HnswGraphSearcher.search(queryScorer, collector, singleLevelGraph, null); @@ -1076,8 +1052,7 @@ private int computeOverlap(int[] a, int[] b) { } /** Returns vectors evenly distributed around the upper unit semicircle. */ - static class CircularFloatVectorValues extends FloatVectorValues - implements RandomAccessVectorValues.Floats { + static class CircularFloatVectorValues extends FloatVectorValues { private final int size; private final float[] value; @@ -1103,22 +1078,18 @@ public int size() { return size; } - @Override public float[] vectorValue() { return vectorValue(doc); } - @Override public int docID() { return doc; } - @Override public int nextDoc() { return advance(doc + 1); } - @Override public int advance(int target) { if (target >= 0 && target < size) { doc = target; @@ -1140,8 +1111,7 @@ public VectorScorer scorer(float[] target) { } /** Returns vectors evenly distributed around the upper unit semicircle. */ - static class CircularByteVectorValues extends ByteVectorValues - implements RandomAccessVectorValues.Bytes { + static class CircularByteVectorValues extends ByteVectorValues { private final int size; private final float[] value; private final byte[] bValue; @@ -1169,22 +1139,18 @@ public int size() { return size; } - @Override public byte[] vectorValue() { return vectorValue(doc); } - @Override public int docID() { return doc; } - @Override public int nextDoc() { return advance(doc + 1); } - @Override public int advance(int target) { if (target >= 0 && target < size) { doc = target; @@ -1227,28 +1193,28 @@ private Set getNeighborNodes(HnswGraph g) throws IOException { return neighbors; } - void assertVectorsEqual(AbstractMockVectorValues u, AbstractMockVectorValues v) - throws IOException { + void assertVectorsEqual(KnnVectorValues u, KnnVectorValues v) throws IOException { int uDoc, vDoc; - while (true) { - uDoc = u.nextDoc(); - vDoc = v.nextDoc(); + assertEquals(u.size(), v.size()); + for (int ord = 0; ord < u.size(); ord++) { + uDoc = u.ordToDoc(ord); + vDoc = v.ordToDoc(ord); assertEquals(uDoc, vDoc); - if (uDoc == NO_MORE_DOCS) { - break; - } + assertNotEquals(NO_MORE_DOCS, uDoc); switch (getVectorEncoding()) { - case BYTE -> assertArrayEquals( - "vectors do not match for doc=" + uDoc, - (byte[]) u.vectorValue(), - (byte[]) v.vectorValue()); - case FLOAT32 -> assertArrayEquals( - "vectors do not match for doc=" + uDoc, - (float[]) u.vectorValue(), - (float[]) v.vectorValue(), - 1e-4f); - default -> throw new IllegalArgumentException( - "unknown vector encoding: " + getVectorEncoding()); + case BYTE -> + assertArrayEquals( + "vectors do not match for doc=" + uDoc, + (byte[]) vectorValue(u, ord), + (byte[]) vectorValue(v, ord)); + case FLOAT32 -> + assertArrayEquals( + "vectors do not match for doc=" + uDoc, + (float[]) vectorValue(u, ord), + (float[]) vectorValue(v, ord), + 1e-4f); + default -> + throw new IllegalArgumentException("unknown vector encoding: " + getVectorEncoding()); } } } diff --git a/lucene/core/src/test/org/apache/lucene/util/hnsw/MockByteVectorValues.java b/lucene/core/src/test/org/apache/lucene/util/hnsw/MockByteVectorValues.java index a3b17b9a621e..4ab86c707816 100644 --- a/lucene/core/src/test/org/apache/lucene/util/hnsw/MockByteVectorValues.java +++ b/lucene/core/src/test/org/apache/lucene/util/hnsw/MockByteVectorValues.java @@ -17,11 +17,17 @@ package org.apache.lucene.util.hnsw; +import org.apache.lucene.index.ByteVectorValues; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; -class MockByteVectorValues extends AbstractMockVectorValues - implements RandomAccessVectorValues.Bytes { +class MockByteVectorValues extends ByteVectorValues { + private final int dimension; + private final byte[][] denseValues; + protected final byte[][] values; + private final int numVectors; + private final BytesRef binaryValue; private final byte[] scratch; static MockByteVectorValues fromValues(byte[][] values) { @@ -43,10 +49,26 @@ static MockByteVectorValues fromValues(byte[][] values) { } MockByteVectorValues(byte[][] values, int dimension, byte[][] denseValues, int numVectors) { - super(values, dimension, denseValues, numVectors); + this.dimension = dimension; + this.values = values; + this.denseValues = denseValues; + this.numVectors = numVectors; + // used by tests that build a graph from bytes rather than floats + binaryValue = new BytesRef(dimension); + binaryValue.length = dimension; scratch = new byte[dimension]; } + @Override + public int size() { + return values.length; + } + + @Override + public int dimension() { + return dimension; + } + @Override public MockByteVectorValues copy() { return new MockByteVectorValues( @@ -55,20 +77,20 @@ public MockByteVectorValues copy() { @Override public byte[] vectorValue(int ord) { - return values[ord]; - } - - @Override - public byte[] vectorValue() { if (LuceneTestCase.random().nextBoolean()) { - return values[pos]; + return values[ord]; } else { // Sometimes use the same scratch array repeatedly, mimicing what the codec will do. // This should help us catch cases of aliasing where the same ByteVectorValues source is used // twice in a // single computation. - System.arraycopy(values[pos], 0, scratch, 0, dimension); + System.arraycopy(values[ord], 0, scratch, 0, dimension); return scratch; } } + + @Override + public DocIndexIterator iterator() { + return createDenseIterator(); + } } diff --git a/lucene/core/src/test/org/apache/lucene/util/hnsw/MockVectorValues.java b/lucene/core/src/test/org/apache/lucene/util/hnsw/MockVectorValues.java index f183f6c99a67..5411f2418de3 100644 --- a/lucene/core/src/test/org/apache/lucene/util/hnsw/MockVectorValues.java +++ b/lucene/core/src/test/org/apache/lucene/util/hnsw/MockVectorValues.java @@ -17,11 +17,15 @@ package org.apache.lucene.util.hnsw; +import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.util.ArrayUtil; -class MockVectorValues extends AbstractMockVectorValues - implements RandomAccessVectorValues.Floats { +class MockVectorValues extends FloatVectorValues { + private final int dimension; + private final float[][] denseValues; + protected final float[][] values; + private final int numVectors; private final float[] scratch; static MockVectorValues fromValues(float[][] values) { @@ -43,10 +47,23 @@ static MockVectorValues fromValues(float[][] values) { } MockVectorValues(float[][] values, int dimension, float[][] denseValues, int numVectors) { - super(values, dimension, denseValues, numVectors); + this.dimension = dimension; + this.values = values; + this.denseValues = denseValues; + this.numVectors = numVectors; this.scratch = new float[dimension]; } + @Override + public int size() { + return values.length; + } + + @Override + public int dimension() { + return dimension; + } + @Override public MockVectorValues copy() { return new MockVectorValues( @@ -54,20 +71,20 @@ public MockVectorValues copy() { } @Override - public float[] vectorValue() { + public float[] vectorValue(int ord) { if (LuceneTestCase.random().nextBoolean()) { - return values[pos]; + return values[ord]; } else { // Sometimes use the same scratch array repeatedly, mimicing what the codec will do. // This should help us catch cases of aliasing where the same vector values source is used // twice in a single computation. - System.arraycopy(values[pos], 0, scratch, 0, dimension); + System.arraycopy(values[ord], 0, scratch, 0, dimension); return scratch; } } @Override - public float[] vectorValue(int targetOrd) { - return denseValues[targetOrd]; + public DocIndexIterator iterator() { + return createDenseIterator(); } } diff --git a/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswByteVectorGraph.java b/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswByteVectorGraph.java index 649bc1a64519..f0e6745211c6 100644 --- a/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswByteVectorGraph.java +++ b/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswByteVectorGraph.java @@ -17,13 +17,12 @@ package org.apache.lucene.util.hnsw; -import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; - import com.carrotsearch.randomizedtesting.RandomizedTest; import java.io.IOException; import org.apache.lucene.document.Field; import org.apache.lucene.document.KnnByteVectorField; import org.apache.lucene.index.ByteVectorValues; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.VectorEncoding; import org.apache.lucene.index.VectorSimilarityFunction; @@ -56,7 +55,7 @@ byte[] randomVector(int dim) { } @Override - AbstractMockVectorValues vectorValues(int size, int dimension) { + MockByteVectorValues vectorValues(int size, int dimension) { return MockByteVectorValues.fromValues(createRandomByteVectors(size, dimension, random())); } @@ -65,7 +64,7 @@ static boolean fitsInByte(float v) { } @Override - AbstractMockVectorValues vectorValues(float[][] values) { + MockByteVectorValues vectorValues(float[][] values) { byte[][] bValues = new byte[values.length][]; // The case when all floats fit within a byte already. boolean scaleSimple = fitsInByte(values[0][0]); @@ -86,42 +85,35 @@ AbstractMockVectorValues vectorValues(float[][] values) { } @Override - AbstractMockVectorValues vectorValues( - int size, - int dimension, - AbstractMockVectorValues pregeneratedVectorValues, - int pregeneratedOffset) { + MockByteVectorValues vectorValues( + int size, int dimension, KnnVectorValues pregeneratedVectorValues, int pregeneratedOffset) { + + MockByteVectorValues pvv = (MockByteVectorValues) pregeneratedVectorValues; byte[][] vectors = new byte[size][]; - byte[][] randomVectors = - createRandomByteVectors(size - pregeneratedVectorValues.values.length, dimension, random()); + byte[][] randomVectors = createRandomByteVectors(size - pvv.values.length, dimension, random()); for (int i = 0; i < pregeneratedOffset; i++) { vectors[i] = randomVectors[i]; } - int currentDoc; - while ((currentDoc = pregeneratedVectorValues.nextDoc()) != NO_MORE_DOCS) { - vectors[pregeneratedOffset + currentDoc] = pregeneratedVectorValues.values[currentDoc]; + for (int currentOrd = 0; currentOrd < pvv.size(); currentOrd++) { + vectors[pregeneratedOffset + currentOrd] = pvv.values[currentOrd]; } - for (int i = pregeneratedOffset + pregeneratedVectorValues.values.length; - i < vectors.length; - i++) { - vectors[i] = randomVectors[i - pregeneratedVectorValues.values.length]; + for (int i = pregeneratedOffset + pvv.values.length; i < vectors.length; i++) { + vectors[i] = randomVectors[i - pvv.values.length]; } return MockByteVectorValues.fromValues(vectors); } @Override - AbstractMockVectorValues vectorValues(LeafReader reader, String fieldName) - throws IOException { + MockByteVectorValues vectorValues(LeafReader reader, String fieldName) throws IOException { ByteVectorValues vectorValues = reader.getByteVectorValues(fieldName); byte[][] vectors = new byte[reader.maxDoc()][]; - while (vectorValues.nextDoc() != NO_MORE_DOCS) { - vectors[vectorValues.docID()] = - ArrayUtil.copyOfSubArray( - vectorValues.vectorValue(), 0, vectorValues.vectorValue().length); + for (int i = 0; i < vectorValues.size(); i++) { + vectors[vectorValues.ordToDoc(i)] = + ArrayUtil.copyOfSubArray(vectorValues.vectorValue(i), 0, vectorValues.dimension()); } return MockByteVectorValues.fromValues(vectors); } diff --git a/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswFloatVectorGraph.java b/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswFloatVectorGraph.java index 5621edc4b35e..52d1da3dfa83 100644 --- a/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswFloatVectorGraph.java +++ b/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswFloatVectorGraph.java @@ -17,13 +17,12 @@ package org.apache.lucene.util.hnsw; -import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; - import com.carrotsearch.randomizedtesting.RandomizedTest; import java.io.IOException; import org.apache.lucene.document.Field; import org.apache.lucene.document.KnnFloatVectorField; import org.apache.lucene.index.FloatVectorValues; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.VectorEncoding; import org.apache.lucene.index.VectorSimilarityFunction; @@ -60,52 +59,44 @@ float[] randomVector(int dim) { } @Override - AbstractMockVectorValues vectorValues(int size, int dimension) { + MockVectorValues vectorValues(int size, int dimension) { return MockVectorValues.fromValues(createRandomFloatVectors(size, dimension, random())); } @Override - AbstractMockVectorValues vectorValues(float[][] values) { + MockVectorValues vectorValues(float[][] values) { return MockVectorValues.fromValues(values); } @Override - AbstractMockVectorValues vectorValues(LeafReader reader, String fieldName) - throws IOException { + MockVectorValues vectorValues(LeafReader reader, String fieldName) throws IOException { FloatVectorValues vectorValues = reader.getFloatVectorValues(fieldName); float[][] vectors = new float[reader.maxDoc()][]; - while (vectorValues.nextDoc() != NO_MORE_DOCS) { - vectors[vectorValues.docID()] = - ArrayUtil.copyOfSubArray( - vectorValues.vectorValue(), 0, vectorValues.vectorValue().length); + for (int i = 0; i < vectorValues.size(); i++) { + vectors[vectorValues.ordToDoc(i)] = + ArrayUtil.copyOfSubArray(vectorValues.vectorValue(i), 0, vectorValues.dimension()); } return MockVectorValues.fromValues(vectors); } @Override - AbstractMockVectorValues vectorValues( - int size, - int dimension, - AbstractMockVectorValues pregeneratedVectorValues, - int pregeneratedOffset) { + MockVectorValues vectorValues( + int size, int dimension, KnnVectorValues pregeneratedVectorValues, int pregeneratedOffset) { + MockVectorValues pvv = (MockVectorValues) pregeneratedVectorValues; float[][] vectors = new float[size][]; float[][] randomVectors = - createRandomFloatVectors( - size - pregeneratedVectorValues.values.length, dimension, random()); + createRandomFloatVectors(size - pvv.values.length, dimension, random()); for (int i = 0; i < pregeneratedOffset; i++) { vectors[i] = randomVectors[i]; } - int currentDoc; - while ((currentDoc = pregeneratedVectorValues.nextDoc()) != NO_MORE_DOCS) { - vectors[pregeneratedOffset + currentDoc] = pregeneratedVectorValues.values[currentDoc]; + for (int currentOrd = 0; currentOrd < pvv.size(); currentOrd++) { + vectors[pregeneratedOffset + currentOrd] = pvv.values[currentOrd]; } - for (int i = pregeneratedOffset + pregeneratedVectorValues.values.length; - i < vectors.length; - i++) { - vectors[i] = randomVectors[i - pregeneratedVectorValues.values.length]; + for (int i = pregeneratedOffset + pvv.values.length; i < vectors.length; i++) { + vectors[i] = randomVectors[i - pvv.values.length]; } return MockVectorValues.fromValues(vectors); @@ -129,7 +120,7 @@ float[] getTargetVector() { public void testSearchWithSkewedAcceptOrds() throws IOException { int nDoc = 1000; similarityFunction = VectorSimilarityFunction.EUCLIDEAN; - RandomAccessVectorValues.Floats vectors = circularVectorValues(nDoc); + FloatVectorValues vectors = circularVectorValues(nDoc); RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectors); HnswGraphBuilder builder = HnswGraphBuilder.create(scorerSupplier, 16, 100, random().nextInt()); OnHeapHnswGraph hnsw = builder.build(vectors.size()); diff --git a/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswUtil.java b/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswUtil.java new file mode 100644 index 000000000000..316afff5ee25 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswUtil.java @@ -0,0 +1,309 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.util.hnsw; + +import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; + +import java.util.ArrayDeque; +import java.util.Arrays; +import java.util.List; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.util.FixedBitSet; + +public class TestHnswUtil extends LuceneTestCase { + + public void testTreeWithCycle() throws Exception { + // test a graph that is a tree - this is rooted from its root node, not rooted + // from any other node, and not strongly connected + int[][][] nodes = { + { + {1, 2}, // node 0 + {3, 4}, // node 1 + {5, 6}, // node 2 + {}, {}, {}, {0} + } + }; + HnswGraph graph = new MockGraph(nodes); + assertTrue(HnswUtil.isRooted(graph)); + assertEquals(List.of(7), HnswUtil.componentSizes(graph)); + } + + public void testBackLinking() throws Exception { + // test a graph that is a tree - this is rooted from its root node, not rooted + // from any other node, and not strongly connected + int[][][] nodes = { + { + {1, 2}, // node 0 + {3, 4}, // node 1 + {0}, // node 2 + {1}, {1}, {1}, {1} + } + }; + HnswGraph graph = new MockGraph(nodes); + assertFalse(HnswUtil.isRooted(graph)); + // [ {0, 1, 2, 3, 4}, {5}, {6} + assertEquals(List.of(5, 1, 1), HnswUtil.componentSizes(graph)); + } + + public void testChain() throws Exception { + // test a graph that is a chain - this is rooted from every node, thus strongly connected + int[][][] nodes = {{{1}, {2}, {3}, {0}}}; + HnswGraph graph = new MockGraph(nodes); + assertTrue(HnswUtil.isRooted(graph)); + assertEquals(List.of(4), HnswUtil.componentSizes(graph)); + } + + public void testTwoChains() throws Exception { + // test a graph that is two chains + int[][][] nodes = {{{2}, {3}, {0}, {1}}}; + HnswGraph graph = new MockGraph(nodes); + assertFalse(HnswUtil.isRooted(graph)); + assertEquals(List.of(2, 2), HnswUtil.componentSizes(graph)); + } + + public void testLevels() throws Exception { + // test a graph that has three levels + int[][][] nodes = { + {{1, 2}, {3}, {0}, {0}}, + {{2}, null, {0}, null}, + {{}, null, null, null} + }; + HnswGraph graph = new MockGraph(nodes); + // System.out.println(graph.toString()); + assertTrue(HnswUtil.isRooted(graph)); + assertEquals(List.of(4), HnswUtil.componentSizes(graph)); + } + + public void testLevelsNotRooted() throws Exception { + // test a graph that has two levels with an orphaned node + int[][][] nodes = { + {{1}, {0}, {0}}, + {{}, null, null} + }; + HnswGraph graph = new MockGraph(nodes); + assertFalse(HnswUtil.isRooted(graph)); + assertEquals(List.of(2, 1), HnswUtil.componentSizes(graph)); + } + + public void testRandom() throws Exception { + for (int i = 0; i < atLeast(10); i++) { + // test on a random directed graph comparing against a brute force algorithm + int numNodes = random().nextInt(1, 100); + int numLevels = (int) Math.ceil(Math.log(numNodes)); + int[][][] nodes = new int[numLevels][][]; + for (int level = numLevels - 1; level >= 0; level--) { + nodes[level] = new int[numNodes][]; + for (int node = 0; node < numNodes; node++) { + if (level > 0) { + if ((level == numLevels - 1 && node > 0) + || (level < numLevels - 1 && nodes[level + 1][node] == null)) { + if (random().nextFloat() > Math.pow(Math.E, -level)) { + // skip some nodes, more on higher levels while ensuring every node present on a + // given level is present on all lower levels. Also ensure node 0 is always present. + continue; + } + } + } + int numNbrs = random().nextInt((numNodes + 7) / 8); + if (level == 0) { + numNbrs *= 2; + } + nodes[level][node] = new int[numNbrs]; + for (int nbr = 0; nbr < numNbrs; nbr++) { + while (true) { + int randomNbr = random().nextInt(numNodes); + if (nodes[level][randomNbr] != null) { + // allow self-linking; this doesn't arise in HNSW but it's valid more generally + nodes[level][node][nbr] = randomNbr; + break; + } + // nbr not on this level, try again + } + } + } + } + MockGraph graph = new MockGraph(nodes); + assertEquals(isRooted(nodes), HnswUtil.isRooted(graph)); + } + } + + private boolean isRooted(int[][][] nodes) { + for (int level = nodes.length - 1; level >= 0; level--) { + if (isRooted(nodes, level) == false) { + return false; + } + } + return true; + } + + private boolean isRooted(int[][][] nodes, int level) { + // check that the graph is rooted in the union of the entry nodes' trees + // System.out.println("isRooted level=" + level); + int[][] entryPoints; + if (level == nodes.length - 1) { + // entry into the top level is from a single entry point, fixed at 0 + entryPoints = new int[][] {nodes[level][0]}; + } else { + entryPoints = nodes[level + 1]; + } + FixedBitSet connected = new FixedBitSet(nodes[level].length); + int count = 0; + for (int entryPoint = 0; entryPoint < entryPoints.length; entryPoint++) { + if (entryPoints[entryPoint] == null) { + // use nodes present on next higher level (or this level if top level) as entry points + continue; + } + // System.out.println(" isRooted level=" + level + " entryPoint=" + entryPoint); + ArrayDeque stack = new ArrayDeque<>(); + stack.push(entryPoint); + while (!stack.isEmpty()) { + int node = stack.pop(); + if (connected.get(node)) { + continue; + } + // System.out.println(" connected node=" + node); + connected.set(node); + count++; + for (int nbr : nodes[level][node]) { + stack.push(nbr); + } + } + } + return count == levelSize(nodes[level]); + } + + static int levelSize(int[][] nodes) { + int count = 0; + for (int[] node : nodes) { + if (node != null) { + ++count; + } + } + return count; + } + + /** Empty graph value */ + static class MockGraph extends HnswGraph { + + private final int[][][] nodes; + + private int currentLevel; + private int currentNode; + private int currentNeighbor; + + MockGraph(int[][][] nodes) { + this.nodes = nodes; + } + + @Override + public int nextNeighbor() { + if (currentNeighbor >= nodes[currentLevel][currentNode].length) { + return NO_MORE_DOCS; + } else { + return nodes[currentLevel][currentNode][currentNeighbor++]; + } + } + + @Override + public void seek(int level, int target) { + assert level >= 0 && level < nodes.length; + assert target >= 0 && target < nodes[level].length + : "target out of range: " + + target + + " for level " + + level + + "; should be less than " + + nodes[level].length; + assert nodes[level][target] != null : "target " + target + " not on level " + level; + currentLevel = level; + currentNode = target; + currentNeighbor = 0; + } + + @Override + public int size() { + return nodes[0].length; + } + + @Override + public int numLevels() { + return nodes.length; + } + + @Override + public int entryNode() { + return 0; + } + + @Override + public String toString() { + StringBuilder buf = new StringBuilder(); + for (int level = nodes.length - 1; level >= 0; level--) { + buf.append("\nLEVEL ").append(level).append("\n"); + for (int node = 0; node < nodes[level].length; node++) { + if (nodes[level][node] != null) { + buf.append(" ") + .append(node) + .append(':') + .append(Arrays.toString(nodes[level][node])) + .append("\n"); + } + } + } + return buf.toString(); + } + + @Override + public NodesIterator getNodesOnLevel(int level) { + + int count = 0; + for (int i = 0; i < nodes[level].length; i++) { + if (nodes[level][i] != null) { + count++; + } + } + final int finalCount = count; + + return new NodesIterator(finalCount) { + int cur = -1; + int curCount = 0; + + @Override + public boolean hasNext() { + return curCount < finalCount; + } + + @Override + public int nextInt() { + while (curCount < finalCount) { + if (nodes[level][++cur] != null) { + curCount++; + return cur; + } + } + throw new IllegalStateException("exhausted"); + } + + @Override + public int consume(int[] dest) { + throw new UnsupportedOperationException(); + } + }; + } + } +} diff --git a/lucene/core/src/test/org/apache/lucene/util/quantization/TestScalarQuantizedVectorSimilarity.java b/lucene/core/src/test/org/apache/lucene/util/quantization/TestScalarQuantizedVectorSimilarity.java index bdba822d4eca..f2cc3ac35c05 100644 --- a/lucene/core/src/test/org/apache/lucene/util/quantization/TestScalarQuantizedVectorSimilarity.java +++ b/lucene/core/src/test/org/apache/lucene/util/quantization/TestScalarQuantizedVectorSimilarity.java @@ -59,8 +59,7 @@ public void testToEuclidean() throws IOException { float error = Math.max((100 - confidenceInterval) * 0.01f, 0.01f); FloatVectorValues floatVectorValues = fromFloats(floats); ScalarQuantizer scalarQuantizer = - ScalarQuantizer.fromVectors( - floatVectorValues, confidenceInterval, floats.length, (byte) 7); + ScalarQuantizer.fromVectors(floatVectorValues, confidenceInterval, numVecs, (byte) 7); byte[][] quantized = new byte[floats.length][]; float[] offsets = quantizeVectors(scalarQuantizer, floats, quantized, VectorSimilarityFunction.EUCLIDEAN); @@ -92,8 +91,7 @@ public void testToCosine() throws IOException { float error = Math.max((100 - confidenceInterval) * 0.01f, 0.01f); FloatVectorValues floatVectorValues = fromFloatsNormalized(floats, null); ScalarQuantizer scalarQuantizer = - ScalarQuantizer.fromVectors( - floatVectorValues, confidenceInterval, floats.length, (byte) 7); + ScalarQuantizer.fromVectors(floatVectorValues, confidenceInterval, numVecs, (byte) 7); byte[][] quantized = new byte[floats.length][]; float[] offsets = quantizeVectorsNormalized( @@ -129,8 +127,7 @@ public void testToDotProduct() throws IOException { float error = Math.max((100 - confidenceInterval) * 0.01f, 0.01f); FloatVectorValues floatVectorValues = fromFloats(floats); ScalarQuantizer scalarQuantizer = - ScalarQuantizer.fromVectors( - floatVectorValues, confidenceInterval, floats.length, (byte) 7); + ScalarQuantizer.fromVectors(floatVectorValues, confidenceInterval, numVecs, (byte) 7); byte[][] quantized = new byte[floats.length][]; float[] offsets = quantizeVectors(scalarQuantizer, floats, quantized, VectorSimilarityFunction.DOT_PRODUCT); @@ -162,8 +159,7 @@ public void testToMaxInnerProduct() throws IOException { float error = Math.max((100 - confidenceInterval) * 0.5f, 0.5f); FloatVectorValues floatVectorValues = fromFloats(floats); ScalarQuantizer scalarQuantizer = - ScalarQuantizer.fromVectors( - floatVectorValues, confidenceInterval, floats.length, (byte) 7); + ScalarQuantizer.fromVectors(floatVectorValues, confidenceInterval, numVecs, (byte) 7); byte[][] quantized = new byte[floats.length][]; float[] offsets = quantizeVectors( @@ -242,11 +238,8 @@ private static FloatVectorValues fromFloatsNormalized( float[][] floats, Set deletedVectors) { return new TestScalarQuantizer.TestSimpleFloatVectorValues(floats, deletedVectors) { @Override - public float[] vectorValue() throws IOException { - if (curDoc == -1 || curDoc >= floats.length) { - throw new IOException("Current doc not set or too many iterations"); - } - float[] v = ArrayUtil.copyArray(floats[curDoc]); + public float[] vectorValue(int ord) throws IOException { + float[] v = ArrayUtil.copyArray(floats[ordToDoc[ord]]); VectorUtil.l2normalize(v); return v; } diff --git a/lucene/core/src/test/org/apache/lucene/util/quantization/TestScalarQuantizer.java b/lucene/core/src/test/org/apache/lucene/util/quantization/TestScalarQuantizer.java index 48eb7ce651c6..7f56688b7999 100644 --- a/lucene/core/src/test/org/apache/lucene/util/quantization/TestScalarQuantizer.java +++ b/lucene/core/src/test/org/apache/lucene/util/quantization/TestScalarQuantizer.java @@ -272,14 +272,27 @@ static TestSimpleFloatVectorValues fromFloatsWithRandomDeletions( static class TestSimpleFloatVectorValues extends FloatVectorValues { protected final float[][] floats; protected final Set deletedVectors; + protected final int[] ordToDoc; protected final int numLiveVectors; - protected int curDoc = -1; TestSimpleFloatVectorValues(float[][] values, Set deletedVectors) { this.floats = values; this.deletedVectors = deletedVectors; - this.numLiveVectors = + numLiveVectors = deletedVectors == null ? values.length : values.length - deletedVectors.size(); + ordToDoc = new int[numLiveVectors]; + if (deletedVectors == null) { + for (int i = 0; i < numLiveVectors; i++) { + ordToDoc[i] = i; + } + } else { + int ord = 0; + for (int doc = 0; doc < values.length; doc++) { + if (!deletedVectors.contains(doc)) { + ordToDoc[ord++] = doc; + } + } + } } @Override @@ -293,40 +306,64 @@ public int size() { } @Override - public float[] vectorValue() throws IOException { - if (curDoc == -1 || curDoc >= floats.length) { - throw new IOException("Current doc not set or too many iterations"); - } - return floats[curDoc]; + public float[] vectorValue(int ord) throws IOException { + return floats[ordToDoc(ord)]; } @Override - public int docID() { - if (curDoc >= floats.length) { - return NO_MORE_DOCS; - } - return curDoc; + public int ordToDoc(int ord) { + return ordToDoc[ord]; } @Override - public int nextDoc() throws IOException { - while (++curDoc < floats.length) { - if (deletedVectors == null || !deletedVectors.contains(curDoc)) { - return curDoc; + public DocIndexIterator iterator() { + return new DocIndexIterator() { + + int ord = -1; + int doc = -1; + + @Override + public int docID() { + return doc; } - } - return docID(); - } - @Override - public int advance(int target) throws IOException { - curDoc = target - 1; - return nextDoc(); + @Override + public int nextDoc() throws IOException { + while (doc < floats.length - 1) { + ++doc; + if (deletedVectors == null || !deletedVectors.contains(doc)) { + ++ord; + return doc; + } + } + return doc = NO_MORE_DOCS; + } + + @Override + public int index() { + return ord; + } + + @Override + public long cost() { + return floats.length - deletedVectors.size(); + } + + @Override + public int advance(int target) throws IOException { + throw new UnsupportedOperationException(); + } + }; } @Override public VectorScorer scorer(float[] target) { throw new UnsupportedOperationException(); } + + @Override + public TestSimpleFloatVectorValues copy() { + return this; + } } } diff --git a/lucene/demo/src/java/module-info.java b/lucene/demo/src/java/module-info.java index 13549a56385d..9dfc3dcb72d7 100644 --- a/lucene/demo/src/java/module-info.java +++ b/lucene/demo/src/java/module-info.java @@ -23,6 +23,7 @@ requires org.apache.lucene.queries; requires org.apache.lucene.queryparser; requires org.apache.lucene.expressions; + requires org.apache.lucene.sandbox; exports org.apache.lucene.demo; exports org.apache.lucene.demo.facet; diff --git a/lucene/demo/src/java/org/apache/lucene/demo/SearchFiles.java b/lucene/demo/src/java/org/apache/lucene/demo/SearchFiles.java index 17c2d6b1e6e6..51d00d24c938 100644 --- a/lucene/demo/src/java/org/apache/lucene/demo/SearchFiles.java +++ b/lucene/demo/src/java/org/apache/lucene/demo/SearchFiles.java @@ -179,7 +179,7 @@ public static void doPagingSearch( TopDocs results = searcher.search(query, 5 * hitsPerPage); ScoreDoc[] hits = results.scoreDocs; - int numTotalHits = Math.toIntExact(results.totalHits.value); + int numTotalHits = Math.toIntExact(results.totalHits.value()); System.out.println(numTotalHits + " total matching documents"); int start = 0; diff --git a/lucene/demo/src/java/org/apache/lucene/demo/facet/AssociationsFacetsExample.java b/lucene/demo/src/java/org/apache/lucene/demo/facet/AssociationsFacetsExample.java index f3982d0b5176..cd1084acb316 100644 --- a/lucene/demo/src/java/org/apache/lucene/demo/facet/AssociationsFacetsExample.java +++ b/lucene/demo/src/java/org/apache/lucene/demo/facet/AssociationsFacetsExample.java @@ -25,6 +25,7 @@ import org.apache.lucene.facet.FacetResult; import org.apache.lucene.facet.Facets; import org.apache.lucene.facet.FacetsCollector; +import org.apache.lucene.facet.FacetsCollectorManager; import org.apache.lucene.facet.FacetsConfig; import org.apache.lucene.facet.LabelAndValue; import org.apache.lucene.facet.taxonomy.AssociationAggregationFunction; @@ -97,12 +98,13 @@ private List sumAssociations() throws IOException { IndexSearcher searcher = new IndexSearcher(indexReader); TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoDir); - FacetsCollector fc = new FacetsCollector(); - // MatchAllDocsQuery is for "browsing" (counts facets // for all non-deleted docs in the index); normally // you'd use a "normal" query: - FacetsCollector.search(searcher, new MatchAllDocsQuery(), 10, fc); + FacetsCollectorManager.FacetsResult facetsResult = + FacetsCollectorManager.search( + searcher, new MatchAllDocsQuery(), 10, new FacetsCollectorManager()); + FacetsCollector fc = facetsResult.facetsCollector(); Facets tags = new TaxonomyFacetIntAssociations( @@ -133,8 +135,8 @@ private FacetResult drillDown() throws IOException { // Now user drills down on Publish Date/2010: q.add("tags", "solr"); - FacetsCollector fc = new FacetsCollector(); - FacetsCollector.search(searcher, q, 10, fc); + FacetsCollectorManager fcm = new FacetsCollectorManager(); + FacetsCollector fc = FacetsCollectorManager.search(searcher, q, 10, fcm).facetsCollector(); // Retrieve results Facets facets = diff --git a/lucene/demo/src/java/org/apache/lucene/demo/facet/CustomFacetSetExample.java b/lucene/demo/src/java/org/apache/lucene/demo/facet/CustomFacetSetExample.java index d3000b460370..ba157c6c8f9a 100644 --- a/lucene/demo/src/java/org/apache/lucene/demo/facet/CustomFacetSetExample.java +++ b/lucene/demo/src/java/org/apache/lucene/demo/facet/CustomFacetSetExample.java @@ -379,7 +379,7 @@ protected TemperatureOnlyFacetSetMatcher(String label, DimRange temperatureRange @Override public boolean matches(long[] dimValues) { - return temperatureRange.min <= dimValues[1] && temperatureRange.max >= dimValues[1]; + return temperatureRange.min() <= dimValues[1] && temperatureRange.max() >= dimValues[1]; } } } diff --git a/lucene/demo/src/java/org/apache/lucene/demo/facet/DynamicRangeFacetsExample.java b/lucene/demo/src/java/org/apache/lucene/demo/facet/DynamicRangeFacetsExample.java new file mode 100644 index 000000000000..5b188d429275 --- /dev/null +++ b/lucene/demo/src/java/org/apache/lucene/demo/facet/DynamicRangeFacetsExample.java @@ -0,0 +1,158 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.demo.facet; + +import java.io.IOException; +import java.util.List; +import java.util.Locale; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import org.apache.lucene.analysis.core.WhitespaceAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.document.StringField; +import org.apache.lucene.facet.FacetsCollector; +import org.apache.lucene.facet.FacetsCollectorManager; +import org.apache.lucene.facet.FacetsConfig; +import org.apache.lucene.facet.range.DynamicRangeUtil; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.LongValuesSource; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.store.ByteBuffersDirectory; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.NamedThreadFactory; + +/** + * Demo dynamic range faceting. + * + *

    The results look like so: min: 63 max: 75 centroid: 69.000000 count: 2 weight: 137 min: 79 + * max: 96 centroid: 86.000000 count: 3 weight: 83 + * + *

    We've computed dynamic ranges over popularity weighted by number of books. We can read the + * results as so: There are 137 books written by authors in the 63 to 75 popularity range. + * + *

    How it works: We collect all the values (popularity) and their weights (book counts). We sort + * the values and find the approximate weight per range. In this case the total weight is 220 (total + * books by all authors) and we want 2 ranges, so we're aiming for 110 books in each range. We add + * Chesterton to the first range, since he is the least popular author. He's written a lot of books, + * the range's weight is 90. We add Tolstoy to the first range, since he is next in line of + * popularity. He's written another 47 books, which brings the total weight to 137. We're over the + * 110 target weight, so we stop and add everyone left to the second range. + */ +public class DynamicRangeFacetsExample { + + private final Directory indexDir = new ByteBuffersDirectory(); + private final FacetsConfig config = new FacetsConfig(); + + /** Empty constructor */ + public DynamicRangeFacetsExample() {} + + /** Build the example index. */ + private void index() throws IOException { + IndexWriter indexWriter = + new IndexWriter( + indexDir, + new IndexWriterConfig(new WhitespaceAnalyzer()) + .setOpenMode(IndexWriterConfig.OpenMode.CREATE)); + + Document doc = new Document(); + doc.add(new StringField("Author", "J. R. R. Tolkien", Field.Store.NO)); + doc.add(new NumericDocValuesField("Popularity", 96)); + doc.add(new NumericDocValuesField("Books", 24)); + indexWriter.addDocument(config.build(doc)); + + doc = new Document(); + doc.add(new StringField("Author", "C. S. Lewis", Field.Store.NO)); + doc.add(new NumericDocValuesField("Popularity", 83)); + doc.add(new NumericDocValuesField("Books", 48)); + indexWriter.addDocument(config.build(doc)); + + doc = new Document(); + doc.add(new StringField("Author", "G. K. Chesterton", Field.Store.NO)); + doc.add(new NumericDocValuesField("Popularity", 63)); + doc.add(new NumericDocValuesField("Books", 90)); + indexWriter.addDocument(config.build(doc)); + indexWriter.commit(); + + doc = new Document(); + doc.add(new StringField("Author", "Fyodor Dostoevsky", Field.Store.NO)); + doc.add(new NumericDocValuesField("Popularity", 79)); + doc.add(new NumericDocValuesField("Books", 11)); + indexWriter.addDocument(config.build(doc)); + + doc = new Document(); + doc.add(new StringField("Author", "Leo Tolstoy", Field.Store.NO)); + doc.add(new NumericDocValuesField("Popularity", 75)); + doc.add(new NumericDocValuesField("Books", 47)); + indexWriter.addDocument(config.build(doc)); + + indexWriter.close(); + } + + /** User runs a query and counts facets. */ + private List search() throws IOException { + DirectoryReader indexReader = DirectoryReader.open(indexDir); + IndexSearcher searcher = new IndexSearcher(indexReader); + + LongValuesSource valuesSource = LongValuesSource.fromLongField("Popularity"); + LongValuesSource weightsSource = LongValuesSource.fromLongField("Books"); + + // Aggregates the facet counts + FacetsCollectorManager fcm = new FacetsCollectorManager(); + + // MatchAllDocsQuery is for "browsing" (counts facets + // for all non-deleted docs in the index); normally + // you'd use a "normal" query: + FacetsCollector fc = + FacetsCollectorManager.search(searcher, new MatchAllDocsQuery(), 10, fcm).facetsCollector(); + + try (ExecutorService executor = + Executors.newFixedThreadPool(2, new NamedThreadFactory("dynamic-ranges"))) { + // We ask for 2 ranges over popularity weighted by book count + return DynamicRangeUtil.computeDynamicRanges( + "Books", weightsSource, valuesSource, fc, 2, executor); + } + } + + /** Runs the search example. */ + public List runSearch() throws IOException { + index(); + return search(); + } + + /** Runs the search example and prints the results. */ + public static void main(String[] args) throws Exception { + System.out.println("Dynamic range facets example:"); + System.out.println("-----------------------"); + DynamicRangeFacetsExample example = new DynamicRangeFacetsExample(); + List results = example.runSearch(); + for (DynamicRangeUtil.DynamicRangeInfo range : results) { + System.out.printf( + Locale.ROOT, + "min: %d max: %d centroid: %f count: %d weight: %d%n", + range.min(), + range.max(), + range.centroid(), + range.count(), + range.weight()); + } + } +} diff --git a/lucene/demo/src/java/org/apache/lucene/demo/facet/ExpressionAggregationFacetsExample.java b/lucene/demo/src/java/org/apache/lucene/demo/facet/ExpressionAggregationFacetsExample.java index 87a8400b3b75..059d5505ace1 100644 --- a/lucene/demo/src/java/org/apache/lucene/demo/facet/ExpressionAggregationFacetsExample.java +++ b/lucene/demo/src/java/org/apache/lucene/demo/facet/ExpressionAggregationFacetsExample.java @@ -30,6 +30,7 @@ import org.apache.lucene.facet.FacetResult; import org.apache.lucene.facet.Facets; import org.apache.lucene.facet.FacetsCollector; +import org.apache.lucene.facet.FacetsCollectorManager; import org.apache.lucene.facet.FacetsConfig; import org.apache.lucene.facet.taxonomy.AssociationAggregationFunction; import org.apache.lucene.facet.taxonomy.TaxonomyFacetFloatAssociations; @@ -97,12 +98,13 @@ private FacetResult search() throws IOException, ParseException { DoubleValuesSource.fromLongField("popularity")); // the value of the 'popularity' field // Aggregates the facet values - FacetsCollector fc = new FacetsCollector(true); + FacetsCollectorManager fcm = new FacetsCollectorManager(true); // MatchAllDocsQuery is for "browsing" (counts facets // for all non-deleted docs in the index); normally // you'd use a "normal" query: - FacetsCollector.search(searcher, new MatchAllDocsQuery(), 10, fc); + FacetsCollector fc = + FacetsCollectorManager.search(searcher, new MatchAllDocsQuery(), 10, fcm).facetsCollector(); // Retrieve results Facets facets = diff --git a/lucene/demo/src/java/org/apache/lucene/demo/facet/MultiCategoryListsFacetsExample.java b/lucene/demo/src/java/org/apache/lucene/demo/facet/MultiCategoryListsFacetsExample.java index 964bd58ad00a..fd21cd809ae5 100644 --- a/lucene/demo/src/java/org/apache/lucene/demo/facet/MultiCategoryListsFacetsExample.java +++ b/lucene/demo/src/java/org/apache/lucene/demo/facet/MultiCategoryListsFacetsExample.java @@ -25,6 +25,7 @@ import org.apache.lucene.facet.FacetResult; import org.apache.lucene.facet.Facets; import org.apache.lucene.facet.FacetsCollector; +import org.apache.lucene.facet.FacetsCollectorManager; import org.apache.lucene.facet.FacetsConfig; import org.apache.lucene.facet.taxonomy.FastTaxonomyFacetCounts; import org.apache.lucene.facet.taxonomy.TaxonomyReader; @@ -97,12 +98,13 @@ private List search() throws IOException { IndexSearcher searcher = new IndexSearcher(indexReader); TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoDir); - FacetsCollector fc = new FacetsCollector(); + FacetsCollectorManager fcm = new FacetsCollectorManager(); // MatchAllDocsQuery is for "browsing" (counts facets // for all non-deleted docs in the index); normally // you'd use a "normal" query: - FacetsCollector.search(searcher, new MatchAllDocsQuery(), 10, fc); + FacetsCollector fc = + FacetsCollectorManager.search(searcher, new MatchAllDocsQuery(), 10, fcm).facetsCollector(); // Retrieve results List results = new ArrayList<>(); diff --git a/lucene/demo/src/java/org/apache/lucene/demo/facet/RangeFacetsExample.java b/lucene/demo/src/java/org/apache/lucene/demo/facet/RangeFacetsExample.java index df87fd879616..b61096215e65 100644 --- a/lucene/demo/src/java/org/apache/lucene/demo/facet/RangeFacetsExample.java +++ b/lucene/demo/src/java/org/apache/lucene/demo/facet/RangeFacetsExample.java @@ -30,6 +30,7 @@ import org.apache.lucene.facet.FacetResult; import org.apache.lucene.facet.Facets; import org.apache.lucene.facet.FacetsCollector; +import org.apache.lucene.facet.FacetsCollectorManager; import org.apache.lucene.facet.FacetsConfig; import org.apache.lucene.facet.range.LongRange; import org.apache.lucene.facet.range.LongRangeFacetCounts; @@ -115,13 +116,13 @@ private FacetsConfig getConfig() { /** User runs a query and counts facets. */ public FacetResult search() throws IOException { - // Aggregates the facet counts - FacetsCollector fc = new FacetsCollector(); - // MatchAllDocsQuery is for "browsing" (counts facets // for all non-deleted docs in the index); normally // you'd use a "normal" query: - FacetsCollector.search(searcher, new MatchAllDocsQuery(), 10, fc); + FacetsCollector fc = + FacetsCollectorManager.search( + searcher, new MatchAllDocsQuery(), 10, new FacetsCollectorManager()) + .facetsCollector(); Facets facets = new LongRangeFacetCounts("timestamp", fc, PAST_HOUR, PAST_SIX_HOURS, PAST_DAY); return facets.getAllChildren("timestamp"); @@ -131,12 +132,13 @@ public FacetResult search() throws IOException { public FacetResult searchTopChildren() throws IOException { // Aggregates the facet counts - FacetsCollector fc = new FacetsCollector(); + FacetsCollectorManager fcm = new FacetsCollectorManager(); // MatchAllDocsQuery is for "browsing" (counts facets // for all non-deleted docs in the index); normally // you'd use a "normal" query: - FacetsCollector.search(searcher, new MatchAllDocsQuery(), 10, fc); + FacetsCollector fc = + FacetsCollectorManager.search(searcher, new MatchAllDocsQuery(), 10, fcm).facetsCollector(); Facets facets = new LongRangeFacetCounts("error timestamp", fc, logTimestampRanges); return facets.getTopChildren(10, "error timestamp"); diff --git a/lucene/demo/src/java/org/apache/lucene/demo/facet/SandboxFacetsExample.java b/lucene/demo/src/java/org/apache/lucene/demo/facet/SandboxFacetsExample.java new file mode 100644 index 000000000000..bddc41146463 --- /dev/null +++ b/lucene/demo/src/java/org/apache/lucene/demo/facet/SandboxFacetsExample.java @@ -0,0 +1,731 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.demo.facet; + +import static org.apache.lucene.facet.FacetsConfig.DEFAULT_INDEX_FIELD_NAME; +import static org.apache.lucene.sandbox.facet.ComparableUtils.byAggregatedValue; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import org.apache.lucene.analysis.core.WhitespaceAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.DoubleDocValuesField; +import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.facet.DrillDownQuery; +import org.apache.lucene.facet.DrillSideways; +import org.apache.lucene.facet.FacetField; +import org.apache.lucene.facet.FacetResult; +import org.apache.lucene.facet.FacetsConfig; +import org.apache.lucene.facet.LabelAndValue; +import org.apache.lucene.facet.MultiLongValuesSource; +import org.apache.lucene.facet.range.LongRange; +import org.apache.lucene.facet.taxonomy.FacetLabel; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader; +import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.sandbox.facet.ComparableUtils; +import org.apache.lucene.sandbox.facet.FacetFieldCollectorManager; +import org.apache.lucene.sandbox.facet.cutters.TaxonomyFacetsCutter; +import org.apache.lucene.sandbox.facet.cutters.ranges.LongRangeFacetCutter; +import org.apache.lucene.sandbox.facet.iterators.ComparableSupplier; +import org.apache.lucene.sandbox.facet.iterators.OrdinalIterator; +import org.apache.lucene.sandbox.facet.iterators.TaxonomyChildrenOrdinalIterator; +import org.apache.lucene.sandbox.facet.iterators.TopnOrdinalIterator; +import org.apache.lucene.sandbox.facet.labels.RangeOrdToLabel; +import org.apache.lucene.sandbox.facet.labels.TaxonomyOrdLabelBiMap; +import org.apache.lucene.sandbox.facet.recorders.CountFacetRecorder; +import org.apache.lucene.sandbox.facet.recorders.LongAggregationsFacetRecorder; +import org.apache.lucene.sandbox.facet.recorders.MultiFacetsRecorder; +import org.apache.lucene.sandbox.facet.recorders.Reducer; +import org.apache.lucene.search.DoubleValuesSource; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.LongValuesSource; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.MultiCollectorManager; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.TopScoreDocCollectorManager; +import org.apache.lucene.store.ByteBuffersDirectory; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.IOUtils; + +/** Demo for sandbox faceting. */ +public class SandboxFacetsExample { + + private final Directory indexDir = new ByteBuffersDirectory(); + private final Directory taxoDir = new ByteBuffersDirectory(); + private final FacetsConfig config = new FacetsConfig(); + + private SandboxFacetsExample() { + config.setHierarchical("Publish Date", true); + } + + /** Build the example index. */ + void index() throws IOException { + IndexWriter indexWriter = + new IndexWriter( + indexDir, new IndexWriterConfig(new WhitespaceAnalyzer()).setOpenMode(OpenMode.CREATE)); + + // Writes facet ords to a separate directory from the main index + DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir); + + Document doc = new Document(); + doc.add(new FacetField("Author", "Bob")); + doc.add(new FacetField("Publish Date", "2010", "10", "15")); + doc.add(new NumericDocValuesField("Price", 10)); + doc.add(new NumericDocValuesField("Units", 9)); + doc.add(new DoubleDocValuesField("Popularity", 3.5d)); + indexWriter.addDocument(config.build(taxoWriter, doc)); + + doc = new Document(); + doc.add(new FacetField("Author", "Lisa")); + doc.add(new FacetField("Publish Date", "2010", "10", "20")); + doc.add(new NumericDocValuesField("Price", 4)); + doc.add(new NumericDocValuesField("Units", 2)); + doc.add(new DoubleDocValuesField("Popularity", 4.1D)); + indexWriter.addDocument(config.build(taxoWriter, doc)); + + doc = new Document(); + doc.add(new FacetField("Author", "Lisa")); + doc.add(new FacetField("Publish Date", "2012", "1", "1")); + doc.add(new NumericDocValuesField("Price", 3)); + doc.add(new NumericDocValuesField("Units", 5)); + doc.add(new DoubleDocValuesField("Popularity", 3.9D)); + indexWriter.addDocument(config.build(taxoWriter, doc)); + + doc = new Document(); + doc.add(new FacetField("Author", "Susan")); + doc.add(new FacetField("Publish Date", "2012", "1", "7")); + doc.add(new NumericDocValuesField("Price", 8)); + doc.add(new NumericDocValuesField("Units", 7)); + doc.add(new DoubleDocValuesField("Popularity", 4D)); + indexWriter.addDocument(config.build(taxoWriter, doc)); + + doc = new Document(); + doc.add(new FacetField("Author", "Frank")); + doc.add(new FacetField("Publish Date", "1999", "5", "5")); + doc.add(new NumericDocValuesField("Price", 9)); + doc.add(new NumericDocValuesField("Units", 6)); + doc.add(new DoubleDocValuesField("Popularity", 4.9D)); + indexWriter.addDocument(config.build(taxoWriter, doc)); + + IOUtils.close(indexWriter, taxoWriter); + } + + /** User runs a query and counts facets only without collecting the matching documents. */ + List facetsOnly() throws IOException { + //// (1) init readers and searcher + DirectoryReader indexReader = DirectoryReader.open(indexDir); + IndexSearcher searcher = new IndexSearcher(indexReader); + TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoDir); + + //// (2) init collector + TaxonomyFacetsCutter defaultTaxoCutter = + new TaxonomyFacetsCutter(DEFAULT_INDEX_FIELD_NAME, config, taxoReader); + CountFacetRecorder defaultRecorder = new CountFacetRecorder(); + + FacetFieldCollectorManager collectorManager = + new FacetFieldCollectorManager<>(defaultTaxoCutter, defaultRecorder); + + // (2.1) if we need to collect data using multiple different collectors, e.g. taxonomy and + // ranges, or even two taxonomy facets that use different Category List Field, we can + // use MultiCollectorManager, e.g.: + // + // TODO: add a demo for it. + // TaxonomyFacetsCutter publishDateCutter = new + // TaxonomyFacetsCutter(config.getDimConfig("Publish Date"), taxoReader); + // CountFacetRecorder publishDateRecorder = new CountFacetRecorder(false); + // FacetFieldCollectorManager publishDateCollectorManager = new + // FacetFieldCollectorManager<>(publishDateCutter, publishDateRecorder); + // MultiCollectorManager drillDownCollectorManager = new + // MultiCollectorManager(authorCollectorManager, publishDateCollectorManager); + // Object[] results = searcher.search(new MatchAllDocsQuery(), drillDownCollectorManager); + + //// (3) search + // Search returns the same Recorder we created - so we can ignore results + searcher.search(new MatchAllDocsQuery(), collectorManager); + + //// (4) Get top 10 results by count for Author and Publish Date + // This object is used to get topN results by count + ComparableSupplier countComparable = + ComparableUtils.byCount(defaultRecorder); + // We don't actually need to use FacetResult, it is up to client what to do with the results. + // Here we just want to demo that we can still do FacetResult as well + List results = new ArrayList<>(2); + // This object provides labels for ordinals. + TaxonomyOrdLabelBiMap ordLabels = new TaxonomyOrdLabelBiMap(taxoReader); + for (String dimension : List.of("Author", "Publish Date")) { + //// (4.1) Chain two ordinal iterators to get top N children + int dimOrdinal = ordLabels.getOrd(new FacetLabel(dimension)); + OrdinalIterator childrenIterator = + new TaxonomyChildrenOrdinalIterator( + defaultRecorder.recordedOrds(), + taxoReader.getParallelTaxonomyArrays().parents(), + dimOrdinal); + OrdinalIterator topByCountOrds = + new TopnOrdinalIterator<>(childrenIterator, countComparable, 10); + // Get array of final ordinals - we need to use all of them to get labels first, and then to + // get counts, + // but OrdinalIterator only allows reading ordinals once. + int[] resultOrdinals = topByCountOrds.toArray(); + + //// (4.2) Use faceting results + FacetLabel[] labels = ordLabels.getLabels(resultOrdinals); + List labelsAndValues = new ArrayList<>(labels.length); + for (int i = 0; i < resultOrdinals.length; i++) { + labelsAndValues.add( + new LabelAndValue( + labels[i].lastComponent(), defaultRecorder.getCount(resultOrdinals[i]))); + } + int dimensionValue = defaultRecorder.getCount(dimOrdinal); + results.add( + new FacetResult( + dimension, + new String[0], + dimensionValue, + labelsAndValues.toArray(new LabelAndValue[0]), + labelsAndValues.size())); + } + + IOUtils.close(indexReader, taxoReader); + return results; + } + + /** + * User runs a query and counts facets for exclusive ranges without collecting the matching + * documents + */ + List exclusiveRangesCountFacetsOnly() throws IOException { + DirectoryReader indexReader = DirectoryReader.open(indexDir); + IndexSearcher searcher = new IndexSearcher(indexReader); + + MultiLongValuesSource valuesSource = MultiLongValuesSource.fromLongField("Price"); + + // Exclusive ranges example + LongRange[] inputRanges = new LongRange[2]; + inputRanges[0] = new LongRange("0-5", 0, true, 5, true); + inputRanges[1] = new LongRange("5-10", 5, false, 10, true); + + LongRangeFacetCutter longRangeFacetCutter = + LongRangeFacetCutter.create(valuesSource, inputRanges); + CountFacetRecorder countRecorder = new CountFacetRecorder(); + + FacetFieldCollectorManager collectorManager = + new FacetFieldCollectorManager<>(longRangeFacetCutter, countRecorder); + searcher.search(new MatchAllDocsQuery(), collectorManager); + RangeOrdToLabel ordToLabels = new RangeOrdToLabel(inputRanges); + + ComparableSupplier countComparable = + ComparableUtils.byCount(countRecorder); + OrdinalIterator topByCountOrds = + new TopnOrdinalIterator<>(countRecorder.recordedOrds(), countComparable, 10); + + List results = new ArrayList<>(2); + + int[] resultOrdinals = topByCountOrds.toArray(); + FacetLabel[] labels = ordToLabels.getLabels(resultOrdinals); + List labelsAndValues = new ArrayList<>(labels.length); + for (int i = 0; i < resultOrdinals.length; i++) { + labelsAndValues.add( + new LabelAndValue(labels[i].lastComponent(), countRecorder.getCount(resultOrdinals[i]))); + } + + results.add( + new FacetResult( + "Price", new String[0], 0, labelsAndValues.toArray(new LabelAndValue[0]), 0)); + + System.out.println("Computed counts"); + IOUtils.close(indexReader); + return results; + } + + List overlappingRangesCountFacetsOnly() throws IOException { + DirectoryReader indexReader = DirectoryReader.open(indexDir); + IndexSearcher searcher = new IndexSearcher(indexReader); + + MultiLongValuesSource valuesSource = MultiLongValuesSource.fromLongField("Price"); + + // overlapping ranges example + LongRange[] inputRanges = new LongRange[2]; + inputRanges[0] = new LongRange("0-5", 0, true, 5, true); + inputRanges[1] = new LongRange("0-10", 0, true, 10, true); + + LongRangeFacetCutter longRangeFacetCutter = + LongRangeFacetCutter.create(valuesSource, inputRanges); + CountFacetRecorder countRecorder = new CountFacetRecorder(); + + FacetFieldCollectorManager collectorManager = + new FacetFieldCollectorManager<>(longRangeFacetCutter, countRecorder); + searcher.search(new MatchAllDocsQuery(), collectorManager); + RangeOrdToLabel ordToLabels = new RangeOrdToLabel(inputRanges); + + ComparableSupplier countComparable = + ComparableUtils.byCount(countRecorder); + OrdinalIterator topByCountOrds = + new TopnOrdinalIterator<>(countRecorder.recordedOrds(), countComparable, 10); + + List results = new ArrayList<>(2); + + int[] resultOrdinals = topByCountOrds.toArray(); + FacetLabel[] labels = ordToLabels.getLabels(resultOrdinals); + List labelsAndValues = new ArrayList<>(labels.length); + for (int i = 0; i < resultOrdinals.length; i++) { + labelsAndValues.add( + new LabelAndValue(labels[i].lastComponent(), countRecorder.getCount(resultOrdinals[i]))); + } + + results.add( + new FacetResult( + "Price", new String[0], 0, labelsAndValues.toArray(new LabelAndValue[0]), 0)); + + System.out.println("Computed counts"); + IOUtils.close(indexReader); + return results; + } + + List exclusiveRangesAggregationFacets() throws IOException { + DirectoryReader indexReader = DirectoryReader.open(indexDir); + IndexSearcher searcher = new IndexSearcher(indexReader); + + MultiLongValuesSource valuesSource = MultiLongValuesSource.fromLongField("Price"); + + // Exclusive ranges example + LongRange[] inputRanges = new LongRange[2]; + inputRanges[0] = new LongRange("0-5", 0, true, 5, true); + inputRanges[1] = new LongRange("5-10", 5, false, 10, true); + + LongRangeFacetCutter longRangeFacetCutter = + LongRangeFacetCutter.create(valuesSource, inputRanges); + + // initialise the aggregations to be computed - a values source + reducer + LongValuesSource[] longValuesSources = new LongValuesSource[2]; + Reducer[] reducers = new Reducer[2]; + // popularity:max + longValuesSources[0] = DoubleValuesSource.fromDoubleField("Popularity").toLongValuesSource(); + reducers[0] = Reducer.MAX; + // units:sum + longValuesSources[1] = LongValuesSource.fromLongField("Units"); + reducers[1] = Reducer.SUM; + + LongAggregationsFacetRecorder longAggregationsFacetRecorder = + new LongAggregationsFacetRecorder(longValuesSources, reducers); + + CountFacetRecorder countRecorder = new CountFacetRecorder(); + + // Compute both counts and aggregations + MultiFacetsRecorder multiFacetsRecorder = + new MultiFacetsRecorder(countRecorder, longAggregationsFacetRecorder); + + FacetFieldCollectorManager collectorManager = + new FacetFieldCollectorManager<>(longRangeFacetCutter, multiFacetsRecorder); + searcher.search(new MatchAllDocsQuery(), collectorManager); + RangeOrdToLabel ordToLabels = new RangeOrdToLabel(inputRanges); + + // Get recorded ords - use either count/aggregations recorder + OrdinalIterator recordedOrds = longAggregationsFacetRecorder.recordedOrds(); + + // We don't actually need to use FacetResult, it is up to client what to do with the results. + // Here we just want to demo that we can still do FacetResult as well + List results = new ArrayList<>(2); + ComparableSupplier comparableSupplier; + OrdinalIterator topOrds; + int[] resultOrdinals; + FacetLabel[] labels; + List labelsAndValues; + + // Sort results by units:sum and tie-break by count + comparableSupplier = byAggregatedValue(countRecorder, longAggregationsFacetRecorder, 1); + topOrds = new TopnOrdinalIterator<>(recordedOrds, comparableSupplier, 10); + + resultOrdinals = topOrds.toArray(); + labels = ordToLabels.getLabels(resultOrdinals); + labelsAndValues = new ArrayList<>(labels.length); + for (int i = 0; i < resultOrdinals.length; i++) { + labelsAndValues.add( + new LabelAndValue( + labels[i].lastComponent(), + longAggregationsFacetRecorder.getRecordedValue(resultOrdinals[i], 1))); + } + results.add( + new FacetResult( + "Price", new String[0], 0, labelsAndValues.toArray(new LabelAndValue[0]), 0)); + + // note: previous ordinal iterator was exhausted + recordedOrds = longAggregationsFacetRecorder.recordedOrds(); + // Sort results by popularity:max and tie-break by count + comparableSupplier = byAggregatedValue(countRecorder, longAggregationsFacetRecorder, 0); + topOrds = new TopnOrdinalIterator<>(recordedOrds, comparableSupplier, 10); + resultOrdinals = topOrds.toArray(); + labels = ordToLabels.getLabels(resultOrdinals); + labelsAndValues = new ArrayList<>(labels.length); + for (int i = 0; i < resultOrdinals.length; i++) { + labelsAndValues.add( + new LabelAndValue( + labels[i].lastComponent(), + longAggregationsFacetRecorder.getRecordedValue(resultOrdinals[i], 0))); + } + results.add( + new FacetResult( + "Price", new String[0], 0, labelsAndValues.toArray(new LabelAndValue[0]), 0)); + + return results; + } + + /** User runs a query and counts facets. */ + private List facetsWithSearch() throws IOException { + //// (1) init readers and searcher + DirectoryReader indexReader = DirectoryReader.open(indexDir); + IndexSearcher searcher = new IndexSearcher(indexReader); + TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoDir); + + //// (2) init collectors + // Facet collectors + TaxonomyFacetsCutter defaultTaxoCutter = + new TaxonomyFacetsCutter(DEFAULT_INDEX_FIELD_NAME, config, taxoReader); + CountFacetRecorder defaultRecorder = new CountFacetRecorder(); + FacetFieldCollectorManager taxoFacetsCollectorManager = + new FacetFieldCollectorManager<>(defaultTaxoCutter, defaultRecorder); + // Hits collector + TopScoreDocCollectorManager hitsCollectorManager = + new TopScoreDocCollectorManager(2, Integer.MAX_VALUE); + // Now wrap them with MultiCollectorManager to collect both hits and facets. + MultiCollectorManager collectorManager = + new MultiCollectorManager(hitsCollectorManager, taxoFacetsCollectorManager); + + //// (3) search + Object[] results = searcher.search(new MatchAllDocsQuery(), collectorManager); + TopDocs topDocs = (TopDocs) results[0]; + System.out.println( + "Search results: totalHits: " + + topDocs.totalHits + + ", collected hits: " + + topDocs.scoreDocs.length); + // FacetFieldCollectorManager returns the same Recorder it gets - so we can ignore read the + // results from original recorder + // and ignore this value. + // CountFacetRecorder defaultRecorder = (CountFacetRecorder) results[1]; + + //// (4) Get top 10 results by count for Author and Publish Date + // This object is used to get topN results by count + ComparableSupplier countComparable = + ComparableUtils.byCount(defaultRecorder); + // We don't actually need to use FacetResult, it is up to client what to do with the results. + // Here we just want to demo that we can still do FacetResult as well + List facetResults = new ArrayList<>(2); + // This object provides labels for ordinals. + TaxonomyOrdLabelBiMap ordLabels = new TaxonomyOrdLabelBiMap(taxoReader); + for (String dimension : List.of("Author", "Publish Date")) { + int dimensionOrdinal = ordLabels.getOrd(new FacetLabel(dimension)); + //// (4.1) Chain two ordinal iterators to get top N children + OrdinalIterator childrenIterator = + new TaxonomyChildrenOrdinalIterator( + defaultRecorder.recordedOrds(), + taxoReader.getParallelTaxonomyArrays().parents(), + dimensionOrdinal); + OrdinalIterator topByCountOrds = + new TopnOrdinalIterator<>(childrenIterator, countComparable, 10); + // Get array of final ordinals - we need to use all of them to get labels first, and then to + // get counts, + // but OrdinalIterator only allows reading ordinals once. + int[] resultOrdinals = topByCountOrds.toArray(); + + //// (4.2) Use faceting results + FacetLabel[] labels = ordLabels.getLabels(resultOrdinals); + List labelsAndValues = new ArrayList<>(labels.length); + for (int i = 0; i < resultOrdinals.length; i++) { + labelsAndValues.add( + new LabelAndValue( + labels[i].lastComponent(), defaultRecorder.getCount(resultOrdinals[i]))); + } + int dimensionValue = defaultRecorder.getCount(dimensionOrdinal); + facetResults.add( + new FacetResult( + dimension, + new String[0], + dimensionValue, + labelsAndValues.toArray(new LabelAndValue[0]), + labelsAndValues.size())); + } + + IOUtils.close(indexReader, taxoReader); + return facetResults; + } + + /** User drills down on 'Publish Date/2010', and we return facets for 'Author' */ + FacetResult drillDown() throws IOException { + //// (1) init readers and searcher + DirectoryReader indexReader = DirectoryReader.open(indexDir); + IndexSearcher searcher = new IndexSearcher(indexReader); + TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoDir); + + //// (2) init collector + TaxonomyFacetsCutter defaultTaxoCutter = + new TaxonomyFacetsCutter(DEFAULT_INDEX_FIELD_NAME, config, taxoReader); + CountFacetRecorder defaultRecorder = new CountFacetRecorder(); + + FacetFieldCollectorManager collectorManager = + new FacetFieldCollectorManager<>(defaultTaxoCutter, defaultRecorder); + + DrillDownQuery q = new DrillDownQuery(config); + q.add("Publish Date", "2010"); + + //// (3) search + // Right now we return the same Recorder we created - so we can ignore results + searcher.search(q, collectorManager); + + //// (4) Get top 10 results by count for Author and Publish Date + // This object is used to get topN results by count + ComparableSupplier countComparable = + ComparableUtils.byCount(defaultRecorder); + + // This object provides labels for ordinals. + TaxonomyOrdLabelBiMap ordLabels = new TaxonomyOrdLabelBiMap(taxoReader); + String dimension = "Author"; + //// (4.1) Chain two ordinal iterators to get top N children + int dimOrdinal = ordLabels.getOrd(new FacetLabel(dimension)); + OrdinalIterator childrenIterator = + new TaxonomyChildrenOrdinalIterator( + defaultRecorder.recordedOrds(), + taxoReader.getParallelTaxonomyArrays().parents(), + dimOrdinal); + OrdinalIterator topByCountOrds = + new TopnOrdinalIterator<>(childrenIterator, countComparable, 10); + // Get array of final ordinals - we need to use all of them to get labels first, and then to get + // counts, + // but OrdinalIterator only allows reading ordinals once. + int[] resultOrdinals = topByCountOrds.toArray(); + + //// (4.2) Use faceting results + FacetLabel[] labels = ordLabels.getLabels(resultOrdinals); + List labelsAndValues = new ArrayList<>(labels.length); + for (int i = 0; i < resultOrdinals.length; i++) { + labelsAndValues.add( + new LabelAndValue( + labels[i].lastComponent(), defaultRecorder.getCount(resultOrdinals[i]))); + } + + IOUtils.close(indexReader, taxoReader); + int dimensionValue = defaultRecorder.getCount(dimOrdinal); + // We don't actually need to use FacetResult, it is up to client what to do with the results. + // Here we just want to demo that we can still do FacetResult as well + return new FacetResult( + dimension, + new String[0], + dimensionValue, + labelsAndValues.toArray(new LabelAndValue[0]), + labelsAndValues.size()); + } + + /** + * User drills down on 'Publish Date/2010', and we return facets for both 'Publish Date' and + * 'Author', using DrillSideways. + */ + private List drillSideways() throws IOException { + //// (1) init readers and searcher + DirectoryReader indexReader = DirectoryReader.open(indexDir); + IndexSearcher searcher = new IndexSearcher(indexReader); + TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoDir); + + //// (2) init drill down query and collectors + TaxonomyFacetsCutter defaultTaxoCutter = + new TaxonomyFacetsCutter(DEFAULT_INDEX_FIELD_NAME, config, taxoReader); + CountFacetRecorder drillDownRecorder = new CountFacetRecorder(); + FacetFieldCollectorManager drillDownCollectorManager = + new FacetFieldCollectorManager<>(defaultTaxoCutter, drillDownRecorder); + + DrillDownQuery q = new DrillDownQuery(config); + + //// (2.1) add query and collector dimensions + q.add("Publish Date", "2010"); + CountFacetRecorder publishDayDimensionRecorder = new CountFacetRecorder(); + // Note that it is safe to use the same FacetsCutter here because we create Leaf cutter for each + // leaf for each + // FacetFieldCollectorManager anyway, and leaf cutter are not merged or anything like that. + FacetFieldCollectorManager publishDayDimensionCollectorManager = + new FacetFieldCollectorManager<>(defaultTaxoCutter, publishDayDimensionRecorder); + List> drillSidewaysManagers = + List.of(publishDayDimensionCollectorManager); + + //// (3) search + // Right now we return the same Recorder we created - so we can ignore results + DrillSideways ds = new DrillSideways(searcher, config, taxoReader); + ds.search(q, drillDownCollectorManager, drillSidewaysManagers); + + //// (4) Get top 10 results by count for Author + List facetResults = new ArrayList<>(2); + // This object provides labels for ordinals. + TaxonomyOrdLabelBiMap ordLabels = new TaxonomyOrdLabelBiMap(taxoReader); + // This object is used to get topN results by count + ComparableSupplier countComparable = + ComparableUtils.byCount(drillDownRecorder); + //// (4.1) Chain two ordinal iterators to get top N children + int dimOrdinal = ordLabels.getOrd(new FacetLabel("Author")); + OrdinalIterator childrenIterator = + new TaxonomyChildrenOrdinalIterator( + drillDownRecorder.recordedOrds(), + taxoReader.getParallelTaxonomyArrays().parents(), + dimOrdinal); + OrdinalIterator topByCountOrds = + new TopnOrdinalIterator<>(childrenIterator, countComparable, 10); + // Get array of final ordinals - we need to use all of them to get labels first, and then to get + // counts, + // but OrdinalIterator only allows reading ordinals once. + int[] resultOrdinals = topByCountOrds.toArray(); + + //// (4.2) Use faceting results + FacetLabel[] labels = ordLabels.getLabels(resultOrdinals); + List labelsAndValues = new ArrayList<>(labels.length); + for (int i = 0; i < resultOrdinals.length; i++) { + labelsAndValues.add( + new LabelAndValue( + labels[i].lastComponent(), drillDownRecorder.getCount(resultOrdinals[i]))); + } + int dimensionValue = drillDownRecorder.getCount(dimOrdinal); + facetResults.add( + new FacetResult( + "Author", + new String[0], + dimensionValue, + labelsAndValues.toArray(new LabelAndValue[0]), + labelsAndValues.size())); + + //// (5) Same process, but for Publish Date drill sideways dimension + countComparable = ComparableUtils.byCount(publishDayDimensionRecorder); + //// (4.1) Chain two ordinal iterators to get top N children + dimOrdinal = ordLabels.getOrd(new FacetLabel("Publish Date")); + childrenIterator = + new TaxonomyChildrenOrdinalIterator( + publishDayDimensionRecorder.recordedOrds(), + taxoReader.getParallelTaxonomyArrays().parents(), + dimOrdinal); + topByCountOrds = new TopnOrdinalIterator<>(childrenIterator, countComparable, 10); + // Get array of final ordinals - we need to use all of them to get labels first, and then to get + // counts, + // but OrdinalIterator only allows reading ordinals once. + resultOrdinals = topByCountOrds.toArray(); + + //// (4.2) Use faceting results + labels = ordLabels.getLabels(resultOrdinals); + labelsAndValues = new ArrayList<>(labels.length); + for (int i = 0; i < resultOrdinals.length; i++) { + labelsAndValues.add( + new LabelAndValue( + labels[i].lastComponent(), publishDayDimensionRecorder.getCount(resultOrdinals[i]))); + } + dimensionValue = publishDayDimensionRecorder.getCount(dimOrdinal); + facetResults.add( + new FacetResult( + "Publish Date", + new String[0], + dimensionValue, + labelsAndValues.toArray(new LabelAndValue[0]), + labelsAndValues.size())); + + IOUtils.close(indexReader, taxoReader); + return facetResults; + } + + /** Runs the search example. */ + public List runFacetOnly() throws IOException { + index(); + return facetsOnly(); + } + + /** Runs the search example. */ + public List runSearch() throws IOException { + index(); + return facetsWithSearch(); + } + + /** Runs the drill-down example. */ + public FacetResult runDrillDown() throws IOException { + index(); + return drillDown(); + } + + /** Runs the drill-sideways example. */ + public List runDrillSideways() throws IOException { + index(); + return drillSideways(); + } + + /** Runs the example of non overlapping range facets */ + public List runNonOverlappingRangesCountFacetsOnly() throws IOException { + index(); + return exclusiveRangesCountFacetsOnly(); + } + + /** Runs the example of overlapping range facets */ + public List runOverlappingRangesCountFacetsOnly() throws IOException { + index(); + return overlappingRangesCountFacetsOnly(); + } + + /** Runs the example of collecting long aggregations for non overlapping range facets. */ + public List runNonOverlappingRangesAggregationFacets() throws IOException { + index(); + return exclusiveRangesAggregationFacets(); + } + + /** Runs the search and drill-down examples and prints the results. */ + public static void main(String[] args) throws Exception { + System.out.println("Facet counting example:"); + System.out.println("-----------------------"); + SandboxFacetsExample example = new SandboxFacetsExample(); + List results1 = example.runFacetOnly(); + System.out.println("Author: " + results1.get(0)); + System.out.println("Publish Date: " + results1.get(1)); + + System.out.println("Facet counting example (combined facets and search):"); + System.out.println("-----------------------"); + List results = example.runSearch(); + System.out.println("Author: " + results.get(0)); + System.out.println("Publish Date: " + results.get(1)); + + System.out.println("Facet drill-down example (Publish Date/2010):"); + System.out.println("---------------------------------------------"); + System.out.println("Author: " + example.runDrillDown()); + + System.out.println("Facet drill-sideways example (Publish Date/2010):"); + System.out.println("---------------------------------------------"); + for (FacetResult result : example.runDrillSideways()) { + System.out.println(result); + } + + System.out.println("Facet counting example with exclusive ranges:"); + System.out.println("---------------------------------------------"); + for (FacetResult result : example.runNonOverlappingRangesCountFacetsOnly()) { + System.out.println(result); + } + + System.out.println("Facet counting example with overlapping ranges:"); + System.out.println("---------------------------------------------"); + for (FacetResult result : example.runOverlappingRangesCountFacetsOnly()) { + System.out.println(result); + } + + System.out.println("Facet aggregation example with exclusive ranges:"); + System.out.println("---------------------------------------------"); + for (FacetResult result : example.runNonOverlappingRangesAggregationFacets()) { + System.out.println(result); + } + } +} diff --git a/lucene/demo/src/java/org/apache/lucene/demo/facet/SimpleFacetsExample.java b/lucene/demo/src/java/org/apache/lucene/demo/facet/SimpleFacetsExample.java index 980bc64613d3..5585e431e06b 100644 --- a/lucene/demo/src/java/org/apache/lucene/demo/facet/SimpleFacetsExample.java +++ b/lucene/demo/src/java/org/apache/lucene/demo/facet/SimpleFacetsExample.java @@ -99,12 +99,13 @@ List facetsWithSearch() throws IOException { IndexSearcher searcher = new IndexSearcher(indexReader); TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoDir); - FacetsCollector fc = new FacetsCollector(); + FacetsCollectorManager fcm = new FacetsCollectorManager(); // MatchAllDocsQuery is for "browsing" (counts facets // for all non-deleted docs in the index); normally // you'd use a "normal" query: - FacetsCollector.search(searcher, new MatchAllDocsQuery(), 10, fc); + FacetsCollector fc = + FacetsCollectorManager.search(searcher, new MatchAllDocsQuery(), 10, fcm).facetsCollector(); // Retrieve results List results = new ArrayList<>(); @@ -156,8 +157,8 @@ FacetResult drillDown() throws IOException { // Now user drills down on Publish Date/2010: q.add("Publish Date", "2010"); - FacetsCollector fc = new FacetsCollector(); - FacetsCollector.search(searcher, q, 10, fc); + FacetsCollectorManager fcm = new FacetsCollectorManager(); + FacetsCollector fc = FacetsCollectorManager.search(searcher, q, 10, fcm).facetsCollector(); // Retrieve results Facets facets = new FastTaxonomyFacetCounts(taxoReader, config, fc); diff --git a/lucene/demo/src/java/org/apache/lucene/demo/facet/SimpleSortedSetFacetsExample.java b/lucene/demo/src/java/org/apache/lucene/demo/facet/SimpleSortedSetFacetsExample.java index 2c7c20d35f3f..cdb64a822bb3 100644 --- a/lucene/demo/src/java/org/apache/lucene/demo/facet/SimpleSortedSetFacetsExample.java +++ b/lucene/demo/src/java/org/apache/lucene/demo/facet/SimpleSortedSetFacetsExample.java @@ -25,6 +25,7 @@ import org.apache.lucene.facet.FacetResult; import org.apache.lucene.facet.Facets; import org.apache.lucene.facet.FacetsCollector; +import org.apache.lucene.facet.FacetsCollectorManager; import org.apache.lucene.facet.FacetsConfig; import org.apache.lucene.facet.sortedset.DefaultSortedSetDocValuesReaderState; import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetCounts; @@ -92,12 +93,13 @@ private List search() throws IOException { new DefaultSortedSetDocValuesReaderState(indexReader, config); // Aggregates the facet counts - FacetsCollector fc = new FacetsCollector(); + FacetsCollectorManager fcm = new FacetsCollectorManager(); // MatchAllDocsQuery is for "browsing" (counts facets // for all non-deleted docs in the index); normally // you'd use a "normal" query: - FacetsCollector.search(searcher, new MatchAllDocsQuery(), 10, fc); + FacetsCollector fc = + FacetsCollectorManager.search(searcher, new MatchAllDocsQuery(), 10, fcm).facetsCollector(); // Retrieve results Facets facets = new SortedSetDocValuesFacetCounts(state, fc); @@ -120,8 +122,8 @@ private FacetResult drillDown() throws IOException { // Now user drills down on Publish Year/2010: DrillDownQuery q = new DrillDownQuery(config); q.add("Publish Year", "2010"); - FacetsCollector fc = new FacetsCollector(); - FacetsCollector.search(searcher, q, 10, fc); + FacetsCollectorManager fcm = new FacetsCollectorManager(); + FacetsCollector fc = FacetsCollectorManager.search(searcher, q, 10, fcm).facetsCollector(); // Retrieve results Facets facets = new SortedSetDocValuesFacetCounts(state, fc); diff --git a/lucene/demo/src/java/org/apache/lucene/demo/facet/StringValueFacetCountsExample.java b/lucene/demo/src/java/org/apache/lucene/demo/facet/StringValueFacetCountsExample.java index 9f2feb705e7e..cd848d24afb7 100644 --- a/lucene/demo/src/java/org/apache/lucene/demo/facet/StringValueFacetCountsExample.java +++ b/lucene/demo/src/java/org/apache/lucene/demo/facet/StringValueFacetCountsExample.java @@ -27,6 +27,7 @@ import org.apache.lucene.facet.FacetResult; import org.apache.lucene.facet.Facets; import org.apache.lucene.facet.FacetsCollector; +import org.apache.lucene.facet.FacetsCollectorManager; import org.apache.lucene.facet.FacetsConfig; import org.apache.lucene.facet.StringDocValuesReaderState; import org.apache.lucene.facet.StringValueFacetCounts; @@ -96,12 +97,13 @@ private List search() throws IOException { new StringDocValuesReaderState(indexReader, "Publish Year"); // Aggregates the facet counts - FacetsCollector fc = new FacetsCollector(); + FacetsCollectorManager fcm = new FacetsCollectorManager(); // MatchAllDocsQuery is for "browsing" (counts facets // for all non-deleted docs in the index); normally // you'd use a "normal" query: - FacetsCollector.search(searcher, new MatchAllDocsQuery(), 10, fc); + FacetsCollector fc = + FacetsCollectorManager.search(searcher, new MatchAllDocsQuery(), 10, fcm).facetsCollector(); // Retrieve results Facets authorFacets = new StringValueFacetCounts(authorState, fc); diff --git a/lucene/demo/src/java/org/apache/lucene/demo/facet/package-info.java b/lucene/demo/src/java/org/apache/lucene/demo/facet/package-info.java index e83398650b17..2fbf29999ce1 100644 --- a/lucene/demo/src/java/org/apache/lucene/demo/facet/package-info.java +++ b/lucene/demo/src/java/org/apache/lucene/demo/facet/package-info.java @@ -198,9 +198,9 @@ * org.apache.lucene.search.Collector}, and as such can be passed to the search() method of Lucene's * {@link org.apache.lucene.search.IndexSearcher}. In case the application also needs to collect * documents (in addition to accumulating/collecting facets), you can use one of {@link - * org.apache.lucene.facet.FacetsCollector#search(org.apache.lucene.search.IndexSearcher, - * org.apache.lucene.search.Query, int, org.apache.lucene.search.Collector) - * FacetsCollector.search(...)} utility methods. + * org.apache.lucene.facet.FacetsCollectorManager#search(org.apache.lucene.search.IndexSearcher, + * org.apache.lucene.search.Query, int, org.apache.lucene.facet.FacetsCollectorManager) + * FacetsCollectorManager.search(...)} utility methods. * *

    There is a facets collecting code example in {@link * org.apache.lucene.demo.facet.SimpleFacetsExample#facetsWithSearch()}, see Sampling support is implemented in {@link * org.apache.lucene.facet.RandomSamplingFacetsCollector}. * + *

    Dynamic Range Facets

    + * + * We can build ranges over numeric fields and count the number of values falling in each range. The + * values can be weighted and the number of desired ranges can be specified. To see an example, + * check {@link org.apache.lucene.demo.facet.DynamicRangeFacetsExample}. + * * * *

    Sometimes, indexing is done once, and when the index is fully prepared, searching starts. diff --git a/lucene/demo/src/java/overview.html b/lucene/demo/src/java/overview.html index fd62e2b45f79..f904aca8380c 100644 --- a/lucene/demo/src/java/overview.html +++ b/lucene/demo/src/java/overview.html @@ -215,6 +215,17 @@

    Working with vector embeddings

    a more complete set of embeddings is needed to get reasonable results.

    +

    Working with facets

    +
    - diff --git a/lucene/demo/src/test/org/apache/lucene/demo/facet/TestDistanceFacetsExample.java b/lucene/demo/src/test/org/apache/lucene/demo/facet/TestDistanceFacetsExample.java index 5df2e63b2ef7..cd9fc0100681 100644 --- a/lucene/demo/src/test/org/apache/lucene/demo/facet/TestDistanceFacetsExample.java +++ b/lucene/demo/src/test/org/apache/lucene/demo/facet/TestDistanceFacetsExample.java @@ -36,7 +36,7 @@ public void testDrillDown() throws Exception { DistanceFacetsExample example = new DistanceFacetsExample(); example.index(); TopDocs hits = example.drillDown(example.FIVE_KM); - assertEquals(2, hits.totalHits.value); + assertEquals(2, hits.totalHits.value()); example.close(); } } diff --git a/lucene/demo/src/test/org/apache/lucene/demo/facet/TestDynamicRangeFacetsExample.java b/lucene/demo/src/test/org/apache/lucene/demo/facet/TestDynamicRangeFacetsExample.java new file mode 100644 index 000000000000..8724f2dcf273 --- /dev/null +++ b/lucene/demo/src/test/org/apache/lucene/demo/facet/TestDynamicRangeFacetsExample.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.demo.facet; + +import java.util.List; +import org.apache.lucene.facet.range.DynamicRangeUtil; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.junit.Test; + +public class TestDynamicRangeFacetsExample extends LuceneTestCase { + @Test + public void testExample() throws Exception { + List res = new DynamicRangeFacetsExample().runSearch(); + assertEquals( + List.of( + new DynamicRangeUtil.DynamicRangeInfo(2, 137, 63, 75, 69d), + new DynamicRangeUtil.DynamicRangeInfo(3, 83, 79, 96, 86)), + res); + } +} diff --git a/lucene/demo/src/test/org/apache/lucene/demo/facet/TestRangeFacetsExample.java b/lucene/demo/src/test/org/apache/lucene/demo/facet/TestRangeFacetsExample.java index 9efdcf5356b8..f7fde4c974c5 100644 --- a/lucene/demo/src/test/org/apache/lucene/demo/facet/TestRangeFacetsExample.java +++ b/lucene/demo/src/test/org/apache/lucene/demo/facet/TestRangeFacetsExample.java @@ -55,7 +55,7 @@ public void testDrillDown() throws Exception { RangeFacetsExample example = new RangeFacetsExample(); example.index(); TopDocs hits = example.drillDown(example.PAST_SIX_HOURS); - assertEquals(22, hits.totalHits.value); + assertEquals(22, hits.totalHits.value()); example.close(); } } diff --git a/lucene/distribution/binary-release.gradle b/lucene/distribution/binary-release.gradle index 5c03ac3a82a2..c3365e961abf 100644 --- a/lucene/distribution/binary-release.gradle +++ b/lucene/distribution/binary-release.gradle @@ -37,9 +37,9 @@ configure(project(":lucene:distribution")) { // Maven-published submodule JARs are part of the binary distribution. // We don't copy their transitive dependencies. - def binaryModules = rootProject.ext.mavenProjects.findAll { p -> !(p in [ + def binaryModules = rootProject.ext.mavenProjects.findAll { p -> !(p.path in [ // Placed in a separate folder (module layer conflicts). - project(":lucene:test-framework"), + ":lucene:test-framework", ]) } for (Project module : binaryModules) { jars(module, { diff --git a/lucene/expressions/src/test/org/apache/lucene/expressions/TestExpressionRescorer.java b/lucene/expressions/src/test/org/apache/lucene/expressions/TestExpressionRescorer.java index 9997170e04ea..01a012de3469 100644 --- a/lucene/expressions/src/test/org/apache/lucene/expressions/TestExpressionRescorer.java +++ b/lucene/expressions/src/test/org/apache/lucene/expressions/TestExpressionRescorer.java @@ -87,7 +87,7 @@ public void testBasic() throws Exception { // Just first pass query TopDocs hits = searcher.search(query, 10); - assertEquals(3, hits.totalHits.value); + assertEquals(3, hits.totalHits.value()); assertEquals("3", r.storedFields().document(hits.scoreDocs[0].doc).get("id")); assertEquals("1", r.storedFields().document(hits.scoreDocs[1].doc).get("id")); assertEquals("2", r.storedFields().document(hits.scoreDocs[2].doc).get("id")); @@ -101,7 +101,7 @@ public void testBasic() throws Exception { Rescorer rescorer = e.getRescorer(bindings); hits = rescorer.rescore(searcher, hits, 10); - assertEquals(3, hits.totalHits.value); + assertEquals(3, hits.totalHits.value()); assertEquals("2", r.storedFields().document(hits.scoreDocs[0].doc).get("id")); assertEquals("1", r.storedFields().document(hits.scoreDocs[1].doc).get("id")); assertEquals("3", r.storedFields().document(hits.scoreDocs[2].doc).get("id")); diff --git a/lucene/expressions/src/test/org/apache/lucene/expressions/TestExpressionSorts.java b/lucene/expressions/src/test/org/apache/lucene/expressions/TestExpressionSorts.java index 66d1d534c918..ac3b6959d51e 100644 --- a/lucene/expressions/src/test/org/apache/lucene/expressions/TestExpressionSorts.java +++ b/lucene/expressions/src/test/org/apache/lucene/expressions/TestExpressionSorts.java @@ -140,7 +140,7 @@ void assertQuery(Query query, Sort sort) throws Exception { TopDocs actual = searcher.search(query, size, mutatedSort, random().nextBoolean()); CheckHits.checkEqual(query, expected.scoreDocs, actual.scoreDocs); - if (size < actual.totalHits.value) { + if (size < actual.totalHits.value()) { expected = searcher.searchAfter(expected.scoreDocs[size - 1], query, size, sort); actual = searcher.searchAfter(actual.scoreDocs[size - 1], query, size, mutatedSort); CheckHits.checkEqual(query, expected.scoreDocs, actual.scoreDocs); diff --git a/lucene/facet/src/java/org/apache/lucene/facet/DrillSideways.java b/lucene/facet/src/java/org/apache/lucene/facet/DrillSideways.java index 2e126634d110..576aa84d51fc 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/DrillSideways.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/DrillSideways.java @@ -30,6 +30,7 @@ import org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState; import org.apache.lucene.facet.taxonomy.FastTaxonomyFacetCounts; import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.search.Collector; import org.apache.lucene.search.CollectorManager; import org.apache.lucene.search.FieldDoc; import org.apache.lucene.search.IndexSearcher; @@ -196,9 +197,8 @@ public DrillSidewaysResult search( limit = 1; // the collector does not alow numHits = 0 } final int fTopN = Math.min(topN, limit); - final boolean supportsConcurrency = searcher.getSlices().length > 1; final TopFieldCollectorManager collectorManager = - new TopFieldCollectorManager(sort, fTopN, after, Integer.MAX_VALUE, supportsConcurrency); + new TopFieldCollectorManager(sort, fTopN, after, Integer.MAX_VALUE); final ConcurrentDrillSidewaysResult r = search(query, collectorManager); TopFieldDocs topDocs = r.collectorResult; @@ -229,9 +229,8 @@ public DrillSidewaysResult search(ScoreDoc after, DrillDownQuery query, int topN limit = 1; // the collector does not alow numHits = 0 } final int fTopN = Math.min(topN, limit); - final boolean supportsConcurrency = searcher.getSlices().length > 1; final TopScoreDocCollectorManager collectorManager = - new TopScoreDocCollectorManager(fTopN, after, Integer.MAX_VALUE, supportsConcurrency); + new TopScoreDocCollectorManager(fTopN, after, Integer.MAX_VALUE); final ConcurrentDrillSidewaysResult r = search(query, collectorManager); return new DrillSidewaysResult( r.facets, @@ -300,35 +299,13 @@ public DrillSidewaysResult( } } - private static class CallableCollector implements Callable { - - private final int pos; - private final IndexSearcher searcher; - private final Query query; - private final CollectorManager collectorManager; - - private CallableCollector( - int pos, IndexSearcher searcher, Query query, CollectorManager collectorManager) { - this.pos = pos; - this.searcher = searcher; - this.query = query; - this.collectorManager = collectorManager; - } + private record CallableCollector( + IndexSearcher searcher, Query query, CollectorManager collectorManager) + implements Callable { @Override - public CallableResult call() throws Exception { - return new CallableResult(pos, searcher.search(query, collectorManager)); - } - } - - private static class CallableResult { - - private final int pos; - private final Object result; - - private CallableResult(int pos, Object result) { - this.pos = pos; - this.result = result; + public R call() throws Exception { + return searcher.search(query, collectorManager); } } @@ -349,16 +326,122 @@ private DrillDownQuery getDrillDownQuery( public ConcurrentDrillSidewaysResult search( final DrillDownQuery query, final CollectorManager hitCollectorManager) throws IOException { + // Main query + FacetsCollectorManager drillDownFacetsCollectorManager = + createDrillDownFacetsCollectorManager(); + final CollectorManager mainCollectorManager; + if (drillDownFacetsCollectorManager != null) { + // Make sure we populate a facet collector corresponding to the base query if desired: + mainCollectorManager = + new MultiCollectorManager(drillDownFacetsCollectorManager, hitCollectorManager); + } else { + mainCollectorManager = hitCollectorManager; + } + // Drill sideways dimensions + final List drillSidewaysCollectorManagers; + if (query.getDims().isEmpty() == false) { + drillSidewaysCollectorManagers = new ArrayList<>(query.getDims().size()); + for (int i = 0; i < query.getDims().size(); i++) { + drillSidewaysCollectorManagers.add(createDrillSidewaysFacetsCollectorManager()); + } + } else { + drillSidewaysCollectorManagers = null; + } + // Execute query + final Result result; if (executor != null) { - return searchConcurrently(query, hitCollectorManager); + result = searchConcurrently(query, mainCollectorManager, drillSidewaysCollectorManagers); + } else { + result = searchSequentially(query, mainCollectorManager, drillSidewaysCollectorManagers); + } + + // Collect results + final FacetsCollector facetsCollectorResult; + final R hitCollectorResult; + if (drillDownFacetsCollectorManager != null) { + // drill down collected using MultiCollector + // Extract the results: + Object[] drillDownResult = (Object[]) result.drillDownResult; + facetsCollectorResult = (FacetsCollector) drillDownResult[0]; + hitCollectorResult = (R) drillDownResult[1]; } else { - return searchSequentially(query, hitCollectorManager); + facetsCollectorResult = null; + hitCollectorResult = (R) result.drillDownResult; } + + // Getting results for drill sideways dimensions (if any) + final String[] drillSidewaysDims; + final FacetsCollector[] drillSidewaysCollectors; + if (query.getDims().isEmpty() == false) { + drillSidewaysDims = query.getDims().keySet().toArray(new String[0]); + int numDims = query.getDims().size(); + assert drillSidewaysCollectorManagers != null; + assert drillSidewaysCollectorManagers.size() == numDims; + drillSidewaysCollectors = new FacetsCollector[numDims]; + for (int dim = 0; dim < numDims; dim++) { + drillSidewaysCollectors[dim] = result.drillSidewaysResults.get(dim); + } + } else { + drillSidewaysDims = null; + drillSidewaysCollectors = null; + } + + return new ConcurrentDrillSidewaysResult<>( + buildFacetsResult(facetsCollectorResult, drillSidewaysCollectors, drillSidewaysDims), + null, + hitCollectorResult, + facetsCollectorResult, + drillSidewaysCollectors, + drillSidewaysDims); } - @SuppressWarnings("unchecked") - private ConcurrentDrillSidewaysResult searchSequentially( - final DrillDownQuery query, final CollectorManager hitCollectorManager) + /** + * Search using DrillDownQuery with custom collectors. This method can be used with any {@link + * CollectorManager}s. + * + *

    Note: Use {@link MultiCollectorManager} to collect both hits and facets for the entire query + * and/or for drill-sideways dimensions. You can also use it to wrap different types of {@link + * CollectorManager} for drill-sideways dimensions. + */ + public Result search( + DrillDownQuery query, + CollectorManager drillDownCollectorManager, + List> drillSidewaysCollectorManagers) + throws IOException { + if (drillDownCollectorManager == null) { + throw new IllegalArgumentException( + "This search method requires client to provide drill down collector manager"); + } + if (drillSidewaysCollectorManagers == null) { + if (query.getDims().isEmpty() == false) { + throw new IllegalArgumentException( + "The query requires not null drillSidewaysCollectorManagers"); + } + } else if (drillSidewaysCollectorManagers.size() != query.getDims().size()) { + throw new IllegalArgumentException( + "drillSidewaysCollectorManagers size must be equal to number of dimensions in the query."); + } + if (executor != null) { + return searchConcurrently(query, drillDownCollectorManager, drillSidewaysCollectorManagers); + } else { + return searchSequentially(query, drillDownCollectorManager, drillSidewaysCollectorManagers); + } + } + + /** + * {@link #search(DrillDownQuery, CollectorManager, List)} result. It doesn't depend on {@link + * Facets} to allow users to use any type of {@link CollectorManager} for drill-down or + * drill-sideways dimension. + * + * @param drillDownResult result from drill down (main) {@link CollectorManager} + * @param drillSidewaysResults results from drill sideways {@link CollectorManager}s + */ + public record Result(T drillDownResult, List drillSidewaysResults) {} + + private Result searchSequentially( + final DrillDownQuery query, + final CollectorManager drillDownCollectorManager, + final List> drillSidewaysCollectorManagers) throws IOException { Map drillDownDims = query.getDims(); @@ -366,28 +449,7 @@ private ConcurrentDrillSidewaysResult searchSequentially( if (drillDownDims.isEmpty()) { // There are no drill-down dims, so there is no // drill-sideways to compute: - FacetsCollectorManager drillDownCollectorManager = createDrillDownFacetsCollectorManager(); - FacetsCollector mainFacetsCollector; - R collectorResult; - if (drillDownCollectorManager != null) { - Object[] mainResults = - searcher.search( - query, new MultiCollectorManager(drillDownCollectorManager, hitCollectorManager)); - // Extract the results: - mainFacetsCollector = (FacetsCollector) mainResults[0]; - collectorResult = (R) mainResults[1]; - } else { - mainFacetsCollector = null; - collectorResult = searcher.search(query, hitCollectorManager); - } - - return new ConcurrentDrillSidewaysResult<>( - buildFacetsResult(mainFacetsCollector, null, null), - null, - collectorResult, - mainFacetsCollector, - null, - null); + return new Result<>(searcher.search(query, drillDownCollectorManager), null); } Query baseQuery = query.getBaseQuery(); @@ -398,130 +460,65 @@ private ConcurrentDrillSidewaysResult searchSequentially( } Query[] drillDownQueries = query.getDrillDownQueries(); - int numDims = drillDownDims.size(); + DrillSidewaysQuery dsq = + new DrillSidewaysQuery<>( + baseQuery, drillSidewaysCollectorManagers, drillDownQueries, scoreSubDocsAtOnce()); - FacetsCollectorManager drillDownCollectorManager = createDrillDownFacetsCollectorManager(); - - FacetsCollectorManager[] drillSidewaysFacetsCollectorManagers = - new FacetsCollectorManager[numDims]; - for (int i = 0; i < numDims; i++) { - drillSidewaysFacetsCollectorManagers[i] = createDrillSidewaysFacetsCollectorManager(); - } - - DrillSidewaysQuery dsq = - new DrillSidewaysQuery( - baseQuery, - drillDownCollectorManager, - drillSidewaysFacetsCollectorManagers, - drillDownQueries, - scoreSubDocsAtOnce()); - - R collectorResult = searcher.search(dsq, hitCollectorManager); - - FacetsCollector drillDownCollector; - if (drillDownCollectorManager != null) { - drillDownCollector = drillDownCollectorManager.reduce(dsq.managedDrillDownCollectors); - } else { - drillDownCollector = null; - } - - FacetsCollector[] drillSidewaysCollectors = new FacetsCollector[numDims]; + T collectorResult = searcher.search(dsq, drillDownCollectorManager); + List drillSidewaysResults = new ArrayList<>(drillDownDims.size()); + assert drillSidewaysCollectorManagers != null + : "Case without drill sideways dimensions is handled above"; int numSlices = dsq.managedDrillSidewaysCollectors.size(); - - for (int dim = 0; dim < numDims; dim++) { - List facetsCollectorsForDim = new ArrayList<>(numSlices); - + for (int dim = 0; dim < drillDownDims.size(); dim++) { + List collectorsForDim = new ArrayList<>(numSlices); for (int slice = 0; slice < numSlices; slice++) { - facetsCollectorsForDim.add(dsq.managedDrillSidewaysCollectors.get(slice)[dim]); + collectorsForDim.add(dsq.managedDrillSidewaysCollectors.get(slice).get(dim)); } - - drillSidewaysCollectors[dim] = - drillSidewaysFacetsCollectorManagers[dim].reduce(facetsCollectorsForDim); + drillSidewaysResults.add( + dim, drillSidewaysCollectorManagers.get(dim).reduce(collectorsForDim)); } - - String[] drillSidewaysDims = drillDownDims.keySet().toArray(new String[0]); - - return new ConcurrentDrillSidewaysResult<>( - buildFacetsResult(drillDownCollector, drillSidewaysCollectors, drillSidewaysDims), - null, - collectorResult, - drillDownCollector, - drillSidewaysCollectors, - drillSidewaysDims); + return new Result<>(collectorResult, drillSidewaysResults); } - @SuppressWarnings("unchecked") - private ConcurrentDrillSidewaysResult searchConcurrently( - final DrillDownQuery query, final CollectorManager hitCollectorManager) - throws IOException { + private Result searchConcurrently( + final DrillDownQuery query, + final CollectorManager drillDownCollectorManager, + final List> drillSidewaysCollectorManagers) { final Map drillDownDims = query.getDims(); - final List callableCollectors = new ArrayList<>(drillDownDims.size() + 1); + final CallableCollector drillDownCallableCollector = + new CallableCollector<>(searcher, query, drillDownCollectorManager); + final List> drillSidewaysCallableCollectors = + new ArrayList<>(drillDownDims.size()); - // Add the main DrillDownQuery - FacetsCollectorManager drillDownFacetsCollectorManager = - createDrillDownFacetsCollectorManager(); - CollectorManager mainCollectorManager; - if (drillDownFacetsCollectorManager != null) { - // Make sure we populate a facet collector corresponding to the base query if desired: - mainCollectorManager = - new MultiCollectorManager(drillDownFacetsCollectorManager, hitCollectorManager); - } else { - mainCollectorManager = hitCollectorManager; - } - callableCollectors.add(new CallableCollector(-1, searcher, query, mainCollectorManager)); int i = 0; final Query[] filters = query.getDrillDownQueries(); - for (String dim : drillDownDims.keySet()) - callableCollectors.add( - new CallableCollector( - i++, + for (String dim : drillDownDims.keySet()) { + drillSidewaysCallableCollectors.add( + new CallableCollector<>( searcher, getDrillDownQuery(query, filters, dim), - createDrillSidewaysFacetsCollectorManager())); - - final FacetsCollector mainFacetsCollector; - final FacetsCollector[] facetsCollectors = new FacetsCollector[drillDownDims.size()]; - final R collectorResult; + drillSidewaysCollectorManagers.get(i))); + i++; + } try { - // Run the query pool - final List> futures = executor.invokeAll(callableCollectors); - - // Extract the results - if (drillDownFacetsCollectorManager != null) { - // If we populated a facets collector for the main query, make sure to unpack it properly - final Object[] mainResults = (Object[]) futures.get(0).get().result; - mainFacetsCollector = (FacetsCollector) mainResults[0]; - collectorResult = (R) mainResults[1]; - } else { - mainFacetsCollector = null; - collectorResult = (R) futures.get(0).get().result; - } - for (i = 1; i < futures.size(); i++) { - final CallableResult result = futures.get(i).get(); - facetsCollectors[result.pos] = (FacetsCollector) result.result; - } - // Fill the null results with the mainFacetsCollector - for (i = 0; i < facetsCollectors.length; i++) - if (facetsCollectors[i] == null) facetsCollectors[i] = mainFacetsCollector; + final Future drillDownFuture = executor.submit(drillDownCallableCollector); + final List> drillSidewaysFutures = + executor.invokeAll(drillSidewaysCallableCollectors); + + T collectorResult = drillDownFuture.get(); + List drillSidewaysResults = new ArrayList<>(drillDownDims.size()); + for (i = 0; i < drillSidewaysFutures.size(); i++) { + drillSidewaysResults.add(i, drillSidewaysFutures.get(i).get()); + } + return new Result<>(collectorResult, drillSidewaysResults); } catch (InterruptedException e) { throw new ThreadInterruptedException(e); } catch (ExecutionException e) { throw new RuntimeException(e); } - - String[] drillSidewaysDims = drillDownDims.keySet().toArray(new String[0]); - - // build the facets and return the result - return new ConcurrentDrillSidewaysResult<>( - buildFacetsResult(mainFacetsCollector, facetsCollectors, drillSidewaysDims), - null, - collectorResult, - mainFacetsCollector, - facetsCollectors, - drillSidewaysDims); } /** diff --git a/lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysQuery.java b/lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysQuery.java index dca425f40f4f..1104df09f6e2 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysQuery.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysQuery.java @@ -25,11 +25,12 @@ import java.util.Objects; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.search.BulkScorer; +import org.apache.lucene.search.Collector; +import org.apache.lucene.search.CollectorManager; import org.apache.lucene.search.ConstantScoreScorer; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.LeafCollector; import org.apache.lucene.search.Query; import org.apache.lucene.search.QueryVisitor; import org.apache.lucene.search.ScoreMode; @@ -41,14 +42,12 @@ // TODO change the way DrillSidewaysScorer is used, this query does not work // with filter caching -class DrillSidewaysQuery extends Query { +class DrillSidewaysQuery extends Query { final Query baseQuery; - final FacetsCollectorManager drillDownCollectorManager; - final FacetsCollectorManager[] drillSidewaysCollectorManagers; - final List managedDrillDownCollectors; - final List managedDrillSidewaysCollectors; + final List> drillSidewaysCollectorManagers; + final List> managedDrillSidewaysCollectors; final Query[] drillDownQueries; @@ -56,25 +55,20 @@ class DrillSidewaysQuery extends Query { /** * Construct a new {@code DrillSidewaysQuery} that will create new {@link FacetsCollector}s for - * each {@link LeafReaderContext} using the provided {@link FacetsCollectorManager}s. The caller - * can access the created {@link FacetsCollector}s through {@link #managedDrillDownCollectors} and - * {@link #managedDrillSidewaysCollectors}. + * each {@link LeafReaderContext} using the provided {@link FacetsCollectorManager}s. */ DrillSidewaysQuery( Query baseQuery, - FacetsCollectorManager drillDownCollectorManager, - FacetsCollectorManager[] drillSidewaysCollectorManagers, + List> drillSidewaysCollectorManagers, Query[] drillDownQueries, boolean scoreSubDocsAtOnce) { - // Note that the "managed" facet collector lists are synchronized here since bulkScorer() + // Note that the "managed" collector lists are synchronized here since bulkScorer() // can be invoked concurrently and needs to remain thread-safe. We're OK with synchronizing // on the whole list as contention is expected to remain very low: this( baseQuery, - drillDownCollectorManager, drillSidewaysCollectorManagers, Collections.synchronizedList(new ArrayList<>()), - Collections.synchronizedList(new ArrayList<>()), drillDownQueries, scoreSubDocsAtOnce); } @@ -86,19 +80,15 @@ class DrillSidewaysQuery extends Query { */ private DrillSidewaysQuery( Query baseQuery, - FacetsCollectorManager drillDownCollectorManager, - FacetsCollectorManager[] drillSidewaysCollectorManagers, - List managedDrillDownCollectors, - List managedDrillSidewaysCollectors, + List> drillSidewaysCollectorManagers, + List> managedDrillSidewaysCollectors, Query[] drillDownQueries, boolean scoreSubDocsAtOnce) { this.baseQuery = Objects.requireNonNull(baseQuery); - this.drillDownCollectorManager = drillDownCollectorManager; this.drillSidewaysCollectorManagers = drillSidewaysCollectorManagers; - this.managedDrillDownCollectors = managedDrillDownCollectors; - this.managedDrillSidewaysCollectors = managedDrillSidewaysCollectors; this.drillDownQueries = drillDownQueries; this.scoreSubDocsAtOnce = scoreSubDocsAtOnce; + this.managedDrillSidewaysCollectors = managedDrillSidewaysCollectors; } @Override @@ -119,11 +109,9 @@ public Query rewrite(IndexSearcher indexSearcher) throws IOException { if (newQuery == baseQuery) { return super.rewrite(indexSearcher); } else { - return new DrillSidewaysQuery( + return new DrillSidewaysQuery<>( newQuery, - drillDownCollectorManager, drillSidewaysCollectorManagers, - managedDrillDownCollectors, managedDrillSidewaysCollectors, drillDownQueries, scoreSubDocsAtOnce); @@ -158,18 +146,7 @@ public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOExcepti int drillDownCount = drillDowns.length; - FacetsCollector drillDownCollector; - LeafCollector drillDownLeafCollector; - if (drillDownCollectorManager != null) { - drillDownCollector = drillDownCollectorManager.newCollector(); - managedDrillDownCollectors.add(drillDownCollector); - drillDownLeafCollector = drillDownCollector.getLeafCollector(context); - } else { - drillDownCollector = null; - drillDownLeafCollector = null; - } - - FacetsCollector[] sidewaysCollectors = new FacetsCollector[drillDownCount]; + List sidewaysCollectors = new ArrayList<>(drillDownCount); managedDrillSidewaysCollectors.add(sidewaysCollectors); DrillSidewaysScorer.DocsAndCost[] dims = @@ -183,8 +160,8 @@ public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOExcepti scorer = new ConstantScoreScorer(0f, scoreMode, DocIdSetIterator.empty()); } - FacetsCollector sidewaysCollector = drillSidewaysCollectorManagers[dim].newCollector(); - sidewaysCollectors[dim] = sidewaysCollector; + K sidewaysCollector = drillSidewaysCollectorManagers.get(dim).newCollector(); + sidewaysCollectors.add(dim, sidewaysCollector); dims[dim] = new DrillSidewaysScorer.DocsAndCost( @@ -195,11 +172,8 @@ public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOExcepti // a null scorer in this case, but we need to make sure #finish gets called on all facet // collectors since IndexSearcher won't handle this for us: if (baseScorerSupplier == null || nullCount > 1) { - if (drillDownCollector != null) { - drillDownCollector.finish(); - } - for (FacetsCollector fc : sidewaysCollectors) { - fc.finish(); + for (DrillSidewaysScorer.DocsAndCost dim : dims) { + dim.sidewaysLeafCollector.finish(); } return null; } @@ -217,11 +191,7 @@ public Scorer get(long leadCost) throws IOException { @Override public BulkScorer bulkScorer() throws IOException { return new DrillSidewaysScorer( - context, - baseScorerSupplier.get(Long.MAX_VALUE), - drillDownLeafCollector, - dims, - scoreSubDocsAtOnce); + context, baseScorerSupplier.get(Long.MAX_VALUE), dims, scoreSubDocsAtOnce); } @Override @@ -252,9 +222,8 @@ public int hashCode() { final int prime = 31; int result = classHash(); result = prime * result + Objects.hashCode(baseQuery); - result = prime * result + Objects.hashCode(drillDownCollectorManager); result = prime * result + Arrays.hashCode(drillDownQueries); - result = prime * result + Arrays.hashCode(drillSidewaysCollectorManagers); + result = prime * result + Objects.hashCode(drillSidewaysCollectorManagers); return result; } @@ -263,10 +232,9 @@ public boolean equals(Object other) { return sameClassAs(other) && equalsTo(getClass().cast(other)); } - private boolean equalsTo(DrillSidewaysQuery other) { + private boolean equalsTo(DrillSidewaysQuery other) { return Objects.equals(baseQuery, other.baseQuery) - && Objects.equals(drillDownCollectorManager, other.drillDownCollectorManager) && Arrays.equals(drillDownQueries, other.drillDownQueries) - && Arrays.equals(drillSidewaysCollectorManagers, other.drillSidewaysCollectorManagers); + && Objects.equals(drillSidewaysCollectorManagers, other.drillSidewaysCollectorManagers); } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysScorer.java b/lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysScorer.java index 5ff9c6420ad9..3a5bfc3c4649 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysScorer.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysScorer.java @@ -45,8 +45,6 @@ class DrillSidewaysScorer extends BulkScorer { // private static boolean DEBUG = false; - private final LeafCollector drillDownLeafCollector; - private final DocsAndCost[] dims; // DrillDown DocsEnums: @@ -68,7 +66,6 @@ class DrillSidewaysScorer extends BulkScorer { DrillSidewaysScorer( LeafReaderContext context, Scorer baseScorer, - LeafCollector drillDownLeafCollector, DocsAndCost[] dims, boolean scoreSubDocsAtOnce) { this.dims = dims; @@ -81,7 +78,6 @@ class DrillSidewaysScorer extends BulkScorer { } else { this.baseApproximation = baseIterator; } - this.drillDownLeafCollector = drillDownLeafCollector; this.scoreSubDocsAtOnce = scoreSubDocsAtOnce; } @@ -709,9 +705,6 @@ private void collectHit(LeafCollector collector, DocsAndCost[] dims) throws IOEx // } collector.collect(collectDocID); - if (drillDownLeafCollector != null) { - drillDownLeafCollector.collect(collectDocID); - } // TODO: we could "fix" faceting of the sideways counts // to do this "union" (of the drill down hits) in the @@ -725,9 +718,6 @@ private void collectHit(LeafCollector collector, DocsAndCost[] dims) throws IOEx private void collectHit(LeafCollector collector, DocsAndCost dim) throws IOException { collector.collect(collectDocID); - if (drillDownLeafCollector != null) { - drillDownLeafCollector.collect(collectDocID); - } // Tally sideways count: dim.sidewaysLeafCollector.collect(collectDocID); @@ -735,9 +725,6 @@ private void collectHit(LeafCollector collector, DocsAndCost dim) throws IOExcep private void collectHit(LeafCollector collector, List dims) throws IOException { collector.collect(collectDocID); - if (drillDownLeafCollector != null) { - drillDownLeafCollector.collect(collectDocID); - } // Tally sideways counts: for (DocsAndCost dim : dims) { @@ -756,9 +743,6 @@ private void finish(DocsAndCost[] dims) throws IOException { // Note: We _only_ call #finish on the facets collectors we're managing here, but not the // "main" collector. This is because IndexSearcher handles calling #finish on the main // collector. - if (drillDownLeafCollector != null) { - drillDownLeafCollector.finish(); - } for (DocsAndCost dim : dims) { dim.sidewaysLeafCollector.finish(); } @@ -766,9 +750,6 @@ private void finish(DocsAndCost[] dims) throws IOException { private void setScorer(LeafCollector mainCollector, Scorable scorer) throws IOException { mainCollector.setScorer(scorer); - if (drillDownLeafCollector != null) { - drillDownLeafCollector.setScorer(scorer); - } for (DocsAndCost dim : dims) { dim.sidewaysLeafCollector.setScorer(scorer); } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/FacetCountsWithFilterQuery.java b/lucene/facet/src/java/org/apache/lucene/facet/FacetCountsWithFilterQuery.java index fafa0f33c4b0..500e9a276166 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/FacetCountsWithFilterQuery.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/FacetCountsWithFilterQuery.java @@ -59,7 +59,7 @@ protected FacetCountsWithFilterQuery(Query fastMatchQuery) { protected DocIdSetIterator createIterator( FacetsCollector.MatchingDocs hits, DocIdSetIterator... iterators) throws IOException { List allIterators = new ArrayList<>(); - allIterators.add(hits.bits.iterator()); + allIterators.add(hits.bits().iterator()); allIterators.addAll(Arrays.asList(iterators)); if (allIterators.stream().anyMatch(Objects::isNull)) { // if any of the iterators are null, there are no matching docs @@ -67,12 +67,12 @@ protected DocIdSetIterator createIterator( } if (fastMatchQuery != null) { - final IndexReaderContext topLevelContext = ReaderUtil.getTopLevelContext(hits.context); + final IndexReaderContext topLevelContext = ReaderUtil.getTopLevelContext(hits.context()); final IndexSearcher searcher = new IndexSearcher(topLevelContext); searcher.setQueryCache(null); final Weight fastMatchWeight = searcher.createWeight(searcher.rewrite(fastMatchQuery), ScoreMode.COMPLETE_NO_SCORES, 1); - final Scorer s = fastMatchWeight.scorer(hits.context); + final Scorer s = fastMatchWeight.scorer(hits.context()); if (s == null) { // no matching docs by the fast match query return null; diff --git a/lucene/facet/src/java/org/apache/lucene/facet/FacetsCollector.java b/lucene/facet/src/java/org/apache/lucene/facet/FacetsCollector.java index 87ad2947cf5c..11ccec5ca1b5 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/FacetsCollector.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/FacetsCollector.java @@ -22,23 +22,9 @@ import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.search.Collector; import org.apache.lucene.search.DocIdSet; -import org.apache.lucene.search.FieldDoc; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.MultiCollector; -import org.apache.lucene.search.Query; import org.apache.lucene.search.Scorable; -import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.ScoreMode; import org.apache.lucene.search.SimpleCollector; -import org.apache.lucene.search.Sort; -import org.apache.lucene.search.TopDocs; -import org.apache.lucene.search.TopDocsCollector; -import org.apache.lucene.search.TopFieldCollector; -import org.apache.lucene.search.TopFieldCollectorManager; -import org.apache.lucene.search.TopFieldDocs; -import org.apache.lucene.search.TopScoreDocCollectorManager; -import org.apache.lucene.search.TotalHitCountCollector; -import org.apache.lucene.search.TotalHits; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.DocIdSetBuilder; @@ -58,31 +44,16 @@ public class FacetsCollector extends SimpleCollector { private DocIdSetBuilder docsBuilder; /** - * Holds the documents that were matched in the {@link org.apache.lucene.index.LeafReaderContext}. - * If scores were required, then {@code scores} is not null. + * Holds the documents that were matched in the {@link LeafReaderContext}. If scores were + * required, then {@code scores} is not null. + * + * @param context Context for this segment. + * @param bits Which documents were seen. + * @param scores Non-sparse scores array. + * @param totalHits Total number of hits */ - public static final class MatchingDocs { - - /** Context for this segment. */ - public final LeafReaderContext context; - - /** Which documents were seen. */ - public final DocIdSet bits; - - /** Non-sparse scores array. */ - public final float[] scores; - - /** Total number of hits */ - public final int totalHits; - - /** Sole constructor. */ - public MatchingDocs(LeafReaderContext context, DocIdSet bits, int totalHits, float[] scores) { - this.context = context; - this.bits = bits; - this.scores = scores; - this.totalHits = totalHits; - } - } + public record MatchingDocs( + LeafReaderContext context, DocIdSet bits, int totalHits, float[] scores) {} /** Default constructor */ public FacetsCollector() { @@ -111,12 +82,12 @@ public List getMatchingDocs() { public void collect(int doc) throws IOException { docsBuilder.grow(1).add(doc); if (keepScores) { - if (totalHits >= scores.length) { - float[] newScores = new float[ArrayUtil.oversize(totalHits + 1, 4)]; - System.arraycopy(scores, 0, newScores, 0, totalHits); + if (doc >= scores.length) { + float[] newScores = new float[ArrayUtil.oversize(doc + 1, 4)]; + System.arraycopy(scores, 0, newScores, 0, scores.length); scores = newScores; } - scores[totalHits] = scorer.score(); + scores[doc] = scorer.score(); } totalHits++; } @@ -155,118 +126,4 @@ public void finish() throws IOException { scores = null; context = null; } - - /** Utility method, to search and also collect all hits into the provided {@link Collector}. */ - public static TopDocs search(IndexSearcher searcher, Query q, int n, Collector fc) - throws IOException { - return doSearch(searcher, null, q, n, null, false, fc); - } - - /** Utility method, to search and also collect all hits into the provided {@link Collector}. */ - public static TopFieldDocs search(IndexSearcher searcher, Query q, int n, Sort sort, Collector fc) - throws IOException { - if (sort == null) { - throw new IllegalArgumentException("sort must not be null"); - } - return (TopFieldDocs) doSearch(searcher, null, q, n, sort, false, fc); - } - - /** Utility method, to search and also collect all hits into the provided {@link Collector}. */ - public static TopFieldDocs search( - IndexSearcher searcher, Query q, int n, Sort sort, boolean doDocScores, Collector fc) - throws IOException { - if (sort == null) { - throw new IllegalArgumentException("sort must not be null"); - } - return (TopFieldDocs) doSearch(searcher, null, q, n, sort, doDocScores, fc); - } - - /** Utility method, to search and also collect all hits into the provided {@link Collector}. */ - public static TopDocs searchAfter( - IndexSearcher searcher, ScoreDoc after, Query q, int n, Collector fc) throws IOException { - return doSearch(searcher, after, q, n, null, false, fc); - } - - /** Utility method, to search and also collect all hits into the provided {@link Collector}. */ - public static TopDocs searchAfter( - IndexSearcher searcher, ScoreDoc after, Query q, int n, Sort sort, Collector fc) - throws IOException { - if (sort == null) { - throw new IllegalArgumentException("sort must not be null"); - } - return doSearch(searcher, after, q, n, sort, false, fc); - } - - /** Utility method, to search and also collect all hits into the provided {@link Collector}. */ - public static TopDocs searchAfter( - IndexSearcher searcher, - ScoreDoc after, - Query q, - int n, - Sort sort, - boolean doDocScores, - Collector fc) - throws IOException { - if (sort == null) { - throw new IllegalArgumentException("sort must not be null"); - } - return doSearch(searcher, after, q, n, sort, doDocScores, fc); - } - - private static TopDocs doSearch( - IndexSearcher searcher, - ScoreDoc after, - Query q, - int n, - Sort sort, - boolean doDocScores, - Collector fc) - throws IOException { - - int limit = searcher.getIndexReader().maxDoc(); - if (limit == 0) { - limit = 1; - } - n = Math.min(n, limit); - - if (after != null && after.doc >= limit) { - throw new IllegalArgumentException( - "after.doc exceeds the number of documents in the reader: after.doc=" - + after.doc - + " limit=" - + limit); - } - - TopDocs topDocs = null; - if (n == 0) { - TotalHitCountCollector totalHitCountCollector = new TotalHitCountCollector(); - searcher.search(q, MultiCollector.wrap(totalHitCountCollector, fc)); - topDocs = - new TopDocs( - new TotalHits(totalHitCountCollector.getTotalHits(), TotalHits.Relation.EQUAL_TO), - new ScoreDoc[0]); - } else { - TopDocsCollector hitsCollector; - if (sort != null) { - if (after != null && !(after instanceof FieldDoc)) { - // TODO: if we fix type safety of TopFieldDocs we can - // remove this - throw new IllegalArgumentException("after must be a FieldDoc; got " + after); - } - hitsCollector = - new TopFieldCollectorManager(sort, n, (FieldDoc) after, Integer.MAX_VALUE, false) - .newCollector(); // TODO: can we disable exact hit counts - } else { - hitsCollector = - new TopScoreDocCollectorManager(n, after, Integer.MAX_VALUE, false).newCollector(); - } - searcher.search(q, MultiCollector.wrap(hitsCollector, fc)); - - topDocs = hitsCollector.topDocs(); - if (doDocScores) { - TopFieldCollector.populateScores(topDocs.scoreDocs, searcher, q); - } - } - return topDocs; - } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/FacetsCollectorManager.java b/lucene/facet/src/java/org/apache/lucene/facet/FacetsCollectorManager.java index 71319bfc217f..bc547407ee92 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/FacetsCollectorManager.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/FacetsCollectorManager.java @@ -17,9 +17,25 @@ package org.apache.lucene.facet; import java.io.IOException; +import java.io.UncheckedIOException; import java.util.Collection; -import java.util.List; +import java.util.HashMap; +import java.util.Map; +import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.search.CollectorManager; +import org.apache.lucene.search.FieldDoc; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MultiCollectorManager; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.TopFieldCollector; +import org.apache.lucene.search.TopFieldCollectorManager; +import org.apache.lucene.search.TopScoreDocCollectorManager; +import org.apache.lucene.search.TotalHitCountCollectorManager; +import org.apache.lucene.search.TotalHits; +import org.apache.lucene.util.DocIdSetBuilder; /** * A {@link CollectorManager} implementation which produces FacetsCollector and produces a merged @@ -27,12 +43,24 @@ */ public class FacetsCollectorManager implements CollectorManager { + private final boolean keepScores; + /** Sole constructor. */ - public FacetsCollectorManager() {} + public FacetsCollectorManager() { + this(false); + } + + /** + * Creates a new collector manager that in turn creates {@link FacetsCollector} using the provided + * {@code keepScores} flag. hits. + */ + public FacetsCollectorManager(boolean keepScores) { + this.keepScores = keepScores; + } @Override public FacetsCollector newCollector() throws IOException { - return new FacetsCollector(); + return new FacetsCollector(keepScores); } @Override @@ -43,15 +71,243 @@ public FacetsCollector reduce(Collection collectors) throws IOE if (collectors.size() == 1) { return collectors.iterator().next(); } - return new ReducedFacetsCollector(collectors); + assert collectors.stream().allMatch(fc -> fc.getKeepScores() == keepScores); + return new ReducedFacetsCollector(collectors, keepScores); } private static class ReducedFacetsCollector extends FacetsCollector { - public ReducedFacetsCollector(final Collection facetsCollectors) { - final List matchingDocs = this.getMatchingDocs(); - facetsCollectors.forEach( - facetsCollector -> matchingDocs.addAll(facetsCollector.getMatchingDocs())); + ReducedFacetsCollector(final Collection facetsCollectors, boolean keepScores) { + super(keepScores); + this.getMatchingDocs().addAll(reduceMatchingDocs(facetsCollectors)); + } + } + + /** + * Reduces matching docs held by the provided facets collectors, merging matching docs for the + * same leaf into a single matching docs instance + * + * @param facetsCollectors the facets collectors + * @return the reduced matching docs, with one instance per leaf reader context + */ + static Collection reduceMatchingDocs( + final Collection facetsCollectors) { + // When a segment is split into partitions, each partition gets its own FacetsCollector that + // pulls doc_values independently, and builds a bitset of the size of the entire segment. When + // segments are partitioned, each partition will collect only the docs in its docid range, hence + // there will be multiple MatchingDocs pointing to the same LeafReaderContext. As part of the + // reduction we merge back partitions into a single MatchingDocs per segment. + Map matchingDocsMap = new HashMap<>(); + for (FacetsCollector facetsCollector : facetsCollectors) { + for (FacetsCollector.MatchingDocs matchingDocs : facetsCollector.getMatchingDocs()) { + matchingDocsMap.compute( + matchingDocs.context(), + (leafReaderContext, existing) -> { + if (existing == null) { + return matchingDocs; + } + return merge(existing, matchingDocs); + }); + } } + return matchingDocsMap.values(); } + + private static FacetsCollector.MatchingDocs merge( + FacetsCollector.MatchingDocs matchingDocs1, FacetsCollector.MatchingDocs matchingDocs2) { + assert matchingDocs1.context() == matchingDocs2.context(); + final float[] scores; + + // scores array is null when keepScores is true, and may be null when there are no matches for a + // segment partition, despite keepScores is true. + if (matchingDocs1.scores() == null && matchingDocs2.scores() == null) { + scores = new float[0]; + } else { + if (matchingDocs2.scores() == null) { + scores = matchingDocs1.scores(); + } else if (matchingDocs1.scores() == null) { + scores = matchingDocs2.scores(); + } else { + int length = Math.max(matchingDocs1.scores().length, matchingDocs2.scores().length); + // merge the arrays if both have values, their size is bound to the highest collected docid + scores = new float[length]; + for (int i = 0; i < length; i++) { + float firstScore = i < matchingDocs1.scores().length ? matchingDocs1.scores()[i] : 0; + float secondScore = i < matchingDocs2.scores().length ? matchingDocs2.scores()[i] : 0; + assert (firstScore > 0 && secondScore > 0) == false; + scores[i] = Math.max(firstScore, secondScore); + } + } + } + DocIdSetBuilder docIdSetBuilder = + new DocIdSetBuilder(matchingDocs1.context().reader().maxDoc()); + try { + docIdSetBuilder.add(matchingDocs1.bits().iterator()); + docIdSetBuilder.add(matchingDocs2.bits().iterator()); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + int totalHits = matchingDocs1.totalHits() + matchingDocs2.totalHits(); + return new FacetsCollector.MatchingDocs( + matchingDocs1.context(), docIdSetBuilder.build(), totalHits, scores); + } + + /** + * Utility method, to search and also populate a {@code FacetsCollector} with hits. The provided + * {@code FacetsCollectorManager} will be used for creating/reducing {@code FacetsCollector} + * instances. + */ + public static FacetsResult search( + IndexSearcher searcher, Query q, int n, FacetsCollectorManager fcm) throws IOException { + return doSearch(searcher, null, q, n, null, false, fcm); + } + + /** + * Utility method, to search and also populate a {@code FacetsCollector} with hits. The provided + * {@code FacetsCollectorManager} will be used for creating/reducing {@code FacetsCollector} + * instances. + */ + public static FacetsResult search( + IndexSearcher searcher, Query q, int n, Sort sort, FacetsCollectorManager fcm) + throws IOException { + if (sort == null) { + throw new IllegalArgumentException("sort must not be null"); + } + return doSearch(searcher, null, q, n, sort, false, fcm); + } + + /** + * Utility method, to search and also populate a {@code FacetsCollector} with hits. The provided + * {@code FacetsCollectorManager} will be used for creating/reducing {@code FacetsCollector} + * instances. + */ + public static FacetsResult search( + IndexSearcher searcher, + Query q, + int n, + Sort sort, + boolean doDocScores, + FacetsCollectorManager fcm) + throws IOException { + if (sort == null) { + throw new IllegalArgumentException("sort must not be null"); + } + return doSearch(searcher, null, q, n, sort, doDocScores, fcm); + } + + /** + * Utility method, to search and also populate a {@code FacetsCollector} with hits. The provided + * {@code FacetsCollectorManager} will be used for creating/reducing {@code FacetsCollector} + * instances. + */ + public static FacetsResult searchAfter( + IndexSearcher searcher, ScoreDoc after, Query q, int n, FacetsCollectorManager fcm) + throws IOException { + return doSearch(searcher, after, q, n, null, false, fcm); + } + + /** + * Utility method, to search and also populate a {@code FacetsCollector} with hits. The provided + * {@code FacetsCollectorManager} will be used for creating/reducing {@code FacetsCollector} + * instances. + */ + public static FacetsResult searchAfter( + IndexSearcher searcher, ScoreDoc after, Query q, int n, Sort sort, FacetsCollectorManager fcm) + throws IOException { + if (sort == null) { + throw new IllegalArgumentException("sort must not be null"); + } + return doSearch(searcher, after, q, n, sort, false, fcm); + } + + /** + * Utility method, to search and also populate a {@code FacetsCollector} with hits. The provided + * {@code FacetsCollectorManager} will be used for creating/reducing {@code FacetsCollector} + * instances. + */ + public static FacetsResult searchAfter( + IndexSearcher searcher, + ScoreDoc after, + Query q, + int n, + Sort sort, + boolean doDocScores, + FacetsCollectorManager fcm) + throws IOException { + if (sort == null) { + throw new IllegalArgumentException("sort must not be null"); + } + return doSearch(searcher, after, q, n, sort, doDocScores, fcm); + } + + private static FacetsResult doSearch( + IndexSearcher searcher, + ScoreDoc after, + Query q, + int n, + Sort sort, + boolean doDocScores, + FacetsCollectorManager fcm) + throws IOException { + + int limit = searcher.getIndexReader().maxDoc(); + if (limit == 0) { + limit = 1; + } + n = Math.min(n, limit); + + if (after != null && after.doc >= limit) { + throw new IllegalArgumentException( + "after.doc exceeds the number of documents in the reader: after.doc=" + + after.doc + + " limit=" + + limit); + } + + final TopDocs topDocs; + final FacetsCollector facetsCollector; + if (n == 0) { + TotalHitCountCollectorManager hitCountCollectorManager = + new TotalHitCountCollectorManager(searcher.getSlices()); + MultiCollectorManager multiCollectorManager = + new MultiCollectorManager(hitCountCollectorManager, fcm); + Object[] result = searcher.search(q, multiCollectorManager); + topDocs = + new TopDocs( + new TotalHits((Integer) result[0], TotalHits.Relation.EQUAL_TO), new ScoreDoc[0]); + facetsCollector = (FacetsCollector) result[1]; + } else { + final MultiCollectorManager multiCollectorManager; + if (sort != null) { + if (after != null && !(after instanceof FieldDoc)) { + // TODO: if we fix type safety of TopFieldDocs we can + // remove this + throw new IllegalArgumentException("after must be a FieldDoc; got " + after); + } + TopFieldCollectorManager topFieldCollectorManager = + new TopFieldCollectorManager(sort, n, (FieldDoc) after, Integer.MAX_VALUE); + multiCollectorManager = new MultiCollectorManager(topFieldCollectorManager, fcm); + } else { + TopScoreDocCollectorManager topScoreDocCollectorManager = + new TopScoreDocCollectorManager(n, after, Integer.MAX_VALUE); + multiCollectorManager = new MultiCollectorManager(topScoreDocCollectorManager, fcm); + } + Object[] result = searcher.search(q, multiCollectorManager); + topDocs = (TopDocs) result[0]; + if (doDocScores) { + TopFieldCollector.populateScores(topDocs.scoreDocs, searcher, q); + } + facetsCollector = (FacetsCollector) result[1]; + } + return new FacetsResult(topDocs, facetsCollector); + } + + /** + * Holds results of a search run via static utility methods exposed by this class. Those include + * {@link TopDocs} as well as facets result included in the returned {@link FacetsCollector} + * + * @param topDocs the top docs + * @param facetsCollector the facets result included in a {@link FacetsCollector} instance + */ + public record FacetsResult(TopDocs topDocs, FacetsCollector facetsCollector) {} } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/LongValueFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/LongValueFacetCounts.java index f83b64912a38..346c2dc369e0 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/LongValueFacetCounts.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/LongValueFacetCounts.java @@ -172,19 +172,19 @@ private void count(LongValuesSource valueSource, List matchingDocs throws IOException { for (MatchingDocs hits : matchingDocs) { - if (hits.totalHits == 0) { + if (hits.totalHits() == 0) { continue; } initializeCounters(); - LongValues fv = valueSource.getValues(hits.context, null); + LongValues fv = valueSource.getValues(hits.context(), null); // NOTE: this is not as efficient as working directly with the doc values APIs in the sparse // case // because we are doing a linear scan across all hits, but this API is more flexible since a // LongValuesSource can compute interesting values at query time - DocIdSetIterator docs = hits.bits.iterator(); + DocIdSetIterator docs = hits.bits().iterator(); for (int doc = docs.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; ) { // Skip missing docs: if (fv.advanceExact(doc)) { @@ -201,14 +201,14 @@ private void count(LongValuesSource valueSource, List matchingDocs private void count(MultiLongValuesSource valuesSource, List matchingDocs) throws IOException { for (MatchingDocs hits : matchingDocs) { - if (hits.totalHits == 0) { + if (hits.totalHits() == 0) { continue; } initializeCounters(); - MultiLongValues multiValues = valuesSource.getValues(hits.context); + MultiLongValues multiValues = valuesSource.getValues(hits.context()); - DocIdSetIterator docs = hits.bits.iterator(); + DocIdSetIterator docs = hits.bits().iterator(); for (int doc = docs.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; ) { // Skip missing docs: if (multiValues.advanceExact(doc)) { @@ -235,18 +235,20 @@ private void count(MultiLongValuesSource valuesSource, List matchi /** Counts from the field's indexed doc values. */ private void count(String field, List matchingDocs) throws IOException { for (MatchingDocs hits : matchingDocs) { - if (hits.totalHits == 0) { + if (hits.totalHits() == 0) { continue; } initializeCounters(); - SortedNumericDocValues multiValues = DocValues.getSortedNumeric(hits.context.reader(), field); + SortedNumericDocValues multiValues = + DocValues.getSortedNumeric(hits.context().reader(), field); NumericDocValues singleValues = DocValues.unwrapSingleton(multiValues); if (singleValues != null) { DocIdSetIterator it = - ConjunctionUtils.intersectIterators(Arrays.asList(hits.bits.iterator(), singleValues)); + ConjunctionUtils.intersectIterators( + Arrays.asList(hits.bits().iterator(), singleValues)); for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) { increment(singleValues.longValue()); @@ -255,7 +257,7 @@ private void count(String field, List matchingDocs) throws IOExcep } else { DocIdSetIterator it = - ConjunctionUtils.intersectIterators(Arrays.asList(hits.bits.iterator(), multiValues)); + ConjunctionUtils.intersectIterators(Arrays.asList(hits.bits().iterator(), multiValues)); for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) { int limit = multiValues.docValueCount(); diff --git a/lucene/facet/src/java/org/apache/lucene/facet/MultiDoubleValuesSource.java b/lucene/facet/src/java/org/apache/lucene/facet/MultiDoubleValuesSource.java index 44427965b8c0..9a53df274c35 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/MultiDoubleValuesSource.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/MultiDoubleValuesSource.java @@ -26,6 +26,7 @@ import org.apache.lucene.search.DoubleValuesSource; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.SegmentCacheable; +import org.apache.lucene.util.NumericUtils; /** * Base class for producing {@link MultiDoubleValues}. See also {@link DoubleValuesSource} for a @@ -118,6 +119,65 @@ public final MultiLongValuesSource toMultiLongValuesSource() { return new LongDoubleValuesSource(this); } + /** Convert to a {@link MultiLongValuesSource} using {@link NumericUtils#doubleToSortableLong} */ + public final MultiLongValuesSource toSortableMultiLongValuesSource() { + return new SortableMultiLongValuesSource(this); + } + + private static class SortableMultiLongValuesSource extends MultiLongValuesSource { + + MultiDoubleValuesSource inner; + + SortableMultiLongValuesSource(MultiDoubleValuesSource inner) { + this.inner = Objects.requireNonNull(inner); + } + + @Override + public boolean isCacheable(LeafReaderContext ctx) { + return inner.isCacheable(ctx); + } + + @Override + public MultiLongValues getValues(LeafReaderContext ctx) throws IOException { + MultiDoubleValues doubleValues = inner.getValues(ctx); + + return new MultiLongValues() { + @Override + public long getValueCount() { + return doubleValues.getValueCount(); + } + + @Override + public long nextValue() throws IOException { + return NumericUtils.doubleToSortableLong(doubleValues.nextValue()); + } + + @Override + public boolean advanceExact(int doc) throws IOException { + return doubleValues.advanceExact(doc); + } + }; + } + + @Override + public int hashCode() { + return inner.hashCode(); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + SortableMultiLongValuesSource that = (SortableMultiLongValuesSource) o; + return Objects.equals(inner, that.inner); + } + + @Override + public String toString() { + return "sortableMultiLong(" + inner.toString() + ")"; + } + } + private static class FieldMultiValuedSource extends MultiDoubleValuesSource { private final String field; private final LongToDoubleFunction decoder; diff --git a/lucene/facet/src/java/org/apache/lucene/facet/RandomSamplingFacetsCollector.java b/lucene/facet/src/java/org/apache/lucene/facet/RandomSamplingFacetsCollector.java index de7c0e27323e..32cda39b013a 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/RandomSamplingFacetsCollector.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/RandomSamplingFacetsCollector.java @@ -33,7 +33,7 @@ * Collects hits for subsequent faceting, using sampling if needed. Once you've run a search and * collect hits into this, instantiate one of the {@link Facets} subclasses to do the facet * counting. Note that this collector does not collect the scores of matching docs (i.e. {@link - * FacetsCollector.MatchingDocs#scores}) is {@code null}. + * FacetsCollector.MatchingDocs#scores()}) is {@code null}. * *

    If you require the original set of hits, you can call {@link #getOriginalMatchingDocs()}. * Also, since the counts of the top-facets is based on the sampled set, you can amortize the counts @@ -125,7 +125,7 @@ public List getMatchingDocs() { if (totalHits == NOT_CALCULATED) { totalHits = 0; for (MatchingDocs md : matchingDocs) { - totalHits += md.totalHits; + totalHits += md.totalHits(); } } @@ -156,7 +156,7 @@ private List createSampledDocs(List matchingDocsList /** Create a sampled of the given hits. */ private MatchingDocs createSample(MatchingDocs docs) { - int maxdoc = docs.context.reader().maxDoc(); + int maxdoc = docs.context().reader().maxDoc(); // TODO: we could try the WAH8DocIdSet here as well, as the results will be sparse FixedBitSet sampleDocs = new FixedBitSet(maxdoc); @@ -175,7 +175,7 @@ private MatchingDocs createSample(MatchingDocs docs) { limit = binSize; randomIndex = random.nextInt(binSize); } - final DocIdSetIterator it = docs.bits.iterator(); + final DocIdSetIterator it = docs.bits().iterator(); for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) { if (counter == randomIndex) { sampleDocs.set(doc); @@ -206,7 +206,7 @@ private MatchingDocs createSample(MatchingDocs docs) { } } - return new MatchingDocs(docs.context, new BitDocIdSet(sampleDocs), docs.totalHits, null); + return new MatchingDocs(docs.context(), new BitDocIdSet(sampleDocs), docs.totalHits(), null); } catch (IOException e) { throw new RuntimeException(e); } @@ -288,9 +288,8 @@ private static class ReducedRandomSamplingFacetsCollector extends RandomSampling ReducedRandomSamplingFacetsCollector( int sampleSize, long seed, Collection facetsCollectors) { super(sampleSize, seed); - facetsCollectors.forEach( - facetsCollector -> - getOriginalMatchingDocs().addAll(facetsCollector.getOriginalMatchingDocs())); + this.getOriginalMatchingDocs() + .addAll(FacetsCollectorManager.reduceMatchingDocs(facetsCollectors)); } } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/StringValueFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/StringValueFacetCounts.java index 655e80546f86..a3b6e85ad07b 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/StringValueFacetCounts.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/StringValueFacetCounts.java @@ -108,8 +108,8 @@ public StringValueFacetCounts(StringDocValuesReaderState state, FacetsCollector int totalHits = 0; int totalDocs = 0; for (FacetsCollector.MatchingDocs matchingDocs : facetsCollector.getMatchingDocs()) { - totalHits += matchingDocs.totalHits; - totalDocs += matchingDocs.context.reader().maxDoc(); + totalHits += matchingDocs.totalHits(); + totalDocs += matchingDocs.context().reader().maxDoc(); } // No counting needed if there are no hits: @@ -297,22 +297,22 @@ private void count(FacetsCollector facetsCollector) throws IOException { if (matchingDocs.size() == 1) { FacetsCollector.MatchingDocs hits = matchingDocs.get(0); - if (hits.totalHits == 0) { + if (hits.totalHits() == 0) { return; } // Validate state before doing anything else: - validateState(hits.context); + validateState(hits.context()); // Assuming the state is valid, ordinalMap should be null since we have one segment: assert ordinalMap == null; - countOneSegment(docValues, hits.context.ord, hits, null); + countOneSegment(docValues, hits.context().ord, hits, null); } else { // Validate state before doing anything else. We only check the first segment since they // should all ladder up to the same top-level reader: - validateState(matchingDocs.get(0).context); + validateState(matchingDocs.get(0).context()); for (FacetsCollector.MatchingDocs hits : matchingDocs) { // Assuming the state is valid, ordinalMap should be non-null and docValues should be @@ -320,14 +320,14 @@ private void count(FacetsCollector facetsCollector) throws IOException { assert ordinalMap != null; assert docValues instanceof MultiDocValues.MultiSortedSetDocValues; - if (hits.totalHits == 0) { + if (hits.totalHits() == 0) { continue; } MultiDocValues.MultiSortedSetDocValues multiValues = (MultiDocValues.MultiSortedSetDocValues) docValues; - countOneSegment(multiValues.values[hits.context.ord], hits.context.ord, hits, null); + countOneSegment(multiValues.values[hits.context().ord], hits.context().ord, hits, null); } } } @@ -398,7 +398,7 @@ private void countOneSegment( assert liveDocs != null; it = FacetUtils.liveDocsDISI(valuesIt, liveDocs); } else { - it = ConjunctionUtils.intersectIterators(Arrays.asList(hits.bits.iterator(), valuesIt)); + it = ConjunctionUtils.intersectIterators(Arrays.asList(hits.bits().iterator(), valuesIt)); } // TODO: yet another option is to count all segs @@ -438,7 +438,7 @@ private void countOneSegment( final LongValues ordMap = ordinalMap.getGlobalOrds(segmentOrd); int segmentCardinality = (int) multiValues.getValueCount(); - if (hits != null && hits.totalHits < segmentCardinality / 10) { + if (hits != null && hits.totalHits() < segmentCardinality / 10) { // Remap every ord to global ord as we iterate: if (singleValues != null) { for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) { diff --git a/lucene/facet/src/java/org/apache/lucene/facet/facetset/DimRange.java b/lucene/facet/src/java/org/apache/lucene/facet/facetset/DimRange.java index 777e91ad6c25..c4f0599b24c8 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/facetset/DimRange.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/facetset/DimRange.java @@ -21,25 +21,11 @@ /** * Defines a single range in a {@link FacetSet} dimension. * + * @param min Inclusive min + * @param max Inclusive max * @lucene.experimental */ -public class DimRange { - /** Inclusive min */ - public final long min; - - /** Inclusive max */ - public final long max; - - /** - * Creates a LongRange. - * - * @param min inclusive min value in range - * @param max inclusive max value in range - */ - public DimRange(long min, long max) { - this.min = min; - this.max = max; - } +public record DimRange(long min, long max) { /** * Creates a {@link DimRange} for the given min and max long values. This method is also suitable diff --git a/lucene/facet/src/java/org/apache/lucene/facet/facetset/MatchingFacetSetsCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/facetset/MatchingFacetSetsCounts.java index c8151ab471e9..b17730e7e42d 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/facetset/MatchingFacetSetsCounts.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/facetset/MatchingFacetSetsCounts.java @@ -91,7 +91,7 @@ private int count(String field, List matchingDocs) int totCount = 0; for (FacetsCollector.MatchingDocs hits : matchingDocs) { - BinaryDocValues binaryDocValues = DocValues.getBinary(hits.context.reader(), field); + BinaryDocValues binaryDocValues = DocValues.getBinary(hits.context().reader(), field); final DocIdSetIterator it = createIterator(hits, binaryDocValues); if (it == null) { diff --git a/lucene/facet/src/java/org/apache/lucene/facet/facetset/RangeFacetSetMatcher.java b/lucene/facet/src/java/org/apache/lucene/facet/facetset/RangeFacetSetMatcher.java index 9ed12e0b6b2d..926237a53276 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/facetset/RangeFacetSetMatcher.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/facetset/RangeFacetSetMatcher.java @@ -34,8 +34,8 @@ public class RangeFacetSetMatcher extends FacetSetMatcher { */ public RangeFacetSetMatcher(String label, DimRange... dimRanges) { super(label, getDims(dimRanges)); - this.lowerRanges = Arrays.stream(dimRanges).mapToLong(range -> range.min).toArray(); - this.upperRanges = Arrays.stream(dimRanges).mapToLong(range -> range.max).toArray(); + this.lowerRanges = Arrays.stream(dimRanges).mapToLong(DimRange::min).toArray(); + this.upperRanges = Arrays.stream(dimRanges).mapToLong(DimRange::max).toArray(); } @Override diff --git a/lucene/facet/src/java/org/apache/lucene/facet/package-info.java b/lucene/facet/src/java/org/apache/lucene/facet/package-info.java index afa48744bbb7..33e9d6b9264e 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/package-info.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/package-info.java @@ -46,8 +46,8 @@ * all methods implement a common {@link org.apache.lucene.facet.Facets} base API that you use to * obtain specific facet counts. * - *

    The various {@link org.apache.lucene.facet.FacetsCollector#search} utility methods are useful - * for doing an "ordinary" search (sorting by score, or by a specified Sort) but also collecting - * into a {@link org.apache.lucene.facet.FacetsCollector} for subsequent faceting. + *

    The various {@link org.apache.lucene.facet.FacetsCollectorManager#search} utility methods are + * useful for doing an "ordinary" search (sorting by score, or by a specified Sort) but also + * collecting into a {@link org.apache.lucene.facet.FacetsCollectorManager} for subsequent faceting. */ package org.apache.lucene.facet; diff --git a/lucene/facet/src/java/org/apache/lucene/facet/range/DoubleRangeFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/range/DoubleRangeFacetCounts.java index 701d2471bb43..d2d9d95480a0 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/range/DoubleRangeFacetCounts.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/range/DoubleRangeFacetCounts.java @@ -160,7 +160,7 @@ private void count(DoubleValuesSource valueSource, List matchingDo LongRangeCounter counter = null; int missingCount = 0; for (MatchingDocs hits : matchingDocs) { - if (hits.totalHits == 0) { + if (hits.totalHits() == 0) { continue; } @@ -173,8 +173,8 @@ private void count(DoubleValuesSource valueSource, List matchingDo counter = setupCounter(); } - DoubleValues fv = valueSource.getValues(hits.context, null); - totCount += hits.totalHits; + DoubleValues fv = valueSource.getValues(hits.context(), null); + totCount += hits.totalHits(); for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; ) { // Skip missing docs: @@ -201,7 +201,7 @@ private void count(MultiDoubleValuesSource valueSource, List match LongRangeCounter counter = null; // LongRangeCounter.create(longRanges, counts); int missingCount = 0; for (MatchingDocs hits : matchingDocs) { - if (hits.totalHits == 0) { + if (hits.totalHits() == 0) { continue; } @@ -214,7 +214,7 @@ private void count(MultiDoubleValuesSource valueSource, List match counter = setupCounter(); } - MultiDoubleValues multiValues = valueSource.getValues(hits.context); + MultiDoubleValues multiValues = valueSource.getValues(hits.context()); for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; ) { // Skip missing docs: diff --git a/lucene/facet/src/java/org/apache/lucene/facet/range/DynamicRangeUtil.java b/lucene/facet/src/java/org/apache/lucene/facet/range/DynamicRangeUtil.java new file mode 100644 index 000000000000..b6ae71217f27 --- /dev/null +++ b/lucene/facet/src/java/org/apache/lucene/facet/range/DynamicRangeUtil.java @@ -0,0 +1,278 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.facet.range; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Future; +import org.apache.lucene.facet.FacetsCollector; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.LongValues; +import org.apache.lucene.search.LongValuesSource; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.InPlaceMergeSorter; + +/** + * Methods to create dynamic ranges for numeric fields. + * + * @lucene.experimental + */ +public final class DynamicRangeUtil { + + private DynamicRangeUtil() {} + + /** + * Construct dynamic ranges using the specified weight field to generate equi-weight range for the + * specified numeric bin field + * + * @param weightFieldName Name of the specified weight field + * @param weightValueSource Value source of the weight field + * @param fieldValueSource Value source of the value field + * @param facetsCollector FacetsCollector + * @param topN Number of requested ranges + * @param exec An executor service that is used to do the computation + * @return A list of DynamicRangeInfo that contains count, relevance, min, max, and centroid for + * each range + */ + public static List computeDynamicRanges( + String weightFieldName, + LongValuesSource weightValueSource, + LongValuesSource fieldValueSource, + FacetsCollector facetsCollector, + int topN, + ExecutorService exec) + throws IOException { + + List matchingDocsList = facetsCollector.getMatchingDocs(); + int totalDoc = + matchingDocsList.stream().mapToInt(FacetsCollector.MatchingDocs::totalHits).sum(); + long[] values = new long[totalDoc]; + long[] weights = new long[totalDoc]; + long totalWeight = 0; + int overallLength = 0; + + List> futures = new ArrayList<>(); + List tasks = new ArrayList<>(); + for (FacetsCollector.MatchingDocs matchingDocs : matchingDocsList) { + if (matchingDocs.totalHits() > 0) { + SegmentOutput segmentOutput = new SegmentOutput(matchingDocs.totalHits()); + + // [1] retrieve values and associated weights concurrently + SegmentTask task = + new SegmentTask(matchingDocs, fieldValueSource, weightValueSource, segmentOutput); + tasks.add(task); + futures.add(exec.submit(task)); + } + } + + // [2] wait for all segment runs to finish + for (Future future : futures) { + try { + future.get(); + } catch (InterruptedException ie) { + throw new RuntimeException(ie); + } catch (ExecutionException ee) { + IOUtils.rethrowAlways(ee.getCause()); + } + } + + // [3] merge the segment value and weight arrays into one array respectively and update the + // total weights + // and valid value length + for (SegmentTask task : tasks) { + SegmentOutput curSegmentOutput = task.segmentOutput; + // if segment total weight overflows, return null + if (curSegmentOutput == null) { + return null; + } + + assert curSegmentOutput.values.length == curSegmentOutput.weights.length; + + try { + totalWeight = Math.addExact(curSegmentOutput.segmentTotalWeight, totalWeight); + } catch (ArithmeticException ae) { + throw new IllegalArgumentException( + "weight field \"" + weightFieldName + "\": long totalWeight value out of bounds", ae); + } + + int currSegmentLen = curSegmentOutput.segmentIdx; + System.arraycopy(curSegmentOutput.values, 0, values, overallLength, currSegmentLen); + System.arraycopy(curSegmentOutput.weights, 0, weights, overallLength, currSegmentLen); + overallLength += currSegmentLen; + } + return computeDynamicNumericRanges(values, weights, overallLength, totalWeight, topN); + } + + private static class SegmentTask implements Callable { + private final FacetsCollector.MatchingDocs matchingDocs; + private final DocIdSetIterator matchingParentDocsItr; + private final LongValuesSource fieldValueSource; + private final LongValuesSource weightValueSource; + private SegmentOutput segmentOutput; + + SegmentTask( + FacetsCollector.MatchingDocs matchingDocs, + LongValuesSource fieldValueSource, + LongValuesSource weightValueSource, + SegmentOutput segmentOutput) + throws IOException { + this.matchingDocs = matchingDocs; + this.matchingParentDocsItr = matchingDocs.bits().iterator(); + this.fieldValueSource = fieldValueSource; + this.weightValueSource = weightValueSource; + this.segmentOutput = segmentOutput; + } + + @Override + public Void call() throws Exception { + LongValues fieldValue = fieldValueSource.getValues(matchingDocs.context(), null); + LongValues weightValue = weightValueSource.getValues(matchingDocs.context(), null); + for (int doc = matchingParentDocsItr.nextDoc(); + doc != DocIdSetIterator.NO_MORE_DOCS; + doc = matchingParentDocsItr.nextDoc()) { + // If this doc doesn't have a weight, we skip it. + if (fieldValue.advanceExact(doc) == false || weightValue.advanceExact(doc) == false) { + continue; + } + + long curValue = fieldValue.longValue(); + + long curWeight = weightValue.longValue(); + // We skip weights equal to zero, otherwise they can skew the ranges. + // Imagine all the weights were zero - any ranges would be valid. + if (curWeight == 0) { + continue; + } + + segmentOutput.values[segmentOutput.segmentIdx] = curValue; + segmentOutput.weights[segmentOutput.segmentIdx] = curWeight; + try { + segmentOutput.segmentTotalWeight = + Math.addExact(segmentOutput.segmentTotalWeight, curWeight); + } catch (ArithmeticException ae) { + throw new IllegalArgumentException("segment long totalWeight value out of bounds", ae); + } + segmentOutput.segmentIdx++; + } + return null; + } + } + + /** Holds field value array, weight array, totalWeight, valid value index for each segment */ + private static final class SegmentOutput { + private final long[] values; + private final long[] weights; + private long segmentTotalWeight; + private int segmentIdx; + + public SegmentOutput(int hitsLength) { + this.values = new long[hitsLength]; + this.weights = new long[hitsLength]; + } + } + + /** + * Compute dynamic numeric ranges using weights. + * + * @param values an array that contains the values of matching documents + * @param weights an array that contains the weights of matching documents + * @param len actual length of values and weights + * @param totalWeight the sum of weight values + * @param topN the requested top-n parameter + * @return A list of DynamicRangeInfo that contains count, relevance, min, max, and centroid + * values for each range. The size of dynamic ranges may not be exactly equal to top-N. top-N + * is used to compute the equi-weight per bin. + */ + public static List computeDynamicNumericRanges( + long[] values, long[] weights, int len, long totalWeight, int topN) { + assert values.length == weights.length && len <= values.length && len >= 0; + assert topN >= 0; + List dynamicRangeResult = new ArrayList<>(); + if (len == 0 || topN == 0) { + return dynamicRangeResult; + } + + new InPlaceMergeSorter() { + @Override + protected int compare(int index1, int index2) { + int cmp = Long.compare(values[index1], values[index2]); + if (cmp == 0) { + // If the values are equal, sort based on the weights. + // Any weight order is correct as long as it's deterministic. + return Long.compare(weights[index1], weights[index2]); + } + return cmp; + } + + @Override + protected void swap(int index1, int index2) { + long tmp = values[index1]; + values[index1] = values[index2]; + values[index2] = tmp; + tmp = weights[index1]; + weights[index1] = weights[index2]; + weights[index2] = tmp; + } + }.sort(0, len); + + long accuWeight = 0; + long valueSum = 0; + int count = 0; + int minIdx = 0; + + double rangeWeightTarget = (double) totalWeight / Math.min(topN, len); + + for (int i = 0; i < len; i++) { + accuWeight += weights[i]; + valueSum += values[i]; + count++; + + if (accuWeight >= rangeWeightTarget) { + dynamicRangeResult.add( + new DynamicRangeInfo( + count, accuWeight, values[minIdx], values[i], (double) valueSum / count)); + count = 0; + accuWeight = 0; + valueSum = 0; + minIdx = i + 1; + } + } + + // capture the remaining values to create the last range + if (minIdx < len) { + dynamicRangeResult.add( + new DynamicRangeInfo( + count, accuWeight, values[minIdx], values[len - 1], (double) valueSum / count)); + } + return dynamicRangeResult; + } + + /** + * Holds parameters of a dynamic numeric range. + * + * @param count the number of items in the range + * @param weight the summed weight of the items in the range + * @param min the lower bound of the range (inclusive) + * @param max the upper bound of the range (inclusive) + * @param centroid the average value in the range + */ + public record DynamicRangeInfo(int count, long weight, long min, long max, double centroid) {} +} diff --git a/lucene/facet/src/java/org/apache/lucene/facet/range/ExclusiveLongRangeCounter.java b/lucene/facet/src/java/org/apache/lucene/facet/range/ExclusiveLongRangeCounter.java index f8c4d7e763f7..e6f852b6b106 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/range/ExclusiveLongRangeCounter.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/range/ExclusiveLongRangeCounter.java @@ -161,13 +161,5 @@ private static List buildElementaryIntervals(LongRangeAndPos[] s } /** Simple container for a requested range and its original position */ - private static final class LongRangeAndPos { - final LongRange range; - final int pos; - - LongRangeAndPos(LongRange range, int pos) { - this.range = range; - this.pos = pos; - } - } + private record LongRangeAndPos(LongRange range, int pos) {} } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/range/LongRangeFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/range/LongRangeFacetCounts.java index 34aa3fcd0627..adb06006931d 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/range/LongRangeFacetCounts.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/range/LongRangeFacetCounts.java @@ -133,7 +133,7 @@ private void count(LongValuesSource valueSource, List matchingDocs int missingCount = 0; for (MatchingDocs hits : matchingDocs) { - if (hits.totalHits == 0) { + if (hits.totalHits() == 0) { continue; } @@ -146,8 +146,8 @@ private void count(LongValuesSource valueSource, List matchingDocs counter = setupCounter(); } - LongValues fv = valueSource.getValues(hits.context, null); - totCount += hits.totalHits; + LongValues fv = valueSource.getValues(hits.context(), null); + totCount += hits.totalHits(); for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; ) { // Skip missing docs: @@ -174,7 +174,7 @@ private void count(MultiLongValuesSource valueSource, List matchin LongRangeCounter counter = null; for (MatchingDocs hits : matchingDocs) { - if (hits.totalHits == 0) { + if (hits.totalHits() == 0) { continue; } @@ -187,7 +187,7 @@ private void count(MultiLongValuesSource valueSource, List matchin counter = setupCounter(); } - MultiLongValues multiValues = valueSource.getValues(hits.context); + MultiLongValues multiValues = valueSource.getValues(hits.context()); for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; ) { // Skip missing docs: diff --git a/lucene/facet/src/java/org/apache/lucene/facet/range/RangeFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/range/RangeFacetCounts.java index 8bf9c352e34f..8ab9a5f36485 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/range/RangeFacetCounts.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/range/RangeFacetCounts.java @@ -80,11 +80,12 @@ protected void count(String field, List matchingDo for (int i = 0; i < matchingDocs.size(); i++) { FacetsCollector.MatchingDocs hits = matchingDocs.get(i); - if (hits.totalHits == 0) { + if (hits.totalHits() == 0) { continue; } - SortedNumericDocValues multiValues = DocValues.getSortedNumeric(hits.context.reader(), field); + SortedNumericDocValues multiValues = + DocValues.getSortedNumeric(hits.context().reader(), field); if (multiValuedDocVals == null) { multiValuedDocVals = new SortedNumericDocValues[matchingDocs.size()]; } @@ -135,7 +136,7 @@ protected void count(String field, List matchingDo assert singleValuedDocVals != null; NumericDocValues singleValues = singleValuedDocVals[i]; - totCount += hits.totalHits; + totCount += hits.totalHits(); for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; ) { if (singleValues.advanceExact(doc)) { counter.addSingleValued(mapDocValue(singleValues.longValue())); diff --git a/lucene/facet/src/java/org/apache/lucene/facet/rangeonrange/RangeOnRangeFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/rangeonrange/RangeOnRangeFacetCounts.java index 86c665d6e84e..7aee5bfdda2d 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/rangeonrange/RangeOnRangeFacetCounts.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/rangeonrange/RangeOnRangeFacetCounts.java @@ -89,14 +89,14 @@ protected void count( BinaryRangeDocValues binaryRangeDocValues = new BinaryRangeDocValues( - DocValues.getBinary(hits.context.reader(), field), dims, numEncodedValueBytes); + DocValues.getBinary(hits.context().reader(), field), dims, numEncodedValueBytes); final DocIdSetIterator it = createIterator(hits); if (it == null) { continue; } - totCount += hits.totalHits; + totCount += hits.totalHits(); for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; ) { if (binaryRangeDocValues.advanceExact(doc)) { boolean hasValidRange = false; diff --git a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/AbstractSortedSetDocValueFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/AbstractSortedSetDocValueFacetCounts.java index 03a0ce72190e..45464180f57f 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/AbstractSortedSetDocValueFacetCounts.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/AbstractSortedSetDocValueFacetCounts.java @@ -20,7 +20,6 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; -import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -40,20 +39,6 @@ /** Base class for SSDV faceting implementations. */ abstract class AbstractSortedSetDocValueFacetCounts extends Facets { - private static final Comparator FACET_RESULT_COMPARATOR = - new Comparator<>() { - @Override - public int compare(FacetResult a, FacetResult b) { - if (a.value.intValue() > b.value.intValue()) { - return -1; - } else if (b.value.intValue() > a.value.intValue()) { - return 1; - } else { - return a.dim.compareTo(b.dim); - } - } - }; - final SortedSetDocValuesReaderState state; final FacetsConfig stateConfig; final SortedSetDocValues dv; @@ -140,7 +125,16 @@ public List getAllDims(int topN) throws IOException { } // Sort by highest count: - results.sort(FACET_RESULT_COMPARATOR); + results.sort( + (a, b) -> { + if (a.value.intValue() > b.value.intValue()) { + return -1; + } else if (b.value.intValue() > a.value.intValue()) { + return 1; + } else { + return a.dim.compareTo(b.dim); + } + }); return results; } @@ -183,7 +177,7 @@ protected boolean lessThan(DimValue a, DimValue b) { dimCount = getCount(dimOrd); } else { OrdRange ordRange = state.getOrdRange(dim); - int dimOrd = ordRange.start; + int dimOrd = ordRange.start(); if (dimConfig.multiValued) { if (dimConfig.requireDimCount) { // If a dim is configured as multi-valued and requires dim count, we index dim counts @@ -284,7 +278,7 @@ private ChildIterationCursor prepareChildIteration( // means dimension was never indexed return null; } - pathOrd = ordRange.start; + pathOrd = ordRange.start(); childIterator = ordRange.iterator(); if (dimConfig.multiValued && dimConfig.requireDimCount) { // If the dim is multi-valued and requires dim counts, we know we've explicitly indexed @@ -412,13 +406,5 @@ static final class DimValue { } } - static final class ChildIterationCursor { - final int pathOrd; - final PrimitiveIterator.OfInt childIterator; - - ChildIterationCursor(int pathOrd, PrimitiveIterator.OfInt childIterator) { - this.pathOrd = pathOrd; - this.childIterator = childIterator; - } - } + record ChildIterationCursor(int pathOrd, PrimitiveIterator.OfInt childIterator) {} } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/ConcurrentSortedSetDocValuesFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/ConcurrentSortedSetDocValuesFacetCounts.java index 0b03da9135b5..3a74805a8d8f 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/ConcurrentSortedSetDocValuesFacetCounts.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/ConcurrentSortedSetDocValuesFacetCounts.java @@ -107,7 +107,7 @@ public CountOneSegment( @Override public Void call() throws IOException { // If we're counting collected hits but there were none, short-circuit: - if (hits != null && hits.totalHits == 0) { + if (hits != null && hits.totalHits() == 0) { return null; } @@ -140,7 +140,7 @@ public Void call() throws IOException { final Bits liveDocs = leafReader.getLiveDocs(); it = (liveDocs != null) ? FacetUtils.liveDocsDISI(valuesIt, liveDocs) : valuesIt; } else { - it = ConjunctionUtils.intersectIterators(Arrays.asList(hits.bits.iterator(), valuesIt)); + it = ConjunctionUtils.intersectIterators(Arrays.asList(hits.bits().iterator(), valuesIt)); } if (ordinalMap != null) { @@ -148,7 +148,7 @@ public Void call() throws IOException { int numSegOrds = (int) multiValues.getValueCount(); - if (hits != null && hits.totalHits < numSegOrds / 10) { + if (hits != null && hits.totalHits() < numSegOrds / 10) { // Remap every ord to global ord as we iterate: if (singleValues != null) { if (singleValues == it) { @@ -295,14 +295,14 @@ private void count(List matchingDocs) throws IOException, Interrup // the top-level reader passed to the // SortedSetDocValuesReaderState, else cryptic // AIOOBE can happen: - if (ReaderUtil.getTopLevelContext(hits.context).reader() != reader) { + if (ReaderUtil.getTopLevelContext(hits.context()).reader() != reader) { throw new IllegalStateException( "the SortedSetDocValuesReaderState provided to this class does not match the reader being searched; you must create a new SortedSetDocValuesReaderState every time you open a new IndexReader"); } results.add( exec.submit( - new CountOneSegment(hits.context.reader(), hits, ordinalMap, hits.context.ord))); + new CountOneSegment(hits.context().reader(), hits, ordinalMap, hits.context().ord))); } for (Future result : results) { diff --git a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java index 0d0cf460bfd7..2bb6704bc443 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java @@ -174,7 +174,7 @@ private void countOneSegmentNHLD(OrdinalMap ordinalMap, LeafReader reader, int s private void countOneSegment( OrdinalMap ordinalMap, LeafReader reader, int segOrd, MatchingDocs hits, Bits liveDocs) throws IOException { - if (hits != null && hits.totalHits == 0) { + if (hits != null && hits.totalHits() == 0) { return; } @@ -197,7 +197,7 @@ private void countOneSegment( assert liveDocs != null; it = FacetUtils.liveDocsDISI(valuesIt, liveDocs); } else { - it = ConjunctionUtils.intersectIterators(Arrays.asList(hits.bits.iterator(), valuesIt)); + it = ConjunctionUtils.intersectIterators(Arrays.asList(hits.bits().iterator(), valuesIt)); } // TODO: yet another option is to count all segs @@ -214,7 +214,7 @@ private void countOneSegment( int numSegOrds = (int) multiValues.getValueCount(); - if (hits != null && hits.totalHits < numSegOrds / 10) { + if (hits != null && hits.totalHits() < numSegOrds / 10) { // Remap every ord to global ord as we iterate: if (singleValues != null) { for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) { @@ -293,12 +293,12 @@ private void count(List matchingDocs) throws IOException { // the top-level reader passed to the // SortedSetDocValuesReaderState, else cryptic // AIOOBE can happen: - if (ReaderUtil.getTopLevelContext(hits.context).reader() != reader) { + if (ReaderUtil.getTopLevelContext(hits.context()).reader() != reader) { throw new IllegalStateException( "the SortedSetDocValuesReaderState provided to this class does not match the reader being searched; you must create a new SortedSetDocValuesReaderState every time you open a new IndexReader"); } - countOneSegment(ordinalMap, hits.context.reader(), hits.context.ord, hits, null); + countOneSegment(ordinalMap, hits.context().reader(), hits.context().ord, hits, null); } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesReaderState.java b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesReaderState.java index 31de3678db27..3f387faec7bc 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesReaderState.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesReaderState.java @@ -38,20 +38,13 @@ */ public abstract class SortedSetDocValuesReaderState implements Accountable { - /** Holds start/end range of ords, which maps to one dimension. Only used for flat hierarchies. */ - public static final class OrdRange { - /** Start of range, inclusive: */ - public final int start; - - /** End of range, inclusive: */ - public final int end; - - /** Start and end are inclusive. */ - public OrdRange(int start, int end) { - this.start = start; - this.end = end; - } - + /** + * Holds start/end range of ords, which maps to one dimension. Only used for flat hierarchies. + * + * @param start Start of range, inclusive: + * @param end End of range, inclusive: + */ + public record OrdRange(int start, int end) { /** Iterates from start to end ord (inclusive) */ public PrimitiveIterator.OfInt iterator() { return new PrimitiveIterator.OfInt() { diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FacetLabel.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FacetLabel.java index 4696e16e4ec5..8c8181f0706b 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FacetLabel.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FacetLabel.java @@ -178,6 +178,14 @@ public FacetLabel subpath(final int length) { } } + /** Get the last component. */ + public String lastComponent() { + if (components.length == 0) { + throw new UnsupportedOperationException("components is empty"); + } + return components[components.length - 1]; + } + /** Returns a string representation of the path. */ @Override public String toString() { diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FastTaxonomyFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FastTaxonomyFacetCounts.java index 86ba905ab719..b47000368a90 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FastTaxonomyFacetCounts.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FastTaxonomyFacetCounts.java @@ -71,11 +71,11 @@ public FastTaxonomyFacetCounts( private void count(List matchingDocs) throws IOException { for (MatchingDocs hits : matchingDocs) { - if (hits.totalHits == 0) { + if (hits.totalHits() == 0) { continue; } SortedNumericDocValues multiValued = - hits.context.reader().getSortedNumericDocValues(indexFieldName); + hits.context().reader().getSortedNumericDocValues(indexFieldName); if (multiValued == null) { continue; } @@ -85,7 +85,7 @@ private void count(List matchingDocs) throws IOException { DocIdSetIterator valuesIt = singleValued != null ? singleValued : multiValued; DocIdSetIterator it = - ConjunctionUtils.intersectIterators(Arrays.asList(hits.bits.iterator(), valuesIt)); + ConjunctionUtils.intersectIterators(Arrays.asList(hits.bits().iterator(), valuesIt)); if (singleValued != null) { if (counts != null) { diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/SearcherTaxonomyManager.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/SearcherTaxonomyManager.java index 68abc6930ae8..5b768f48a053 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/SearcherTaxonomyManager.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/SearcherTaxonomyManager.java @@ -38,20 +38,14 @@ public class SearcherTaxonomyManager extends ReferenceManager { - /** Holds a matched pair of {@link IndexSearcher} and {@link TaxonomyReader} */ - public static class SearcherAndTaxonomy { - /** Point-in-time {@link IndexSearcher}. */ - public final IndexSearcher searcher; - - /** Matching point-in-time {@link DirectoryTaxonomyReader}. */ - public final DirectoryTaxonomyReader taxonomyReader; - - /** Create a SearcherAndTaxonomy */ - public SearcherAndTaxonomy(IndexSearcher searcher, DirectoryTaxonomyReader taxonomyReader) { - this.searcher = searcher; - this.taxonomyReader = taxonomyReader; - } - } + /** + * Holds a matched pair of {@link IndexSearcher} and {@link TaxonomyReader} + * + * @param searcher Point-in-time {@link IndexSearcher}. + * @param taxonomyReader Matching point-in-time {@link DirectoryTaxonomyReader}. + */ + public record SearcherAndTaxonomy( + IndexSearcher searcher, DirectoryTaxonomyReader taxonomyReader) {} private final SearcherFactory searcherFactory; private final long taxoEpoch; diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacetFloatAssociations.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacetFloatAssociations.java index 879d7fdf0c05..067db343b5d3 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacetFloatAssociations.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacetFloatAssociations.java @@ -115,7 +115,7 @@ private static DoubleValues scores(MatchingDocs hits) { @Override public double doubleValue() throws IOException { - return hits.scores[index]; + return hits.scores()[index]; } @Override @@ -134,17 +134,17 @@ private void aggregateValues( DoubleValuesSource valueSource) throws IOException { for (MatchingDocs hits : matchingDocs) { - if (hits.totalHits == 0) { + if (hits.totalHits() == 0) { continue; } initializeValueCounters(); SortedNumericDocValues ordinalValues = - DocValues.getSortedNumeric(hits.context.reader(), indexFieldName); + DocValues.getSortedNumeric(hits.context().reader(), indexFieldName); DoubleValues scores = keepScores ? scores(hits) : null; - DoubleValues functionValues = valueSource.getValues(hits.context, scores); + DoubleValues functionValues = valueSource.getValues(hits.context(), scores); DocIdSetIterator it = - ConjunctionUtils.intersectIterators(List.of(hits.bits.iterator(), ordinalValues)); + ConjunctionUtils.intersectIterators(List.of(hits.bits().iterator(), ordinalValues)); for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) { if (functionValues.advanceExact(doc)) { @@ -171,14 +171,14 @@ private void aggregateValues( throws IOException { for (MatchingDocs hits : matchingDocs) { - if (hits.totalHits == 0) { + if (hits.totalHits() == 0) { continue; } initializeValueCounters(); - BinaryDocValues dv = DocValues.getBinary(hits.context.reader(), indexFieldName); + BinaryDocValues dv = DocValues.getBinary(hits.context().reader(), indexFieldName); DocIdSetIterator it = - ConjunctionUtils.intersectIterators(Arrays.asList(hits.bits.iterator(), dv)); + ConjunctionUtils.intersectIterators(Arrays.asList(hits.bits().iterator(), dv)); for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) { final BytesRef bytesRef = dv.binaryValue(); diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacetIntAssociations.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacetIntAssociations.java index 86cc3d1f7147..363d6f6c24c5 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacetIntAssociations.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacetIntAssociations.java @@ -63,13 +63,14 @@ private void aggregateValues( AssociationAggregationFunction aggregationFunction, List matchingDocs) throws IOException { for (MatchingDocs hits : matchingDocs) { - if (hits.totalHits == 0) { + if (hits.totalHits() == 0) { continue; } initializeValueCounters(); - BinaryDocValues dv = DocValues.getBinary(hits.context.reader(), indexFieldName); - DocIdSetIterator it = ConjunctionUtils.intersectIterators(List.of(hits.bits.iterator(), dv)); + BinaryDocValues dv = DocValues.getBinary(hits.context().reader(), indexFieldName); + DocIdSetIterator it = + ConjunctionUtils.intersectIterators(List.of(hits.bits().iterator(), dv)); for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) { final BytesRef bytesRef = dv.binaryValue(); diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacets.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacets.java index 8c0b1ddb6aa5..611e577adce5 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacets.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacets.java @@ -55,20 +55,6 @@ private static class DimValue { } } - private static final Comparator BY_VALUE_THEN_DIM = - new Comparator() { - @Override - public int compare(FacetResult a, FacetResult b) { - if (a.value.doubleValue() > b.value.doubleValue()) { - return -1; - } else if (b.value.doubleValue() > a.value.doubleValue()) { - return 1; - } else { - return a.dim.compareTo(b.dim); - } - } - }; - /** Index field name provided to the constructor. */ final String indexFieldName; @@ -128,8 +114,8 @@ private boolean useHashTable(FacetsCollector fc, TaxonomyReader taxoReader) { int maxDoc = 0; int sumTotalHits = 0; for (FacetsCollector.MatchingDocs docs : fc.getMatchingDocs()) { - sumTotalHits += docs.totalHits; - maxDoc += docs.context.reader().maxDoc(); + sumTotalHits += docs.totalHits(); + maxDoc += docs.context().reader().maxDoc(); } // if our result set is < 10% of the index, we collect sparsely (use hash map): @@ -591,7 +577,16 @@ public List getAllDims(int topN) throws IOException { } // Sort by highest value, tie break by dim: - results.sort(BY_VALUE_THEN_DIM); + results.sort( + (a, b) -> { + if (a.value.doubleValue() > b.value.doubleValue()) { + return -1; + } else if (b.value.doubleValue() > a.value.doubleValue()) { + return 1; + } else { + return a.dim.compareTo(b.dim); + } + }); return results; } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/CharBlockArray.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/CharBlockArray.java index f358ed418367..9a514486bad8 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/CharBlockArray.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/CharBlockArray.java @@ -16,15 +16,8 @@ */ package org.apache.lucene.facet.taxonomy.writercache; -import java.io.IOException; -import java.io.InputStream; -import java.io.ObjectInputStream; -import java.io.ObjectOutputStream; -import java.io.OutputStream; -import java.io.Serializable; import java.util.ArrayList; import java.util.List; -import org.apache.lucene.util.SuppressForbidden; /** * Similar to {@link StringBuilder}, but with a more efficient growing strategy. This class uses @@ -32,15 +25,11 @@ * * @lucene.experimental */ -class CharBlockArray implements Appendable, Serializable, CharSequence { - - private static final long serialVersionUID = 1L; +class CharBlockArray implements Appendable, CharSequence { private static final int DefaultBlockSize = 32 * 1024; // 32 KB default size - static final class Block implements Serializable, Cloneable { - private static final long serialVersionUID = 1L; - + static final class Block implements Cloneable { final char[] chars; int length; @@ -185,34 +174,4 @@ public String toString() { } return sb.toString(); } - - @SuppressForbidden( - reason = "TODO: don't use java serialization here, inefficient and unnecessary") - void flush(OutputStream out) throws IOException { - ObjectOutputStream oos = null; - try { - oos = new ObjectOutputStream(out); - oos.writeObject(this); - oos.flush(); - } finally { - if (oos != null) { - oos.close(); - } - } - } - - @SuppressForbidden( - reason = "TODO: don't use java serialization here, inefficient and unnecessary") - public static CharBlockArray open(InputStream in) throws IOException, ClassNotFoundException { - ObjectInputStream ois = null; - try { - ois = new ObjectInputStream(in); - CharBlockArray a = (CharBlockArray) ois.readObject(); - return a; - } finally { - if (ois != null) { - ois.close(); - } - } - } } diff --git a/lucene/facet/src/test/org/apache/lucene/facet/FacetTestCase.java b/lucene/facet/src/test/org/apache/lucene/facet/FacetTestCase.java index 1023bb4cce30..ea6c06b87a59 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/FacetTestCase.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/FacetTestCase.java @@ -68,8 +68,8 @@ public List> getAllTaxonomyFacetLabels( TaxonomyFacetLabels taxoLabels = new TaxonomyFacetLabels(taxoReader, FacetsConfig.DEFAULT_INDEX_FIELD_NAME); for (MatchingDocs m : fc.getMatchingDocs()) { - FacetLabelReader facetLabelReader = taxoLabels.getFacetLabelReader(m.context); - DocIdSetIterator disi = m.bits.iterator(); + FacetLabelReader facetLabelReader = taxoLabels.getFacetLabelReader(m.context()); + DocIdSetIterator disi = m.bits().iterator(); while (disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { actualLabels.add(allFacetLabels(disi.docID(), dimension, facetLabelReader)); } diff --git a/lucene/facet/src/test/org/apache/lucene/facet/TestDrillDownQuery.java b/lucene/facet/src/test/org/apache/lucene/facet/TestDrillDownQuery.java index d606009d79f0..5501744fbe79 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/TestDrillDownQuery.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/TestDrillDownQuery.java @@ -125,7 +125,7 @@ public void testAndOrs() throws Exception { q.add("a", "2"); q.add("b", "1"); TopDocs docs = searcher.search(q, 100); - assertEquals(5, docs.totalHits.value); + assertEquals(5, docs.totalHits.value()); } public void testQuery() throws IOException { @@ -136,14 +136,14 @@ public void testQuery() throws IOException { q.add("a"); QueryUtils.check(q); TopDocs docs = searcher.search(q, 100); - assertEquals(25, docs.totalHits.value); + assertEquals(25, docs.totalHits.value()); // Making sure the query yields 5 documents with the facet "b" and the // previous (facet "a") query as a base query DrillDownQuery q2 = new DrillDownQuery(config, q); q2.add("b"); docs = searcher.search(q2, 100); - assertEquals(5, docs.totalHits.value); + assertEquals(5, docs.totalHits.value()); // Making sure that a query of both facet "a" and facet "b" yields 5 results DrillDownQuery q3 = new DrillDownQuery(config); @@ -151,14 +151,14 @@ public void testQuery() throws IOException { q3.add("b"); docs = searcher.search(q3, 100); - assertEquals(5, docs.totalHits.value); + assertEquals(5, docs.totalHits.value()); // Check that content:foo (which yields 50% results) and facet/b (which yields 20%) // would gather together 10 results (10%..) Query fooQuery = new TermQuery(new Term("content", "foo")); DrillDownQuery q4 = new DrillDownQuery(config, fooQuery); q4.add("b"); docs = searcher.search(q4, 100); - assertEquals(10, docs.totalHits.value); + assertEquals(10, docs.totalHits.value()); } public void testQueryImplicitDefaultParams() throws IOException { @@ -173,7 +173,7 @@ public void testQueryImplicitDefaultParams() throws IOException { DrillDownQuery q2 = new DrillDownQuery(config, q); q2.add("b"); TopDocs docs = searcher.search(q2, 100); - assertEquals(5, docs.totalHits.value); + assertEquals(5, docs.totalHits.value()); // Check that content:foo (which yields 50% results) and facet/b (which yields 20%) // would gather together 10 results (10%..) @@ -181,7 +181,7 @@ public void testQueryImplicitDefaultParams() throws IOException { DrillDownQuery q4 = new DrillDownQuery(config, fooQuery); q4.add("b"); docs = searcher.search(q4, 100); - assertEquals(10, docs.totalHits.value); + assertEquals(10, docs.totalHits.value()); } public void testZeroLimit() throws IOException { @@ -189,8 +189,10 @@ public void testZeroLimit() throws IOException { DrillDownQuery q = new DrillDownQuery(config); q.add("b", "1"); int limit = 0; - FacetsCollector facetCollector = new FacetsCollector(); - FacetsCollector.search(searcher, q, limit, facetCollector); + + FacetsCollector facetCollector = + FacetsCollectorManager.search(searcher, q, limit, new FacetsCollectorManager()) + .facetsCollector(); Facets facets = getTaxonomyFacetCounts( taxo, config, facetCollector, config.getDimConfig("b").indexFieldName); diff --git a/lucene/facet/src/test/org/apache/lucene/facet/TestDrillSideways.java b/lucene/facet/src/test/org/apache/lucene/facet/TestDrillSideways.java index 5645f1915491..74205522451f 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/TestDrillSideways.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/TestDrillSideways.java @@ -142,7 +142,7 @@ protected Facets buildFacetsResult( private IndexSearcher getNewSearcher(IndexReader reader) { // Do not wrap with an asserting searcher, since DrillSidewaysQuery doesn't // implement all the required components like Weight#scorer. - IndexSearcher searcher = newSearcher(reader, true, false, random().nextBoolean()); + IndexSearcher searcher = newSearcher(reader, true, false, Concurrency.INTER_SEGMENT); // DrillSideways requires the entire range of docs to be scored at once, so it doesn't support // timeouts whose implementation scores one window of doc IDs at a time. searcher.setTimeout(null); @@ -284,7 +284,6 @@ public void testCollectionTerminated() throws Exception { Weight dimWeight = searcher.createWeight(dimQ, ScoreMode.COMPLETE_NO_SCORES, 1f); Scorer dimScorer = dimWeight.scorer(ctx); - FacetsCollector baseFC = new FacetsCollector(); FacetsCollector dimFC = new FacetsCollector(); DrillSidewaysScorer.DocsAndCost docsAndCost = new DrillSidewaysScorer.DocsAndCost(dimScorer, dimFC.getLeafCollector(ctx)); @@ -311,17 +310,17 @@ public void collect(int doc) throws IOException { new DrillSidewaysScorer( ctx, baseScorer, - baseFC.getLeafCollector(ctx), new DrillSidewaysScorer.DocsAndCost[] {docsAndCost}, scoreSubDocsAtOnce); - expectThrows(CollectionTerminatedException.class, () -> scorer.score(baseCollector, null)); + expectThrows( + CollectionTerminatedException.class, + () -> scorer.score(baseCollector, null, 0, DocIdSetIterator.NO_MORE_DOCS)); // We've set things up so that our base collector with throw CollectionTerminatedException // after collecting the first doc. This means we'll only collect the first indexed doc for // both our base and sideways dim facets collectors. What we really want to test here is // that the matching docs are still correctly present and populated after an early // termination occurs (i.e., #finish is properly called in that scenario): - assertEquals(1, baseFC.getMatchingDocs().size()); assertEquals(1, dimFC.getMatchingDocs().size()); } } @@ -430,7 +429,7 @@ private void runDrillSidewaysTestCases(FacetsConfig config, DrillSideways ds) th DrillDownQuery ddq = new DrillDownQuery(config); ddq.add("Author", "Lisa"); DrillSidewaysResult r = ds.search(null, ddq, 10); - assertEquals(2, r.hits.totalHits.value); + assertEquals(2, r.hits.totalHits.value()); // Publish Date is only drill-down, and Lisa published // one in 2012 and one in 2010: assertEquals( @@ -450,7 +449,7 @@ private void runDrillSidewaysTestCases(FacetsConfig config, DrillSideways ds) th ddq.add("Author", "Lisa"); ddq.add("Author", "Bob"); r = ds.search(null, ddq, 10); - assertEquals(3, r.hits.totalHits.value); + assertEquals(3, r.hits.totalHits.value()); // Publish Date is only drill-down: Lisa and Bob // (drill-down) published twice in 2010 and once in 2012: assertEquals( @@ -495,7 +494,7 @@ private void runDrillSidewaysTestCases(FacetsConfig config, DrillSideways ds) th ddq.add("Author", "Lisa"); ddq.add("Publish Date", "2010"); r = ds.search(null, ddq, 10); - assertEquals(1, r.hits.totalHits.value); + assertEquals(1, r.hits.totalHits.value()); // Publish Date is drill-sideways + drill-down: Lisa // (drill-down) published once in 2010 and once in 2012: assertEquals( @@ -516,7 +515,7 @@ private void runDrillSidewaysTestCases(FacetsConfig config, DrillSideways ds) th ddq.add("Publish Date", "2010"); ddq.add("Author", "Bob"); r = ds.search(null, ddq, 10); - assertEquals(2, r.hits.totalHits.value); + assertEquals(2, r.hits.totalHits.value()); // Publish Date is both drill-sideways + drill-down: // Lisa or Bob published twice in 2010 and once in 2012: assertEquals( @@ -532,7 +531,7 @@ private void runDrillSidewaysTestCases(FacetsConfig config, DrillSideways ds) th ddq = new DrillDownQuery(config); ddq.add("Foobar", "Baz"); r = ds.search(null, ddq, 10); - assertEquals(0, r.hits.totalHits.value); + assertEquals(0, r.hits.totalHits.value()); assertNull(r.facets.getTopChildren(10, "Publish Date")); assertNull(r.facets.getTopChildren(10, "Foobar")); @@ -541,7 +540,7 @@ private void runDrillSidewaysTestCases(FacetsConfig config, DrillSideways ds) th ddq.add("Author", "Lisa"); ddq.add("Author", "Tom"); r = ds.search(null, ddq, 10); - assertEquals(2, r.hits.totalHits.value); + assertEquals(2, r.hits.totalHits.value()); // Publish Date is only drill-down, and Lisa published // one in 2012 and one in 2010: assertEquals( @@ -560,7 +559,7 @@ private void runDrillSidewaysTestCases(FacetsConfig config, DrillSideways ds) th ddq.add("Author", "Lisa"); ddq.add("Author", "Tom"); r = ds.search(null, ddq, 10); - assertEquals(2, r.hits.totalHits.value); + assertEquals(2, r.hits.totalHits.value()); // Publish Date is only drill-down, and Lisa published // one in 2012 and one in 2010: assertEquals( @@ -572,7 +571,7 @@ private void runDrillSidewaysTestCases(FacetsConfig config, DrillSideways ds) th ddq.add("Author", "Lisa"); r = ds.search(null, ddq, 10); - assertEquals(0, r.hits.totalHits.value); + assertEquals(0, r.hits.totalHits.value()); assertNull(r.facets.getTopChildren(10, "Publish Date")); assertNull(r.facets.getTopChildren(10, "Author")); @@ -903,7 +902,7 @@ public void testSometimesInvalidDrillDown() throws Exception { ddq.add("Author", "Lisa"); DrillSidewaysResult r = getNewDrillSideways(searcher, config, taxoReader).search(null, ddq, 10); - assertEquals(1, r.hits.totalHits.value); + assertEquals(1, r.hits.totalHits.value()); // Publish Date is only drill-down, and Lisa published // one in 2012 and one in 2010: assertEquals( @@ -969,7 +968,7 @@ public void testMultipleRequestsPerDim() throws Exception { ddq.add("dim", "a"); DrillSidewaysResult r = getNewDrillSideways(searcher, config, taxoReader).search(null, ddq, 10); - assertEquals(3, r.hits.totalHits.value); + assertEquals(3, r.hits.totalHits.value()); assertEquals( "dim=dim path=[] value=6 childCount=4\n a (3)\n b (1)\n c (1)\n d (1)\n", r.facets.getTopChildren(10, "dim").toString()); @@ -1433,7 +1432,7 @@ public int hashCode() { q = new BooleanQuery.Builder().add(q, Occur.MUST).add(filter, Occur.FILTER).build(); } TopDocs ddqHits = s.search(q, numDocs); - assertEquals(expected.hits.size(), ddqHits.totalHits.value); + assertEquals(expected.hits.size(), ddqHits.totalHits.value()); for (int i = 0; i < expected.hits.size(); i++) { // Score should be IDENTICAL: assertEquals(scores.get(expected.hits.get(i).id), ddqHits.scoreDocs[i].score, 0.0f); @@ -1488,7 +1487,8 @@ protected FacetsCollectorManager createDrillSidewaysFacetsCollectorManager() { // context, which happens as part of #finish getting called: assertEquals(1, result.drillDownFacetsCollector.getMatchingDocs().size()); assertEquals( - 1, result.drillDownFacetsCollector.getMatchingDocs().get(0).context.reader().maxDoc()); + 1, + result.drillDownFacetsCollector.getMatchingDocs().get(0).context().reader().maxDoc()); assertEquals(1, result.drillSidewaysFacetsCollector.length); assertEquals(1, result.drillSidewaysFacetsCollector[0].getMatchingDocs().size()); assertEquals( @@ -1497,7 +1497,7 @@ protected FacetsCollectorManager createDrillSidewaysFacetsCollectorManager() { .drillSidewaysFacetsCollector[0] .getMatchingDocs() .get(0) - .context + .context() .reader() .maxDoc()); } @@ -1563,15 +1563,7 @@ private static class DSResults { Facets facets; } - private static class CollectedResult { - final DocAndScore docAndScore; - final String id; - - CollectedResult(DocAndScore docAndScore, String id) { - this.docAndScore = docAndScore; - this.id = id; - } - } + private record CollectedResult(DocAndScore docAndScore, String id) {} private abstract static class SimpleLeafCollector implements LeafCollector { protected Scorable scorer; @@ -1667,10 +1659,11 @@ private static final class DrillSidewaysAssertingIndexSearcher extends IndexSear } @Override - protected void search(List leaves, Weight weight, Collector collector) + protected void search( + LeafReaderContextPartition[] partitions, Weight weight, Collector collector) throws IOException { AssertingCollector assertingCollector = AssertingCollector.wrap(collector); - super.search(leaves, weight, assertingCollector); + super.search(partitions, weight, assertingCollector); assert assertingCollector.hasFinishedCollectingPreviousLeaf; } } @@ -1871,7 +1864,7 @@ void verifyEquals( if (VERBOSE) { System.out.println(" verify totHits=" + expected.hits.size()); } - assertEquals(expected.hits.size(), actual.resultCount.value); + assertEquals(expected.hits.size(), actual.resultCount.value()); assertEquals(expected.hits.size(), actual.results.size()); for (int i = 0; i < expected.hits.size(); i++) { if (VERBOSE) { @@ -1999,7 +1992,7 @@ public void testEmptyIndex() throws Exception { ddq.add("Author", "Lisa"); DrillSidewaysResult r = ds.search(ddq, 10); // this used to fail on IllegalArgEx - assertEquals(0, r.hits.totalHits.value); + assertEquals(0, r.hits.totalHits.value()); r = ds.search( @@ -2009,7 +2002,7 @@ public void testEmptyIndex() throws Exception { 10, new Sort(new SortField("foo", SortField.Type.INT)), false); // this used to fail on IllegalArgEx - assertEquals(0, r.hits.totalHits.value); + assertEquals(0, r.hits.totalHits.value()); writer.close(); IOUtils.close(taxoWriter, searcher.getIndexReader(), taxoReader, dir, taxoDir); @@ -2083,7 +2076,7 @@ protected FacetsCollectorManager createDrillDownFacetsCollectorManager() { ddq.add("Author", "Bob"); DrillSidewaysResult r = ds.search(null, ddq, 10); Facets facets = r.facets; - assertEquals(3, r.hits.totalHits.value); + assertEquals(3, r.hits.totalHits.value()); assertEquals( "dim=Author path=[] value=5 childCount=4\n Lisa (2)\n Bob (1)\n Susan (1)\n Frank (1)\n", facets.getTopChildren(10, "Author").toString()); @@ -2112,7 +2105,7 @@ protected FacetsCollectorManager createDrillDownFacetsCollectorManager() { ddq.add("Author", "Lisa"); ddq.add("Publish Date", "2010"); r = ds.search(null, ddq, 10); - assertEquals(1, r.hits.totalHits.value); + assertEquals(1, r.hits.totalHits.value()); // Should be able to count on both fields since they're both drill sideways cases assertEquals( "dim=Publish Date path=[] value=2 childCount=2\n 2010 (1)\n 2012 (1)\n", @@ -2164,7 +2157,7 @@ public void testScorer() throws Exception { ddq.add("author", bq.build()); ddq.add("dim", bq.build()); DrillSidewaysResult r = ds.search(null, ddq, 10); - assertEquals(0, r.hits.totalHits.value); + assertEquals(0, r.hits.totalHits.value()); writer.close(); IOUtils.close(searcher.getIndexReader(), taxoReader, taxoWriter, dir, taxoDir); @@ -2278,7 +2271,7 @@ public void testDrillSidewaysSearchUseCorrectIterator() throws Exception { drillDownQuery.add("dim1", "dim1"); var result = drill.search(drillDownQuery, 99); // We expect to match exactly one document from the query above - assertEquals(1, result.hits.totalHits.value); + assertEquals(1, result.hits.totalHits.value()); indexReader.close(); taxonomyReader.close(); diff --git a/lucene/facet/src/test/org/apache/lucene/facet/TestFacetQuery.java b/lucene/facet/src/test/org/apache/lucene/facet/TestFacetQuery.java index da012e3f0a8c..16ecfd09719f 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/TestFacetQuery.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/TestFacetQuery.java @@ -95,7 +95,7 @@ public static void closeTestIndex() throws IOException { @Test public void testSingleValued() throws Exception { TopDocs topDocs = searcher.search(new FacetQuery("Author", "Mark Twain"), 10); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); } @Test @@ -105,6 +105,6 @@ public void testMultiValued() throws Exception { new MultiFacetQuery( "Author", new String[] {"Mark Twain"}, new String[] {"Kurt Vonnegut"}), 10); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); } } diff --git a/lucene/facet/src/test/org/apache/lucene/facet/TestFacetUtils.java b/lucene/facet/src/test/org/apache/lucene/facet/TestFacetUtils.java index 8c78d9b01fd7..0f018ea9a8f9 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/TestFacetUtils.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/TestFacetUtils.java @@ -79,8 +79,8 @@ public void testBasic() throws IOException { DocIdSetIterator disi; for (FacetsCollector.MatchingDocs m : fc.getMatchingDocs()) { - NumericDocValues numericDV = DocValues.getNumeric(m.context.reader(), "val"); - Bits liveDocs = m.context.reader().getLiveDocs(); + NumericDocValues numericDV = DocValues.getNumeric(m.context().reader(), "val"); + Bits liveDocs = m.context().reader().getLiveDocs(); // Only use the liveDocsDISI if liveDocs is not null disi = (liveDocs == null) ? numericDV : FacetUtils.liveDocsDISI(numericDV, liveDocs); diff --git a/lucene/facet/src/test/org/apache/lucene/facet/TestMultipleIndexFields.java b/lucene/facet/src/test/org/apache/lucene/facet/TestMultipleIndexFields.java index 706c09a98370..9ba77a871538 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/TestMultipleIndexFields.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/TestMultipleIndexFields.java @@ -331,9 +331,9 @@ private void assertCorrectResults(Facets facets) throws IOException { } private FacetsCollector performSearch(IndexSearcher searcher) throws IOException { - FacetsCollector fc = new FacetsCollector(); - FacetsCollector.search(searcher, new MatchAllDocsQuery(), 10, fc); - return fc; + return FacetsCollectorManager.search( + searcher, new MatchAllDocsQuery(), 10, new FacetsCollectorManager()) + .facetsCollector(); } private void seedIndex(TaxonomyWriter tw, RandomIndexWriter iw, FacetsConfig config) diff --git a/lucene/facet/src/test/org/apache/lucene/facet/TestRandomSamplingFacetsCollector.java b/lucene/facet/src/test/org/apache/lucene/facet/TestRandomSamplingFacetsCollector.java index 9eb45ba2c3ac..93b7963ff5eb 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/TestRandomSamplingFacetsCollector.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/TestRandomSamplingFacetsCollector.java @@ -85,7 +85,7 @@ public void testRandomSampling() throws Exception { // There should be no results at all for (MatchingDocs doc : collectRandomZeroResults.getMatchingDocs()) { - assertEquals(0, doc.totalHits); + assertEquals(0, doc.totalHits()); } // Now start searching and retrieve results. @@ -106,17 +106,17 @@ public void testRandomSampling() throws Exception { // System.out.println("numSegments=" + numSampledDocs.length); for (int i = 0; i < numSampledDocs.length; i++) { MatchingDocs md = matchingDocs.get(i); - final DocIdSetIterator iter = md.bits.iterator(); + final DocIdSetIterator iter = md.bits().iterator(); while (iter.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) ++numSampledDocs[i]; totalSampledDocs += numSampledDocs[i]; - totalHits += md.totalHits; + totalHits += md.totalHits(); } // compute the chi-square value for the sampled documents' distribution float chi_square = 0; for (int i = 0; i < numSampledDocs.length; i++) { MatchingDocs md = matchingDocs.get(i); - float ei = (float) md.totalHits / totalHits; + float ei = (float) md.totalHits() / totalHits; if (ei > 0.0f) { float oi = (float) numSampledDocs[i] / totalSampledDocs; chi_square += (float) (Math.pow(ei - oi, 2) / ei); diff --git a/lucene/facet/src/test/org/apache/lucene/facet/range/TestDynamicRangeUtil.java b/lucene/facet/src/test/org/apache/lucene/facet/range/TestDynamicRangeUtil.java new file mode 100644 index 000000000000..db78b03e6e3a --- /dev/null +++ b/lucene/facet/src/test/org/apache/lucene/facet/range/TestDynamicRangeUtil.java @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.facet.range; + +import java.util.ArrayList; +import java.util.List; +import org.apache.lucene.tests.util.LuceneTestCase; + +public class TestDynamicRangeUtil extends LuceneTestCase { + public void testComputeDynamicNumericRangesBasic() { + List expectedRangeInfoList = new ArrayList<>(); + long[] values = new long[1000]; + long[] weights = new long[1000]; + + long totalWeight = 0; + for (int i = 0; i < 1000; i++) { + values[i] = i + 1; + weights[i] = i; + totalWeight += i; + } + + expectedRangeInfoList.add(new DynamicRangeUtil.DynamicRangeInfo(501, 125250L, 1L, 501L, 251D)); + expectedRangeInfoList.add( + new DynamicRangeUtil.DynamicRangeInfo(207, 125028L, 502L, 708L, 605D)); + expectedRangeInfoList.add( + new DynamicRangeUtil.DynamicRangeInfo(159, 125133L, 709L, 867L, 788D)); + expectedRangeInfoList.add( + new DynamicRangeUtil.DynamicRangeInfo(133, 124089L, 868L, 1000L, 934D)); + assertDynamicNumericRangeResults(values, weights, 4, totalWeight, expectedRangeInfoList); + } + + public void testComputeDynamicNumericRangesWithSameValues() { + List expectedRangeInfoList = new ArrayList<>(); + long totalWeight = 0; + long[] values = new long[100]; + long[] weights = new long[100]; + for (int i = 0; i < 100; i++) { + values[i] = 50; + weights[i] = i; + totalWeight += i; + } + + expectedRangeInfoList.add(new DynamicRangeUtil.DynamicRangeInfo(51, 1275L, 50L, 50L, 50D)); + expectedRangeInfoList.add(new DynamicRangeUtil.DynamicRangeInfo(21, 1281L, 50L, 50L, 50D)); + expectedRangeInfoList.add(new DynamicRangeUtil.DynamicRangeInfo(16, 1272L, 50L, 50L, 50D)); + expectedRangeInfoList.add(new DynamicRangeUtil.DynamicRangeInfo(12, 1122L, 50L, 50L, 50D)); + + assertDynamicNumericRangeResults(values, weights, 4, totalWeight, expectedRangeInfoList); + } + + public void testComputeDynamicNumericRangesWithOneValue() { + long[] values = new long[] {50}; + long[] weights = new long[] {1}; + List expectedRangeInfoList = new ArrayList<>(); + + expectedRangeInfoList.add(new DynamicRangeUtil.DynamicRangeInfo(1, 1L, 50L, 50L, 50D)); + assertDynamicNumericRangeResults(values, weights, 4, 1, expectedRangeInfoList); + } + + public void testComputeDynamicNumericRangesWithOneLargeWeight() { + List expectedRangeInfoList = new ArrayList<>(); + long[] values = new long[] {45, 32, 52, 14, 455, 342, 53}; + long[] weights = new long[] {143, 23, 1, 52343, 53, 12, 2534}; + + // value 14 has its own bin since the weight is large, and the rest of values fall the other bin + expectedRangeInfoList.add(new DynamicRangeUtil.DynamicRangeInfo(1, 52343, 14L, 14L, 14D)); + expectedRangeInfoList.add( + new DynamicRangeUtil.DynamicRangeInfo(6, 2766, 32L, 455L, 163.16666666666666D)); + assertDynamicNumericRangeResults(values, weights, 4, 55109, expectedRangeInfoList); + } + + private static void assertDynamicNumericRangeResults( + long[] values, + long[] weights, + int topN, + long totalWeight, + List expectedDynamicRangeResult) { + List mockDynamicRangeResult = + DynamicRangeUtil.computeDynamicNumericRanges( + values, weights, values.length, totalWeight, topN); + assertTrue(compareDynamicRangeResult(mockDynamicRangeResult, expectedDynamicRangeResult)); + } + + private static boolean compareDynamicRangeResult( + List mockResult, + List expectedResult) { + return mockResult.size() == expectedResult.size() && mockResult.containsAll(expectedResult); + } +} diff --git a/lucene/facet/src/test/org/apache/lucene/facet/range/TestRangeFacetCounts.java b/lucene/facet/src/test/org/apache/lucene/facet/range/TestRangeFacetCounts.java index d7cb507a9110..d5f57bd827c1 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/range/TestRangeFacetCounts.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/range/TestRangeFacetCounts.java @@ -469,7 +469,7 @@ public void testMixedRangeAndNonRangeTaxonomy() throws Exception { final TaxonomyReader tr = new DirectoryTaxonomyReader(tw); - IndexSearcher s = newSearcher(r, false, false); + IndexSearcher s = newSearcher(r, false, false, Concurrency.INTER_SEGMENT); // DrillSideways requires the entire range of docs to be scored at once, so it doesn't support // timeouts whose implementation scores one window of doc IDs at a time. s.setTimeout(null); @@ -525,7 +525,7 @@ protected boolean scoreSubDocsAtOnce() { DrillDownQuery ddq = new DrillDownQuery(config); DrillSidewaysResult dsr = ds.search(null, ddq, 10); - assertEquals(100, dsr.hits.totalHits.value); + assertEquals(100, dsr.hits.totalHits.value()); assertEquals( "dim=dim path=[] value=100 childCount=2\n b (75)\n a (25)\n", dsr.facets.getTopChildren(10, "dim").toString()); @@ -538,7 +538,7 @@ protected boolean scoreSubDocsAtOnce() { ddq.add("dim", "b"); dsr = ds.search(null, ddq, 10); - assertEquals(75, dsr.hits.totalHits.value); + assertEquals(75, dsr.hits.totalHits.value()); assertEquals( "dim=dim path=[] value=100 childCount=2\n b (75)\n a (25)\n", dsr.facets.getTopChildren(10, "dim").toString()); @@ -551,7 +551,7 @@ protected boolean scoreSubDocsAtOnce() { ddq.add("field", LongPoint.newRangeQuery("field", 0L, 10L)); dsr = ds.search(null, ddq, 10); - assertEquals(11, dsr.hits.totalHits.value); + assertEquals(11, dsr.hits.totalHits.value()); assertEquals( "dim=dim path=[] value=11 childCount=2\n b (8)\n a (3)\n", dsr.facets.getTopChildren(10, "dim").toString()); @@ -1652,7 +1652,7 @@ public void testCustomDoubleValuesSource() throws Exception { IndexReader r = writer.getReader(); - IndexSearcher s = newSearcher(r, false, false); + IndexSearcher s = newSearcher(r, false, false, Concurrency.INTER_SEGMENT); // DrillSideways requires the entire range of docs to be scored at once, so it doesn't support // timeouts whose implementation scores one window of doc IDs at a time. s.setTimeout(null); @@ -1709,7 +1709,7 @@ public void testCustomDoubleValuesSource() throws Exception { } // Test simple drill-down: - assertEquals(1, s.search(ddq, 10).totalHits.value); + assertEquals(1, s.search(ddq, 10).totalHits.value()); // Test drill-sideways after drill-down DrillSideways ds = @@ -1737,7 +1737,7 @@ protected boolean scoreSubDocsAtOnce() { }; DrillSidewaysResult dsr = ds.search(ddq, 10); - assertEquals(1, dsr.hits.totalHits.value); + assertEquals(1, dsr.hits.totalHits.value()); assertEquals( "dim=field path=[] value=3 childCount=6\n < 1 (0)\n < 2 (1)\n < 5 (3)\n < 10 (3)\n < 20 (3)\n < 50 (3)\n", dsr.facets.getAllChildren("field").toString()); diff --git a/lucene/facet/src/test/org/apache/lucene/facet/rangeonrange/TestRangeOnRangeFacetCounts.java b/lucene/facet/src/test/org/apache/lucene/facet/rangeonrange/TestRangeOnRangeFacetCounts.java index 4698e9101681..9cdadc8ae300 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/rangeonrange/TestRangeOnRangeFacetCounts.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/rangeonrange/TestRangeOnRangeFacetCounts.java @@ -492,7 +492,7 @@ public void testSingleDimMixedRangeAndNonRangeTaxonomy() throws Exception { final TaxonomyReader tr = new DirectoryTaxonomyReader(tw); - IndexSearcher s = newSearcher(r, false, false); + IndexSearcher s = newSearcher(r, false, false, Concurrency.INTER_SEGMENT); // DrillSideways requires the entire range of docs to be scored at once, so it doesn't support // timeouts whose implementation scores one window of doc IDs at a time. s.setTimeout(null); @@ -549,7 +549,7 @@ protected boolean scoreSubDocsAtOnce() { DrillDownQuery ddq = new DrillDownQuery(config); DrillSideways.DrillSidewaysResult dsr = ds.search(null, ddq, 10); - assertEquals(100, dsr.hits.totalHits.value); + assertEquals(100, dsr.hits.totalHits.value()); assertEquals( "dim=dim path=[] value=100 childCount=2\n b (75)\n a (25)\n", dsr.facets.getTopChildren(10, "dim").toString()); @@ -562,7 +562,7 @@ protected boolean scoreSubDocsAtOnce() { ddq.add("dim", "b"); dsr = ds.search(null, ddq, 10); - assertEquals(75, dsr.hits.totalHits.value); + assertEquals(75, dsr.hits.totalHits.value()); assertEquals( "dim=dim path=[] value=100 childCount=2\n b (75)\n a (25)\n", dsr.facets.getTopChildren(10, "dim").toString()); @@ -605,7 +605,7 @@ public void testMultiDimMixedRangeAndNonRangeTaxonomy() throws Exception { final TaxonomyReader tr = new DirectoryTaxonomyReader(tw); - IndexSearcher s = newSearcher(r, false, false); + IndexSearcher s = newSearcher(r, false, false, Concurrency.INTER_SEGMENT); // DrillSideways requires the entire range of docs to be scored at once, so it doesn't support // timeouts whose implementation scores one window of doc IDs at a time. s.setTimeout(null); @@ -666,7 +666,7 @@ protected boolean scoreSubDocsAtOnce() { DrillDownQuery ddq = new DrillDownQuery(config); DrillSideways.DrillSidewaysResult dsr = ds.search(null, ddq, 10); - assertEquals(100, dsr.hits.totalHits.value); + assertEquals(100, dsr.hits.totalHits.value()); assertEquals( "dim=dim path=[] value=100 childCount=2\n b (75)\n a (25)\n", dsr.facets.getTopChildren(10, "dim").toString()); @@ -679,7 +679,7 @@ protected boolean scoreSubDocsAtOnce() { ddq.add("dim", "b"); dsr = ds.search(null, ddq, 10); - assertEquals(75, dsr.hits.totalHits.value); + assertEquals(75, dsr.hits.totalHits.value()); assertEquals( "dim=dim path=[] value=100 childCount=2\n b (75)\n a (25)\n", dsr.facets.getTopChildren(10, "dim").toString()); diff --git a/lucene/facet/src/test/org/apache/lucene/facet/sortedset/TestSortedSetDocValuesFacets.java b/lucene/facet/src/test/org/apache/lucene/facet/sortedset/TestSortedSetDocValuesFacets.java index 4a8a5379b949..dd86bd441595 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/sortedset/TestSortedSetDocValuesFacets.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/sortedset/TestSortedSetDocValuesFacets.java @@ -163,7 +163,7 @@ public void testBasic() throws Exception { q.add("a", "foo"); q.add("b", "baz"); TopDocs hits = searcher.search(q, 1); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); } finally { if (exec != null) exec.shutdownNow(); } @@ -410,17 +410,17 @@ public void testBasicHierarchical() throws Exception { q.add("a", "foo"); q.add("b", "baz"); TopDocs hits = searcher.search(q, 1); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); q = new DrillDownQuery(config); q.add("c", "buzz", "bif"); hits = searcher.search(q, 2); - assertEquals(2, hits.totalHits.value); + assertEquals(2, hits.totalHits.value()); q = new DrillDownQuery(config); q.add("c", "buzz", "biz", "bar"); hits = searcher.search(q, 2); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); } finally { if (exec != null) exec.shutdownNow(); } @@ -658,7 +658,7 @@ public void testBasicSingleValued() throws Exception { q.add("a", "foo"); q.add("b", "bar"); TopDocs hits = searcher.search(q, 1); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); } finally { if (exec != null) exec.shutdownNow(); } @@ -710,12 +710,12 @@ public void testHierarchicalBasicSingleValues() throws Exception { DrillDownQuery q = new DrillDownQuery(config); q.add("c", "buzz"); TopDocs hits = searcher.search(q, 1); - assertEquals(2, hits.totalHits.value); + assertEquals(2, hits.totalHits.value()); q = new DrillDownQuery(config); q.add("c", "buzz", "bar"); hits = searcher.search(q, 1); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); } finally { if (exec != null) exec.shutdownNow(); } @@ -756,52 +756,52 @@ public void testDrillDownOptions() throws Exception { DrillDownQuery q = new DrillDownQuery(config); q.add("c"); TopDocs hits = searcher.search(q, 1); - assertEquals(0, hits.totalHits.value); + assertEquals(0, hits.totalHits.value()); q = new DrillDownQuery(config); q.add("c", "foo"); hits = searcher.search(q, 1); - assertEquals(0, hits.totalHits.value); + assertEquals(0, hits.totalHits.value()); q = new DrillDownQuery(config); q.add("d"); hits = searcher.search(q, 1); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); q = new DrillDownQuery(config); q.add("d", "foo"); hits = searcher.search(q, 1); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); q = new DrillDownQuery(config); q.add("e"); hits = searcher.search(q, 1); - assertEquals(0, hits.totalHits.value); + assertEquals(0, hits.totalHits.value()); q = new DrillDownQuery(config); q.add("e", "foo"); hits = searcher.search(q, 1); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); q = new DrillDownQuery(config); q.add("f"); hits = searcher.search(q, 1); - assertEquals(0, hits.totalHits.value); + assertEquals(0, hits.totalHits.value()); q = new DrillDownQuery(config); q.add("f", "foo"); hits = searcher.search(q, 1); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); q = new DrillDownQuery(config); q.add("g"); hits = searcher.search(q, 1); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); q = new DrillDownQuery(config); q.add("g", "foo"); hits = searcher.search(q, 1); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); } } } @@ -844,102 +844,102 @@ public void testHierarchicalDrillDownOptions() throws Exception { DrillDownQuery q = new DrillDownQuery(config); q.add("c"); TopDocs hits = searcher.search(q, 1); - assertEquals(0, hits.totalHits.value); + assertEquals(0, hits.totalHits.value()); q = new DrillDownQuery(config); q.add("c", "biz"); hits = searcher.search(q, 1); - assertEquals(0, hits.totalHits.value); + assertEquals(0, hits.totalHits.value()); q = new DrillDownQuery(config); q.add("c", "biz", "baz"); hits = searcher.search(q, 1); - assertEquals(0, hits.totalHits.value); + assertEquals(0, hits.totalHits.value()); q = new DrillDownQuery(config); q.add("c", "foo"); hits = searcher.search(q, 1); - assertEquals(0, hits.totalHits.value); + assertEquals(0, hits.totalHits.value()); q = new DrillDownQuery(config); q.add("d"); hits = searcher.search(q, 1); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); q = new DrillDownQuery(config); q.add("d", "foo"); hits = searcher.search(q, 1); - assertEquals(0, hits.totalHits.value); + assertEquals(0, hits.totalHits.value()); q = new DrillDownQuery(config); q.add("d", "biz"); hits = searcher.search(q, 1); - assertEquals(0, hits.totalHits.value); + assertEquals(0, hits.totalHits.value()); q = new DrillDownQuery(config); q.add("d", "biz", "baz"); hits = searcher.search(q, 1); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); q = new DrillDownQuery(config); q.add("e"); hits = searcher.search(q, 1); - assertEquals(0, hits.totalHits.value); + assertEquals(0, hits.totalHits.value()); q = new DrillDownQuery(config); q.add("e", "foo"); hits = searcher.search(q, 1); - assertEquals(0, hits.totalHits.value); + assertEquals(0, hits.totalHits.value()); q = new DrillDownQuery(config); q.add("e", "biz"); hits = searcher.search(q, 1); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); q = new DrillDownQuery(config); q.add("e", "biz", "baz"); hits = searcher.search(q, 1); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); q = new DrillDownQuery(config); q.add("f"); hits = searcher.search(q, 1); - assertEquals(0, hits.totalHits.value); + assertEquals(0, hits.totalHits.value()); q = new DrillDownQuery(config); q.add("f", "foo"); hits = searcher.search(q, 1); - assertEquals(0, hits.totalHits.value); + assertEquals(0, hits.totalHits.value()); q = new DrillDownQuery(config); q.add("f", "biz"); hits = searcher.search(q, 1); - assertEquals(0, hits.totalHits.value); + assertEquals(0, hits.totalHits.value()); q = new DrillDownQuery(config); q.add("f", "biz", "baz"); hits = searcher.search(q, 1); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); q = new DrillDownQuery(config); q.add("g"); hits = searcher.search(q, 1); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); q = new DrillDownQuery(config); q.add("g", "foo"); hits = searcher.search(q, 1); - assertEquals(0, hits.totalHits.value); + assertEquals(0, hits.totalHits.value()); q = new DrillDownQuery(config); q.add("g", "biz"); hits = searcher.search(q, 1); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); q = new DrillDownQuery(config); q.add("g", "biz", "baz"); hits = searcher.search(q, 1); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); } } } @@ -1362,9 +1362,13 @@ public void testRandom() throws Exception { if (VERBOSE) { System.out.println("\nTEST: iter content=" + searchToken); } - FacetsCollector fc = new FacetsCollector(); - FacetsCollector.search( - searcher, new TermQuery(new Term("content", searchToken)), 10, fc); + FacetsCollector fc = + FacetsCollectorManager.search( + searcher, + new TermQuery(new Term("content", searchToken)), + 10, + new FacetsCollectorManager()) + .facetsCollector(); Facets facets; if (exec != null) { facets = new ConcurrentSortedSetDocValuesFacetCounts(state, fc, exec); @@ -1503,9 +1507,13 @@ public void testRandomHierarchicalFlatMix() throws Exception { if (VERBOSE) { System.out.println("\nTEST: iter content=" + searchToken); } - FacetsCollector fc = new FacetsCollector(); - FacetsCollector.search( - searcher, new TermQuery(new Term("content", searchToken)), 10, fc); + FacetsCollector fc = + FacetsCollectorManager.search( + searcher, + new TermQuery(new Term("content", searchToken)), + 10, + new FacetsCollectorManager()) + .facetsCollector(); Facets facets; if (exec != null) { facets = new ConcurrentSortedSetDocValuesFacetCounts(state, fc, exec); diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestOrdinalData.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestOrdinalData.java index 47750b773b2b..d80027f52701 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestOrdinalData.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestOrdinalData.java @@ -57,12 +57,8 @@ public class TestOrdinalData extends FacetTestCase { "Bob", 42L, "Lisa", 35L); - private static class OrdinalDataAppender implements BiConsumer { - private final Map scores; - - private OrdinalDataAppender(Map scores) { - this.scores = scores; - } + private record OrdinalDataAppender(Map scores) + implements BiConsumer { @Override public void accept(FacetLabel facetLabel, Document doc) { @@ -157,7 +153,7 @@ private void validateSearchResults(IndexSearcher searcher, Map q Query q = queryAndCount.getKey(); int count = queryAndCount.getValue(); TopDocs td = searcher.search(q, Integer.MAX_VALUE); - assertEquals(count, td.totalHits.value); + assertEquals(count, td.totalHits.value()); } } diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestOrdinalMappingLeafReader.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestOrdinalMappingLeafReader.java index 27badb7b0e3f..3f04363015dd 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestOrdinalMappingLeafReader.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestOrdinalMappingLeafReader.java @@ -24,6 +24,7 @@ import org.apache.lucene.facet.FacetTestCase; import org.apache.lucene.facet.Facets; import org.apache.lucene.facet.FacetsCollector; +import org.apache.lucene.facet.FacetsCollectorManager; import org.apache.lucene.facet.FacetsConfig; import org.apache.lucene.facet.LabelAndValue; import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader; @@ -89,8 +90,10 @@ private void verifyResults(Directory indexDir, Directory taxoDir) throws IOExcep DirectoryTaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoDir); IndexSearcher searcher = newSearcher(indexReader); - FacetsCollector collector = new FacetsCollector(); - FacetsCollector.search(searcher, new MatchAllDocsQuery(), 10, collector); + FacetsCollector collector = + FacetsCollectorManager.search( + searcher, new MatchAllDocsQuery(), 10, new FacetsCollectorManager()) + .facetsCollector(); // tag facets Facets tagFacets = new FastTaxonomyFacetCounts("$tags", taxoReader, facetConfig, collector); diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestSearcherTaxonomyManager.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestSearcherTaxonomyManager.java index 10f56e63ad8a..3f319acdb780 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestSearcherTaxonomyManager.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestSearcherTaxonomyManager.java @@ -189,10 +189,10 @@ public void run() { try { // System.out.println("search maxOrd=" + pair.taxonomyReader.getSize()); FacetsCollector sfc = - pair.searcher.search(new MatchAllDocsQuery(), new FacetsCollectorManager()); - Facets facets = getTaxonomyFacetCounts(pair.taxonomyReader, config, sfc); + pair.searcher().search(new MatchAllDocsQuery(), new FacetsCollectorManager()); + Facets facets = getTaxonomyFacetCounts(pair.taxonomyReader(), config, sfc); FacetResult result = facets.getTopChildren(10, "field"); - if (pair.searcher.getIndexReader().numDocs() > 0) { + if (pair.searcher().getIndexReader().numDocs() > 0) { // System.out.println(pair.taxonomyReader.getSize()); assertTrue(result.childCount > 0); assertTrue(result.labelValues.length > 0); @@ -244,10 +244,10 @@ public void testDirectory() throws Exception { try { // System.out.println("search maxOrd=" + pair.taxonomyReader.getSize()); FacetsCollector sfc = - pair.searcher.search(new MatchAllDocsQuery(), new FacetsCollectorManager()); - Facets facets = getTaxonomyFacetCounts(pair.taxonomyReader, config, sfc); + pair.searcher().search(new MatchAllDocsQuery(), new FacetsCollectorManager()); + Facets facets = getTaxonomyFacetCounts(pair.taxonomyReader(), config, sfc); FacetResult result = facets.getTopChildren(10, "field"); - if (pair.searcher.getIndexReader().numDocs() > 0) { + if (pair.searcher().getIndexReader().numDocs() > 0) { // System.out.println(pair.taxonomyReader.getSize()); assertTrue(result.childCount > 0); assertTrue(result.labelValues.length > 0); @@ -309,7 +309,7 @@ public void testReplaceTaxonomyDirectory() throws Exception { SearcherTaxonomyManager mgr = new SearcherTaxonomyManager(indexDir, taxoDir, null); SearcherAndTaxonomy pair = mgr.acquire(); try { - assertEquals(1, pair.taxonomyReader.getSize()); + assertEquals(1, pair.taxonomyReader().getSize()); } finally { mgr.release(pair); } @@ -323,7 +323,7 @@ public void testReplaceTaxonomyDirectory() throws Exception { mgr.maybeRefresh(); pair = mgr.acquire(); try { - assertEquals(3, pair.taxonomyReader.getSize()); + assertEquals(3, pair.taxonomyReader().getSize()); } finally { mgr.release(pair); } diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetAssociations.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetAssociations.java index c452a01292e9..7bf7e980d5f4 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetAssociations.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetAssociations.java @@ -242,10 +242,9 @@ public void testIntSumAssociation() throws Exception { public void testIntAssociationRandom() throws Exception { - FacetsCollector fc = new FacetsCollector(); - IndexSearcher searcher = newSearcher(reader); - searcher.search(new TermQuery(new Term("match", "yes")), fc); + FacetsCollector fc = + searcher.search(new TermQuery(new Term("match", "yes")), new FacetsCollectorManager()); Map expected; Facets facets; @@ -331,11 +330,11 @@ public void testFloatSumAssociation() throws Exception { } public void testFloatAssociationRandom() throws Exception { - - FacetsCollector fc = new FacetsCollector(); - - IndexSearcher searcher = newSearcher(reader); - searcher.search(new TermQuery(new Term("match", "yes")), fc); + // disabling search concurrency because validateFloats relies on ordering which requires + // sequential execution + IndexSearcher searcher = newSearcher(reader, true, true, false); + FacetsCollector fc = + searcher.search(new TermQuery(new Term("match", "yes")), new FacetsCollectorManager()); Map expected; Facets facets; @@ -619,7 +618,7 @@ private void validateFloats( float value = e.getValue(); // We can expect the floats to be exactly equal here since we're ensuring that we sum them // in the same order when determining expected values and when computing facets. See - // LUCENE-10530: + // LUCENE-10530. This though requires sequential execution. assertEquals(value, facets.getSpecificValue(dim, e.getKey()).floatValue(), 0f); aggregatedValue = aggregationFunction.aggregate(aggregatedValue, value); } @@ -654,7 +653,7 @@ private void assertFloatFacetResultsEqual(List expected, List prepareDocuments() { } private List allDocIds(MatchingDocs m, boolean decreasingDocIds) throws IOException { - DocIdSetIterator disi = m.bits.iterator(); + DocIdSetIterator disi = m.bits().iterator(); List docIds = new ArrayList<>(); while (disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { docIds.add(disi.docID()); @@ -106,7 +106,7 @@ private List lookupFacetLabels( for (MatchingDocs m : matchingDocs) { TaxonomyFacetLabels.FacetLabelReader facetLabelReader = - taxoLabels.getFacetLabelReader(m.context); + taxoLabels.getFacetLabelReader(m.context()); List docIds = allDocIds(m, decreasingDocIds); FacetLabel facetLabel; for (Integer docId : docIds) { diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetValueSource.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetValueSource.java index 67113ea38d8f..3cea228fc849 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetValueSource.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetValueSource.java @@ -309,16 +309,19 @@ public void testScoreAggregator() throws Exception { DirectoryReader r = DirectoryReader.open(iw); DirectoryTaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter); - FacetsCollector fc = new FacetsCollector(true); BoostQuery csq = new BoostQuery(new ConstantScoreQuery(new MatchAllDocsQuery()), 2f); - TopDocs td = FacetsCollector.search(newSearcher(r), csq, 10, fc); + FacetsCollectorManager.FacetsResult facetsResult = + FacetsCollectorManager.search(newSearcher(r), csq, 10, new FacetsCollectorManager(true)); + TopDocs td = facetsResult.topDocs(); + FacetsCollector fc = facetsResult.facetsCollector(); + assertTrue(fc.getKeepScores()); // Test SUM: Facets facets = new TaxonomyFacetFloatAssociations( taxoReader, config, fc, AssociationAggregationFunction.SUM, DoubleValuesSource.SCORES); - int expected = (int) (csq.getBoost() * td.totalHits.value); + int expected = (int) (csq.getBoost() * td.totalHits.value()); assertEquals(expected, facets.getSpecificValue("dim", "a").intValue()); // Test MAX: @@ -403,11 +406,13 @@ public void testWithScore() throws Exception { DirectoryReader r = DirectoryReader.open(iw); DirectoryTaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter); - FacetsCollector fc = new FacetsCollector(true); // score documents by their 'price' field - makes asserting the correct counts for the // categories easier Query q = new FunctionQuery(new LongFieldSource("price")); - FacetsCollector.search(newSearcher(r), q, 10, fc); + FacetsCollector fc = + FacetsCollectorManager.search(newSearcher(r), q, 10, new FacetsCollectorManager(true)) + .facetsCollector(); + assertTrue(fc.getKeepScores()); // Test SUM: Facets facets = @@ -540,8 +545,11 @@ public void testCountAndSumScore() throws Exception { DirectoryReader r = DirectoryReader.open(iw); DirectoryTaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter); - FacetsCollector fc = new FacetsCollector(true); - FacetsCollector.search(newSearcher(r), new MatchAllDocsQuery(), 10, fc); + FacetsCollector fc = + FacetsCollectorManager.search( + newSearcher(r), new MatchAllDocsQuery(), 10, new FacetsCollectorManager(true)) + .facetsCollector(); + assertTrue(fc.getKeepScores()); Facets facets1 = getTaxonomyFacetCounts(taxoReader, config, fc); Facets facets2 = @@ -595,8 +603,13 @@ public void testRandom() throws Exception { if (VERBOSE) { System.out.println("\nTEST: iter content=" + searchToken); } - FacetsCollector fc = new FacetsCollector(); - FacetsCollector.search(searcher, new TermQuery(new Term("content", searchToken)), 10, fc); + FacetsCollector fc = + FacetsCollectorManager.search( + searcher, + new TermQuery(new Term("content", searchToken)), + 10, + new FacetsCollectorManager()) + .facetsCollector(); checkResults( numDims, diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestAlwaysRefreshDirectoryTaxonomyReader.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestAlwaysRefreshDirectoryTaxonomyReader.java index aedd07647511..64de01fa0c6f 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestAlwaysRefreshDirectoryTaxonomyReader.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestAlwaysRefreshDirectoryTaxonomyReader.java @@ -86,7 +86,7 @@ private void testAlwaysRefreshDirectoryTaxonomyReader( * the call flow here initializes {@link DirectoryTaxonomyReader#taxoArrays}. These reused * `taxoArrays` form the basis of the inconsistency * */ - getTaxonomyFacetCounts(pair.taxonomyReader, config, sfc); + getTaxonomyFacetCounts(pair.taxonomyReader(), config, sfc); // now try to go back to checkpoint 1 and refresh the SearcherTaxonomyManager @@ -112,8 +112,8 @@ private void testAlwaysRefreshDirectoryTaxonomyReader( } else { mgr.maybeRefresh(); pair = mgr.acquire(); - assertEquals(new FacetLabel("a"), pair.taxonomyReader.getPath(1)); - assertEquals(-1, pair.taxonomyReader.getOrdinal(new FacetLabel("b"))); + assertEquals(new FacetLabel("a"), pair.taxonomyReader().getPath(1)); + assertEquals(-1, pair.taxonomyReader().getOrdinal(new FacetLabel("b"))); } mgr.release(pair); diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyWriter.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyWriter.java index 7350f0bffdb0..e8b2f9076b6f 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyWriter.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyWriter.java @@ -540,7 +540,7 @@ public void testHugeLabel() throws Exception { IndexSearcher searcher = new IndexSearcher(indexReader); DrillDownQuery ddq = new DrillDownQuery(new FacetsConfig()); ddq.add("dim", bigs); - assertEquals(1, searcher.search(ddq, 10).totalHits.value); + assertEquals(1, searcher.search(ddq, 10).totalHits.value()); IOUtils.close(indexReader, taxoReader, indexDir, taxoDir); } diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/writercache/TestCharBlockArray.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/writercache/TestCharBlockArray.java index f918ed5e1503..3c3ae4938a00 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/writercache/TestCharBlockArray.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/writercache/TestCharBlockArray.java @@ -16,14 +16,10 @@ */ package org.apache.lucene.facet.taxonomy.writercache; -import java.io.BufferedInputStream; -import java.io.BufferedOutputStream; import java.nio.ByteBuffer; import java.nio.charset.CharsetDecoder; import java.nio.charset.CodingErrorAction; import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.nio.file.Path; import org.apache.lucene.facet.FacetTestCase; public class TestCharBlockArray extends FacetTestCase { @@ -89,19 +85,6 @@ public void testArray() throws Exception { } assertEqualsInternal("GrowingCharArray<->StringBuilder mismatch.", builder, array); - - Path tempDir = createTempDir("growingchararray"); - Path f = tempDir.resolve("GrowingCharArrayTest.tmp"); - BufferedOutputStream out = new BufferedOutputStream(Files.newOutputStream(f)); - array.flush(out); - out.flush(); - out.close(); - - BufferedInputStream in = new BufferedInputStream(Files.newInputStream(f)); - array = CharBlockArray.open(in); - assertEqualsInternal( - "GrowingCharArray<->StringBuilder mismatch after flush/load.", builder, array); - in.close(); } private static void assertEqualsInternal( diff --git a/lucene/grouping/src/java/org/apache/lucene/search/grouping/BlockGroupingCollector.java b/lucene/grouping/src/java/org/apache/lucene/search/grouping/BlockGroupingCollector.java index 92c420c58372..779b62291a69 100644 --- a/lucene/grouping/src/java/org/apache/lucene/search/grouping/BlockGroupingCollector.java +++ b/lucene/grouping/src/java/org/apache/lucene/search/grouping/BlockGroupingCollector.java @@ -295,13 +295,12 @@ public TopGroups getTopGroups( "cannot sort by relevance within group: needsScores=false"); } collector = - new TopScoreDocCollectorManager(maxDocsPerGroup, null, Integer.MAX_VALUE, false) + new TopScoreDocCollectorManager(maxDocsPerGroup, null, Integer.MAX_VALUE) .newCollector(); } else { // Sort by fields collector = - new TopFieldCollectorManager( - withinGroupSort, maxDocsPerGroup, null, Integer.MAX_VALUE, false) + new TopFieldCollectorManager(withinGroupSort, maxDocsPerGroup, null, Integer.MAX_VALUE) .newCollector(); // TODO: disable exact counts? } diff --git a/lucene/grouping/src/java/org/apache/lucene/search/grouping/GroupDocs.java b/lucene/grouping/src/java/org/apache/lucene/search/grouping/GroupDocs.java index 9c0eccf9512b..ea0dbab85297 100644 --- a/lucene/grouping/src/java/org/apache/lucene/search/grouping/GroupDocs.java +++ b/lucene/grouping/src/java/org/apache/lucene/search/grouping/GroupDocs.java @@ -22,45 +22,20 @@ /** * Represents one group in the results. * + * @param groupValue The groupField value for all docs in this group; this may be null if hits did + * not have the groupField. + * @param maxScore Max score in this group + * @param score Overall aggregated score of this group (currently only set by join queries). + * @param scoreDocs Hits; this may be {@link org.apache.lucene.search.FieldDoc} instances if the + * withinGroupSort sorted by fields. + * @param totalHits Total hits within this group + * @param groupSortValues Matches the groupSort passed to {@link FirstPassGroupingCollector}. * @lucene.experimental */ -public class GroupDocs { - /** - * The groupField value for all docs in this group; this may be null if hits did not have the - * groupField. - */ - public final T groupValue; - - /** Max score in this group */ - public final float maxScore; - - /** Overall aggregated score of this group (currently only set by join queries). */ - public final float score; - - /** - * Hits; this may be {@link org.apache.lucene.search.FieldDoc} instances if the withinGroupSort - * sorted by fields. - */ - public final ScoreDoc[] scoreDocs; - - /** Total hits within this group */ - public final TotalHits totalHits; - - /** Matches the groupSort passed to {@link FirstPassGroupingCollector}. */ - public final Object[] groupSortValues; - - public GroupDocs( - float score, - float maxScore, - TotalHits totalHits, - ScoreDoc[] scoreDocs, - T groupValue, - Object[] groupSortValues) { - this.score = score; - this.maxScore = maxScore; - this.totalHits = totalHits; - this.scoreDocs = scoreDocs; - this.groupValue = groupValue; - this.groupSortValues = groupSortValues; - } -} +public record GroupDocs( + float score, + float maxScore, + TotalHits totalHits, + ScoreDoc[] scoreDocs, + T groupValue, + Object[] groupSortValues) {} diff --git a/lucene/grouping/src/java/org/apache/lucene/search/grouping/GroupFacetCollector.java b/lucene/grouping/src/java/org/apache/lucene/search/grouping/GroupFacetCollector.java index 4e56a12c9024..e598ebdcd12a 100644 --- a/lucene/grouping/src/java/org/apache/lucene/search/grouping/GroupFacetCollector.java +++ b/lucene/grouping/src/java/org/apache/lucene/search/grouping/GroupFacetCollector.java @@ -18,7 +18,6 @@ import java.io.IOException; import java.util.ArrayList; -import java.util.Comparator; import java.util.LinkedList; import java.util.List; import java.util.NavigableSet; @@ -126,28 +125,6 @@ public ScoreMode scoreMode() { */ public static class GroupedFacetResult { - private static final Comparator orderByCountAndValue = - new Comparator() { - - @Override - public int compare(FacetEntry a, FacetEntry b) { - int cmp = b.count - a.count; // Highest count first! - if (cmp != 0) { - return cmp; - } - return a.value.compareTo(b.value); - } - }; - - private static final Comparator orderByValue = - new Comparator() { - - @Override - public int compare(FacetEntry a, FacetEntry b) { - return a.value.compareTo(b.value); - } - }; - private final int maxSize; private final NavigableSet facetEntries; private final int totalMissingCount; @@ -157,7 +134,17 @@ public int compare(FacetEntry a, FacetEntry b) { public GroupedFacetResult( int size, int minCount, boolean orderByCount, int totalCount, int totalMissingCount) { - this.facetEntries = new TreeSet<>(orderByCount ? orderByCountAndValue : orderByValue); + this.facetEntries = + new TreeSet<>( + orderByCount + ? (a, b) -> { + int cmp = b.count - a.count; // Highest count first! + if (cmp != 0) { + return cmp; + } + return a.value.compareTo(b.value); + } + : (a, b) -> a.value.compareTo(b.value)); this.totalMissingCount = totalMissingCount; this.totalCount = totalCount; maxSize = size; @@ -229,54 +216,12 @@ public int getTotalMissingCount() { } /** Represents a facet entry with a value and a count. */ - public static class FacetEntry { - - private final BytesRef value; - private final int count; - - public FacetEntry(BytesRef value, int count) { - this.value = value; - this.count = count; - } - - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - - FacetEntry that = (FacetEntry) o; - - if (count != that.count) return false; - if (!value.equals(that.value)) return false; - - return true; - } - - @Override - public int hashCode() { - int result = value.hashCode(); - result = 31 * result + count; - return result; - } + public record FacetEntry(BytesRef value, int count) { @Override public String toString() { return "FacetEntry{" + "value=" + value.utf8ToString() + ", count=" + count + '}'; } - - /** - * @return The value of this facet entry - */ - public BytesRef getValue() { - return value; - } - - /** - * @return The count (number of groups) of this facet entry. - */ - public int getCount() { - return count; - } } /** diff --git a/lucene/grouping/src/java/org/apache/lucene/search/grouping/TermGroupFacetCollector.java b/lucene/grouping/src/java/org/apache/lucene/search/grouping/TermGroupFacetCollector.java index e49e517faa88..33052db43905 100644 --- a/lucene/grouping/src/java/org/apache/lucene/search/grouping/TermGroupFacetCollector.java +++ b/lucene/grouping/src/java/org/apache/lucene/search/grouping/TermGroupFacetCollector.java @@ -431,14 +431,5 @@ protected void nextTerm() throws IOException { } } - private static class GroupedFacetHit { - - final BytesRef groupValue; - final BytesRef facetValue; - - GroupedFacetHit(BytesRef groupValue, BytesRef facetValue) { - this.groupValue = groupValue; - this.facetValue = facetValue; - } - } + private record GroupedFacetHit(BytesRef groupValue, BytesRef facetValue) {} } diff --git a/lucene/grouping/src/java/org/apache/lucene/search/grouping/TopGroups.java b/lucene/grouping/src/java/org/apache/lucene/search/grouping/TopGroups.java index 3ceb388ca80b..0a1c607ab78b 100644 --- a/lucene/grouping/src/java/org/apache/lucene/search/grouping/TopGroups.java +++ b/lucene/grouping/src/java/org/apache/lucene/search/grouping/TopGroups.java @@ -161,7 +161,7 @@ public static TopGroups merge( float totalMaxScore = Float.NaN; for (int groupIDX = 0; groupIDX < numGroups; groupIDX++) { - final T groupValue = shardGroups[0].groups[groupIDX].groupValue; + final T groupValue = shardGroups[0].groups[groupIDX].groupValue(); // System.out.println(" merge groupValue=" + groupValue + " sortValues=" + // Arrays.toString(shardGroups[0].groups[groupIDX].groupSortValues)); float maxScore = Float.NaN; @@ -172,11 +172,11 @@ public static TopGroups merge( final TopGroups shard = shardGroups[shardIDX]; final GroupDocs shardGroupDocs = shard.groups[groupIDX]; if (groupValue == null) { - if (shardGroupDocs.groupValue != null) { + if (shardGroupDocs.groupValue() != null) { throw new IllegalArgumentException( "group values differ across shards; you must pass same top groups to all shards' second-pass collector"); } - } else if (!groupValue.equals(shardGroupDocs.groupValue)) { + } else if (!groupValue.equals(shardGroupDocs.groupValue())) { throw new IllegalArgumentException( "group values differ across shards; you must pass same top groups to all shards' second-pass collector"); } @@ -188,21 +188,22 @@ public static TopGroups merge( */ if (docSort.equals(Sort.RELEVANCE)) { - shardTopDocs[shardIDX] = new TopDocs(shardGroupDocs.totalHits, shardGroupDocs.scoreDocs); + shardTopDocs[shardIDX] = + new TopDocs(shardGroupDocs.totalHits(), shardGroupDocs.scoreDocs()); } else { shardTopDocs[shardIDX] = new TopFieldDocs( - shardGroupDocs.totalHits, shardGroupDocs.scoreDocs, docSort.getSort()); + shardGroupDocs.totalHits(), shardGroupDocs.scoreDocs(), docSort.getSort()); } for (int i = 0; i < shardTopDocs[shardIDX].scoreDocs.length; i++) { shardTopDocs[shardIDX].scoreDocs[i].shardIndex = shardIDX; } - maxScore = nonNANmax(maxScore, shardGroupDocs.maxScore); - assert shardGroupDocs.totalHits.relation == Relation.EQUAL_TO; - totalHits += shardGroupDocs.totalHits.value; - scoreSum += shardGroupDocs.score; + maxScore = nonNANmax(maxScore, shardGroupDocs.maxScore()); + assert shardGroupDocs.totalHits().relation() == Relation.EQUAL_TO; + totalHits += shardGroupDocs.totalHits().value(); + scoreSum += shardGroupDocs.score(); } final TopDocs mergedTopDocs; @@ -255,7 +256,7 @@ public static TopGroups merge( new TotalHits(totalHits, TotalHits.Relation.EQUAL_TO), mergedScoreDocs, groupValue, - shardGroups[0].groups[groupIDX].groupSortValues); + shardGroups[0].groups[groupIDX].groupSortValues()); totalMaxScore = nonNANmax(totalMaxScore, maxScore); } diff --git a/lucene/grouping/src/java/org/apache/lucene/search/grouping/TopGroupsCollector.java b/lucene/grouping/src/java/org/apache/lucene/search/grouping/TopGroupsCollector.java index 00940be58ecc..6a54f7b59af0 100644 --- a/lucene/grouping/src/java/org/apache/lucene/search/grouping/TopGroupsCollector.java +++ b/lucene/grouping/src/java/org/apache/lucene/search/grouping/TopGroupsCollector.java @@ -129,7 +129,7 @@ private static class TopDocsReducer extends GroupReducer new TopDocsAndMaxScoreCollector( true, - new TopScoreDocCollectorManager(maxDocsPerGroup, null, Integer.MAX_VALUE, false) + new TopScoreDocCollectorManager(maxDocsPerGroup, null, Integer.MAX_VALUE) .newCollector(), null); } else { @@ -137,7 +137,7 @@ private static class TopDocsReducer extends GroupReducer { TopFieldCollector topDocsCollector = new TopFieldCollectorManager( - withinGroupSort, maxDocsPerGroup, null, Integer.MAX_VALUE, false) + withinGroupSort, maxDocsPerGroup, null, Integer.MAX_VALUE) .newCollector(); // TODO: disable exact counts? MaxScoreCollector maxScoreCollector = getMaxScores ? new MaxScoreCollector() : null; return new TopDocsAndMaxScoreCollector(false, topDocsCollector, maxScoreCollector); diff --git a/lucene/grouping/src/test/org/apache/lucene/search/grouping/BaseGroupSelectorTestCase.java b/lucene/grouping/src/test/org/apache/lucene/search/grouping/BaseGroupSelectorTestCase.java index 5aa0c4b0bfe1..c64ffbf52f0b 100644 --- a/lucene/grouping/src/test/org/apache/lucene/search/grouping/BaseGroupSelectorTestCase.java +++ b/lucene/grouping/src/test/org/apache/lucene/search/grouping/BaseGroupSelectorTestCase.java @@ -67,10 +67,10 @@ public void testSortByRelevance() throws IOException { Query filtered = new BooleanQuery.Builder() .add(topLevel, BooleanClause.Occur.MUST) - .add(filterQuery(topGroups.groups[i].groupValue), BooleanClause.Occur.FILTER) + .add(filterQuery(topGroups.groups[i].groupValue()), BooleanClause.Occur.FILTER) .build(); TopDocs td = searcher.search(filtered, 10); - assertScoreDocsEquals(topGroups.groups[i].scoreDocs, td.scoreDocs); + assertScoreDocsEquals(topGroups.groups[i].scoreDocs(), td.scoreDocs); if (i == 0) { assertEquals(td.scoreDocs[0].doc, topDoc.scoreDocs[0].doc); assertEquals(td.scoreDocs[0].score, topDoc.scoreDocs[0].score, 0); @@ -105,10 +105,10 @@ public void testSortGroups() throws IOException { Query filtered = new BooleanQuery.Builder() .add(topLevel, BooleanClause.Occur.MUST) - .add(filterQuery(topGroups.groups[i].groupValue), BooleanClause.Occur.FILTER) + .add(filterQuery(topGroups.groups[i].groupValue()), BooleanClause.Occur.FILTER) .build(); TopDocs td = searcher.search(filtered, 10); - assertScoreDocsEquals(topGroups.groups[i].scoreDocs, td.scoreDocs); + assertScoreDocsEquals(topGroups.groups[i].scoreDocs(), td.scoreDocs); // The top group should have sort values equal to the sort values of the top doc of // a top-level search sorted by the same Sort; subsequent groups should have sort values // that compare lower than their predecessor. @@ -116,7 +116,7 @@ public void testSortGroups() throws IOException { assertSortsBefore(topGroups.groups[i - 1], topGroups.groups[i]); } else { assertArrayEquals( - ((FieldDoc) topDoc.scoreDocs[0]).fields, topGroups.groups[0].groupSortValues); + ((FieldDoc) topDoc.scoreDocs[0]).fields, topGroups.groups[0].groupSortValues()); } } @@ -148,19 +148,19 @@ public void testSortWithinGroups() throws IOException { // top score returned by a simple search with no grouping; subsequent groups should // all have equal or lower maxScores if (i == 0) { - assertEquals(topDoc.scoreDocs[0].score, topGroups.groups[0].maxScore, 0); + assertEquals(topDoc.scoreDocs[0].score, topGroups.groups[0].maxScore(), 0); } else { - assertTrue(topGroups.groups[i].maxScore <= topGroups.groups[i - 1].maxScore); + assertTrue(topGroups.groups[i].maxScore() <= topGroups.groups[i - 1].maxScore()); } // Groups themselves are ordered by a defined Sort, and each should give the same result as // the top-level query, filtered by the group value, with the same Sort Query filtered = new BooleanQuery.Builder() .add(topLevel, BooleanClause.Occur.MUST) - .add(filterQuery(topGroups.groups[i].groupValue), BooleanClause.Occur.FILTER) + .add(filterQuery(topGroups.groups[i].groupValue()), BooleanClause.Occur.FILTER) .build(); TopDocs td = searcher.search(filtered, 10, sort); - assertScoreDocsEquals(td.scoreDocs, topGroups.groups[i].scoreDocs); + assertScoreDocsEquals(td.scoreDocs, topGroups.groups[i].scoreDocs()); } shard.close(); @@ -350,10 +350,11 @@ public void testShardedGrouping() throws IOException { assertEquals(singletonTopGroups.totalGroupCount, mergedTopGroups.totalGroupCount); assertEquals(singletonTopGroups.groups.length, mergedTopGroups.groups.length); for (int i = 0; i < singletonTopGroups.groups.length; i++) { - assertEquals(singletonTopGroups.groups[i].groupValue, mergedTopGroups.groups[i].groupValue); assertEquals( - singletonTopGroups.groups[i].scoreDocs.length, - mergedTopGroups.groups[i].scoreDocs.length); + singletonTopGroups.groups[i].groupValue(), mergedTopGroups.groups[i].groupValue()); + assertEquals( + singletonTopGroups.groups[i].scoreDocs().length, + mergedTopGroups.groups[i].scoreDocs().length); } control.close(); @@ -379,8 +380,8 @@ private void indexRandomDocs(RandomIndexWriter w) throws IOException { } private void assertSortsBefore(GroupDocs first, GroupDocs second) { - Object[] groupSortValues = second.groupSortValues; - Object[] prevSortValues = first.groupSortValues; + Object[] groupSortValues = second.groupSortValues(); + Object[] prevSortValues = first.groupSortValues(); assertTrue(((BytesRef) prevSortValues[0]).compareTo((BytesRef) groupSortValues[0]) <= 0); if (prevSortValues[0].equals(groupSortValues[0])) { assertTrue((long) prevSortValues[1] <= (long) groupSortValues[1]); diff --git a/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestBlockGrouping.java b/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestBlockGrouping.java index 0ef446101458..18c02422f840 100644 --- a/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestBlockGrouping.java +++ b/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestBlockGrouping.java @@ -57,11 +57,12 @@ public void testSimple() throws IOException { // We're sorting by score, so the score of the top group should be the same as the // score of the top document from the same query with no grouping TopDocs topDoc = searcher.search(topLevel, 1); - assertEquals(topDoc.scoreDocs[0].score, tg.groups[0].scoreDocs[0].score, 0); - assertEquals(topDoc.scoreDocs[0].doc, tg.groups[0].scoreDocs[0].doc); + assertEquals(topDoc.scoreDocs[0].score, tg.groups[0].scoreDocs()[0].score, 0); + assertEquals(topDoc.scoreDocs[0].doc, tg.groups[0].scoreDocs()[0].doc); for (int i = 0; i < tg.groups.length; i++) { - String bookName = searcher.storedFields().document(tg.groups[i].scoreDocs[0].doc).get("book"); + String bookName = + searcher.storedFields().document(tg.groups[i].scoreDocs()[0].doc).get("book"); // The contents of each group should be equal to the results of a search for // that group alone Query filtered = @@ -70,7 +71,7 @@ public void testSimple() throws IOException { .add(new TermQuery(new Term("book", bookName)), BooleanClause.Occur.FILTER) .build(); TopDocs td = searcher.search(filtered, 10); - assertScoreDocsEquals(td.scoreDocs, tg.groups[i].scoreDocs); + assertScoreDocsEquals(td.scoreDocs, tg.groups[i].scoreDocs()); } shard.close(); @@ -96,10 +97,11 @@ public void testTopLevelSort() throws IOException { // The sort value of the top doc in the top group should be the same as the sort value // of the top result from the same search done with no grouping TopDocs topDoc = searcher.search(topLevel, 1, sort); - assertEquals(((FieldDoc) topDoc.scoreDocs[0]).fields[0], tg.groups[0].groupSortValues[0]); + assertEquals(((FieldDoc) topDoc.scoreDocs[0]).fields[0], tg.groups[0].groupSortValues()[0]); for (int i = 0; i < tg.groups.length; i++) { - String bookName = searcher.storedFields().document(tg.groups[i].scoreDocs[0].doc).get("book"); + String bookName = + searcher.storedFields().document(tg.groups[i].scoreDocs()[0].doc).get("book"); // The contents of each group should be equal to the results of a search for // that group alone, sorted by score Query filtered = @@ -108,7 +110,7 @@ public void testTopLevelSort() throws IOException { .add(new TermQuery(new Term("book", bookName)), BooleanClause.Occur.FILTER) .build(); TopDocs td = searcher.search(filtered, 10); - assertScoreDocsEquals(td.scoreDocs, tg.groups[i].scoreDocs); + assertScoreDocsEquals(td.scoreDocs, tg.groups[i].scoreDocs()); if (i > 1) { assertSortsBefore(tg.groups[i - 1], tg.groups[i]); } @@ -137,10 +139,11 @@ public void testWithinGroupSort() throws IOException { // We're sorting by score, so the score of the top group should be the same as the // score of the top document from the same query with no grouping TopDocs topDoc = searcher.search(topLevel, 1); - assertEquals(topDoc.scoreDocs[0].score, (float) tg.groups[0].groupSortValues[0], 0); + assertEquals(topDoc.scoreDocs[0].score, (float) tg.groups[0].groupSortValues()[0], 0); for (int i = 0; i < tg.groups.length; i++) { - String bookName = searcher.storedFields().document(tg.groups[i].scoreDocs[0].doc).get("book"); + String bookName = + searcher.storedFields().document(tg.groups[i].scoreDocs()[0].doc).get("book"); // The contents of each group should be equal to the results of a search for // that group alone, sorted by length Query filtered = @@ -149,12 +152,12 @@ public void testWithinGroupSort() throws IOException { .add(new TermQuery(new Term("book", bookName)), BooleanClause.Occur.FILTER) .build(); TopDocs td = searcher.search(filtered, 10, sort); - assertFieldDocsEquals(td.scoreDocs, tg.groups[i].scoreDocs); + assertFieldDocsEquals(td.scoreDocs, tg.groups[i].scoreDocs()); // We're sorting by score, so the group sort value for each group should be a float, // and the value for the previous group should be higher or equal to the value for this one if (i > 0) { - float prevScore = (float) tg.groups[i - 1].groupSortValues[0]; - float thisScore = (float) tg.groups[i].groupSortValues[0]; + float prevScore = (float) tg.groups[i - 1].groupSortValues()[0]; + float thisScore = (float) tg.groups[i].groupSortValues()[0]; assertTrue(prevScore >= thisScore); } } @@ -210,8 +213,8 @@ private static String randomText() { } private void assertSortsBefore(GroupDocs first, GroupDocs second) { - Object[] groupSortValues = second.groupSortValues; - Object[] prevSortValues = first.groupSortValues; + Object[] groupSortValues = second.groupSortValues(); + Object[] prevSortValues = first.groupSortValues(); assertTrue(((Long) prevSortValues[0]).compareTo((Long) groupSortValues[0]) <= 0); } diff --git a/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestDistinctValuesCollector.java b/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestDistinctValuesCollector.java index 012c09cb0879..3bf1a4024b4b 100644 --- a/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestDistinctValuesCollector.java +++ b/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestDistinctValuesCollector.java @@ -485,24 +485,11 @@ private IndexContext createIndexContext() throws Exception { contentStrings.toArray(new String[contentStrings.size()])); } - private static class IndexContext { - - final Directory directory; - final DirectoryReader indexReader; - final Map>> searchTermToGroupCounts; - final String[] contentStrings; - - IndexContext( - Directory directory, - DirectoryReader indexReader, - Map>> searchTermToGroupCounts, - String[] contentStrings) { - this.directory = directory; - this.indexReader = indexReader; - this.searchTermToGroupCounts = searchTermToGroupCounts; - this.contentStrings = contentStrings; - } - } + private record IndexContext( + Directory directory, + DirectoryReader indexReader, + Map>> searchTermToGroupCounts, + String[] contentStrings) {} private static class NullComparator implements Comparator> { diff --git a/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestGroupFacetCollector.java b/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestGroupFacetCollector.java index 52e896bb6eca..96aa92c4a9bd 100644 --- a/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestGroupFacetCollector.java +++ b/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestGroupFacetCollector.java @@ -123,15 +123,15 @@ public void testSimple() throws Exception { entries = airportResult.getFacetEntries(0, limit); assertEquals(2, entries.size()); - assertEquals("ams", entries.get(0).getValue().utf8ToString()); - assertEquals(2, entries.get(0).getCount()); - assertEquals("dus", entries.get(1).getValue().utf8ToString()); - assertEquals(1, entries.get(1).getCount()); + assertEquals("ams", entries.get(0).value().utf8ToString()); + assertEquals(2, entries.get(0).count()); + assertEquals("dus", entries.get(1).value().utf8ToString()); + assertEquals(1, entries.get(1).count()); entries = airportResult.getFacetEntries(1, limit); assertEquals(1, entries.size()); - assertEquals("dus", entries.get(0).getValue().utf8ToString()); - assertEquals(1, entries.get(0).getCount()); + assertEquals("dus", entries.get(0).value().utf8ToString()); + assertEquals(1, entries.get(0).count()); } GroupFacetCollector groupedDurationFacetCollector = @@ -145,10 +145,10 @@ public void testSimple() throws Exception { entries = durationResult.getFacetEntries(0, 10); assertEquals(2, entries.size()); - assertEquals("10", entries.get(0).getValue().utf8ToString()); - assertEquals(2, entries.get(0).getCount()); - assertEquals("5", entries.get(1).getValue().utf8ToString()); - assertEquals(2, entries.get(1).getCount()); + assertEquals("10", entries.get(0).value().utf8ToString()); + assertEquals(2, entries.get(0).count()); + assertEquals("5", entries.get(1).value().utf8ToString()); + assertEquals(2, entries.get(1).count()); // 5 doc = new Document(); @@ -193,17 +193,17 @@ public void testSimple() throws Exception { if (useDv) { assertEquals(6, airportResult.getTotalCount()); assertEquals(0, airportResult.getTotalMissingCount()); - assertEquals("bru", entries.get(0).getValue().utf8ToString()); - assertEquals(2, entries.get(0).getCount()); - assertEquals("", entries.get(1).getValue().utf8ToString()); - assertEquals(1, entries.get(1).getCount()); + assertEquals("bru", entries.get(0).value().utf8ToString()); + assertEquals(2, entries.get(0).count()); + assertEquals("", entries.get(1).value().utf8ToString()); + assertEquals(1, entries.get(1).count()); } else { assertEquals(5, airportResult.getTotalCount()); assertEquals(1, airportResult.getTotalMissingCount()); - assertEquals("bru", entries.get(0).getValue().utf8ToString()); - assertEquals(2, entries.get(0).getCount()); - assertEquals("dus", entries.get(1).getValue().utf8ToString()); - assertEquals(1, entries.get(1).getCount()); + assertEquals("bru", entries.get(0).value().utf8ToString()); + assertEquals(2, entries.get(0).count()); + assertEquals("dus", entries.get(1).value().utf8ToString()); + assertEquals(1, entries.get(1).count()); } groupedDurationFacetCollector = @@ -216,8 +216,8 @@ public void testSimple() throws Exception { entries = durationResult.getFacetEntries(1, 1); assertEquals(1, entries.size()); - assertEquals("5", entries.get(0).getValue().utf8ToString()); - assertEquals(2, entries.get(0).getCount()); + assertEquals("5", entries.get(0).value().utf8ToString()); + assertEquals(2, entries.get(0).count()); // 9 doc = new Document(); @@ -245,24 +245,24 @@ public void testSimple() throws Exception { assertEquals(8, airportResult.getTotalCount()); assertEquals(0, airportResult.getTotalMissingCount()); assertEquals(4, entries.size()); - assertEquals("", entries.get(0).getValue().utf8ToString()); - assertEquals(1, entries.get(0).getCount()); - assertEquals("ams", entries.get(1).getValue().utf8ToString()); - assertEquals(2, entries.get(1).getCount()); - assertEquals("bru", entries.get(2).getValue().utf8ToString()); - assertEquals(3, entries.get(2).getCount()); - assertEquals("dus", entries.get(3).getValue().utf8ToString()); - assertEquals(2, entries.get(3).getCount()); + assertEquals("", entries.get(0).value().utf8ToString()); + assertEquals(1, entries.get(0).count()); + assertEquals("ams", entries.get(1).value().utf8ToString()); + assertEquals(2, entries.get(1).count()); + assertEquals("bru", entries.get(2).value().utf8ToString()); + assertEquals(3, entries.get(2).count()); + assertEquals("dus", entries.get(3).value().utf8ToString()); + assertEquals(2, entries.get(3).count()); } else { assertEquals(7, airportResult.getTotalCount()); assertEquals(1, airportResult.getTotalMissingCount()); assertEquals(3, entries.size()); - assertEquals("ams", entries.get(0).getValue().utf8ToString()); - assertEquals(2, entries.get(0).getCount()); - assertEquals("bru", entries.get(1).getValue().utf8ToString()); - assertEquals(3, entries.get(1).getCount()); - assertEquals("dus", entries.get(2).getValue().utf8ToString()); - assertEquals(2, entries.get(2).getCount()); + assertEquals("ams", entries.get(0).value().utf8ToString()); + assertEquals(2, entries.get(0).count()); + assertEquals("bru", entries.get(1).value().utf8ToString()); + assertEquals(3, entries.get(1).count()); + assertEquals("dus", entries.get(2).value().utf8ToString()); + assertEquals(2, entries.get(2).count()); } groupedDurationFacetCollector = @@ -275,10 +275,10 @@ public void testSimple() throws Exception { entries = durationResult.getFacetEntries(0, 10); assertEquals(2, entries.size()); - assertEquals("10", entries.get(0).getValue().utf8ToString()); - assertEquals(3, entries.get(0).getCount()); - assertEquals("15", entries.get(1).getValue().utf8ToString()); - assertEquals(2, entries.get(1).getCount()); + assertEquals("10", entries.get(0).value().utf8ToString()); + assertEquals(3, entries.get(0).count()); + assertEquals("15", entries.get(1).value().utf8ToString()); + assertEquals(2, entries.get(1).count()); w.close(); indexSearcher.getIndexReader().close(); @@ -365,10 +365,10 @@ public void testMVGroupedFacetingWithDeletes() throws Exception { List entries = airportResult.getFacetEntries(0, 10); assertEquals(2, entries.size()); - assertEquals("ams", entries.get(0).getValue().utf8ToString()); - assertEquals(2, entries.get(0).getCount()); - assertEquals("dus", entries.get(1).getValue().utf8ToString()); - assertEquals(1, entries.get(1).getCount()); + assertEquals("ams", entries.get(0).value().utf8ToString()); + assertEquals(2, entries.get(0).count()); + assertEquals("dus", entries.get(1).value().utf8ToString()); + assertEquals(1, entries.get(1).count()); indexSearcher.getIndexReader().close(); dir.close(); @@ -428,7 +428,7 @@ public void testRandom() throws Exception { groupFacetCollector.mergeSegmentResults(size, minCount, orderByCount); List expectedFacetEntries = - expectedFacetResult.getFacetEntries(); + expectedFacetResult.facetEntries(); List actualFacetEntries = actualFacetResult.getFacetEntries(offset, limit); @@ -448,16 +448,16 @@ public void testRandom() throws Exception { System.out.println("Order by count: " + orderByCount); System.out.println("\n=== Expected: \n"); - System.out.println("Total count " + expectedFacetResult.getTotalCount()); - System.out.println("Total missing count " + expectedFacetResult.getTotalMissingCount()); + System.out.println("Total count " + expectedFacetResult.totalCount()); + System.out.println("Total missing count " + expectedFacetResult.totalMissingCount()); int counter = 0; for (TermGroupFacetCollector.FacetEntry expectedFacetEntry : expectedFacetEntries) { System.out.printf( Locale.ROOT, "%d. Expected facet value %s with count %d%n", counter++, - expectedFacetEntry.getValue().utf8ToString(), - expectedFacetEntry.getCount()); + expectedFacetEntry.value().utf8ToString(), + expectedFacetEntry.count()); } System.out.println("\n=== Actual: \n"); @@ -469,16 +469,16 @@ public void testRandom() throws Exception { Locale.ROOT, "%d. Actual facet value %s with count %d%n", counter++, - actualFacetEntry.getValue().utf8ToString(), - actualFacetEntry.getCount()); + actualFacetEntry.value().utf8ToString(), + actualFacetEntry.count()); } System.out.println( "\n==================================================================================="); } - assertEquals(expectedFacetResult.getTotalCount(), actualFacetResult.getTotalCount()); + assertEquals(expectedFacetResult.totalCount(), actualFacetResult.getTotalCount()); assertEquals( - expectedFacetResult.getTotalMissingCount(), actualFacetResult.getTotalMissingCount()); + expectedFacetResult.totalMissingCount(), actualFacetResult.getTotalMissingCount()); assertEquals(expectedFacetEntries.size(), actualFacetEntries.size()); for (int i = 0; i < expectedFacetEntries.size(); i++) { TermGroupFacetCollector.FacetEntry expectedFacetEntry = expectedFacetEntries.get(i); @@ -487,20 +487,15 @@ public void testRandom() throws Exception { "i=" + i + ": " - + expectedFacetEntry.getValue().utf8ToString() + + expectedFacetEntry.value().utf8ToString() + " != " - + actualFacetEntry.getValue().utf8ToString(), - expectedFacetEntry.getValue(), - actualFacetEntry.getValue()); + + actualFacetEntry.value().utf8ToString(), + expectedFacetEntry.value(), + actualFacetEntry.value()); assertEquals( - "i=" - + i - + ": " - + expectedFacetEntry.getCount() - + " != " - + actualFacetEntry.getCount(), - expectedFacetEntry.getCount(), - actualFacetEntry.getCount()); + "i=" + i + ": " + expectedFacetEntry.count() + " != " + actualFacetEntry.count(), + expectedFacetEntry.count(), + actualFacetEntry.count()); } } @@ -736,12 +731,12 @@ private GroupedFacetResult createExpectedFacetResult( entries.sort( (a, b) -> { if (orderByCount) { - int cmp = b.getCount() - a.getCount(); + int cmp = b.count() - a.count(); if (cmp != 0) { return cmp; } } - return a.getValue().compareTo(b.getValue()); + return a.value().compareTo(b.value()); }); int endOffset = offset + limit; @@ -805,31 +800,8 @@ public IndexContext( } } - private static class GroupedFacetResult { - - final int totalCount; - final int totalMissingCount; - final List facetEntries; - - private GroupedFacetResult( - int totalCount, - int totalMissingCount, - List facetEntries) { - this.totalCount = totalCount; - this.totalMissingCount = totalMissingCount; - this.facetEntries = facetEntries; - } - - public int getTotalCount() { - return totalCount; - } - - public int getTotalMissingCount() { - return totalMissingCount; - } - - public List getFacetEntries() { - return facetEntries; - } - } + private record GroupedFacetResult( + int totalCount, + int totalMissingCount, + List facetEntries) {} } diff --git a/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java b/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java index 38493a01bb69..ff6ddc192d1c 100644 --- a/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java +++ b/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java @@ -46,6 +46,7 @@ import org.apache.lucene.queries.function.valuesource.BytesRefFieldSource; import org.apache.lucene.search.CachingCollector; import org.apache.lucene.search.Collector; +import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.FieldDoc; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MultiCollector; @@ -164,29 +165,29 @@ public void testBasic() throws Exception { // value GroupDocs group = groups.groups[0]; compareGroupValue("author3", group); - assertEquals(2, group.scoreDocs.length); - assertEquals(5, group.scoreDocs[0].doc); - assertEquals(4, group.scoreDocs[1].doc); - assertTrue(group.scoreDocs[0].score > group.scoreDocs[1].score); + assertEquals(2, group.scoreDocs().length); + assertEquals(5, group.scoreDocs()[0].doc); + assertEquals(4, group.scoreDocs()[1].doc); + assertTrue(group.scoreDocs()[0].score > group.scoreDocs()[1].score); group = groups.groups[1]; compareGroupValue("author1", group); - assertEquals(3, group.scoreDocs.length); - assertEquals(0, group.scoreDocs[0].doc); - assertEquals(1, group.scoreDocs[1].doc); - assertEquals(2, group.scoreDocs[2].doc); - assertTrue(group.scoreDocs[0].score >= group.scoreDocs[1].score); - assertTrue(group.scoreDocs[1].score >= group.scoreDocs[2].score); + assertEquals(3, group.scoreDocs().length); + assertEquals(0, group.scoreDocs()[0].doc); + assertEquals(1, group.scoreDocs()[1].doc); + assertEquals(2, group.scoreDocs()[2].doc); + assertTrue(group.scoreDocs()[0].score >= group.scoreDocs()[1].score); + assertTrue(group.scoreDocs()[1].score >= group.scoreDocs()[2].score); group = groups.groups[2]; compareGroupValue("author2", group); - assertEquals(1, group.scoreDocs.length); - assertEquals(3, group.scoreDocs[0].doc); + assertEquals(1, group.scoreDocs().length); + assertEquals(3, group.scoreDocs()[0].doc); group = groups.groups[3]; compareGroupValue(null, group); - assertEquals(1, group.scoreDocs.length); - assertEquals(6, group.scoreDocs[0].doc); + assertEquals(1, group.scoreDocs().length); + assertEquals(6, group.scoreDocs()[0].doc); indexSearcher.getIndexReader().close(); dir.close(); @@ -292,22 +293,22 @@ private AllGroupsCollector createAllGroupsCollector( private void compareGroupValue(String expected, GroupDocs group) { if (expected == null) { - if (group.groupValue == null) { + if (group.groupValue() == null) { return; - } else if (group.groupValue.getClass().isAssignableFrom(MutableValueStr.class)) { + } else if (group.groupValue().getClass().isAssignableFrom(MutableValueStr.class)) { return; - } else if (((BytesRef) group.groupValue).length == 0) { + } else if (((BytesRef) group.groupValue()).length == 0) { return; } fail(); } - if (group.groupValue.getClass().isAssignableFrom(BytesRef.class)) { - assertEquals(new BytesRef(expected), group.groupValue); - } else if (group.groupValue.getClass().isAssignableFrom(MutableValueStr.class)) { + if (group.groupValue().getClass().isAssignableFrom(BytesRef.class)) { + assertEquals(new BytesRef(expected), group.groupValue()); + } else if (group.groupValue().getClass().isAssignableFrom(MutableValueStr.class)) { MutableValueStr v = new MutableValueStr(); v.value.copyChars(expected); - assertEquals(v, group.groupValue); + assertEquals(v, group.groupValue()); } else { fail(); } @@ -356,15 +357,17 @@ private TopGroups getTopGroups(TopGroupsCollector c, int withinGroupOf List> groups = new ArrayList<>(mvalTopGroups.groups.length); for (GroupDocs mvalGd : mvalTopGroups.groups) { BytesRef groupValue = - mvalGd.groupValue.exists() ? ((MutableValueStr) mvalGd.groupValue).value.get() : null; + mvalGd.groupValue().exists() + ? ((MutableValueStr) mvalGd.groupValue()).value.get() + : null; groups.add( new GroupDocs<>( Float.NaN, - mvalGd.maxScore, - mvalGd.totalHits, - mvalGd.scoreDocs, + mvalGd.maxScore(), + mvalGd.totalHits(), + mvalGd.scoreDocs(), groupValue, - mvalGd.groupSortValues)); + mvalGd.groupSortValues())); } // NOTE: currenlty using diamond operator on MergedIterator (without explicit Term class) // causes @@ -1136,12 +1139,12 @@ public void testRandom() throws Exception { for (GroupDocs gd : expectedGroups.groups) { System.out.println( " group=" - + (gd.groupValue == null ? "null" : gd.groupValue) + + (gd.groupValue() == null ? "null" : gd.groupValue()) + " totalHits=" - + gd.totalHits.value + + gd.totalHits().value() + " scoreDocs.len=" - + gd.scoreDocs.length); - for (ScoreDoc sd : gd.scoreDocs) { + + gd.scoreDocs().length); + for (ScoreDoc sd : gd.scoreDocs()) { System.out.println(" id=" + sd.doc + " score=" + sd.score); } } @@ -1155,10 +1158,10 @@ public void testRandom() throws Exception { for (GroupDocs gd : groupsResult.groups) { System.out.println( " group=" - + (gd.groupValue == null ? "null" : gd.groupValue) + + (gd.groupValue() == null ? "null" : gd.groupValue()) + " totalHits=" - + gd.totalHits.value); - for (ScoreDoc sd : gd.scoreDocs) { + + gd.totalHits().value()); + for (ScoreDoc sd : gd.scoreDocs()) { System.out.println(" id=" + docIDToID[sd.doc] + " score=" + sd.score); } } @@ -1180,10 +1183,10 @@ public void testRandom() throws Exception { for (GroupDocs gd : topGroupsShards.groups) { System.out.println( " group=" - + (gd.groupValue == null ? "null" : gd.groupValue) + + (gd.groupValue() == null ? "null" : gd.groupValue()) + " totalHits=" - + gd.totalHits.value); - for (ScoreDoc sd : gd.scoreDocs) { + + gd.totalHits().value()); + for (ScoreDoc sd : gd.scoreDocs()) { System.out.println(" id=" + docIDToID[sd.doc] + " score=" + sd.score); } } @@ -1244,10 +1247,10 @@ public void testRandom() throws Exception { for (GroupDocs gd : groupsResultBlocks.groups) { System.out.println( " group=" - + (gd.groupValue == null ? "null" : gd.groupValue.utf8ToString()) + + (gd.groupValue() == null ? "null" : gd.groupValue().utf8ToString()) + " totalHits=" - + gd.totalHits.value); - for (ScoreDoc sd : gd.scoreDocs) { + + gd.totalHits().value()); + for (ScoreDoc sd : gd.scoreDocs()) { System.out.println(" id=" + docIDToIDBlocks[sd.doc] + " score=" + sd.score); if (first) { System.out.println("explain: " + sBlocks.explain(query, sd.doc)); @@ -1277,7 +1280,7 @@ public void testRandom() throws Exception { if (expectedGroups != null) { // Fixup scores for reader2 for (GroupDocs groupDocsHits : expectedGroups.groups) { - for (ScoreDoc hit : groupDocsHits.scoreDocs) { + for (ScoreDoc hit : groupDocsHits.scoreDocs()) { final GroupDoc gd = groupDocsByID[hit.doc]; assertEquals(gd.id, hit.doc); // System.out.println("fixup score " + hit.score + " to " + gd.score2 + " vs " + @@ -1291,12 +1294,12 @@ public void testRandom() throws Exception { for (int groupSortIDX = 0; groupSortIDX < sortFields.length; groupSortIDX++) { if (sortFields[groupSortIDX].getType() == SortField.Type.SCORE) { for (GroupDocs groupDocsHits : expectedGroups.groups) { - if (groupDocsHits.groupSortValues != null) { + if (groupDocsHits.groupSortValues() != null) { // System.out.println("remap " + groupDocsHits.groupSortValues[groupSortIDX] + " // to " + termScoreMap.get(groupDocsHits.groupSortValues[groupSortIDX])); - groupDocsHits.groupSortValues[groupSortIDX] = - termScoreMap.get(groupDocsHits.groupSortValues[groupSortIDX]); - assertNotNull(groupDocsHits.groupSortValues[groupSortIDX]); + groupDocsHits.groupSortValues()[groupSortIDX] = + termScoreMap.get(groupDocsHits.groupSortValues()[groupSortIDX]); + assertNotNull(groupDocsHits.groupSortValues()[groupSortIDX]); } } } @@ -1306,7 +1309,7 @@ public void testRandom() throws Exception { for (int docSortIDX = 0; docSortIDX < docSortFields.length; docSortIDX++) { if (docSortFields[docSortIDX].getType() == SortField.Type.SCORE) { for (GroupDocs groupDocsHits : expectedGroups.groups) { - for (ScoreDoc _hit : groupDocsHits.scoreDocs) { + for (ScoreDoc _hit : groupDocsHits.scoreDocs()) { FieldDoc hit = (FieldDoc) _hit; if (hit.fields != null) { hit.fields[docSortIDX] = termScoreMap.get(hit.fields[docSortIDX]); @@ -1332,8 +1335,8 @@ public void testRandom() throws Exception { private void verifyShards(int[] docStarts, TopGroups topGroups) { for (GroupDocs group : topGroups.groups) { - for (int hitIDX = 0; hitIDX < group.scoreDocs.length; hitIDX++) { - final ScoreDoc sd = group.scoreDocs[hitIDX]; + for (int hitIDX = 0; hitIDX < group.scoreDocs().length; hitIDX++) { + final ScoreDoc sd = group.scoreDocs()[hitIDX]; assertEquals( "doc=" + sd.doc + " wrong shard", ReaderUtil.subIndex(sd.doc, docStarts), @@ -1465,11 +1468,11 @@ private TopGroups searchShards( for (GroupDocs group : shardTopGroups[shardIDX].groups) { System.out.println( " [" - + groupToString(group.groupValue) + + groupToString(group.groupValue()) + "] groupSort=" - + Arrays.toString(group.groupSortValues) + + Arrays.toString(group.groupSortValues()) + " numDocs=" - + group.scoreDocs.length); + + group.scoreDocs().length); } } } @@ -1487,11 +1490,11 @@ private TopGroups searchShards( for (GroupDocs group : mergedGroups.groups) { System.out.println( " [" - + groupToString(group.groupValue) + + groupToString(group.groupValue()) + "] groupSort=" - + Arrays.toString(group.groupSortValues) + + Arrays.toString(group.groupSortValues()) + " numDocs=" - + group.scoreDocs.length); + + group.scoreDocs().length); } } return mergedGroups; @@ -1540,23 +1543,23 @@ private void assertEquals( final GroupDocs actualGroup = actual.groups[groupIDX]; if (verifyGroupValues) { if (idvBasedImplsUsed) { - if (actualGroup.groupValue.length == 0) { - assertNull(expectedGroup.groupValue); + if (actualGroup.groupValue().length == 0) { + assertNull(expectedGroup.groupValue()); } else { - assertEquals(expectedGroup.groupValue, actualGroup.groupValue); + assertEquals(expectedGroup.groupValue(), actualGroup.groupValue()); } } else { - assertEquals(expectedGroup.groupValue, actualGroup.groupValue); + assertEquals(expectedGroup.groupValue(), actualGroup.groupValue()); } } - assertArrayEquals(expectedGroup.groupSortValues, actualGroup.groupSortValues); + assertArrayEquals(expectedGroup.groupSortValues(), actualGroup.groupSortValues()); // TODO // assertEquals(expectedGroup.maxScore, actualGroup.maxScore); - assertEquals(expectedGroup.totalHits.value, actualGroup.totalHits.value); + assertEquals(expectedGroup.totalHits().value(), actualGroup.totalHits().value()); - final ScoreDoc[] expectedFDs = expectedGroup.scoreDocs; - final ScoreDoc[] actualFDs = actualGroup.scoreDocs; + final ScoreDoc[] expectedFDs = expectedGroup.scoreDocs(); + final ScoreDoc[] actualFDs = actualGroup.scoreDocs(); assertEquals(expectedFDs.length, actualFDs.length); for (int docIDX = 0; docIDX < expectedFDs.length; docIDX++) { @@ -1579,7 +1582,7 @@ public ShardSearcher(LeafReaderContext ctx, IndexReaderContext parent) { } public void search(Weight weight, Collector collector) throws IOException { - searchLeaf(ctx, weight, collector); + searchLeaf(ctx, 0, DocIdSetIterator.NO_MORE_DOCS, weight, collector); } @Override diff --git a/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestGroupingSearch.java b/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestGroupingSearch.java index 95132d735a56..308b7d2c0702 100644 --- a/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestGroupingSearch.java +++ b/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestGroupingSearch.java @@ -135,29 +135,29 @@ public void testBasic() throws Exception { // value GroupDocs group = groups.groups[0]; compareGroupValue("author3", group); - assertEquals(2, group.scoreDocs.length); - assertEquals(5, group.scoreDocs[0].doc); - assertEquals(4, group.scoreDocs[1].doc); - assertTrue(group.scoreDocs[0].score >= group.scoreDocs[1].score); + assertEquals(2, group.scoreDocs().length); + assertEquals(5, group.scoreDocs()[0].doc); + assertEquals(4, group.scoreDocs()[1].doc); + assertTrue(group.scoreDocs()[0].score >= group.scoreDocs()[1].score); group = groups.groups[1]; compareGroupValue("author1", group); - assertEquals(3, group.scoreDocs.length); - assertEquals(0, group.scoreDocs[0].doc); - assertEquals(1, group.scoreDocs[1].doc); - assertEquals(2, group.scoreDocs[2].doc); - assertTrue(group.scoreDocs[0].score >= group.scoreDocs[1].score); - assertTrue(group.scoreDocs[1].score >= group.scoreDocs[2].score); + assertEquals(3, group.scoreDocs().length); + assertEquals(0, group.scoreDocs()[0].doc); + assertEquals(1, group.scoreDocs()[1].doc); + assertEquals(2, group.scoreDocs()[2].doc); + assertTrue(group.scoreDocs()[0].score >= group.scoreDocs()[1].score); + assertTrue(group.scoreDocs()[1].score >= group.scoreDocs()[2].score); group = groups.groups[2]; compareGroupValue("author2", group); - assertEquals(1, group.scoreDocs.length); - assertEquals(3, group.scoreDocs[0].doc); + assertEquals(1, group.scoreDocs().length); + assertEquals(3, group.scoreDocs()[0].doc); group = groups.groups[3]; compareGroupValue(null, group); - assertEquals(1, group.scoreDocs.length); - assertEquals(6, group.scoreDocs[0].doc); + assertEquals(1, group.scoreDocs().length); + assertEquals(6, group.scoreDocs()[0].doc); Query lastDocInBlock = new TermQuery(new Term("groupend", "x")); groupingSearch = new GroupingSearch(lastDocInBlock); @@ -182,22 +182,22 @@ private void addGroupField(Document doc, String groupField, String value, boolea private void compareGroupValue(String expected, GroupDocs group) { if (expected == null) { - if (group.groupValue == null) { + if (group.groupValue() == null) { return; - } else if (group.groupValue.getClass().isAssignableFrom(MutableValueStr.class)) { + } else if (group.groupValue().getClass().isAssignableFrom(MutableValueStr.class)) { return; - } else if (((BytesRef) group.groupValue).length == 0) { + } else if (((BytesRef) group.groupValue()).length == 0) { return; } fail(); } - if (group.groupValue.getClass().isAssignableFrom(BytesRef.class)) { - assertEquals(new BytesRef(expected), group.groupValue); - } else if (group.groupValue.getClass().isAssignableFrom(MutableValueStr.class)) { + if (group.groupValue().getClass().isAssignableFrom(BytesRef.class)) { + assertEquals(new BytesRef(expected), group.groupValue()); + } else if (group.groupValue().getClass().isAssignableFrom(MutableValueStr.class)) { MutableValueStr v = new MutableValueStr(); v.value.copyChars(expected); - assertEquals(v, group.groupValue); + assertEquals(v, group.groupValue()); } else { fail(); } diff --git a/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestTopGroups.java b/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestTopGroups.java index 8740b639fa7c..f7fe36ef4013 100644 --- a/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestTopGroups.java +++ b/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestTopGroups.java @@ -128,8 +128,8 @@ private void narrativeMergeTestImplementation( new TopGroups( sort.getSort() /* groupSort */, sort.getSort() /* withinGroupSort */, - group1.scoreDocs.length + group2.scoreDocs.length /* totalHitCount */, - group1.scoreDocs.length + group2.scoreDocs.length /* totalGroupedHitCount */, + group1.scoreDocs().length + group2.scoreDocs().length /* totalHitCount */, + group1.scoreDocs().length + group2.scoreDocs().length /* totalGroupedHitCount */, combineGroupDocs(group1, group2) /* groups */, (haveBlueWhale ? blueWhaleScore @@ -162,8 +162,8 @@ private void narrativeMergeTestImplementation( new TopGroups( sort.getSort() /* groupSort */, sort.getSort() /* withinGroupSort */, - group1.scoreDocs.length + group2.scoreDocs.length /* totalHitCount */, - group1.scoreDocs.length + group2.scoreDocs.length /* totalGroupedHitCount */, + group1.scoreDocs().length + group2.scoreDocs().length /* totalHitCount */, + group1.scoreDocs().length + group2.scoreDocs().length /* totalGroupedHitCount */, combineGroupDocs(group1, group2) /* groups */, (haveRedSquirrel ? redSquirrelScore @@ -191,16 +191,16 @@ private void narrativeMergeTestImplementation( assertEquals(2, mergedTopGroups.groups.length); { - assertEquals(blueGroupValue, mergedTopGroups.groups[0].groupValue); + assertEquals(blueGroupValue, mergedTopGroups.groups[0].groupValue()); final float expectedBlueMaxScore = (haveBlueWhale ? blueWhaleScore : (haveBlueDragonfly ? blueDragonflyScore : Float.NaN)); - checkMaxScore(expectedBlueMaxScore, mergedTopGroups.groups[0].maxScore); + checkMaxScore(expectedBlueMaxScore, mergedTopGroups.groups[0].maxScore()); } { - assertEquals(redGroupValue, mergedTopGroups.groups[1].groupValue); + assertEquals(redGroupValue, mergedTopGroups.groups[1].groupValue()); final float expectedRedMaxScore = (haveRedSquirrel ? redSquirrelScore : (haveRedAnt ? redAntScore : Float.NaN)); - checkMaxScore(expectedRedMaxScore, mergedTopGroups.groups[1].maxScore); + checkMaxScore(expectedRedMaxScore, mergedTopGroups.groups[1].maxScore()); } final float expectedMaxScore = diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java index f60c7966f984..cb8c71a089f0 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java @@ -21,6 +21,7 @@ import java.util.Iterator; import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.ByteVectorValues; +import org.apache.lucene.index.DocValuesSkipIndexType; import org.apache.lucene.index.DocValuesSkipper; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.FieldInfo; @@ -96,7 +97,7 @@ public int size() { terms.hasPayloads(), indexOptions, DocValuesType.NONE, - false, + DocValuesSkipIndexType.NONE, -1, Collections.emptyMap(), 0, diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java index c4cd50274f1c..b94f73baefb7 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java @@ -54,6 +54,7 @@ import org.apache.lucene.search.ConstantScoreQuery; import org.apache.lucene.search.DisjunctionMaxQuery; import org.apache.lucene.search.FieldExistsQuery; +import org.apache.lucene.search.IndexOrDocValuesQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.MultiPhraseQuery; @@ -163,6 +164,11 @@ protected void extract(Query query, float boost, Map t new SpanNearQuery(clauses, phraseQuery.getSlop() + positionGaps, inorder); extractWeightedSpanTerms(terms, sp, boost); } + } else if (query instanceof IndexOrDocValuesQuery) { + Query indexQuery = ((IndexOrDocValuesQuery) query).getIndexQuery(); + if (indexQuery != null) { + extract(indexQuery, boost, terms); + } } else if (query instanceof TermQuery || query instanceof SynonymQuery) { extractWeightedTerms(terms, query, boost); } else if (query instanceof SpanQuery) { diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/DefaultPassageFormatter.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/DefaultPassageFormatter.java index 27281a91be7e..4cd2b07fc1ca 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/DefaultPassageFormatter.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/DefaultPassageFormatter.java @@ -64,7 +64,7 @@ public String format(Passage[] passages, String content) { int pos = 0; for (Passage passage : passages) { // don't add ellipsis if its the first one, or if its connected. - if (passage.getStartOffset() > pos && pos > 0) { + if (!sb.isEmpty() && passage.getStartOffset() != pos) { sb.append(ellipsis); } pos = passage.getStartOffset(); diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java index dcc8e5af873f..8275a59754be 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java @@ -46,7 +46,7 @@ public FieldOffsetStrategy(UHComponents components) { } public String getField() { - return components.getField(); + return components.field(); } public abstract UnifiedHighlighter.OffsetSource getOffsetSource(); @@ -69,7 +69,7 @@ protected OffsetsEnum createOffsetsEnumFromReader(LeafReader leafReader, int doc final List offsetsEnums = new ArrayList<>(); // Handle Weight.matches approach - if (components.getHighlightFlags().contains(UnifiedHighlighter.HighlightFlag.WEIGHT_MATCHES)) { + if (components.highlightFlags().contains(UnifiedHighlighter.HighlightFlag.WEIGHT_MATCHES)) { createOffsetsEnumsWeightMatcher(leafReader, doc, offsetsEnums); @@ -77,8 +77,8 @@ protected OffsetsEnum createOffsetsEnumFromReader(LeafReader leafReader, int doc // Handle position insensitive terms (a subset of this.terms field): final BytesRef[] insensitiveTerms; - final PhraseHelper phraseHelper = components.getPhraseHelper(); - final BytesRef[] terms = components.getTerms(); + final PhraseHelper phraseHelper = components.phraseHelper(); + final BytesRef[] terms = components.terms(); if (phraseHelper.hasPositionSensitivity()) { insensitiveTerms = phraseHelper.getAllPositionInsensitiveTerms(); assert insensitiveTerms.length <= terms.length @@ -96,7 +96,7 @@ protected OffsetsEnum createOffsetsEnumFromReader(LeafReader leafReader, int doc } // Handle automata - if (components.getAutomata().length > 0) { + if (components.automata().length > 0) { createOffsetsEnumsForAutomata(termsIndex, doc, offsetsEnums); } } @@ -118,8 +118,8 @@ protected void createOffsetsEnumsWeightMatcher( new FilterLeafReader(_leafReader) { @Override public Terms terms(String field) throws IOException { - if (components.getFieldMatcher().test(field)) { - return super.terms(components.getField()); + if (components.fieldMatcher().test(field)) { + return super.terms(components.field()); } else { return super.terms(field); } @@ -142,14 +142,14 @@ public CacheHelper getReaderCacheHelper() { indexSearcher.setQueryCache(null); Matches matches = indexSearcher - .rewrite(components.getQuery()) + .rewrite(components.query()) .createWeight(indexSearcher, ScoreMode.COMPLETE_NO_SCORES, 1.0f) .matches(leafReader.getContext(), docId); if (matches == null) { return; // doc doesn't match } for (String field : matches) { - if (components.getFieldMatcher().test(field)) { + if (components.fieldMatcher().test(field)) { MatchesIterator iterator = matches.getMatches(field); if (iterator == null) { continue; @@ -180,7 +180,7 @@ protected void createOffsetsEnumsForTerms( protected void createOffsetsEnumsForAutomata(Terms termsIndex, int doc, List results) throws IOException { - final LabelledCharArrayMatcher[] automata = components.getAutomata(); + final LabelledCharArrayMatcher[] automata = components.automata(); List> automataPostings = new ArrayList<>(automata.length); for (int i = 0; i < automata.length; i++) { automataPostings.add(new ArrayList<>()); diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java index a4b090902ed4..749cb765b39d 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java @@ -44,8 +44,7 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy { public MemoryIndexOffsetStrategy(UHComponents components, Analyzer analyzer) { super(components, analyzer); - boolean storePayloads = - components.getPhraseHelper().hasPositionSensitivity(); // might be needed + boolean storePayloads = components.phraseHelper().hasPositionSensitivity(); // might be needed memoryIndex = new MemoryIndex(true, storePayloads); // true==store offsets memIndexLeafReader = (LeafReader) memoryIndex.createSearcher().getIndexReader(); // appears to be re-usable @@ -56,26 +55,26 @@ public MemoryIndexOffsetStrategy(UHComponents components, Analyzer analyzer) { /** Build one {@link CharArrayMatcher} matching any term the query might match. */ private static CharArrayMatcher buildCombinedAutomaton(UHComponents components) { // We don't know enough about the query to do this confidently - if (components.getTerms() == null || components.getAutomata() == null) { + if (components.terms() == null || components.automata() == null) { return null; } List allAutomata = new ArrayList<>(); - if (components.getTerms().length > 0) { + if (components.terms().length > 0) { // Filter out any long terms that would otherwise cause exceptions if we tried // to build an automaton on them List filteredTerms = - Arrays.stream(components.getTerms()) + Arrays.stream(components.terms()) .filter(b -> b.length < Automata.MAX_STRING_UNION_TERM_LENGTH) .toList(); allAutomata.add(CharArrayMatcher.fromTerms(filteredTerms)); } - Collections.addAll(allAutomata, components.getAutomata()); - for (SpanQuery spanQuery : components.getPhraseHelper().getSpanQueries()) { + Collections.addAll(allAutomata, components.automata()); + for (SpanQuery spanQuery : components.phraseHelper().getSpanQueries()) { Collections.addAll( allAutomata, MultiTermHighlighting.extractAutomata( - spanQuery, components.getFieldMatcher(), true)); // true==lookInSpan + spanQuery, components.fieldMatcher(), true)); // true==lookInSpan } if (allAutomata.size() == 1) { diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java index 05a9743a8cb8..e22286ced43d 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java @@ -38,8 +38,8 @@ public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy { public TokenStreamOffsetStrategy(UHComponents components, Analyzer indexAnalyzer) { super(components, indexAnalyzer); - assert components.getPhraseHelper().hasPositionSensitivity() == false; - combinedAutomata = convertTermsToMatchers(components.getTerms(), components.getAutomata()); + assert components.phraseHelper().hasPositionSensitivity() == false; + combinedAutomata = convertTermsToMatchers(components.terms(), components.automata()); } // TODO this is inefficient; instead build a union automata just for terms part. diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UHComponents.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UHComponents.java index 96e40a4cbca9..a8d4d6179867 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UHComponents.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UHComponents.java @@ -25,71 +25,19 @@ /** * A parameter object to hold the components a {@link FieldOffsetStrategy} needs. * + * @param terms Query: all terms we extracted (some may be position sensitive) + * @param phraseHelper Query: position-sensitive information + * @param automata Query: wildcards (i.e. multi-term query), not position sensitive + * @param hasUnrecognizedQueryPart Query: if part of the query (other than the extracted terms / + * automata) is a leaf we don't know * @lucene.internal */ -public class UHComponents { - private final String field; - private final Predicate fieldMatcher; - private final Query query; - // Query: all terms we extracted (some may be position sensitive) - private final BytesRef[] terms; - // Query: position-sensitive information - private final PhraseHelper phraseHelper; - // Query: wildcards (i.e. multi-term query), not position sensitive - private final LabelledCharArrayMatcher[] automata; - // Query: if part of the query (other than the extracted terms / automata) is a leaf we don't know - private final boolean hasUnrecognizedQueryPart; - - private final Set highlightFlags; - - public UHComponents( - String field, - Predicate fieldMatcher, - Query query, - BytesRef[] terms, - PhraseHelper phraseHelper, - LabelledCharArrayMatcher[] automata, - boolean hasUnrecognizedQueryPart, - Set highlightFlags) { - this.field = field; - this.fieldMatcher = fieldMatcher; - this.query = query; - this.terms = terms; - this.phraseHelper = phraseHelper; - this.automata = automata; - this.hasUnrecognizedQueryPart = hasUnrecognizedQueryPart; - this.highlightFlags = highlightFlags; - } - - public String getField() { - return field; - } - - public Predicate getFieldMatcher() { - return fieldMatcher; - } - - public Query getQuery() { - return query; - } - - public BytesRef[] getTerms() { - return terms; - } - - public PhraseHelper getPhraseHelper() { - return phraseHelper; - } - - public LabelledCharArrayMatcher[] getAutomata() { - return automata; - } - - public boolean hasUnrecognizedQueryPart() { - return hasUnrecognizedQueryPart; - } - - public Set getHighlightFlags() { - return highlightFlags; - } -} +public record UHComponents( + String field, + Predicate fieldMatcher, + Query query, + BytesRef[] terms, + PhraseHelper phraseHelper, + LabelledCharArrayMatcher[] automata, + boolean hasUnrecognizedQueryPart, + Set highlightFlags) {} diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java index 397239c7d66b..75728bbfee73 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java @@ -684,7 +684,7 @@ public enum OffsetSource { *

  • If there's a field info it has {@link * IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS} then {@link OffsetSource#POSTINGS} * is returned. - *
  • If there's a field info and {@link FieldInfo#hasVectors()} then {@link + *
  • If there's a field info and {@link FieldInfo#hasTermVectors()} then {@link * OffsetSource#TERM_VECTORS} is returned (note we can't check here if the TV has offsets; * if there isn't then an exception will get thrown down the line). *
  • Fall-back: {@link OffsetSource#ANALYSIS} is returned. @@ -698,11 +698,11 @@ protected OffsetSource getOffsetSource(String field) { FieldInfo fieldInfo = getFieldInfo(field); if (fieldInfo != null) { if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) { - return fieldInfo.hasVectors() + return fieldInfo.hasTermVectors() ? OffsetSource.POSTINGS_WITH_TERM_VECTORS : OffsetSource.POSTINGS; } - if (fieldInfo.hasVectors()) { // unfortunately we can't also check if the TV has offsets + if (fieldInfo.hasTermVectors()) { // unfortunately we can't also check if the TV has offsets return OffsetSource.TERM_VECTORS; } } @@ -1252,19 +1252,17 @@ protected LabelledCharArrayMatcher[] getAutomata( } protected OffsetSource getOptimizedOffsetSource(UHComponents components) { - OffsetSource offsetSource = getOffsetSource(components.getField()); + OffsetSource offsetSource = getOffsetSource(components.field()); // null automata means unknown, so assume a possibility boolean mtqOrRewrite = - components.getAutomata() == null - || components.getAutomata().length > 0 - || components.getPhraseHelper().willRewrite() + components.automata() == null + || components.automata().length > 0 + || components.phraseHelper().willRewrite() || components.hasUnrecognizedQueryPart(); // null terms means unknown, so assume something to highlight - if (mtqOrRewrite == false - && components.getTerms() != null - && components.getTerms().length == 0) { + if (mtqOrRewrite == false && components.terms() != null && components.terms().length == 0) { return OffsetSource.NONE_NEEDED; // nothing to highlight } @@ -1295,9 +1293,9 @@ protected FieldOffsetStrategy getOffsetStrategy( OffsetSource offsetSource, UHComponents components) { switch (offsetSource) { case ANALYSIS: - if (!components.getPhraseHelper().hasPositionSensitivity() - && !components.getHighlightFlags().contains(HighlightFlag.PASSAGE_RELEVANCY_OVER_SPEED) - && !components.getHighlightFlags().contains(HighlightFlag.WEIGHT_MATCHES)) { + if (!components.phraseHelper().hasPositionSensitivity() + && !components.highlightFlags().contains(HighlightFlag.PASSAGE_RELEVANCY_OVER_SPEED) + && !components.highlightFlags().contains(HighlightFlag.WEIGHT_MATCHES)) { // skip using a memory index since it's pure term filtering return new TokenStreamOffsetStrategy(components, getIndexAnalyzer()); } else { diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java index e46ecbfb466b..964f039e5438 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java @@ -18,8 +18,6 @@ import java.io.IOException; import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; import java.util.HashMap; import java.util.Iterator; import java.util.List; @@ -187,7 +185,7 @@ protected Field[] getFields(IndexReader reader, int docId, final String fieldNam public void stringField(FieldInfo fieldInfo, String value) { Objects.requireNonNull(value, "String value should not be null"); FieldType ft = new FieldType(TextField.TYPE_STORED); - ft.setStoreTermVectors(fieldInfo.hasVectors()); + ft.setStoreTermVectors(fieldInfo.hasTermVectors()); fields.add(new Field(fieldInfo.name, value, ft)); } @@ -215,18 +213,18 @@ protected String makeFragment( buffer, index, values, s, fragInfo.getEndOffset(), modifiedStartOffset); int srcIndex = 0; for (SubInfo subInfo : fragInfo.getSubInfos()) { - for (Toffs to : subInfo.getTermsOffsets()) { + for (Toffs to : subInfo.termsOffsets()) { fragment .append( encoder.encodeText( src.substring(srcIndex, to.getStartOffset() - modifiedStartOffset[0]))) - .append(getPreTag(preTags, subInfo.getSeqnum())) + .append(getPreTag(preTags, subInfo.seqnum())) .append( encoder.encodeText( src.substring( to.getStartOffset() - modifiedStartOffset[0], to.getEndOffset() - modifiedStartOffset[0]))) - .append(getPostTag(postTags, subInfo.getSeqnum())); + .append(getPostTag(postTags, subInfo.seqnum())); srcIndex = to.getEndOffset() - modifiedStartOffset[0]; } } @@ -298,7 +296,7 @@ protected List discreteMultiValueHighlighting( continue fragInfos; } - Toffs firstToffs = fragInfo.getSubInfos().get(0).getTermsOffsets().get(0); + Toffs firstToffs = fragInfo.getSubInfos().get(0).termsOffsets().get(0); if (fragInfo.getStartOffset() >= fieldEnd || firstToffs.getStartOffset() >= fieldEnd) { continue; } @@ -320,7 +318,7 @@ protected List discreteMultiValueHighlighting( while (subInfoIterator.hasNext()) { SubInfo subInfo = subInfoIterator.next(); List toffsList = new ArrayList<>(); - Iterator toffsIterator = subInfo.getTermsOffsets().iterator(); + Iterator toffsIterator = subInfo.termsOffsets().iterator(); while (toffsIterator.hasNext()) { Toffs toffs = toffsIterator.next(); if (toffs.getStartOffset() >= fieldEnd) { @@ -359,12 +357,11 @@ protected List discreteMultiValueHighlighting( } } if (!toffsList.isEmpty()) { - subInfos.add( - new SubInfo(subInfo.getText(), toffsList, subInfo.getSeqnum(), subInfo.getBoost())); - boost += subInfo.getBoost(); + subInfos.add(new SubInfo(subInfo.text(), toffsList, subInfo.seqnum(), subInfo.boost())); + boost += subInfo.boost(); } - if (subInfo.getTermsOffsets().isEmpty()) { + if (subInfo.termsOffsets().isEmpty()) { subInfoIterator.remove(); } } @@ -378,16 +375,7 @@ protected List discreteMultiValueHighlighting( for (List weightedFragInfos : fieldNameToFragInfos.values()) { result.addAll(weightedFragInfos); } - Collections.sort( - result, - new Comparator() { - - @Override - public int compare( - FieldFragList.WeightedFragInfo info1, FieldFragList.WeightedFragInfo info2) { - return info1.getStartOffset() - info2.getStartOffset(); - } - }); + result.sort((info1, info2) -> info1.getStartOffset() - info2.getStartOffset()); return result; } diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldFragList.java b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldFragList.java index d08060b8945c..6799b39f856b 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldFragList.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldFragList.java @@ -101,37 +101,15 @@ public String toString() { return sb.toString(); } - /** Represents the list of term offsets for some text */ - public static class SubInfo { - private final String text; // unnecessary member, just exists for debugging purpose - // usually termsOffsets.size() == 1, - // but if position-gap > 1 and slop > 0 then size() could be greater than 1 - private final List termsOffsets; - private final int seqnum; - private final float boost; // used for scoring split WeightedPhraseInfos. - - public SubInfo(String text, List termsOffsets, int seqnum, float boost) { - this.text = text; - this.termsOffsets = termsOffsets; - this.seqnum = seqnum; - this.boost = boost; - } - - public List getTermsOffsets() { - return termsOffsets; - } - - public int getSeqnum() { - return seqnum; - } - - public String getText() { - return text; - } - - public float getBoost() { - return boost; - } + /** + * Represents the list of term offsets for some text + * + * @param text unnecessary member, just exists for debugging purpose + * @param termsOffsets usually termsOffsets.size() == 1, but if position-gap > 1 and slop > 0 + * then size() could be greater than 1 + * @param boost used for scoring split WeightedPhraseInfos. + */ + public record SubInfo(String text, List termsOffsets, int seqnum, float boost) { @Override public String toString() { diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/WeightedFieldFragList.java b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/WeightedFieldFragList.java index 6965ee661170..1b3c7c1a88a7 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/WeightedFieldFragList.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/WeightedFieldFragList.java @@ -70,13 +70,10 @@ public void add(int startOffset, int endOffset, List phraseI float totalBoost = 0; for (SubInfo tempSubInfo : tempSubInfos) { - float subInfoBoost = tempSubInfo.getBoost() * norm; + float subInfoBoost = tempSubInfo.boost() * norm; realSubInfos.add( new SubInfo( - tempSubInfo.getText(), - tempSubInfo.getTermsOffsets(), - tempSubInfo.getSeqnum(), - subInfoBoost)); + tempSubInfo.text(), tempSubInfo.termsOffsets(), tempSubInfo.seqnum(), subInfoBoost)); totalBoost += subInfoBoost; } diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TestHighlighter.java b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TestHighlighter.java index 1664eb4be67d..a379cf26e6cd 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TestHighlighter.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TestHighlighter.java @@ -59,12 +59,14 @@ import org.apache.lucene.queries.spans.SpanOrQuery; import org.apache.lucene.queries.spans.SpanQuery; import org.apache.lucene.queries.spans.SpanTermQuery; +import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.ConstantScoreQuery; import org.apache.lucene.search.DoubleValuesSource; import org.apache.lucene.search.FieldExistsQuery; import org.apache.lucene.search.FuzzyQuery; +import org.apache.lucene.search.IndexOrDocValuesQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MultiPhraseQuery; import org.apache.lucene.search.MultiTermQuery; @@ -153,7 +155,7 @@ public void testFunctionScoreQuery() throws Exception { searcher = newSearcher(reader); TopDocs hits = searcher.search(query, 10, new Sort(SortField.FIELD_DOC, SortField.FIELD_SCORE)); - assertEquals(2, hits.totalHits.value); + assertEquals(2, hits.totalHits.value()); QueryScorer scorer = new QueryScorer(query, FIELD_NAME); Highlighter highlighter = new Highlighter(scorer); @@ -205,7 +207,7 @@ public void testHighlightingCommonTermsQuery() throws Exception { searcher = newSearcher(reader); TopDocs hits = searcher.search(query, 10, new Sort(SortField.FIELD_DOC, SortField.FIELD_SCORE)); - assertEquals(2, hits.totalHits.value); + assertEquals(2, hits.totalHits.value()); QueryScorer scorer = new QueryScorer(query, FIELD_NAME); Highlighter highlighter = new Highlighter(scorer); @@ -255,6 +257,27 @@ public void testHighlightingSynonymQuery() throws Exception { assertEquals("John Kennedy has been shot", fragment); } + public void testHighlightingIndexOrDocValuesQuery() throws Exception { + searcher = newSearcher(reader); + BooleanQuery.Builder booleanQueryBuilder = new BooleanQuery.Builder(); + booleanQueryBuilder.add(new TermQuery(new Term(FIELD_NAME, "jfk")), BooleanClause.Occur.SHOULD); + booleanQueryBuilder.add( + new TermQuery(new Term(FIELD_NAME, "kennedy")), BooleanClause.Occur.SHOULD); + Query indexQuery = booleanQueryBuilder.build(); + Query dvQuery = TermRangeQuery.newStringRange(FIELD_NAME, "a", "z", true, true); + Query query = new IndexOrDocValuesQuery(indexQuery, dvQuery); + QueryScorer scorer = new QueryScorer(query, FIELD_NAME); + Highlighter highlighter = new Highlighter(scorer); + TokenStream stream = getAnyTokenStream(FIELD_NAME, 2); + String storedField = searcher.storedFields().document(2).get(FIELD_NAME); + String fragment = highlighter.getBestFragment(stream, storedField); + assertEquals("JFK has been shot", fragment); + stream = getAnyTokenStream(FIELD_NAME, 3); + storedField = searcher.storedFields().document(3).get(FIELD_NAME); + fragment = highlighter.getBestFragment(stream, storedField); + assertEquals("John Kennedy has been shot", fragment); + } + public void testHighlightUnknownQueryAfterRewrite() throws IOException, InvalidTokenOffsetsException { Query query = @@ -290,7 +313,7 @@ public boolean equals(Object obj) { searcher = newSearcher(reader); TopDocs hits = searcher.search(query, 10, new Sort(SortField.FIELD_DOC, SortField.FIELD_SCORE)); - assertEquals(2, hits.totalHits.value); + assertEquals(2, hits.totalHits.value()); QueryScorer scorer = new QueryScorer(query, FIELD_NAME); Highlighter highlighter = new Highlighter(scorer); @@ -374,7 +397,7 @@ public void testSimpleSpanHighlighter() throws Exception { QueryScorer scorer = new QueryScorer(query, FIELD_NAME); Highlighter highlighter = new Highlighter(scorer); - for (int i = 0; i < hits.totalHits.value; i++) { + for (int i = 0; i < hits.totalHits.value(); i++) { String text = searcher.storedFields().document(hits.scoreDocs[i].doc).get(FIELD_NAME); TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text); highlighter.setTextFragmenter(new SimpleFragmenter(40)); @@ -398,8 +421,8 @@ public void testSimpleSpanHighlighterWithStopWordsStraddlingFragmentBoundaries() QueryScorer scorer = new QueryScorer(query, FIELD_NAME); Highlighter highlighter = new Highlighter(scorer); - assertEquals("Must have one hit", 1, hits.totalHits.value); - for (int i = 0; i < hits.totalHits.value; i++) { + assertEquals("Must have one hit", 1, hits.totalHits.value()); + for (int i = 0; i < hits.totalHits.value(); i++) { String text = searcher.storedFields().document(hits.scoreDocs[i].doc).get(FIELD_NAME); TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text); @@ -465,7 +488,7 @@ public void testSimpleQueryScorerPhraseHighlighting() throws Exception { QueryScorer scorer = new QueryScorer(query, FIELD_NAME); Highlighter highlighter = new Highlighter(this, scorer); - for (int i = 0; i < hits.totalHits.value; i++) { + for (int i = 0; i < hits.totalHits.value(); i++) { final int docId = hits.scoreDocs[i].doc; final Document doc = searcher.storedFields().document(docId); String text = doc.get(FIELD_NAME); @@ -498,7 +521,7 @@ public void testSimpleQueryScorerPhraseHighlighting() throws Exception { scorer = new QueryScorer(query, FIELD_NAME); highlighter = new Highlighter(this, scorer); - for (int i = 0; i < hits.totalHits.value; i++) { + for (int i = 0; i < hits.totalHits.value(); i++) { final int docId = hits.scoreDocs[i].doc; final Document doc = searcher.storedFields().document(docId); String text = doc.get(FIELD_NAME); @@ -531,7 +554,7 @@ public void testSimpleQueryScorerPhraseHighlighting() throws Exception { scorer = new QueryScorer(query, FIELD_NAME); highlighter = new Highlighter(this, scorer); - for (int i = 0; i < hits.totalHits.value; i++) { + for (int i = 0; i < hits.totalHits.value(); i++) { final int docId = hits.scoreDocs[i].doc; final Document doc = searcher.storedFields().document(docId); String text = doc.get(FIELD_NAME); @@ -560,7 +583,7 @@ public void testSpanRegexQuery() throws Exception { QueryScorer scorer = new QueryScorer(query, FIELD_NAME); Highlighter highlighter = new Highlighter(this, scorer); - for (int i = 0; i < hits.totalHits.value; i++) { + for (int i = 0; i < hits.totalHits.value(); i++) { final int docId = hits.scoreDocs[i].doc; final Document doc = searcher.storedFields().document(docId); String text = doc.get(FIELD_NAME); @@ -587,7 +610,7 @@ public void testRegexQuery() throws Exception { QueryScorer scorer = new QueryScorer(query, FIELD_NAME); Highlighter highlighter = new Highlighter(this, scorer); - for (int i = 0; i < hits.totalHits.value; i++) { + for (int i = 0; i < hits.totalHits.value(); i++) { final int docId = hits.scoreDocs[i].doc; final Document doc = searcher.storedFields().document(docId); String text = doc.get(FIELD_NAME); @@ -614,7 +637,7 @@ public void testExternalReader() throws Exception { QueryScorer scorer = new QueryScorer(query, reader, FIELD_NAME); Highlighter highlighter = new Highlighter(this, scorer); - for (int i = 0; i < hits.totalHits.value; i++) { + for (int i = 0; i < hits.totalHits.value(); i++) { final int docId = hits.scoreDocs[i].doc; final Document doc = searcher.storedFields().document(docId); String text = doc.get(FIELD_NAME); @@ -643,7 +666,7 @@ public void testDimensionalRangeQuery() throws Exception { QueryScorer scorer = new QueryScorer(query, FIELD_NAME); Highlighter highlighter = new Highlighter(this, scorer); - for (int i = 0; i < hits.totalHits.value; i++) { + for (int i = 0; i < hits.totalHits.value(); i++) { String text = searcher .storedFields() @@ -671,7 +694,7 @@ public void testSimpleQueryScorerPhraseHighlighting2() throws Exception { Highlighter highlighter = new Highlighter(this, scorer); highlighter.setTextFragmenter(new SimpleFragmenter(40)); - for (int i = 0; i < hits.totalHits.value; i++) { + for (int i = 0; i < hits.totalHits.value(); i++) { final int docId = hits.scoreDocs[i].doc; final Document doc = searcher.storedFields().document(docId); String text = doc.get(FIELD_NAME); @@ -693,7 +716,7 @@ public void testSimpleQueryScorerPhraseHighlighting3() throws Exception { int maxNumFragmentsRequired = 2; - for (int i = 0; i < hits.totalHits.value; i++) { + for (int i = 0; i < hits.totalHits.value(); i++) { final int docId = hits.scoreDocs[i].doc; final Document doc = searcher.storedFields().document(docId); String text = doc.get(FIELD_NAME); @@ -727,7 +750,7 @@ public void testSimpleSpanFragmenter() throws Exception { QueryScorer scorer = new QueryScorer(query, FIELD_NAME); Highlighter highlighter = new Highlighter(this, scorer); - for (int i = 0; i < hits.totalHits.value; i++) { + for (int i = 0; i < hits.totalHits.value(); i++) { final int docId = hits.scoreDocs[i].doc; final Document doc = searcher.storedFields().document(docId); String text = doc.get(FIELD_NAME); @@ -749,7 +772,7 @@ public void testSimpleSpanFragmenter() throws Exception { scorer = new QueryScorer(query, FIELD_NAME); highlighter = new Highlighter(this, scorer); - for (int i = 0; i < hits.totalHits.value; i++) { + for (int i = 0; i < hits.totalHits.value(); i++) { String text = searcher.storedFields().document(hits.scoreDocs[i].doc).get(FIELD_NAME); TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text); @@ -776,7 +799,7 @@ public void testPosTermStdTerm() throws Exception { QueryScorer scorer = new QueryScorer(query, FIELD_NAME); Highlighter highlighter = new Highlighter(this, scorer); - for (int i = 0; i < hits.totalHits.value; i++) { + for (int i = 0; i < hits.totalHits.value(); i++) { final int docId = hits.scoreDocs[i].doc; final Document doc = searcher.storedFields().document(docId); String text = doc.get(FIELD_NAME); @@ -857,7 +880,7 @@ public void testSimpleQueryTermScorerHighlighter() throws Exception { Highlighter highlighter = new Highlighter(new QueryTermScorer(query)); highlighter.setTextFragmenter(new SimpleFragmenter(40)); int maxNumFragmentsRequired = 2; - for (int i = 0; i < hits.totalHits.value; i++) { + for (int i = 0; i < hits.totalHits.value(); i++) { final int docId = hits.scoreDocs[i].doc; final Document doc = searcher.storedFields().document(docId); String text = doc.get(FIELD_NAME); @@ -1102,7 +1125,7 @@ public void testConstantScoreMultiTermQuery() throws Exception { if (VERBOSE) System.out.println("Searching for: " + query.toString(FIELD_NAME)); hits = searcher.search(query, 1000); - for (int i = 0; i < hits.totalHits.value; i++) { + for (int i = 0; i < hits.totalHits.value(); i++) { final int docId = hits.scoreDocs[i].doc; final Document doc = searcher.storedFields().document(docId); String text = doc.get(FIELD_NAME); @@ -1130,7 +1153,7 @@ public void testConstantScoreMultiTermQuery() throws Exception { numHighlights = 0; - for (int i = 0; i < hits.totalHits.value; i++) { + for (int i = 0; i < hits.totalHits.value(); i++) { final int docId = hits.scoreDocs[i].doc; final Document doc = searcher.storedFields().document(docId); String text = doc.get(FIELD_NAME); @@ -1158,7 +1181,7 @@ public void testConstantScoreMultiTermQuery() throws Exception { numHighlights = 0; - for (int i = 0; i < hits.totalHits.value; i++) { + for (int i = 0; i < hits.totalHits.value(); i++) { final int docId = hits.scoreDocs[i].doc; final Document doc = searcher.storedFields().document(docId); String text = doc.get(FIELD_NAME); @@ -1356,7 +1379,7 @@ public void testGetBestSingleFragment() throws Exception { public void run() throws Exception { doSearching(new TermQuery(new Term(FIELD_NAME, "kennedy"))); numHighlights = 0; - for (int i = 0; i < hits.totalHits.value; i++) { + for (int i = 0; i < hits.totalHits.value(); i++) { final int docId = hits.scoreDocs[i].doc; final Document doc = searcher.storedFields().document(docId); String text = doc.get(FIELD_NAME); @@ -1372,7 +1395,7 @@ public void run() throws Exception { numHighlights == 4); numHighlights = 0; - for (int i = 0; i < hits.totalHits.value; i++) { + for (int i = 0; i < hits.totalHits.value(); i++) { final int docId = hits.scoreDocs[i].doc; final Document doc = searcher.storedFields().document(docId); String text = doc.get(FIELD_NAME); @@ -1385,7 +1408,7 @@ public void run() throws Exception { numHighlights == 4); numHighlights = 0; - for (int i = 0; i < hits.totalHits.value; i++) { + for (int i = 0; i < hits.totalHits.value(); i++) { final int docId = hits.scoreDocs[i].doc; final Document doc = searcher.storedFields().document(docId); String text = doc.get(FIELD_NAME); @@ -1532,7 +1555,7 @@ public void run() throws Exception { doSearching(new TermQuery(new Term(FIELD_NAME, "kennedy"))); // new Highlighter(TestHighlighter.this, new QueryTermScorer(query)); - for (int i = 0; i < hits.totalHits.value; i++) { + for (int i = 0; i < hits.totalHits.value(); i++) { String text = searcher.storedFields().document(hits.scoreDocs[i].doc).get(FIELD_NAME); TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text); Highlighter highlighter = getHighlighter(query, FIELD_NAME, TestHighlighter.this); @@ -1556,7 +1579,7 @@ public void run() throws Exception { doSearching(new TermQuery(new Term(FIELD_NAME, "kennedy"))); - for (int i = 0; i < hits.totalHits.value; i++) { + for (int i = 0; i < hits.totalHits.value(); i++) { final int docId = hits.scoreDocs[i].doc; final Document doc = searcher.storedFields().document(docId); String text = doc.get(FIELD_NAME); @@ -1770,7 +1793,7 @@ public void run() throws Exception { int maxNumFragmentsRequired = 3; - for (int i = 0; i < hits.totalHits.value; i++) { + for (int i = 0; i < hits.totalHits.value(); i++) { final int docId = hits.scoreDocs[i].doc; final Document doc = searcher.storedFields().document(docId); String text = doc.get(FIELD_NAME); @@ -2161,7 +2184,7 @@ private void searchIndex() throws IOException, InvalidTokenOffsetsException { Highlighter h = new Highlighter(scorer); TopDocs hits = searcher.search(query, 10); - for (int i = 0; i < hits.totalHits.value; i++) { + for (int i = 0; i < hits.totalHits.value(); i++) { Document doc = searcher.storedFields().document(hits.scoreDocs[i].doc); String result = h.getBestFragment(a, "t_text1", doc.get("t_text1")); if (VERBOSE) System.out.println("result:" + result); @@ -2230,7 +2253,7 @@ public void doSearching(Query unReWrittenQuery) throws Exception { public void assertExpectedHighlightCount( final int maxNumFragmentsRequired, final int expectedHighlights) throws Exception { - for (int i = 0; i < hits.totalHits.value; i++) { + for (int i = 0; i < hits.totalHits.value(); i++) { String text = searcher.storedFields().document(hits.scoreDocs[i].doc).get(FIELD_NAME); TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text); QueryScorer scorer = new QueryScorer(query, FIELD_NAME); @@ -2497,7 +2520,7 @@ void doStandardHighlights( throws Exception { TermVectors termVectors = searcher.getIndexReader().termVectors(); - for (int i = 0; i < hits.totalHits.value; i++) { + for (int i = 0; i < hits.totalHits.value(); i++) { final int docId = hits.scoreDocs[i].doc; final Document doc = searcher.storedFields().document(docId); String text = doc.get(TestHighlighter.FIELD_NAME); diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TestHighlighterPhrase.java b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TestHighlighterPhrase.java index f52f277cb37f..3803b71461b1 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TestHighlighterPhrase.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TestHighlighterPhrase.java @@ -73,7 +73,7 @@ public void testConcurrentPhrase() throws IOException, InvalidTokenOffsetsExcept final IndexSearcher indexSearcher = newSearcher(indexReader); final PhraseQuery phraseQuery = new PhraseQuery(FIELD, "fox", "jumped"); TopDocs hits = indexSearcher.search(phraseQuery, 1); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); final Highlighter highlighter = new Highlighter( new SimpleHTMLFormatter(), new SimpleHTMLEncoder(), new QueryScorer(phraseQuery)); @@ -170,7 +170,7 @@ public void testSparsePhrase() throws IOException, InvalidTokenOffsetsException final IndexSearcher indexSearcher = newSearcher(indexReader); final PhraseQuery phraseQuery = new PhraseQuery(FIELD, "did", "jump"); TopDocs hits = indexSearcher.search(phraseQuery, 1); - assertEquals(0, hits.totalHits.value); + assertEquals(0, hits.totalHits.value()); final Highlighter highlighter = new Highlighter( new SimpleHTMLFormatter(), new SimpleHTMLEncoder(), new QueryScorer(phraseQuery)); @@ -209,7 +209,7 @@ public void testSparsePhraseWithNoPositions() throws IOException, InvalidTokenOf final IndexSearcher indexSearcher = newSearcher(indexReader); final PhraseQuery phraseQuery = new PhraseQuery(1, FIELD, "did", "jump"); TopDocs hits = indexSearcher.search(phraseQuery, 1); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); final Highlighter highlighter = new Highlighter( new SimpleHTMLFormatter(), new SimpleHTMLEncoder(), new QueryScorer(phraseQuery)); @@ -255,7 +255,7 @@ public void testSparseSpan() throws IOException, InvalidTokenOffsetsException { true); TopDocs hits = indexSearcher.search(phraseQuery, 1); - assertEquals(0, hits.totalHits.value); + assertEquals(0, hits.totalHits.value()); final Highlighter highlighter = new Highlighter( new SimpleHTMLFormatter(), new SimpleHTMLEncoder(), new QueryScorer(phraseQuery)); @@ -293,7 +293,7 @@ public void testStopWords() throws IOException, InvalidTokenOffsetsException { .build(); TopDocs hits = indexSearcher.search(phraseQuery, 100); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); final Highlighter highlighter = new Highlighter( new SimpleHTMLFormatter(), new SimpleHTMLEncoder(), new QueryScorer(phraseQuery)); @@ -326,7 +326,7 @@ public void testInOrderWithStopWords() throws IOException, InvalidTokenOffsetsEx .build(); TopDocs hits = indexSearcher.search(phraseQuery, 100); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); final Highlighter highlighter = new Highlighter( diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TestTokenSources.java b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TestTokenSources.java index 087b460c1da0..6a466822b40a 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TestTokenSources.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TestTokenSources.java @@ -128,7 +128,7 @@ public void testOverlapWithOffset() throws IOException, InvalidTokenOffsetsExcep // new SpanTermQuery(new Term(FIELD, "fox")) }, 0, true); TopDocs hits = indexSearcher.search(query, 1); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); final Highlighter highlighter = new Highlighter( new SimpleHTMLFormatter(), new SimpleHTMLEncoder(), new QueryScorer(query)); @@ -171,7 +171,7 @@ public void testOverlapWithPositionsAndOffset() throws IOException, InvalidToken // new SpanTermQuery(new Term(FIELD, "fox")) }, 0, true); TopDocs hits = indexSearcher.search(query, 1); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); final Highlighter highlighter = new Highlighter( new SimpleHTMLFormatter(), new SimpleHTMLEncoder(), new QueryScorer(query)); @@ -215,7 +215,7 @@ public void testOverlapWithOffsetExactPhrase() throws IOException, InvalidTokenO true); TopDocs hits = indexSearcher.search(phraseQuery, 1); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); final Highlighter highlighter = new Highlighter( new SimpleHTMLFormatter(), new SimpleHTMLEncoder(), new QueryScorer(phraseQuery)); @@ -260,7 +260,7 @@ public void testOverlapWithPositionsAndOffsetExactPhrase() true); TopDocs hits = indexSearcher.search(phraseQuery, 1); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); final Highlighter highlighter = new Highlighter( new SimpleHTMLFormatter(), new SimpleHTMLEncoder(), new QueryScorer(phraseQuery)); diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestDefaultPassageFormatter.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestDefaultPassageFormatter.java index b59fea47453e..617077c987c4 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestDefaultPassageFormatter.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestDefaultPassageFormatter.java @@ -75,4 +75,30 @@ public void testOverlappingPassages() throws Exception { "Yin yang loooooooooong, yin gap yang yong", formatter.format(passages, content)); } + + public void testReversedStartOffsetOrder() { + String content = + "When indexing data in Solr, each document is composed of various fields. " + + "A document essentially represents a single record, and each document typically contains a unique ID field."; + + Passage[] passages = new Passage[2]; + passages[0] = new Passage(); + passages[0].setStartOffset(73); + passages[0].setEndOffset(179); + passages[0].setScore(1.8846991f); + passages[0].addMatch(75, 83, new BytesRef("document"), 1); + passages[0].addMatch(133, 141, new BytesRef("document"), 1); + + passages[1] = new Passage(); + passages[1].setStartOffset(0); + passages[1].setEndOffset(73); + passages[1].setScore(1.5923802f); + passages[1].addMatch(33, 41, new BytesRef("document"), 1); + + DefaultPassageFormatter formatter = new DefaultPassageFormatter("", "", "\n", false); + assertEquals( + "A document essentially represents a single record, and each document typically contains a unique ID field.\n" + + "When indexing data in Solr, each document is composed of various fields. ", + formatter.format(passages, content)); + } } diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java index 18720a3d0f4e..98b1ba86f586 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java @@ -182,7 +182,7 @@ public void testBasics() throws Exception { UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer); Query query = new TermQuery(new Term("body", "highlighting")); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); String[] snippets = highlighter.highlight("body", query, topDocs); assertEquals(2, snippets.length); assertEquals("Just a test highlighting from postings. ", snippets[0]); @@ -243,7 +243,7 @@ private String[] formatWithMatchExceedingContentLength(String bodyText) throws I Query query = new TermQuery(new Term("body", "test")); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); UnifiedHighlighter.Builder uhBuilder = new UnifiedHighlighter.Builder(searcher, indexAnalyzer).withMaxLength(maxLength); @@ -272,7 +272,7 @@ public void testHighlightLastWord() throws Exception { UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer); Query query = new TermQuery(new Term("body", "test")); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); String[] snippets = highlighter.highlight("body", query, topDocs); assertEquals(1, snippets.length); assertEquals("This is a test", snippets[0]); @@ -300,7 +300,7 @@ public void testOneSentence() throws Exception { UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer); Query query = new TermQuery(new Term("body", "test")); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); String[] snippets = highlighter.highlight("body", query, topDocs); assertEquals(2, snippets.length); assertEquals("This is a test.", snippets[0]); @@ -332,7 +332,7 @@ public void testMaxLengthWithMultivalue() throws Exception { UnifiedHighlighter highlighter = randomUnifiedHighlighter(uhBuilder); Query query = new TermQuery(new Term("body", "field")); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); String[] snippets = highlighter.highlight("body", query, topDocs, 10); assertEquals(1, snippets.length); String highlightedValue = "This is a multivalued field. Sentencetwo field."; @@ -369,7 +369,7 @@ public void testMultipleFields() throws Exception { .add(new TermQuery(new Term("title", "best")), BooleanClause.Occur.SHOULD) .build(); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); Map snippets = highlighter.highlightFields(new String[] {"body", "title"}, query, topDocs); assertEquals(2, snippets.size()); @@ -405,7 +405,7 @@ public void testMultipleTerms() throws Exception { .add(new TermQuery(new Term("body", "first")), BooleanClause.Occur.SHOULD) .build(); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); String[] snippets = highlighter.highlight("body", query, topDocs); assertEquals(2, snippets.length); assertEquals("Just a test highlighting from postings. ", snippets[0]); @@ -434,7 +434,7 @@ public void testMultiplePassages() throws Exception { UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer); Query query = new TermQuery(new Term("body", "test")); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); String[] snippets = highlighter.highlight("body", query, topDocs, 2); assertEquals(2, snippets.length); assertEquals( @@ -472,7 +472,7 @@ public void testBuddhism() throws Exception { .add(new Term("body", "origins")) .build(); TopDocs topDocs = searcher.search(query, 10); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); UnifiedHighlighter.Builder uhBuilder = new UnifiedHighlighter.Builder(searcher, indexAnalyzer).withHighlightPhrasesStrictly(false); UnifiedHighlighter highlighter = randomUnifiedHighlighter(uhBuilder); @@ -528,7 +528,7 @@ public void testCuriousGeorge() throws Exception { .add(new Term("body", "george")) .build(); TopDocs topDocs = searcher.search(query, 10); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); UnifiedHighlighter.Builder uhBuilder = new UnifiedHighlighter.Builder(searcher, indexAnalyzer).withHighlightPhrasesStrictly(false); UnifiedHighlighter highlighter = randomUnifiedHighlighter(uhBuilder); @@ -560,7 +560,7 @@ public void testCambridgeMA() throws Exception { .add(new TermQuery(new Term("body", "massachusetts")), BooleanClause.Occur.SHOULD) .build(); TopDocs topDocs = searcher.search(query, 10); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); UnifiedHighlighter.Builder uhBuilder = new UnifiedHighlighter.Builder(searcher, indexAnalyzer) .withMaxLength(Integer.MAX_VALUE - 1); @@ -590,7 +590,7 @@ public void testPassageRanking() throws Exception { UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer); Query query = new TermQuery(new Term("body", "test")); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); String[] snippets = highlighter.highlight("body", query, topDocs, 2); assertEquals(1, snippets.length); assertEquals( @@ -625,7 +625,7 @@ public void testBooleanMustNot() throws Exception { .build(); TopDocs topDocs = searcher.search(query, 10); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); UnifiedHighlighter.Builder uhBuilder = new UnifiedHighlighter.Builder(searcher, indexAnalyzer) .withMaxLength(Integer.MAX_VALUE - 1); @@ -658,7 +658,7 @@ public void testHighlightAllText() throws Exception { UnifiedHighlighter highlighter = randomUnifiedHighlighter(uhBuilder); Query query = new TermQuery(new Term("body", "test")); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); String[] snippets = highlighter.highlight("body", query, topDocs, 2); assertEquals(1, snippets.length); assertEquals( @@ -688,7 +688,7 @@ public void testSpecificDocIDs() throws Exception { UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer); Query query = new TermQuery(new Term("body", "highlighting")); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); ScoreDoc[] hits = topDocs.scoreDocs; int[] docIDs = new int[2]; docIDs[0] = hits[0].doc; @@ -745,7 +745,7 @@ protected BreakIterator getBreakIterator(String field) { }.build(); Query query = new TermQuery(new Term("body", "test")); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); String[] snippets = highlighter.highlight("body", query, topDocs, 2); assertEquals(1, snippets.length); assertEquals( @@ -1014,7 +1014,7 @@ public void testMultipleDocs() throws Exception { UnifiedHighlighter highlighter = randomUnifiedHighlighter(uhBuilder); Query query = new TermQuery(new Term("body", "answer")); TopDocs hits = searcher.search(query, numDocs); - assertEquals(numDocs, hits.totalHits.value); + assertEquals(numDocs, hits.totalHits.value()); String[] snippets = highlighter.highlight("body", query, hits); assertEquals(numDocs, snippets.length); @@ -1088,7 +1088,7 @@ public void testEncode() throws Exception { .build(); Query query = new TermQuery(new Term("body", "highlighting")); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); String[] snippets = highlighter.highlight("body", query, topDocs); assertEquals(1, snippets.length); assertEquals( @@ -1131,7 +1131,7 @@ public String[] format(Passage[] passages, String content) { UnifiedHighlighter.builder(searcher, indexAnalyzer).withFormatter(passageFormatter).build(); Query query = new TermQuery(new Term("body", "highlighting")); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); int[] docIDs = new int[1]; docIDs[0] = topDocs.scoreDocs[0].doc; Map snippets = @@ -1196,7 +1196,7 @@ public void testFieldMatcherTermQuery() throws Exception { // title { TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); String[] snippets = highlighterNoFieldMatch.highlight("title", query, topDocs, 10); assertEquals(1, snippets.length); assertEquals("This is the title field.", snippets[0]); @@ -1216,7 +1216,7 @@ public void testFieldMatcherTermQuery() throws Exception { // text { TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); String[] snippets = highlighterNoFieldMatch.highlight("text", query, topDocs, 10); assertEquals(1, snippets.length); assertEquals( @@ -1240,7 +1240,7 @@ public void testFieldMatcherTermQuery() throws Exception { // category { TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); String[] snippets = highlighterNoFieldMatch.highlight("category", query, topDocs, 10); assertEquals(1, snippets.length); assertEquals("This is the category field.", snippets[0]); @@ -1286,7 +1286,7 @@ public void testFieldMatcherMultiTermQuery() throws Exception { // title { TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); String[] snippets = highlighterNoFieldMatch.highlight("title", query, topDocs, 10); assertEquals(1, snippets.length); assertEquals("This is the title field.", snippets[0]); @@ -1306,7 +1306,7 @@ public void testFieldMatcherMultiTermQuery() throws Exception { // text { TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); String[] snippets = highlighterNoFieldMatch.highlight("text", query, topDocs, 10); assertEquals(1, snippets.length); assertEquals( @@ -1330,7 +1330,7 @@ public void testFieldMatcherMultiTermQuery() throws Exception { // category { TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); String[] snippets = highlighterNoFieldMatch.highlight("category", query, topDocs, 10); assertEquals(1, snippets.length); assertEquals("This is the category field.", snippets[0]); @@ -1446,7 +1446,7 @@ private static void maskedFieldsTestCase( } Query query = boolQueryBuilder.build(); TopDocs topDocs = searcher.search(query, 10); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); Function> maskedFieldsFunc = fieldName -> fieldName.equals(field) ? maskedFields : Collections.emptySet(); @@ -1470,7 +1470,7 @@ public void testMatchesSlopBug() throws IOException { UnifiedHighlighter highlighter = UnifiedHighlighter.builder(searcher, indexAnalyzer).build(); Query query = new PhraseQuery(2, "title", "this", "is", "the", "field"); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); String[] snippets = highlighter.highlight("title", query, topDocs, 10); assertEquals(1, snippets.length); if (highlighter.getFlags("title").contains(HighlightFlag.WEIGHT_MATCHES)) { @@ -1513,7 +1513,7 @@ public void testFieldMatcherPhraseQuery() throws Exception { // title { TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); String[] snippets = highlighterNoFieldMatch.highlight("title", query, topDocs, 10); assertEquals(1, snippets.length); if (highlighterNoFieldMatch.getFlags("title").contains(HighlightFlag.WEIGHT_MATCHES)) { @@ -1545,7 +1545,7 @@ public void testFieldMatcherPhraseQuery() throws Exception { // text { TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); String[] snippets = highlighterNoFieldMatch.highlight("text", query, topDocs, 10); assertEquals(1, snippets.length); if (highlighterNoFieldMatch.getFlags("text").contains(HighlightFlag.WEIGHT_MATCHES)) { @@ -1584,7 +1584,7 @@ public void testFieldMatcherPhraseQuery() throws Exception { // category { TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); String[] snippets = highlighterNoFieldMatch.highlight("category", query, topDocs, 10); assertEquals(1, snippets.length); if (highlighterNoFieldMatch.getFlags("category").contains(HighlightFlag.WEIGHT_MATCHES)) { @@ -1682,7 +1682,7 @@ protected TokenStreamComponents createComponents(String fieldName) { }); Query query = new TermQuery(new Term("body", "highlighting")); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); String[] snippets = highlighter.highlight("body", query, topDocs); assertEquals(1, snippets.length); assertEquals("Just a test highlighting from postings. ", snippets[0]); @@ -1755,7 +1755,7 @@ public Weight createWeight( BooleanClause.Occur.MUST) .build(); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); String[] snippets = highlighter.highlight("body", query, topDocs); assertEquals(1, snippets.length); assertEquals("Test a one sentence document.", snippets[0]); diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java index 76193a8e62a0..b5652fb0f2bb 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java @@ -96,7 +96,7 @@ public void testWildcards() throws Exception { UnifiedHighlighter.Builder uhBuilder = new UnifiedHighlighter.Builder(searcher, indexAnalyzer); Query query = new WildcardQuery(new Term("body", "te*")); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); String[] snippets = uhBuilder.build().highlight("body", query, topDocs); assertEquals(2, snippets.length); assertEquals("This is a test.", snippets[0]); @@ -115,7 +115,7 @@ public void testWildcards() throws Exception { .add(new WildcardQuery(new Term("bogus", "te*")), BooleanClause.Occur.SHOULD) .build(); topDocs = searcher.search(bq, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); snippets = uhBuilder.withHandleMultiTermQuery(true).build().highlight("body", bq, topDocs); assertEquals(2, snippets.length); assertEquals("This is a test.", snippets[0]); @@ -156,7 +156,7 @@ public void testOnePrefix() throws Exception { // wrap in a BoostQuery to also show we see inside it Query query = new BoostQuery(new PrefixQuery(new Term("body", "te")), 2.0f); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); String[] snippets = highlighter.highlight("body", query, topDocs); assertEquals(2, snippets.length); assertEquals("This is a test.", snippets[0]); @@ -169,7 +169,7 @@ public void testOnePrefix() throws Exception { .add(new PrefixQuery(new Term("bogus", "te")), BooleanClause.Occur.SHOULD) .build(); topDocs = searcher.search(bq, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); snippets = uhBuilder.withFieldMatcher(null).build().highlight("body", bq, topDocs); assertEquals(2, snippets.length); assertEquals("This is a test.", snippets[0]); @@ -198,7 +198,7 @@ public void testOneRegexp() throws Exception { UnifiedHighlighter highlighter = randomUnifiedHighlighter(uhBuilder); Query query = new RegexpQuery(new Term("body", "te.*")); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); String[] snippets = highlighter.highlight("body", query, topDocs); assertEquals(2, snippets.length); assertEquals("This is a test.", snippets[0]); @@ -211,7 +211,7 @@ public void testOneRegexp() throws Exception { .add(new RegexpQuery(new Term("bogus", "te.*")), BooleanClause.Occur.SHOULD) .build(); topDocs = searcher.search(bq, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); snippets = uhBuilder.withFieldMatcher(null).build().highlight("body", bq, topDocs); assertEquals(2, snippets.length); assertEquals("This is a test.", snippets[0]); @@ -240,7 +240,7 @@ public void testFuzzy() throws Exception { UnifiedHighlighter highlighter = randomUnifiedHighlighter(uhBuilder); Query query = new FuzzyQuery(new Term("body", "tets"), 1); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); String[] snippets = highlighter.highlight("body", query, topDocs); assertEquals(2, snippets.length); assertEquals("This is a test.", snippets[0]); @@ -249,7 +249,7 @@ public void testFuzzy() throws Exception { // with prefix query = new FuzzyQuery(new Term("body", "tets"), 1, 2); topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); snippets = highlighter.highlight("body", query, topDocs); assertEquals(2, snippets.length); assertEquals("This is a test.", snippets[0]); @@ -258,7 +258,7 @@ public void testFuzzy() throws Exception { // with zero max edits query = new FuzzyQuery(new Term("body", "test"), 0, 2); topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); snippets = highlighter.highlight("body", query, topDocs); assertEquals(2, snippets.length); assertEquals("This is a test.", snippets[0]); @@ -271,7 +271,7 @@ public void testFuzzy() throws Exception { .add(new FuzzyQuery(new Term("bogus", "tets"), 1), BooleanClause.Occur.SHOULD) .build(); topDocs = searcher.search(bq, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); snippets = uhBuilder.withFieldMatcher(null).build().highlight("body", bq, topDocs); assertEquals(2, snippets.length); assertEquals("This is a test.", snippets[0]); @@ -300,7 +300,7 @@ public void testRanges() throws Exception { UnifiedHighlighter highlighter = randomUnifiedHighlighter(uhBuilder); Query query = TermRangeQuery.newStringRange("body", "ta", "tf", true, true); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); String[] snippets = highlighter.highlight("body", query, topDocs); assertEquals(2, snippets.length); assertEquals("This is a test.", snippets[0]); @@ -309,7 +309,7 @@ public void testRanges() throws Exception { // null start query = TermRangeQuery.newStringRange("body", null, "tf", true, true); topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); snippets = highlighter.highlight("body", query, topDocs); assertEquals(2, snippets.length); assertEquals("This is a test.", snippets[0]); @@ -318,7 +318,7 @@ public void testRanges() throws Exception { // null end query = TermRangeQuery.newStringRange("body", "ta", null, true, true); topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); snippets = highlighter.highlight("body", query, topDocs); assertEquals(2, snippets.length); assertEquals("This is a test.", snippets[0]); @@ -327,7 +327,7 @@ public void testRanges() throws Exception { // exact start inclusive query = TermRangeQuery.newStringRange("body", "test", "tf", true, true); topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); snippets = highlighter.highlight("body", query, topDocs); assertEquals(2, snippets.length); assertEquals("This is a test.", snippets[0]); @@ -336,7 +336,7 @@ public void testRanges() throws Exception { // exact end inclusive query = TermRangeQuery.newStringRange("body", "ta", "test", true, true); topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); snippets = highlighter.highlight("body", query, topDocs); assertEquals(2, snippets.length); assertEquals("This is a test.", snippets[0]); @@ -351,7 +351,7 @@ public void testRanges() throws Exception { BooleanClause.Occur.SHOULD) .build(); topDocs = searcher.search(bq, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); snippets = highlighter.highlight("body", bq, topDocs); assertEquals(2, snippets.length); assertEquals("This is a test.", snippets[0]); @@ -366,7 +366,7 @@ public void testRanges() throws Exception { BooleanClause.Occur.SHOULD) .build(); topDocs = searcher.search(bq, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); snippets = highlighter.highlight("body", bq, topDocs); assertEquals(2, snippets.length); assertEquals("This is a test.", snippets[0]); @@ -381,7 +381,7 @@ public void testRanges() throws Exception { BooleanClause.Occur.SHOULD) .build(); topDocs = searcher.search(bq, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); snippets = uhBuilder.withFieldMatcher(null).build().highlight("body", bq, topDocs); assertEquals(2, snippets.length); assertEquals("This is a test.", snippets[0]); @@ -412,7 +412,7 @@ public void testWildcardInBoolean() throws Exception { .add(new WildcardQuery(new Term("body", "te*")), BooleanClause.Occur.SHOULD) .build(); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); String[] snippets = highlighter.highlight("body", query, topDocs); assertEquals(2, snippets.length); assertEquals("This is a test.", snippets[0]); @@ -425,7 +425,7 @@ public void testWildcardInBoolean() throws Exception { .add(new WildcardQuery(new Term("bogus", "te*")), BooleanClause.Occur.MUST_NOT) .build(); topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); snippets = highlighter.highlight("body", query, topDocs); assertEquals(2, snippets.length); assertEquals("This is a test.", snippets[0]); @@ -457,7 +457,7 @@ public void testWildcardInFiltered() throws Exception { .add(new TermQuery(new Term("body", "test")), BooleanClause.Occur.FILTER) .build(); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); String[] snippets = highlighter.highlight("body", query, topDocs); assertEquals(2, snippets.length); assertEquals("This is a test.", snippets[0]); @@ -485,7 +485,7 @@ public void testWildcardInConstantScore() throws Exception { UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer); ConstantScoreQuery query = new ConstantScoreQuery(new WildcardQuery(new Term("body", "te*"))); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); String[] snippets = highlighter.highlight("body", query, topDocs); assertEquals(2, snippets.length); assertEquals("This is a test.", snippets[0]); @@ -515,7 +515,7 @@ public void testWildcardInDisjunctionMax() throws Exception { new DisjunctionMaxQuery( Collections.singleton(new WildcardQuery(new Term("body", "te*"))), 0); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); String[] snippets = highlighter.highlight("body", query, topDocs); assertEquals(2, snippets.length); assertEquals("This is a test.", snippets[0]); @@ -543,7 +543,7 @@ public void testSpanWildcard() throws Exception { UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer); Query query = new SpanMultiTermQueryWrapper<>(new WildcardQuery(new Term("body", "te*"))); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); String[] snippets = highlighter.highlight("body", query, topDocs); assertEquals(2, snippets.length); assertEquals("This is a test.", snippets[0]); @@ -573,7 +573,7 @@ public void testSpanOr() throws Exception { new SpanMultiTermQueryWrapper<>(new WildcardQuery(new Term("body", "te*"))); Query query = new SpanOrQuery(new SpanQuery[] {childQuery}); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); String[] snippets = highlighter.highlight("body", query, topDocs); assertEquals(2, snippets.length); assertEquals("This is a test.", snippets[0]); @@ -603,7 +603,7 @@ public void testSpanNear() throws Exception { new SpanMultiTermQueryWrapper<>(new WildcardQuery(new Term("body", "te*"))); Query query = new SpanNearQuery(new SpanQuery[] {childQuery, childQuery}, 0, false); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); String[] snippets = highlighter.highlight("body", query, topDocs); assertEquals(2, snippets.length); assertEquals("This is a test.", snippets[0]); @@ -633,7 +633,7 @@ public void testSpanNot() throws Exception { SpanQuery exclude = new SpanTermQuery(new Term("body", "bogus")); Query query = new SpanNotQuery(include, exclude); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); String[] snippets = highlighter.highlight("body", query, topDocs); assertEquals(2, snippets.length); assertEquals("This is a test.", snippets[0]); @@ -663,7 +663,7 @@ public void testSpanPositionCheck() throws Exception { new SpanMultiTermQueryWrapper<>(new WildcardQuery(new Term("body", "te*"))); Query query = new SpanFirstQuery(childQuery, 1000000); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); String[] snippets = highlighter.highlight("body", query, topDocs); assertEquals(2, snippets.length); assertEquals("This is a test.", snippets[0]); @@ -698,7 +698,7 @@ public void testWhichMTQMatched() throws Exception { .add(new FuzzyQuery(new Term("body", "zentence~")), BooleanClause.Occur.SHOULD) .build(); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); String[] snippets = highlighter.highlight("body", query, topDocs); assertEquals(1, snippets.length); @@ -755,7 +755,7 @@ public Object format(Passage[] passages, String content) { } }; - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); snippets = highlighter.highlight("body", query, topDocs); assertEquals(1, snippets.length); @@ -1196,7 +1196,7 @@ private void highlightAndAssertMatch( String fieldVal) throws IOException { TopDocs topDocs = searcher.search(query, 1); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); String[] snippets = highlighter.highlight(field, query, topDocs); assertEquals("[" + fieldVal + "]", Arrays.toString(snippets)); } diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterRanking.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterRanking.java index 83ce335e5a91..43fb12fdb8ec 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterRanking.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterRanking.java @@ -203,23 +203,7 @@ public String format(Passage[] passages, String content) { } } - static class Pair { - final int start; - final int end; - - Pair(int start, int end) { - this.start = start; - this.end = end; - } - - @Override - public int hashCode() { - final int prime = 31; - int result = 1; - result = prime * result + end; - result = prime * result + start; - return result; - } + record Pair(int start, int end) { @Override public boolean equals(Object obj) { @@ -289,7 +273,7 @@ protected PassageScorer getScorer(String field) { }; Query query = new TermQuery(new Term("body", "test")); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); String[] snippets = highlighter.highlight("body", query, topDocs, 1); assertEquals(1, snippets.length); assertTrue(snippets[0].startsWith("This test is a better test")); @@ -347,7 +331,7 @@ protected PassageScorer getScorer(String field) { .add(new TermQuery(new Term("body", "bar")), BooleanClause.Occur.SHOULD) .build(); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); String[] snippets = highlighter.highlight("body", query, topDocs, 1); assertEquals(1, snippets.length); assertTrue(snippets[0].startsWith("On the other hand")); diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterStrictPhrases.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterStrictPhrases.java index 2b553d9af48e..e2b559b4a20c 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterStrictPhrases.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterStrictPhrases.java @@ -476,7 +476,7 @@ public void testMultiValued() throws IOException { .add(phraseQuery, BooleanClause.Occur.MUST) // must match and it will .build(); topDocs = searcher.search(query, 10); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); snippets = highlighter.highlight("body", query, topDocs, 2); if (highlighter.getFlags("body").contains(HighlightFlag.WEIGHT_MATCHES)) { assertEquals("one bravo three... four bravo six", snippets[0]); @@ -594,7 +594,7 @@ protected Collection preSpanQueryRewrite(Query query) { .add(proximityBoostingQuery, BooleanClause.Occur.SHOULD) .build(); TopDocs topDocs = searcher.search(totalQuery, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); String[] snippets = highlighter.highlight("body", totalQuery, topDocs); assertArrayEquals( new String[] { diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterTermIntervals.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterTermIntervals.java index 4ae9eba826f5..1cd57f394180 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterTermIntervals.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterTermIntervals.java @@ -96,7 +96,7 @@ public void testBasics() throws Exception { UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer); Query query = new IntervalQuery("body", Intervals.term("highlighting")); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); String[] snippets = highlighter.highlight("body", query, topDocs); assertEquals(2, snippets.length); assertEquals("Just a test highlighting from postings. ", snippets[0]); @@ -156,7 +156,7 @@ private String[] formatWithMatchExceedingContentLength(String bodyText) throws I Query query = new IntervalQuery("body", Intervals.term("test")); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); UnifiedHighlighter.Builder uhBuilder = new UnifiedHighlighter.Builder(searcher, indexAnalyzer).withMaxLength(maxLength); @@ -185,7 +185,7 @@ public void testHighlightLastWord() throws Exception { UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer); Query query = new IntervalQuery("body", Intervals.term("test")); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); String[] snippets = highlighter.highlight("body", query, topDocs); assertEquals(1, snippets.length); assertEquals("This is a test", snippets[0]); @@ -213,7 +213,7 @@ public void testOneSentence() throws Exception { UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer); Query query = new IntervalQuery("body", Intervals.term("test")); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); String[] snippets = highlighter.highlight("body", query, topDocs); assertEquals(2, snippets.length); assertEquals("This is a test.", snippets[0]); @@ -244,7 +244,7 @@ public void testMaxLengthWithMultivalue() throws Exception { UnifiedHighlighter highlighter = randomUnifiedHighlighter(uhBuilder); Query query = new IntervalQuery("body", Intervals.term("field")); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); String[] snippets = highlighter.highlight("body", query, topDocs, 10); assertEquals(1, snippets.length); String highlightedValue = "This is a multivalued field. Sentencetwo field."; @@ -276,7 +276,7 @@ public void testMultipleTerms() throws Exception { Intervals.or( Intervals.term("highlighting"), Intervals.term("just"), Intervals.term("first"))); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); String[] snippets = highlighter.highlight("body", query, topDocs); assertEquals(2, snippets.length); assertEquals("Just a test highlighting from postings. ", snippets[0]); @@ -304,7 +304,7 @@ public void testMultiplePassages() throws Exception { UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer); Query query = new IntervalQuery("body", Intervals.term("test")); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); String[] snippets = highlighter.highlight("body", query, topDocs, 2); assertEquals(2, snippets.length); assertEquals( @@ -337,7 +337,7 @@ public void testBuddhism() throws Exception { IndexSearcher searcher = newSearcher(ir); Query query = new IntervalQuery("body", Intervals.phrase("buddhist", "origins")); TopDocs topDocs = searcher.search(query, 10); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); UnifiedHighlighter.Builder uhBuilder = new UnifiedHighlighter.Builder(searcher, indexAnalyzer).withHighlightPhrasesStrictly(false); UnifiedHighlighter highlighter = randomUnifiedHighlighter(uhBuilder); @@ -367,7 +367,7 @@ public void testCuriousGeorge() throws Exception { IndexSearcher searcher = newSearcher(ir); Query query = new IntervalQuery("body", Intervals.phrase("curious", "george")); TopDocs topDocs = searcher.search(query, 10); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); UnifiedHighlighter.Builder uhBuilder = new UnifiedHighlighter.Builder(searcher, indexAnalyzer).withHighlightPhrasesStrictly(false); UnifiedHighlighter highlighter = randomUnifiedHighlighter(uhBuilder); @@ -407,7 +407,7 @@ public void testCambridgeMA() throws Exception { Intervals.term("square"), Intervals.term("massachusetts"))); TopDocs topDocs = searcher.search(query, 10); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); UnifiedHighlighter.Builder uhBuilder = new UnifiedHighlighter.Builder(searcher, indexAnalyzer) .withMaxLength(Integer.MAX_VALUE - 1); @@ -439,7 +439,7 @@ public void testPassageRanking() throws Exception { UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer); Query query = new IntervalQuery("body", Intervals.term("test")); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); String[] snippets = highlighter.highlight("body", query, topDocs, 2); assertEquals(1, snippets.length); assertEquals( @@ -465,7 +465,7 @@ public void testBooleanMustNot() throws Exception { new IntervalQuery( "body", Intervals.notContaining(Intervals.term("terms"), Intervals.term("both"))); TopDocs topDocs = searcher.search(query, 10); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); UnifiedHighlighter.Builder uhBuilder = new UnifiedHighlighter.Builder(searcher, indexAnalyzer) .withMaxLength(Integer.MAX_VALUE - 1); @@ -499,7 +499,7 @@ public void testHighlightAllText() throws Exception { UnifiedHighlighter highlighter = randomUnifiedHighlighter(uhBuilder); Query query = new IntervalQuery("body", Intervals.term("test")); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); String[] snippets = highlighter.highlight("body", query, topDocs, 2); assertEquals(1, snippets.length); assertEquals( @@ -528,7 +528,7 @@ public void testSpecificDocIDs() throws Exception { UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer); Query query = new IntervalQuery("body", Intervals.term("highlighting")); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); ScoreDoc[] hits = topDocs.scoreDocs; int[] docIDs = new int[2]; docIDs[0] = hits[0].doc; @@ -579,7 +579,7 @@ protected BreakIterator getBreakIterator(String field) { Query query = new IntervalQuery("body", Intervals.term("test")); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); String[] snippets = highlighter.highlight("body", query, topDocs, 2); assertEquals(1, snippets.length); assertEquals( @@ -839,7 +839,7 @@ public void testMultipleDocs() throws Exception { UnifiedHighlighter highlighter = randomUnifiedHighlighter(uhBuilder); Query query = new IntervalQuery("body", Intervals.term("answer")); TopDocs hits = searcher.search(query, numDocs); - assertEquals(numDocs, hits.totalHits.value); + assertEquals(numDocs, hits.totalHits.value()); String[] snippets = highlighter.highlight("body", query, hits); assertEquals(numDocs, snippets.length); @@ -877,7 +877,7 @@ public void testEncode() throws Exception { Query query = new IntervalQuery("body", Intervals.term("highlighting")); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); String[] snippets = highlighter.highlight("body", query, topDocs); assertEquals(1, snippets.length); assertEquals( @@ -920,7 +920,7 @@ public String[] format(Passage[] passages, String content) { Query query = new IntervalQuery("body", Intervals.term("highlighting")); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); int[] docIDs = new int[1]; docIDs[0] = topDocs.scoreDocs[0].doc; Map snippets = @@ -975,7 +975,7 @@ public void testMatchesSlopBug() throws IOException { Intervals.term("the"), Intervals.term("field")))); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); String[] snippets = highlighter.highlight("title", query, topDocs, 10); assertEquals(1, snippets.length); // All flags are enabled. @@ -1020,7 +1020,7 @@ protected TokenStreamComponents createComponents(String fieldName) { }); Query query = new IntervalQuery("body", Intervals.term("highlighting")); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); String[] snippets = highlighter.highlight("body", query, topDocs); assertEquals(1, snippets.length); assertEquals("Just a test highlighting from postings. ", snippets[0]); diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterTermVec.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterTermVec.java index 84ea233affe9..edb96e5c6bab 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterTermVec.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterTermVec.java @@ -96,7 +96,7 @@ public void testTermVecButNoPositions(String aaa, String bbb, String indexed, St .add(new TermQuery(new Term("body", bbb)), BooleanClause.Occur.MUST) .build(); TopDocs topDocs = searcher.search(query, 10); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); UnifiedHighlighter highlighter = UnifiedHighlighter.builder(searcher, indexAnalyzer).build(); String[] snippets = highlighter.highlight("body", query, topDocs, 2); assertEquals(1, snippets.length); @@ -142,7 +142,7 @@ public void testFetchTermVecsOncePerDoc() throws IOException { } BooleanQuery query = queryBuilder.build(); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(numDocs, topDocs.totalHits.value); + assertEquals(numDocs, topDocs.totalHits.value()); Map fieldToSnippets = highlighter.highlightFields(fields.toArray(new String[numTvFields]), query, topDocs); String[] expectedSnippetsByDoc = new String[numDocs]; diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java index d12a21091015..55146300a49e 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java @@ -175,14 +175,14 @@ protected FieldHighlighter getFieldHighlighter( OffsetSource offsetSource = getOptimizedOffsetSource(components); // test all is accessible - components.getField(); - components.getFieldMatcher(); - components.getQuery(); - components.getTerms(); - components.getPhraseHelper(); - components.getAutomata(); + components.field(); + components.fieldMatcher(); + components.query(); + components.terms(); + components.phraseHelper(); + components.automata(); components.hasUnrecognizedQueryPart(); - components.getHighlightFlags(); + components.highlightFlags(); return new CustomFieldHighlighter( field, diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/TestFastVectorHighlighter.java b/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/TestFastVectorHighlighter.java index 81e8cef9208a..74779697eee9 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/TestFastVectorHighlighter.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/TestFastVectorHighlighter.java @@ -389,7 +389,7 @@ public void testCommonTermsQueryHighlight() throws IOException { IndexReader reader = DirectoryReader.open(writer); IndexSearcher searcher = newSearcher(reader); TopDocs hits = searcher.search(query, 10); - assertEquals(2, hits.totalHits.value); + assertEquals(2, hits.totalHits.value()); FieldQuery fieldQuery = highlighter.getFieldQuery(query, reader); String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader, 1, "field", 1000, 1); assertEquals( diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/TestSimpleFragmentsBuilder.java b/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/TestSimpleFragmentsBuilder.java index e4a949cc7299..b6ff90c47acb 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/TestSimpleFragmentsBuilder.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/TestSimpleFragmentsBuilder.java @@ -334,12 +334,5 @@ private String getRandomValue( return value; } - private static class Doc { - - final String[][] fieldValues; - - private Doc(String[][] fieldValues) { - this.fieldValues = fieldValues; - } - } + private record Doc(String[][] fieldValues) {} } diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/TestWeightedFragListBuilder.java b/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/TestWeightedFragListBuilder.java index 9fcf4d672981..ad60e995308a 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/TestWeightedFragListBuilder.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/TestWeightedFragListBuilder.java @@ -56,7 +56,7 @@ private void testCase( float totalSubInfoBoost = 0; for (WeightedFragInfo info : ffl.getFragInfos()) { for (SubInfo subInfo : info.getSubInfos()) { - totalSubInfoBoost += subInfo.getBoost(); + totalSubInfoBoost += subInfo.boost(); } } assertEquals(expectedTotalSubInfoBoost, totalSubInfoBoost, .0000001); diff --git a/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingChildrenByteKnnVectorQuery.java b/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingChildrenByteKnnVectorQuery.java index 456a885b49a0..45cb8b9c88fa 100644 --- a/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingChildrenByteKnnVectorQuery.java +++ b/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingChildrenByteKnnVectorQuery.java @@ -154,7 +154,14 @@ protected TopDocs approximateSearch( @Override public String toString(String field) { - return getClass().getSimpleName() + ":" + this.field + "[" + query[0] + ",...][" + k + "]"; + StringBuilder buffer = new StringBuilder(); + buffer.append(getClass().getSimpleName() + ":"); + buffer.append(this.field + "[" + query[0] + ",...]"); + buffer.append("[" + k + "]"); + if (this.filter != null) { + buffer.append("[" + this.filter + "]"); + } + return buffer.toString(); } @Override diff --git a/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingChildrenFloatKnnVectorQuery.java b/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingChildrenFloatKnnVectorQuery.java index 7b5a656d1414..9c44a2f78566 100644 --- a/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingChildrenFloatKnnVectorQuery.java +++ b/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingChildrenFloatKnnVectorQuery.java @@ -153,7 +153,14 @@ protected TopDocs approximateSearch( @Override public String toString(String field) { - return getClass().getSimpleName() + ":" + this.field + "[" + query[0] + ",...][" + k + "]"; + StringBuilder buffer = new StringBuilder(); + buffer.append(getClass().getSimpleName() + ":"); + buffer.append(this.field + "[" + query[0] + ",...]"); + buffer.append("[" + k + "]"); + if (this.filter != null) { + buffer.append("[" + this.filter + "]"); + } + return buffer.toString(); } @Override diff --git a/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollector.java b/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollector.java index afab8ac57d90..15227a503429 100644 --- a/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollector.java +++ b/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollector.java @@ -278,15 +278,8 @@ private int downHeapWithoutCacheUpdate(int i) { } /** Keeps track of child node, parent node, and the stored score. */ - private static class ParentChildScore implements Comparable { - private final int parent, child; - private final float score; - - ParentChildScore(int child, int parent, float score) { - this.child = child; - this.parent = parent; - this.score = score; - } + private record ParentChildScore(int child, int parent, float score) + implements Comparable { @Override public int compareTo(ParentChildScore o) { diff --git a/lucene/join/src/java/org/apache/lucene/search/join/TermsQuery.java b/lucene/join/src/java/org/apache/lucene/search/join/TermsQuery.java index 729feb19b9c4..f781fc6a29eb 100644 --- a/lucene/join/src/java/org/apache/lucene/search/join/TermsQuery.java +++ b/lucene/join/src/java/org/apache/lucene/search/join/TermsQuery.java @@ -95,7 +95,7 @@ protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOExc } @Override - public long getTermsCount() throws IOException { + public long getTermsCount() { return terms.size(); } diff --git a/lucene/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinQuery.java b/lucene/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinQuery.java index db21e17ba089..0d92b324e4bd 100644 --- a/lucene/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinQuery.java +++ b/lucene/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinQuery.java @@ -16,6 +16,7 @@ */ package org.apache.lucene.search.join; +import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; import static org.apache.lucene.search.ScoreMode.COMPLETE; import java.io.IOException; @@ -24,20 +25,25 @@ import java.util.Locale; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.search.BulkScorer; import org.apache.lucene.search.ConstantScoreQuery; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.Explanation; +import org.apache.lucene.search.FilterLeafCollector; import org.apache.lucene.search.FilterWeight; import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.LeafCollector; import org.apache.lucene.search.Matches; import org.apache.lucene.search.MatchesUtils; import org.apache.lucene.search.Query; import org.apache.lucene.search.QueryVisitor; +import org.apache.lucene.search.Scorable; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.ScorerSupplier; import org.apache.lucene.search.TwoPhaseIterator; import org.apache.lucene.search.Weight; import org.apache.lucene.util.BitSet; +import org.apache.lucene.util.Bits; /** * This query requires that you index children and parent docs as a single block, using the {@link @@ -101,12 +107,17 @@ public Weight createWeight( .rewrite(new ConstantScoreQuery(childQuery)) .createWeight(searcher, weightScoreMode, 0f); } else { - // if the score is needed we force the collection mode to COMPLETE because the child query - // cannot skip - // non-competitive documents. + // if the score is needed and the score mode is not max, we force the collection mode to + // COMPLETE because the child query cannot skip non-competitive documents. + // weightScoreMode.needsScores() will always be true here, but keep the check to make the + // logic clearer. childWeight = childQuery.createWeight( - searcher, weightScoreMode.needsScores() ? COMPLETE : weightScoreMode, boost); + searcher, + weightScoreMode.needsScores() && childScoreMode != ScoreMode.Max + ? COMPLETE + : weightScoreMode, + boost); } return new BlockJoinWeight(this, childWeight, parentsFilter, childScoreMode); } @@ -151,10 +162,29 @@ public Scorer get(long leadCost) throws IOException { return new BlockJoinScorer(childScorerSupplier.get(leadCost), parents, scoreMode); } + @Override + public BulkScorer bulkScorer() throws IOException { + if (scoreMode == ScoreMode.None) { + // BlockJoinBulkScorer evaluates all child hits exhaustively, but when scoreMode is None + // we only need to evaluate a single child doc per parent. In this case, use the default + // bulk scorer instead, which uses BlockJoinScorer to iterate over child hits. + // BlockJoinScorer is optimized to skip child hit evaluation when scoreMode is None. + return super.bulkScorer(); + } + return new BlockJoinBulkScorer(childScorerSupplier.bulkScorer(), parents, scoreMode); + } + @Override public long cost() { return childScorerSupplier.cost(); } + + @Override + public void setTopLevelScoringClause() throws IOException { + if (scoreMode == ScoreMode.Max) { + childScorerSupplier.setTopLevelScoringClause(); + } + } }; } @@ -263,6 +293,54 @@ public float matchCost() { } } + private static class Score extends Scorable { + private final ScoreMode scoreMode; + private double score; + private int freq; + + public Score(ScoreMode scoreMode) { + this.scoreMode = scoreMode; + this.score = 0; + this.freq = 0; + } + + public void reset(Scorable firstChildScorer) throws IOException { + score = scoreMode == ScoreMode.None ? 0 : firstChildScorer.score(); + freq = 1; + } + + public void addChildScore(Scorable childScorer) throws IOException { + final float childScore = scoreMode == ScoreMode.None ? 0 : childScorer.score(); + freq++; + switch (scoreMode) { + case Total: + case Avg: + score += childScore; + break; + case Min: + score = Math.min(score, childScore); + break; + case Max: + score = Math.max(score, childScore); + break; + case None: + break; + default: + throw new AssertionError(); + } + } + + @Override + public float score() { + assert freq > 0; + double score = this.score; + if (scoreMode == ScoreMode.Avg) { + score /= freq; + } + return (float) score; + } + } + static class BlockJoinScorer extends Scorer { private final Scorer childScorer; private final BitSet parentBits; @@ -271,13 +349,14 @@ static class BlockJoinScorer extends Scorer { private final TwoPhaseIterator childTwoPhase; private final ParentApproximation parentApproximation; private final ParentTwoPhase parentTwoPhase; - private float score; + private final Score parentScore; public BlockJoinScorer(Scorer childScorer, BitSet parentBits, ScoreMode scoreMode) { // System.out.println("Q.init firstChildDoc=" + firstChildDoc); this.parentBits = parentBits; this.childScorer = childScorer; this.scoreMode = scoreMode; + this.parentScore = new Score(scoreMode); childTwoPhase = childScorer.twoPhaseIterator(); if (childTwoPhase == null) { childApproximation = childScorer.iterator(); @@ -317,8 +396,7 @@ public int docID() { @Override public float score() throws IOException { - setScoreAndFreq(); - return score; + return scoreChildDocs(); } @Override @@ -331,39 +409,31 @@ public float getMaxScore(int upTo) throws IOException { @Override public void setMinCompetitiveScore(float minScore) throws IOException { - if (scoreMode == ScoreMode.None) { + if (scoreMode == ScoreMode.None || scoreMode == ScoreMode.Max) { childScorer.setMinCompetitiveScore(minScore); } } - private void setScoreAndFreq() throws IOException { + private float scoreChildDocs() throws IOException { if (childApproximation.docID() >= parentApproximation.docID()) { - return; + return parentScore.score(); } - double score = scoreMode == ScoreMode.None ? 0 : childScorer.score(); - int freq = 1; - while (childApproximation.nextDoc() < parentApproximation.docID()) { - if (childTwoPhase == null || childTwoPhase.matches()) { - final float childScore = scoreMode == ScoreMode.None ? 0 : childScorer.score(); - freq += 1; - switch (scoreMode) { - case Total: - case Avg: - score += childScore; - break; - case Min: - score = Math.min(score, childScore); - break; - case Max: - score = Math.max(score, childScore); - break; - case None: - break; - default: - throw new AssertionError(); + + float score = 0; + if (scoreMode != ScoreMode.None) { + parentScore.reset(childScorer); + while (childApproximation.nextDoc() < parentApproximation.docID()) { + if (childTwoPhase == null || childTwoPhase.matches()) { + parentScore.addChildScore(childScorer); } } + + score = parentScore.score(); } + + // TODO: When score mode is None, this check is broken because the child approximation is not + // advanced and will therefore never match the parent approximation at this point in + // execution. Fix this error check when score mode is None. if (childApproximation.docID() == parentApproximation.docID() && (childTwoPhase == null || childTwoPhase.matches())) { throw new IllegalStateException( @@ -374,10 +444,8 @@ private void setScoreAndFreq() throws IOException { + ", " + childScorer.getClass()); } - if (scoreMode == ScoreMode.Avg) { - score /= freq; - } - this.score = (float) score; + + return score; } /* @@ -428,6 +496,120 @@ private String formatScoreExplanation(int matches, int start, int end, ScoreMode } } + private abstract static class BatchAwareLeafCollector extends FilterLeafCollector { + public BatchAwareLeafCollector(LeafCollector in) { + super(in); + } + + public void endBatch() throws IOException {} + } + + private static class BlockJoinBulkScorer extends BulkScorer { + private final BulkScorer childBulkScorer; + private final ScoreMode scoreMode; + private final BitSet parents; + private final int parentsLength; + + public BlockJoinBulkScorer(BulkScorer childBulkScorer, BitSet parents, ScoreMode scoreMode) { + this.childBulkScorer = childBulkScorer; + this.scoreMode = scoreMode; + this.parents = parents; + this.parentsLength = parents.length(); + } + + @Override + public int score(LeafCollector collector, Bits acceptDocs, int min, int max) + throws IOException { + if (min == max) { + return scoringCompleteCheck(max, max); + } + + // Subtract one because max is exclusive w.r.t. score but inclusive w.r.t prevSetBit + int lastParent = parents.prevSetBit(Math.min(parentsLength, max) - 1); + int prevParent = min == 0 ? -1 : parents.prevSetBit(min - 1); + if (lastParent == prevParent) { + // No parent docs in this range. + return scoringCompleteCheck(max, max); + } + + BatchAwareLeafCollector wrappedCollector = wrapCollector(collector); + childBulkScorer.score(wrappedCollector, acceptDocs, prevParent + 1, lastParent + 1); + wrappedCollector.endBatch(); + + return scoringCompleteCheck(lastParent + 1, max); + } + + private int scoringCompleteCheck(int innerMax, int returnedMax) { + // If we've scored the last parent in the bit set, return NO_MORE_DOCS to indicate we are done + // scoring + return innerMax >= parentsLength ? NO_MORE_DOCS : returnedMax; + } + + @Override + public long cost() { + return childBulkScorer.cost(); + } + + private BatchAwareLeafCollector wrapCollector(LeafCollector collector) { + return new BatchAwareLeafCollector(collector) { + private final Score currentParentScore = new Score(scoreMode); + private int currentParent = -1; + private Scorable scorer = null; + + @Override + public void setScorer(Scorable scorer) throws IOException { + assert scorer != null; + this.scorer = scorer; + + super.setScorer( + new Scorable() { + @Override + public float score() { + return currentParentScore.score(); + } + + @Override + public void setMinCompetitiveScore(float minScore) throws IOException { + if (scoreMode == ScoreMode.None || scoreMode == ScoreMode.Max) { + scorer.setMinCompetitiveScore(minScore); + } + } + }); + } + + @Override + public void collect(int doc) throws IOException { + if (doc > currentParent) { + // Emit the current parent and setup scoring for the next parent + if (currentParent >= 0) { + in.collect(currentParent); + } + + currentParent = parents.nextSetBit(doc); + currentParentScore.reset(scorer); + } else if (doc == currentParent) { + throw new IllegalStateException( + "Child query must not match same docs with parent filter. " + + "Combine them as must clauses (+) to find a problem doc. " + + "docId=" + + doc + + ", " + + childBulkScorer.getClass()); + } else { + currentParentScore.addChildScore(scorer); + } + } + + @Override + public void endBatch() throws IOException { + if (currentParent >= 0) { + in.collect(currentParent); + } + } + }; + } + } + @Override public Query rewrite(IndexSearcher indexSearcher) throws IOException { final Query childRewrite = childQuery.rewrite(indexSearcher); diff --git a/lucene/join/src/test/org/apache/lucene/search/join/ParentBlockJoinKnnVectorQueryTestCase.java b/lucene/join/src/test/org/apache/lucene/search/join/ParentBlockJoinKnnVectorQueryTestCase.java index 7cec3b715920..03fad9b02a71 100644 --- a/lucene/join/src/test/org/apache/lucene/search/join/ParentBlockJoinKnnVectorQueryTestCase.java +++ b/lucene/join/src/test/org/apache/lucene/search/join/ParentBlockJoinKnnVectorQueryTestCase.java @@ -113,7 +113,7 @@ d, newIndexWriterConfig().setMergePolicy(newMergePolicy(random(), false)))) { new QueryBitSetProducer(new TermQuery(new Term("docType", "_parent"))); Query query = getParentJoinKnnQuery("field", new float[] {2, 2}, null, 3, parentFilter); TopDocs topDocs = searcher.search(query, 3); - assertEquals(0, topDocs.totalHits.value); + assertEquals(0, topDocs.totalHits.value()); assertEquals(0, topDocs.scoreDocs.length); // Test with match_all filter and large k to test exact search @@ -121,7 +121,7 @@ d, newIndexWriterConfig().setMergePolicy(newMergePolicy(random(), false)))) { getParentJoinKnnQuery( "field", new float[] {2, 2}, new MatchAllDocsQuery(), 10, parentFilter); topDocs = searcher.search(query, 3); - assertEquals(0, topDocs.totalHits.value); + assertEquals(0, topDocs.totalHits.value()); assertEquals(0, topDocs.scoreDocs.length); } } @@ -154,7 +154,7 @@ d, newIndexWriterConfig().setMergePolicy(newMergePolicy(random(), false)))) { new QueryBitSetProducer(new TermQuery(new Term("docType", "_parent"))); Query query = getParentJoinKnnQuery("field", new float[] {2, 2}, null, 3, parentFilter); TopDocs topDocs = searcher.search(query, 3); - assertEquals(0, topDocs.totalHits.value); + assertEquals(0, topDocs.totalHits.value()); assertEquals(0, topDocs.scoreDocs.length); // Test with match_all filter and large k to test exact search @@ -162,7 +162,7 @@ d, newIndexWriterConfig().setMergePolicy(newMergePolicy(random(), false)))) { getParentJoinKnnQuery( "field", new float[] {2, 2}, new MatchAllDocsQuery(), 10, parentFilter); topDocs = searcher.search(query, 3); - assertEquals(0, topDocs.totalHits.value); + assertEquals(0, topDocs.totalHits.value()); assertEquals(0, topDocs.scoreDocs.length); } } @@ -177,7 +177,7 @@ public void testFilterWithNoVectorMatches() throws IOException { BitSetProducer parentFilter = parentFilter(reader); Query kvq = getParentJoinKnnQuery("field", new float[] {1, 2}, filter, 2, parentFilter); TopDocs topDocs = searcher.search(kvq, 3); - assertEquals(0, topDocs.totalHits.value); + assertEquals(0, topDocs.totalHits.value()); } } diff --git a/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoin.java b/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoin.java index 77f00818dec0..6e333c0cbd70 100644 --- a/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoin.java +++ b/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoin.java @@ -148,7 +148,7 @@ public void testEmptyChildFilter() throws Exception { fullQuery.add(new BooleanClause(childJoinQuery, Occur.MUST)); fullQuery.add(new BooleanClause(new MatchAllDocsQuery(), Occur.MUST)); TopDocs topDocs = s.search(fullQuery.build(), 2); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); assertEquals( asSet("Lisa", "Frank"), asSet( @@ -159,14 +159,14 @@ public void testEmptyChildFilter() throws Exception { new ParentChildrenBlockJoinQuery( parentsFilter, childQuery.build(), topDocs.scoreDocs[0].doc); TopDocs matchingChildren = s.search(childrenQuery, 1); - assertEquals(1, matchingChildren.totalHits.value); + assertEquals(1, matchingChildren.totalHits.value()); assertEquals("java", s.storedFields().document(matchingChildren.scoreDocs[0].doc).get("skill")); childrenQuery = new ParentChildrenBlockJoinQuery( parentsFilter, childQuery.build(), topDocs.scoreDocs[1].doc); matchingChildren = s.search(childrenQuery, 1); - assertEquals(1, matchingChildren.totalHits.value); + assertEquals(1, matchingChildren.totalHits.value()); assertEquals("java", s.storedFields().document(matchingChildren.scoreDocs[0].doc).get("skill")); r.close(); @@ -222,7 +222,7 @@ public void testBQShouldJoinedChild() throws Exception { fullQuery.add(new BooleanClause(childJoinQuery, Occur.SHOULD)); final TopDocs topDocs = s.search(fullQuery.build(), 2); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); assertEquals( asSet("Lisa", "Frank"), asSet( @@ -233,14 +233,14 @@ public void testBQShouldJoinedChild() throws Exception { new ParentChildrenBlockJoinQuery( parentsFilter, childQuery.build(), topDocs.scoreDocs[0].doc); TopDocs matchingChildren = s.search(childrenQuery, 1); - assertEquals(1, matchingChildren.totalHits.value); + assertEquals(1, matchingChildren.totalHits.value()); assertEquals("java", s.storedFields().document(matchingChildren.scoreDocs[0].doc).get("skill")); childrenQuery = new ParentChildrenBlockJoinQuery( parentsFilter, childQuery.build(), topDocs.scoreDocs[1].doc); matchingChildren = s.search(childrenQuery, 1); - assertEquals(1, matchingChildren.totalHits.value); + assertEquals(1, matchingChildren.totalHits.value()); assertEquals("java", s.storedFields().document(matchingChildren.scoreDocs[0].doc).get("skill")); r.close(); @@ -281,7 +281,7 @@ public void testSimpleKnn() throws Exception { "vector", new float[] {4f, 4f, 4f}, null, 3, parentsFilter); TopDocs topDocs = s.search(childKnnJoin, 5); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); Document childDoc = s.storedFields().document(topDocs.scoreDocs[0].doc); assertEquals("parent1", childDoc.get("my_parent_id")); assertEquals( @@ -346,7 +346,7 @@ public void testSimple() throws Exception { TopDocs topDocs = s.search(fullQuery.build(), 1); // assertEquals(1, results.totalHitCount); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); Document parentDoc = s.storedFields().document(topDocs.scoreDocs[0].doc); assertEquals("Lisa", parentDoc.get("name")); @@ -354,7 +354,7 @@ public void testSimple() throws Exception { new ParentChildrenBlockJoinQuery( parentsFilter, childQuery.build(), topDocs.scoreDocs[0].doc); TopDocs matchingChildren = s.search(childrenQuery, 1); - assertEquals(1, matchingChildren.totalHits.value); + assertEquals(1, matchingChildren.totalHits.value()); assertEquals("java", s.storedFields().document(matchingChildren.scoreDocs[0].doc).get("skill")); // System.out.println("TEST: now test up"); @@ -367,7 +367,7 @@ public void testSimple() throws Exception { // System.out.println("FULL: " + fullChildQuery); TopDocs hits = s.search(fullChildQuery.build(), 10); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); Document childDoc = s.storedFields().document(hits.scoreDocs[0].doc); // System.out.println("CHILD = " + childDoc + " docID=" + hits.scoreDocs[0].doc); assertEquals("java", childDoc.get("skill")); @@ -467,7 +467,7 @@ public void testSimpleFilter() throws Exception { .add(parentQuery, Occur.FILTER) .build(); TopDocs ukOnly = s.search(query, 1); - assertEquals("has filter - single passed", 1, ukOnly.totalHits.value); + assertEquals("has filter - single passed", 1, ukOnly.totalHits.value()); assertEquals("Lisa", r.storedFields().document(ukOnly.scoreDocs[0].doc).get("name")); query = @@ -477,7 +477,7 @@ public void testSimpleFilter() throws Exception { .build(); // looking for US candidates TopDocs usThen = s.search(query, 1); - assertEquals("has filter - single passed", 1, usThen.totalHits.value); + assertEquals("has filter - single passed", 1, usThen.totalHits.value()); assertEquals("Frank", r.storedFields().document(usThen.scoreDocs[0].doc).get("name")); TermQuery us = new TermQuery(new Term("country", "United States")); @@ -889,7 +889,7 @@ public void testRandom() throws Exception { if (VERBOSE) { System.out.println( "\nTEST: normal index gets " - + results.totalHits.value + + results.totalHits.value() + " hits; sort=" + parentAndChildSort); final ScoreDoc[] hits = results.scoreDocs; @@ -958,7 +958,7 @@ public void testRandom() throws Exception { } } - if (results.totalHits.value == 0) { + if (results.totalHits.value() == 0) { assertEquals(0, joinResults.size()); } else { compareHits(r, joinR, results, joinResults); @@ -1113,7 +1113,7 @@ public void testRandom() throws Exception { } final TopDocs results2 = s.search(childQuery2, r.numDocs(), childSort2); if (VERBOSE) { - System.out.println(" " + results2.totalHits.value + " totalHits:"); + System.out.println(" " + results2.totalHits.value() + " totalHits:"); for (ScoreDoc sd : results2.scoreDocs) { final Document doc = s.storedFields().document(sd.doc); System.out.println( @@ -1133,7 +1133,7 @@ public void testRandom() throws Exception { } TopDocs joinResults2 = joinS.search(childJoinQuery2, joinR.numDocs(), childSort2); if (VERBOSE) { - System.out.println(" " + joinResults2.totalHits.value + " totalHits:"); + System.out.println(" " + joinResults2.totalHits.value() + " totalHits:"); for (ScoreDoc sd : joinResults2.scoreDocs) { final Document doc = joinS.storedFields().document(sd.doc); final Document parentDoc = getParentDoc(joinR, parentsFilter, sd.doc); @@ -1158,7 +1158,7 @@ public void testRandom() throws Exception { private void compareChildHits( IndexReader r, IndexReader joinR, TopDocs results, TopDocs joinResults) throws Exception { - assertEquals(results.totalHits.value, joinResults.totalHits.value); + assertEquals(results.totalHits.value(), joinResults.totalHits.value()); assertEquals(results.scoreDocs.length, joinResults.scoreDocs.length); for (int hitCount = 0; hitCount < results.scoreDocs.length; hitCount++) { ScoreDoc hit = results.scoreDocs[hitCount]; @@ -1252,7 +1252,7 @@ public void testMultiChildTypes() throws Exception { fullQuery.add(new BooleanClause(childQualificationJoinQuery, Occur.MUST)); final TopDocs topDocs = s.search(fullQuery.build(), 10); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); Document parentDoc = s.storedFields().document(topDocs.scoreDocs[0].doc); assertEquals("Lisa", parentDoc.get("name")); @@ -1260,14 +1260,14 @@ public void testMultiChildTypes() throws Exception { new ParentChildrenBlockJoinQuery( parentsFilter, childJobQuery.build(), topDocs.scoreDocs[0].doc); TopDocs matchingChildren = s.search(childrenQuery, 1); - assertEquals(1, matchingChildren.totalHits.value); + assertEquals(1, matchingChildren.totalHits.value()); assertEquals("java", s.storedFields().document(matchingChildren.scoreDocs[0].doc).get("skill")); childrenQuery = new ParentChildrenBlockJoinQuery( parentsFilter, childQualificationQuery.build(), topDocs.scoreDocs[0].doc); matchingChildren = s.search(childrenQuery, 1); - assertEquals(1, matchingChildren.totalHits.value); + assertEquals(1, matchingChildren.totalHits.value()); assertEquals( "maths", s.storedFields().document(matchingChildren.scoreDocs[0].doc).get("qualification")); @@ -1453,7 +1453,7 @@ public void testAdvanceSingleDeletedParentNoChild() throws Exception { ToChildBlockJoinQuery parentJoinQuery = new ToChildBlockJoinQuery(parentQuery, parentsFilter); TopDocs topdocs = s.search(parentJoinQuery, 3); - assertEquals(1, topdocs.totalHits.value); + assertEquals(1, topdocs.totalHits.value()); r.close(); dir.close(); @@ -1735,7 +1735,7 @@ protected double score(BasicStats stats, double freq, double docLen) { Query query = new ToParentBlockJoinQuery(new TermQuery(new Term("foo", "bar")), parents, scoreMode); TopDocs topDocs = searcher.search(query, 10); - assertEquals(1, topDocs.totalHits.value); + assertEquals(1, topDocs.totalHits.value()); assertEquals(3, topDocs.scoreDocs[0].doc); float expectedScore; switch (scoreMode) { diff --git a/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoinBulkScorer.java b/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoinBulkScorer.java new file mode 100644 index 000000000000..b9580331347f --- /dev/null +++ b/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoinBulkScorer.java @@ -0,0 +1,495 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.join; + +import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; + +import com.carrotsearch.randomizedtesting.generators.RandomPicks; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.BoostQuery; +import org.apache.lucene.search.BulkScorer; +import org.apache.lucene.search.ConstantScoreQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.LeafCollector; +import org.apache.lucene.search.Scorable; +import org.apache.lucene.search.ScorerSupplier; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.Weight; +import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.index.RandomIndexWriter; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.tests.util.TestUtil; + +public class TestBlockJoinBulkScorer extends LuceneTestCase { + private static final String TYPE_FIELD_NAME = "type"; + private static final String VALUE_FIELD_NAME = "value"; + private static final String PARENT_FILTER_VALUE = "parent"; + private static final String CHILD_FILTER_VALUE = "child"; + + private enum MatchValue { + MATCH_A("A", 1), + MATCH_B("B", 2), + MATCH_C("C", 3), + MATCH_D("D", 4); + + private static final List VALUES = List.of(values()); + + private final String text; + private final int score; + + MatchValue(String text, int score) { + this.text = text; + this.score = score; + } + + public String getText() { + return text; + } + + public int getScore() { + return score; + } + + @Override + public String toString() { + return text; + } + + public static MatchValue random() { + return RandomPicks.randomFrom(LuceneTestCase.random(), VALUES); + } + } + + private record ChildDocMatch(int docId, List matches) { + public ChildDocMatch(int docId, List matches) { + this.docId = docId; + this.matches = Collections.unmodifiableList(matches); + } + } + + private static Map> populateRandomIndex( + RandomIndexWriter writer, int maxParentDocCount, int maxChildDocCount, int maxChildDocMatches) + throws IOException { + Map> expectedMatches = new HashMap<>(); + + final int parentDocCount = random().nextInt(1, maxParentDocCount + 1); + int currentDocId = 0; + for (int i = 0; i < parentDocCount; i++) { + final int childDocCount = random().nextInt(maxChildDocCount + 1); + List docs = new ArrayList<>(childDocCount); + List childDocMatches = new ArrayList<>(childDocCount); + + for (int j = 0; j < childDocCount; j++) { + // Build a child doc + Document childDoc = new Document(); + childDoc.add(newStringField(TYPE_FIELD_NAME, CHILD_FILTER_VALUE, Field.Store.NO)); + + final int matchCount = random().nextInt(maxChildDocMatches + 1); + List matchValues = new ArrayList<>(matchCount); + for (int k = 0; k < matchCount; k++) { + // Add a match to the child doc + MatchValue matchValue = MatchValue.random(); + matchValues.add(matchValue); + childDoc.add(newStringField(VALUE_FIELD_NAME, matchValue.getText(), Field.Store.NO)); + } + + docs.add(childDoc); + childDocMatches.add(new ChildDocMatch(currentDocId++, matchValues)); + } + + // Build a parent doc + Document parentDoc = new Document(); + parentDoc.add(newStringField(TYPE_FIELD_NAME, PARENT_FILTER_VALUE, Field.Store.NO)); + docs.add(parentDoc); + + // Don't add parent docs with no children to expectedMatches + if (childDocCount > 0) { + expectedMatches.put(currentDocId, childDocMatches); + } + currentDocId++; + + writer.addDocuments(docs); + } + + return expectedMatches; + } + + private static void populateStaticIndex(RandomIndexWriter writer) throws IOException { + // Use these vars to improve readability when defining the docs + final String A = MatchValue.MATCH_A.getText(); + final String B = MatchValue.MATCH_B.getText(); + final String C = MatchValue.MATCH_C.getText(); + final String D = MatchValue.MATCH_D.getText(); + + for (String[][] values : + Arrays.asList( + new String[][] {{A, B}, {A, B, C}}, + new String[][] {{A}, {B}}, + new String[][] {{}}, + new String[][] {{A, B, C}, {A, B, C, D}}, + new String[][] {{B}}, + new String[][] {{B, C}, {A, B}, {A, C}})) { + + List docs = new ArrayList<>(); + for (String[] value : values) { + Document childDoc = new Document(); + childDoc.add(newStringField(TYPE_FIELD_NAME, CHILD_FILTER_VALUE, Field.Store.NO)); + for (String v : value) { + childDoc.add(newStringField(VALUE_FIELD_NAME, v, Field.Store.NO)); + } + docs.add(childDoc); + } + + Document parentDoc = new Document(); + parentDoc.add(newStringField(TYPE_FIELD_NAME, PARENT_FILTER_VALUE, Field.Store.NO)); + docs.add(parentDoc); + + writer.addDocuments(docs); + } + } + + private static Map computeExpectedScores( + Map> expectedMatches, + ScoreMode joinScoreMode, + org.apache.lucene.search.ScoreMode searchScoreMode) { + Map expectedScores = new HashMap<>(); + for (var entry : expectedMatches.entrySet()) { + // Filter out child docs with no matches since those will never contribute to the score + List childDocMatches = + entry.getValue().stream().filter(m -> !m.matches().isEmpty()).toList(); + if (childDocMatches.isEmpty()) { + continue; + } + + double expectedScore = 0; + if (searchScoreMode.needsScores()) { + boolean firstScore = true; + for (ChildDocMatch childDocMatch : childDocMatches) { + float expectedChildDocScore = computeExpectedScore(childDocMatch); + switch (joinScoreMode) { + case Total: + case Avg: + expectedScore += expectedChildDocScore; + break; + case Min: + expectedScore = + firstScore + ? expectedChildDocScore + : Math.min(expectedScore, expectedChildDocScore); + break; + case Max: + expectedScore = Math.max(expectedScore, expectedChildDocScore); + break; + case None: + break; + default: + throw new AssertionError(); + } + + firstScore = false; + } + + if (joinScoreMode == ScoreMode.Avg) { + expectedScore /= childDocMatches.size(); + } + } + + expectedScores.put(entry.getKey(), (float) expectedScore); + } + + return expectedScores; + } + + private static float computeExpectedScore(ChildDocMatch childDocMatch) { + float expectedScore = 0.0f; + Set matchValueSet = new HashSet<>(childDocMatch.matches()); + for (MatchValue matchValue : matchValueSet) { + expectedScore += matchValue.getScore(); + } + + return expectedScore; + } + + private static ToParentBlockJoinQuery buildQuery(ScoreMode scoreMode) { + BooleanQuery.Builder childQueryBuilder = new BooleanQuery.Builder(); + for (MatchValue matchValue : MatchValue.VALUES) { + childQueryBuilder.add( + new BoostQuery( + new ConstantScoreQuery( + new TermQuery(new Term(VALUE_FIELD_NAME, matchValue.getText()))), + matchValue.getScore()), + BooleanClause.Occur.SHOULD); + } + BitSetProducer parentsFilter = + new QueryBitSetProducer(new TermQuery(new Term(TYPE_FIELD_NAME, PARENT_FILTER_VALUE))); + return new ToParentBlockJoinQuery(childQueryBuilder.build(), parentsFilter, scoreMode); + } + + private static void assertScores( + BulkScorer bulkScorer, + org.apache.lucene.search.ScoreMode scoreMode, + Float minScore, + Map expectedScores) + throws IOException { + assertScores(bulkScorer, scoreMode, minScore, List.of(expectedScores)); + } + + private static void assertScores( + BulkScorer bulkScorer, + org.apache.lucene.search.ScoreMode scoreMode, + Float minScore, + List> expectedScoresList) + throws IOException { + Map actualScores = new HashMap<>(); + bulkScorer.score( + new LeafCollector() { + private Scorable scorer; + + @Override + public void setScorer(Scorable scorer) throws IOException { + assertNotNull(scorer); + this.scorer = scorer; + if (minScore != null) { + this.scorer.setMinCompetitiveScore(minScore); + } + } + + @Override + public void collect(int doc) throws IOException { + assertNotNull(scorer); + actualScores.put(doc, scoreMode.needsScores() ? scorer.score() : 0); + } + }, + null, + 0, + NO_MORE_DOCS); + + if (expectedScoresList.size() == 1) { + assertEquals(expectedScoresList.getFirst(), actualScores); + } else { + assertEqualsToOneOf(expectedScoresList, actualScores); + } + } + + private static void assertEqualsToOneOf(List expectedList, Object actual) { + boolean foundMatch = false; + for (Object expected : expectedList) { + if (Objects.equals(expected, actual)) { + foundMatch = true; + break; + } + } + + if (!foundMatch) { + throw new AssertionError("expected one of: " + expectedList + " but was: " + actual); + } + } + + public void testScoreRandomIndices() throws IOException { + for (int i = 0; i < 200 * RANDOM_MULTIPLIER; i++) { + try (Directory dir = newDirectory()) { + Map> expectedMatches; + try (RandomIndexWriter w = + new RandomIndexWriter( + random(), + dir, + newIndexWriterConfig() + .setMergePolicy( + // retain doc id order + newLogMergePolicy(random().nextBoolean())))) { + + expectedMatches = + populateRandomIndex( + w, + TestUtil.nextInt(random(), 10 * RANDOM_MULTIPLIER, 30 * RANDOM_MULTIPLIER), + 20, + 3); + w.forceMerge(1); + } + + try (IndexReader reader = DirectoryReader.open(dir)) { + final IndexSearcher searcher = newSearcher(reader); + final ScoreMode joinScoreMode = + RandomPicks.randomFrom(LuceneTestCase.random(), ScoreMode.values()); + final org.apache.lucene.search.ScoreMode searchScoreMode = + RandomPicks.randomFrom( + LuceneTestCase.random(), org.apache.lucene.search.ScoreMode.values()); + final Map expectedScores = + computeExpectedScores(expectedMatches, joinScoreMode, searchScoreMode); + + ToParentBlockJoinQuery query = buildQuery(joinScoreMode); + Weight weight = searcher.createWeight(searcher.rewrite(query), searchScoreMode, 1); + ScorerSupplier ss = weight.scorerSupplier(searcher.getIndexReader().leaves().get(0)); + if (ss == null) { + // Score supplier will be null when there are no matches + assertTrue(expectedScores.isEmpty()); + continue; + } + + assertScores(ss.bulkScorer(), searchScoreMode, null, expectedScores); + } + } + } + } + + public void testSetMinCompetitiveScoreWithScoreModeMax() throws IOException { + try (Directory dir = newDirectory()) { + try (RandomIndexWriter w = + new RandomIndexWriter( + random(), + dir, + newIndexWriterConfig() + .setMergePolicy( + // retain doc id order + newLogMergePolicy(random().nextBoolean())))) { + + populateStaticIndex(w); + w.forceMerge(1); + } + + try (IndexReader reader = DirectoryReader.open(dir)) { + final IndexSearcher searcher = newSearcher(reader); + final ToParentBlockJoinQuery query = buildQuery(ScoreMode.Max); + final org.apache.lucene.search.ScoreMode scoreMode = + org.apache.lucene.search.ScoreMode.TOP_SCORES; + final Weight weight = searcher.createWeight(searcher.rewrite(query), scoreMode, 1); + + { + Map expectedScores = + Map.of( + 2, 6.0f, + 5, 2.0f, + 10, 10.0f, + 12, 2.0f, + 16, 5.0f); + + ScorerSupplier ss = weight.scorerSupplier(searcher.getIndexReader().leaves().get(0)); + ss.setTopLevelScoringClause(); + assertScores(ss.bulkScorer(), scoreMode, null, expectedScores); + } + + { + // This test case has two potential results. + // If all docs are scored in the same batch, then doc 16 is collected because + // MaxScoreBulkScorer scores assuming A will match in doc 13, leading to a potential max + // score of 6. + // If the scoring is split across two or more batches, then doc 16 is not collected + // because MaxScoreBulkScorer does not assume A will match in doc 13, leading to a + // potential max score of 5. + Map expectedScores1 = + Map.of( + 2, 6.0f, + 10, 10.0f); + Map expectedScores2 = + Map.of( + 2, 6.0f, + 10, 10.0f, + 16, 5.0f); + + ScorerSupplier ss = weight.scorerSupplier(searcher.getIndexReader().leaves().get(0)); + ss.setTopLevelScoringClause(); + assertScores(ss.bulkScorer(), scoreMode, 6.0f, List.of(expectedScores1, expectedScores2)); + } + + { + Map expectedScores = Map.of(); + + ScorerSupplier ss = weight.scorerSupplier(searcher.getIndexReader().leaves().get(0)); + ss.setTopLevelScoringClause(); + assertScores(ss.bulkScorer(), scoreMode, 11.0f, expectedScores); + } + } + } + } + + public void testSetMinCompetitiveScoreWithScoreModeNone() throws IOException { + try (Directory dir = newDirectory()) { + try (RandomIndexWriter w = + new RandomIndexWriter( + random(), + dir, + newIndexWriterConfig() + .setMergePolicy( + // retain doc id order + newLogMergePolicy(random().nextBoolean())))) { + + populateStaticIndex(w); + w.forceMerge(1); + } + + try (IndexReader reader = DirectoryReader.open(dir)) { + final IndexSearcher searcher = newSearcher(reader); + final ToParentBlockJoinQuery query = buildQuery(ScoreMode.None); + final org.apache.lucene.search.ScoreMode scoreMode = + org.apache.lucene.search.ScoreMode.TOP_SCORES; + final Weight weight = searcher.createWeight(searcher.rewrite(query), scoreMode, 1); + + { + Map expectedScores = + Map.of( + 2, 0.0f, + 5, 0.0f, + 10, 0.0f, + 12, 0.0f, + 16, 0.0f); + + ScorerSupplier ss = weight.scorerSupplier(searcher.getIndexReader().leaves().get(0)); + ss.setTopLevelScoringClause(); + assertScores(ss.bulkScorer(), scoreMode, null, expectedScores); + } + + { + Map expectedScores = + Map.of( + 2, 0.0f, + 5, 0.0f, + 10, 0.0f, + 12, 0.0f, + 16, 0.0f); + + ScorerSupplier ss = weight.scorerSupplier(searcher.getIndexReader().leaves().get(0)); + ss.setTopLevelScoringClause(); + assertScores(ss.bulkScorer(), scoreMode, 0.0f, expectedScores); + } + + { + Map expectedScores = Map.of(); + + ScorerSupplier ss = weight.scorerSupplier(searcher.getIndexReader().leaves().get(0)); + ss.setTopLevelScoringClause(); + assertScores(ss.bulkScorer(), scoreMode, Math.nextUp(0f), expectedScores); + } + } + } + } +} diff --git a/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoinScorer.java b/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoinScorer.java index 6acca015c0ef..4f7b13e8a6a7 100644 --- a/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoinScorer.java +++ b/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoinScorer.java @@ -16,19 +16,26 @@ */ package org.apache.lucene.search.join; +import static org.apache.lucene.search.ScoreMode.TOP_SCORES; + import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; +import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.BoostQuery; +import org.apache.lucene.search.ConstantScoreQuery; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.Query; -import org.apache.lucene.search.ScoreMode; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.ScorerSupplier; import org.apache.lucene.search.TermQuery; @@ -76,10 +83,9 @@ public void testScoreNone() throws IOException { Query childQuery = new MatchAllDocsQuery(); ToParentBlockJoinQuery query = - new ToParentBlockJoinQuery( - childQuery, parentsFilter, org.apache.lucene.search.join.ScoreMode.None); + new ToParentBlockJoinQuery(childQuery, parentsFilter, ScoreMode.None); - Weight weight = searcher.createWeight(searcher.rewrite(query), ScoreMode.TOP_SCORES, 1); + Weight weight = searcher.createWeight(searcher.rewrite(query), TOP_SCORES, 1); LeafReaderContext context = searcher.getIndexReader().leaves().get(0); Scorer scorer = weight.scorer(context); @@ -118,4 +124,119 @@ public void testScoreNone() throws IOException { reader.close(); dir.close(); } + + public void testScoreMax() throws IOException { + try (Directory dir = newDirectory()) { + try (RandomIndexWriter w = + new RandomIndexWriter( + random(), + dir, + newIndexWriterConfig() + .setMergePolicy( + // retain doc id order + newLogMergePolicy(random().nextBoolean())))) { + + for (String[][] values : + Arrays.asList( + new String[][] {{"A", "B"}, {"A", "B", "C"}}, + new String[][] {{"A"}, {"B"}}, + new String[][] {{}}, + new String[][] {{"A", "B", "C"}, {"A", "B", "C", "D"}}, + new String[][] {{"B"}}, + new String[][] {{"B", "C"}, {"A", "B"}, {"A", "C"}})) { + + List docs = new ArrayList<>(); + for (String[] value : values) { + Document childDoc = new Document(); + childDoc.add(newStringField("type", "child", Field.Store.NO)); + for (String v : value) { + childDoc.add(newStringField("value", v, Field.Store.NO)); + } + docs.add(childDoc); + } + + Document parentDoc = new Document(); + parentDoc.add(newStringField("type", "parent", Field.Store.NO)); + docs.add(parentDoc); + + w.addDocuments(docs); + } + + w.forceMerge(1); + } + + try (IndexReader reader = DirectoryReader.open(dir)) { + IndexSearcher searcher = newSearcher(reader); + + BooleanQuery childQuery = + new BooleanQuery.Builder() + .add( + new BoostQuery( + new ConstantScoreQuery(new TermQuery(new Term("value", "A"))), 2), + BooleanClause.Occur.SHOULD) + .add( + new ConstantScoreQuery(new TermQuery(new Term("value", "B"))), + BooleanClause.Occur.SHOULD) + .add( + new BoostQuery( + new ConstantScoreQuery(new TermQuery(new Term("value", "C"))), 3), + BooleanClause.Occur.SHOULD) + .add( + new BoostQuery( + new ConstantScoreQuery(new TermQuery(new Term("value", "D"))), 4), + BooleanClause.Occur.SHOULD) + .build(); + BitSetProducer parentsFilter = + new QueryBitSetProducer(new TermQuery(new Term("type", "parent"))); + ToParentBlockJoinQuery parentQuery = + new ToParentBlockJoinQuery(childQuery, parentsFilter, ScoreMode.Max); + + Weight weight = searcher.createWeight(searcher.rewrite(parentQuery), TOP_SCORES, 1); + ScorerSupplier ss = weight.scorerSupplier(searcher.getIndexReader().leaves().get(0)); + ss.setTopLevelScoringClause(); + Scorer scorer = ss.get(Long.MAX_VALUE); + + assertEquals(2, scorer.iterator().nextDoc()); + assertEquals(2 + 1 + 3, scorer.score(), 0); + + assertEquals(5, scorer.iterator().nextDoc()); + assertEquals(2, scorer.score(), 0); + + assertEquals(10, scorer.iterator().nextDoc()); + assertEquals(2 + 1 + 3 + 4, scorer.score(), 0); + + assertEquals(12, scorer.iterator().nextDoc()); + assertEquals(1, scorer.score(), 0); + + assertEquals(16, scorer.iterator().nextDoc()); + assertEquals(2 + 3, scorer.score(), 0); + + assertEquals(DocIdSetIterator.NO_MORE_DOCS, scorer.iterator().nextDoc()); + + ss = weight.scorerSupplier(searcher.getIndexReader().leaves().get(0)); + ss.setTopLevelScoringClause(); + scorer = ss.get(Long.MAX_VALUE); + scorer.setMinCompetitiveScore(6); + + assertEquals(2, scorer.iterator().nextDoc()); + assertEquals(2 + 1 + 3, scorer.score(), 0); + + assertEquals(10, scorer.iterator().nextDoc()); + assertEquals(2 + 1 + 3 + 4, scorer.score(), 0); + + assertEquals(DocIdSetIterator.NO_MORE_DOCS, scorer.iterator().nextDoc()); + + ss = weight.scorerSupplier(searcher.getIndexReader().leaves().get(0)); + ss.setTopLevelScoringClause(); + scorer = ss.get(Long.MAX_VALUE); + + assertEquals(2, scorer.iterator().nextDoc()); + assertEquals(2 + 1 + 3, scorer.score(), 0); + + scorer.setMinCompetitiveScore(11); + + assertEquals(DocIdSetIterator.NO_MORE_DOCS, scorer.iterator().nextDoc()); + } + } + } } diff --git a/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoinSorting.java b/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoinSorting.java index dd5ba445a654..0c9c77ed7e0c 100644 --- a/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoinSorting.java +++ b/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoinSorting.java @@ -224,7 +224,7 @@ public void testNestedSorting() throws Exception { "field2", SortField.Type.STRING, false, parentFilter, childFilter); Sort sort = new Sort(sortField); TopFieldDocs topDocs = searcher.search(query, 5, sort); - assertEquals(7, topDocs.totalHits.value); + assertEquals(7, topDocs.totalHits.value()); assertEquals(5, topDocs.scoreDocs.length); assertEquals(3, topDocs.scoreDocs[0].doc); assertEquals("a", ((BytesRef) ((FieldDoc) topDocs.scoreDocs[0]).fields[0]).utf8ToString()); @@ -247,7 +247,7 @@ public void testNestedSorting() throws Exception { sort = new Sort(sortField); topDocs = searcher.search(query, 5, sort); - assertEquals(7, topDocs.totalHits.value); + assertEquals(7, topDocs.totalHits.value()); assertEquals(5, topDocs.scoreDocs.length); assertEquals(3, topDocs.scoreDocs[0].doc); assertEquals("c", ((BytesRef) ((FieldDoc) topDocs.scoreDocs[0]).fields[0]).utf8ToString()); @@ -269,7 +269,7 @@ public void testNestedSorting() throws Exception { "field2", SortField.Type.STRING, true, parentFilter, childFilter)); sort = new Sort(sortField); topDocs = searcher.search(query, 5, sort); - assertEquals(topDocs.totalHits.value, 7); + assertEquals(topDocs.totalHits.value(), 7); assertEquals(5, topDocs.scoreDocs.length); assertEquals(27, topDocs.scoreDocs[0].doc); assertEquals("o", ((BytesRef) ((FieldDoc) topDocs.scoreDocs[0]).fields[0]).utf8ToString()); @@ -298,7 +298,7 @@ public void testNestedSorting() throws Exception { sort = new Sort(sortField); topDocs = searcher.search(query, 5, sort); - assertEquals(6, topDocs.totalHits.value); + assertEquals(6, topDocs.totalHits.value()); assertEquals(5, topDocs.scoreDocs.length); assertEquals(23, topDocs.scoreDocs[0].doc); assertEquals("m", ((BytesRef) ((FieldDoc) topDocs.scoreDocs[0]).fields[0]).utf8ToString()); diff --git a/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoinValidation.java b/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoinValidation.java index a1b85fe9bbb3..ffe4c3840d5f 100644 --- a/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoinValidation.java +++ b/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoinValidation.java @@ -16,6 +16,7 @@ */ package org.apache.lucene.search.join; +import com.carrotsearch.randomizedtesting.generators.RandomPicks; import java.util.ArrayList; import java.util.List; import org.apache.lucene.document.Document; @@ -81,9 +82,16 @@ public void tearDown() throws Exception { } public void testNextDocValidationForToParentBjq() throws Exception { + // TODO: This test is broken when score mode is None because BlockJoinScorer#scoreChildDocs does + // not advance the child approximation. Adjust this test once that is fixed. + final List validScoreModes = + List.of(ScoreMode.Avg, ScoreMode.Max, ScoreMode.Total, ScoreMode.Min); Query parentQueryWithRandomChild = createChildrenQueryWithOneParent(getRandomChildNumber(0)); ToParentBlockJoinQuery blockJoinQuery = - new ToParentBlockJoinQuery(parentQueryWithRandomChild, parentsFilter, ScoreMode.None); + new ToParentBlockJoinQuery( + parentQueryWithRandomChild, + parentsFilter, + RandomPicks.randomFrom(LuceneTestCase.random(), validScoreModes)); IllegalStateException expected = expectThrows( IllegalStateException.class, diff --git a/lucene/join/src/test/org/apache/lucene/search/join/TestJoinUtil.java b/lucene/join/src/test/org/apache/lucene/search/join/TestJoinUtil.java index 972929e8118e..d959d9e26308 100644 --- a/lucene/join/src/test/org/apache/lucene/search/join/TestJoinUtil.java +++ b/lucene/join/src/test/org/apache/lucene/search/join/TestJoinUtil.java @@ -157,7 +157,7 @@ public void testSimple() throws Exception { ScoreMode.None); TopDocs result = indexSearcher.search(joinQuery, 10); - assertEquals(2, result.totalHits.value); + assertEquals(2, result.totalHits.value()); assertEquals(4, result.scoreDocs[0].doc); assertEquals(5, result.scoreDocs[1].doc); @@ -170,7 +170,7 @@ public void testSimple() throws Exception { indexSearcher, ScoreMode.None); result = indexSearcher.search(joinQuery, 10); - assertEquals(2, result.totalHits.value); + assertEquals(2, result.totalHits.value()); assertEquals(1, result.scoreDocs[0].doc); assertEquals(2, result.scoreDocs[1].doc); @@ -184,7 +184,7 @@ public void testSimple() throws Exception { indexSearcher, ScoreMode.None); result = indexSearcher.search(joinQuery, 10); - assertEquals(1, result.totalHits.value); + assertEquals(1, result.totalHits.value()); assertEquals(3, result.scoreDocs[0].doc); indexSearcher.getIndexReader().close(); @@ -278,7 +278,7 @@ public void testSimpleOrdinalsJoin() throws Exception { JoinUtil.createJoinQuery( joinField, fromQuery, toQuery, indexSearcher, ScoreMode.None, ordinalMap); TopDocs result = indexSearcher.search(joinQuery, 10); - assertEquals(2, result.totalHits.value); + assertEquals(2, result.totalHits.value()); assertEquals(4, result.scoreDocs[0].doc); assertEquals(5, result.scoreDocs[1].doc); @@ -287,7 +287,7 @@ public void testSimpleOrdinalsJoin() throws Exception { JoinUtil.createJoinQuery( joinField, fromQuery, toQuery, indexSearcher, ScoreMode.None, ordinalMap); result = indexSearcher.search(joinQuery, 10); - assertEquals(2, result.totalHits.value); + assertEquals(2, result.totalHits.value()); assertEquals(1, result.scoreDocs[0].doc); assertEquals(2, result.scoreDocs[1].doc); @@ -298,7 +298,7 @@ public void testSimpleOrdinalsJoin() throws Exception { JoinUtil.createJoinQuery( joinField, fromQuery, toQuery, indexSearcher, ScoreMode.None, ordinalMap); result = indexSearcher.search(joinQuery, 10); - assertEquals(2, result.totalHits.value); + assertEquals(2, result.totalHits.value()); assertEquals(0, result.scoreDocs[0].doc); assertEquals(3, result.scoreDocs[1].doc); @@ -391,7 +391,7 @@ public void testOrdinalsJoinExplainNoMatches() throws Exception { JoinUtil.createJoinQuery( joinField, fromQuery, toQuery, indexSearcher, scoreMode, ordinalMap); TopDocs result = indexSearcher.search(joinQuery, 10); - assertEquals(1, result.totalHits.value); + assertEquals(1, result.totalHits.value()); assertEquals(4, result.scoreDocs[0].doc); // doc with price: 5.0 Explanation explanation = indexSearcher.explain(joinQuery, 4); assertTrue(explanation.isMatch()); @@ -468,7 +468,7 @@ public void testRandomOrdinalsJoin() throws Exception { final BitSet actualResult = new FixedBitSet(indexSearcher.getIndexReader().maxDoc()); final TopScoreDocCollector topScoreDocCollector = - new TopScoreDocCollectorManager(10, null, Integer.MAX_VALUE, false).newCollector(); + new TopScoreDocCollectorManager(10, null, Integer.MAX_VALUE).newCollector(); indexSearcher.search( joinQuery, MultiCollector.wrap(new BitSetCollector(actualResult), topScoreDocCollector)); assertBitSet(expectedResult, actualResult, indexSearcher); @@ -534,7 +534,7 @@ public void testMinMaxScore() throws Exception { JoinUtil.createJoinQuery( "join_field", fromQuery.build(), toQuery, searcher, ScoreMode.Min, ordinalMap); TopDocs topDocs = searcher.search(joinQuery, numParents); - assertEquals(numParents, topDocs.totalHits.value); + assertEquals(numParents, topDocs.totalHits.value()); for (int i = 0; i < topDocs.scoreDocs.length; i++) { ScoreDoc scoreDoc = topDocs.scoreDocs[i]; String id = searcher.storedFields().document(scoreDoc.doc).get("id"); @@ -546,7 +546,7 @@ public void testMinMaxScore() throws Exception { JoinUtil.createJoinQuery( "join_field", fromQuery.build(), toQuery, searcher, ScoreMode.Max, ordinalMap); topDocs = searcher.search(joinQuery, numParents); - assertEquals(numParents, topDocs.totalHits.value); + assertEquals(numParents, topDocs.totalHits.value()); for (int i = 0; i < topDocs.scoreDocs.length; i++) { ScoreDoc scoreDoc = topDocs.scoreDocs[i]; String id = searcher.storedFields().document(scoreDoc.doc).get("id"); @@ -805,7 +805,7 @@ void test300spartans(boolean multipleValues, ScoreMode scoreMode) throws Excepti scoreMode); TopDocs result = indexSearcher.search(joinQuery, 10); - assertEquals(1, result.totalHits.value); + assertEquals(1, result.totalHits.value()); assertEquals(0, result.scoreDocs[0].doc); indexSearcher.getIndexReader().close(); @@ -1003,7 +1003,7 @@ public void testSimpleWithScoring() throws Exception { indexSearcher, ScoreMode.Max); TopDocs result = indexSearcher.search(joinQuery, 10); - assertEquals(2, result.totalHits.value); + assertEquals(2, result.totalHits.value()); assertEquals(0, result.scoreDocs[0].doc); assertEquals(3, result.scoreDocs[1].doc); checkBoost(joinQuery, indexSearcher); @@ -1018,7 +1018,7 @@ public void testSimpleWithScoring() throws Exception { indexSearcher, ScoreMode.Max); result = indexSearcher.search(joinQuery, 10); - assertEquals(2, result.totalHits.value); + assertEquals(2, result.totalHits.value()); assertEquals(3, result.scoreDocs[0].doc); assertEquals(0, result.scoreDocs[1].doc); checkBoost(joinQuery, indexSearcher); @@ -1033,7 +1033,7 @@ public void testSimpleWithScoring() throws Exception { indexSearcher, ScoreMode.Total); result = indexSearcher.search(joinQuery, 10); - assertEquals(2, result.totalHits.value); + assertEquals(2, result.totalHits.value()); assertEquals(0, result.scoreDocs[0].doc); assertEquals(3, result.scoreDocs[1].doc); checkBoost(joinQuery, indexSearcher); @@ -1048,7 +1048,7 @@ public void testSimpleWithScoring() throws Exception { indexSearcher, ScoreMode.Avg); result = indexSearcher.search(joinQuery, 10); - assertEquals(2, result.totalHits.value); + assertEquals(2, result.totalHits.value()); assertEquals(3, result.scoreDocs[0].doc); assertEquals(0, result.scoreDocs[1].doc); checkBoost(joinQuery, indexSearcher); @@ -1546,7 +1546,7 @@ private void executeRandomJoin( // be also testing TopDocsCollector... final BitSet actualResult = new FixedBitSet(indexSearcher.getIndexReader().maxDoc()); final TopScoreDocCollector topScoreDocCollector = - new TopScoreDocCollectorManager(10, null, Integer.MAX_VALUE, false).newCollector(); + new TopScoreDocCollectorManager(10, null, Integer.MAX_VALUE).newCollector(); indexSearcher.search( joinQuery, MultiCollector.wrap(new BitSetCollector(actualResult), topScoreDocCollector)); @@ -1597,7 +1597,7 @@ private void assertTopDocs( IndexSearcher indexSearcher, Query joinQuery) throws IOException { - assertEquals(expectedTopDocs.totalHits.value, actualTopDocs.totalHits.value); + assertEquals(expectedTopDocs.totalHits.value(), actualTopDocs.totalHits.value()); assertEquals(expectedTopDocs.scoreDocs.length, actualTopDocs.scoreDocs.length); if (scoreMode == ScoreMode.None) { return; diff --git a/lucene/join/src/test/org/apache/lucene/search/join/TestParentBlockJoinByteKnnVectorQuery.java b/lucene/join/src/test/org/apache/lucene/search/join/TestParentBlockJoinByteKnnVectorQuery.java index 6f773300a6dc..6c1d461d4bf8 100644 --- a/lucene/join/src/test/org/apache/lucene/search/join/TestParentBlockJoinByteKnnVectorQuery.java +++ b/lucene/join/src/test/org/apache/lucene/search/join/TestParentBlockJoinByteKnnVectorQuery.java @@ -29,9 +29,11 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.Term; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.Directory; public class TestParentBlockJoinByteKnnVectorQuery extends ParentBlockJoinKnnVectorQueryTestCase { @@ -81,6 +83,20 @@ d, new IndexWriterConfig().setMergePolicy(newMergePolicy(random(), false)))) { } } + public void testToString() { + // test without filter + Query query = getParentJoinKnnQuery("field", new float[] {0, 1}, null, 10, null); + assertEquals( + "DiversifyingChildrenByteKnnVectorQuery:field[0,...][10]", query.toString("ignored")); + + // test with filter + Query filter = new TermQuery(new Term("id", "text")); + query = getParentJoinKnnQuery("field", new float[] {0, 1}, filter, 10, null); + assertEquals( + "DiversifyingChildrenByteKnnVectorQuery:field[0,...][10][id:text]", + query.toString("ignored")); + } + private static byte[] fromFloat(float[] queryVector) { byte[] query = new byte[queryVector.length]; for (int i = 0; i < queryVector.length; i++) { diff --git a/lucene/join/src/test/org/apache/lucene/search/join/TestParentBlockJoinFloatKnnVectorQuery.java b/lucene/join/src/test/org/apache/lucene/search/join/TestParentBlockJoinFloatKnnVectorQuery.java index 616c8fdb3706..f15de3b57eea 100644 --- a/lucene/join/src/test/org/apache/lucene/search/join/TestParentBlockJoinFloatKnnVectorQuery.java +++ b/lucene/join/src/test/org/apache/lucene/search/join/TestParentBlockJoinFloatKnnVectorQuery.java @@ -29,9 +29,11 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.Term; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.Directory; public class TestParentBlockJoinFloatKnnVectorQuery extends ParentBlockJoinKnnVectorQueryTestCase { @@ -110,6 +112,20 @@ d, new IndexWriterConfig().setMergePolicy(newMergePolicy(random(), false)))) { } } + public void testToString() { + // test without filter + Query query = getParentJoinKnnQuery("field", new float[] {0, 1}, null, 10, null); + assertEquals( + "DiversifyingChildrenFloatKnnVectorQuery:field[0.0,...][10]", query.toString("ignored")); + + // test with filter + Query filter = new TermQuery(new Term("id", "text")); + query = getParentJoinKnnQuery("field", new float[] {0.0f, 1.0f}, filter, 10, null); + assertEquals( + "DiversifyingChildrenFloatKnnVectorQuery:field[0.0,...][10][id:text]", + query.toString("ignored")); + } + @Override Field getKnnVectorField(String name, float[] vector) { return new KnnFloatVectorField(name, vector); diff --git a/lucene/join/src/test/org/apache/lucene/search/join/TestParentChildrenBlockJoinQuery.java b/lucene/join/src/test/org/apache/lucene/search/join/TestParentChildrenBlockJoinQuery.java index 54b0bf017185..6fa3df2cdf96 100644 --- a/lucene/join/src/test/org/apache/lucene/search/join/TestParentChildrenBlockJoinQuery.java +++ b/lucene/join/src/test/org/apache/lucene/search/join/TestParentChildrenBlockJoinQuery.java @@ -91,7 +91,7 @@ public void testParentChildrenBlockJoinQuery() throws Exception { ParentChildrenBlockJoinQuery parentChildrenBlockJoinQuery = new ParentChildrenBlockJoinQuery(parentFilter, childQuery, parentScoreDoc.doc); TopDocs topDocs = searcher.search(parentChildrenBlockJoinQuery, maxChildDocsPerParent); - assertEquals(expectedChildDocs, topDocs.totalHits.value); + assertEquals(expectedChildDocs, topDocs.totalHits.value()); if (expectedChildDocs > 0) { for (int i = 0; i < topDocs.scoreDocs.length; i++) { ScoreDoc childScoreDoc = topDocs.scoreDocs[i]; diff --git a/lucene/luke/src/java/org/apache/lucene/luke/app/desktop/components/SearchPanelProvider.java b/lucene/luke/src/java/org/apache/lucene/luke/app/desktop/components/SearchPanelProvider.java index d00baa55981b..8399d347d888 100644 --- a/lucene/luke/src/java/org/apache/lucene/luke/app/desktop/components/SearchPanelProvider.java +++ b/lucene/luke/src/java/org/apache/lucene/luke/app/desktop/components/SearchPanelProvider.java @@ -613,14 +613,14 @@ private Query parse(boolean rewrite) { private void populateResults(SearchResults res) { totalHitsLbl.setText(String.valueOf(res.getTotalHits())); - if (res.getTotalHits().value > 0) { + if (res.getTotalHits().value() > 0) { startLbl.setText(String.valueOf(res.getOffset() + 1)); endLbl.setText(String.valueOf(res.getOffset() + res.size())); prevBtn.setEnabled(res.getOffset() > 0); nextBtn.setEnabled( - res.getTotalHits().relation == TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO - || res.getTotalHits().value > res.getOffset() + res.size()); + res.getTotalHits().relation() == TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO + || res.getTotalHits().value() > res.getOffset() + res.size()); if (!indexHandler.getState().readOnly() && indexHandler.getState().hasDirectoryReader()) { delBtn.setEnabled(true); diff --git a/lucene/luke/src/java/org/apache/lucene/luke/models/analysis/AnalysisImpl.java b/lucene/luke/src/java/org/apache/lucene/luke/models/analysis/AnalysisImpl.java index 7f5136a2dd80..9a70acc7c361 100644 --- a/lucene/luke/src/java/org/apache/lucene/luke/models/analysis/AnalysisImpl.java +++ b/lucene/luke/src/java/org/apache/lucene/luke/models/analysis/AnalysisImpl.java @@ -167,16 +167,16 @@ public Analyzer buildCustomAnalyzer(CustomAnalyzerConfig config) { // set tokenizer builder.withTokenizer( - config.getTokenizerConfig().getName(), config.getTokenizerConfig().getParams()); + config.getTokenizerConfig().name(), config.getTokenizerConfig().params()); // add char filters for (CustomAnalyzerConfig.ComponentConfig cfConf : config.getCharFilterConfigs()) { - builder.addCharFilter(cfConf.getName(), cfConf.getParams()); + builder.addCharFilter(cfConf.name(), cfConf.params()); } // add token filters for (CustomAnalyzerConfig.ComponentConfig tfConf : config.getTokenFilterConfigs()) { - builder.addTokenFilter(tfConf.getName(), tfConf.getParams()); + builder.addTokenFilter(tfConf.name(), tfConf.params()); } // build analyzer diff --git a/lucene/luke/src/java/org/apache/lucene/luke/models/analysis/CustomAnalyzerConfig.java b/lucene/luke/src/java/org/apache/lucene/luke/models/analysis/CustomAnalyzerConfig.java index aaa29c479376..465d6a37f1e1 100644 --- a/lucene/luke/src/java/org/apache/lucene/luke/models/analysis/CustomAnalyzerConfig.java +++ b/lucene/luke/src/java/org/apache/lucene/luke/models/analysis/CustomAnalyzerConfig.java @@ -99,24 +99,15 @@ List getTokenFilterConfigs() { return List.copyOf(tokenFilterConfigs); } - static class ComponentConfig { - - /* SPI name */ - private final String name; - /* parameter map */ - private final Map params; + /** + * @param name SPI name + * @param params parameter map + */ + record ComponentConfig(String name, Map params) { ComponentConfig(String name, Map params) { this.name = Objects.requireNonNull(name); this.params = Objects.requireNonNull(params); } - - String getName() { - return this.name; - } - - Map getParams() { - return this.params; - } } } diff --git a/lucene/luke/src/java/org/apache/lucene/luke/models/documents/DocumentField.java b/lucene/luke/src/java/org/apache/lucene/luke/models/documents/DocumentField.java index 460d14226495..14d3a76b8f3e 100644 --- a/lucene/luke/src/java/org/apache/lucene/luke/models/documents/DocumentField.java +++ b/lucene/luke/src/java/org/apache/lucene/luke/models/documents/DocumentField.java @@ -73,7 +73,7 @@ static DocumentField of(FieldInfo finfo, IndexableField field, IndexReader reade dfield.name = finfo.name; dfield.idxOptions = finfo.getIndexOptions(); - dfield.hasTermVectors = finfo.hasVectors(); + dfield.hasTermVectors = finfo.hasTermVectors(); dfield.hasPayloads = finfo.hasPayloads(); dfield.hasNorms = finfo.hasNorms(); diff --git a/lucene/luke/src/java/org/apache/lucene/luke/models/search/SearchImpl.java b/lucene/luke/src/java/org/apache/lucene/luke/models/search/SearchImpl.java index 56a942ba6aad..e95ca09060d8 100644 --- a/lucene/luke/src/java/org/apache/lucene/luke/models/search/SearchImpl.java +++ b/lucene/luke/src/java/org/apache/lucene/luke/models/search/SearchImpl.java @@ -315,8 +315,7 @@ private SearchResults search() throws IOException { } else { int hitsThreshold = exactHitsCount ? Integer.MAX_VALUE : DEFAULT_TOTAL_HITS_THRESHOLD; TopScoreDocCollectorManager collectorManager = - new TopScoreDocCollectorManager( - pageSize, after, hitsThreshold, searcher.getSlices().length > 1); + new TopScoreDocCollectorManager(pageSize, after, hitsThreshold); topDocs = searcher.search(query, collectorManager); } @@ -342,9 +341,9 @@ public Optional nextPage() { // proceed to next page currentPage += 1; - if (totalHits.value == 0 - || (totalHits.relation == TotalHits.Relation.EQUAL_TO - && currentPage * (long) pageSize >= totalHits.value)) { + if (totalHits.value() == 0 + || (totalHits.relation() == TotalHits.Relation.EQUAL_TO + && currentPage * (long) pageSize >= totalHits.value())) { log.warning("No more next search results are available."); return Optional.empty(); } @@ -395,8 +394,7 @@ private Similarity createSimilarity(SimilarityConfig config) { Similarity similarity; if (config.isUseClassicSimilarity()) { - ClassicSimilarity tfidf = new ClassicSimilarity(); - tfidf.setDiscountOverlaps(config.isDiscountOverlaps()); + ClassicSimilarity tfidf = new ClassicSimilarity(config.isDiscountOverlaps()); similarity = tfidf; } else { BM25Similarity bm25 = diff --git a/lucene/luke/src/test/org/apache/lucene/luke/models/search/TestSearchImpl.java b/lucene/luke/src/test/org/apache/lucene/luke/models/search/TestSearchImpl.java index 7d5a13d9cea1..6a3e4dc2c220 100644 --- a/lucene/luke/src/test/org/apache/lucene/luke/models/search/TestSearchImpl.java +++ b/lucene/luke/src/test/org/apache/lucene/luke/models/search/TestSearchImpl.java @@ -309,7 +309,7 @@ public void testSearch() throws Exception { SearchResults res = search.search(query, new SimilarityConfig.Builder().build(), null, 10, true); - assertEquals(10, res.getTotalHits().value); + assertEquals(10, res.getTotalHits().value()); assertEquals(10, res.size()); assertEquals(0, res.getOffset()); } @@ -322,7 +322,7 @@ public void testSearchWithSort() throws Exception { SearchResults res = search.search(query, new SimilarityConfig.Builder().build(), sort, null, 10, true); - assertEquals(10, res.getTotalHits().value); + assertEquals(10, res.getTotalHits().value()); assertEquals(10, res.size()); assertEquals(0, res.getOffset()); } @@ -336,7 +336,7 @@ public void testNextPage() throws Exception { assertTrue(opt.isPresent()); SearchResults res = opt.get(); - assertEquals(20, res.getTotalHits().value); + assertEquals(20, res.getTotalHits().value()); assertEquals(10, res.size()); assertEquals(10, res.getOffset()); } @@ -366,7 +366,7 @@ public void testPrevPage() throws Exception { assertTrue(opt.isPresent()); SearchResults res = opt.get(); - assertEquals(20, res.getTotalHits().value); + assertEquals(20, res.getTotalHits().value()); assertEquals(10, res.size()); assertEquals(0, res.getOffset()); } diff --git a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java index 08fb1cf6b5bd..04ac9285baba 100644 --- a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java +++ b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java @@ -36,15 +36,19 @@ import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.KnnByteVectorField; +import org.apache.lucene.document.KnnFloatVectorField; import org.apache.lucene.index.*; import org.apache.lucene.search.Collector; import org.apache.lucene.search.CollectorManager; +import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.KnnCollector; import org.apache.lucene.search.Query; import org.apache.lucene.search.Scorable; import org.apache.lucene.search.ScoreMode; import org.apache.lucene.search.SimpleCollector; +import org.apache.lucene.search.VectorScorer; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.store.Directory; import org.apache.lucene.util.ArrayUtil; @@ -636,6 +640,10 @@ public void addField(IndexableField field, Analyzer analyzer) { if (field.fieldType().stored()) { storeValues(info, field); } + + if (field.fieldType().vectorDimension() > 0) { + storeVectorValues(info, field); + } } /** @@ -728,7 +736,7 @@ private FieldInfo createFieldInfo(String fieldName, int ord, IndexableFieldType storePayloads, indexOptions, fieldType.docValuesType(), - false, + fieldType.docValuesSkipIndexType(), -1, Collections.emptyMap(), fieldType.pointDimensionCount(), @@ -749,6 +757,56 @@ private void storePointValues(Info info, BytesRef pointValue) { info.pointValues[info.pointValuesCount++] = BytesRef.deepCopyOf(pointValue); } + private void storeVectorValues(Info info, IndexableField vectorField) { + assert vectorField instanceof KnnFloatVectorField || vectorField instanceof KnnByteVectorField; + switch (info.fieldInfo.getVectorEncoding()) { + case BYTE -> { + if (vectorField instanceof KnnByteVectorField byteVectorField) { + if (info.byteVectorCount == 1) { + throw new IllegalArgumentException( + "Only one value per field allowed for byte vector field [" + + vectorField.name() + + "]"); + } + info.byteVectorCount++; + if (info.byteVectorValues == null) { + info.byteVectorValues = new byte[1][]; + } + info.byteVectorValues[0] = + ArrayUtil.copyOfSubArray( + byteVectorField.vectorValue(), 0, info.fieldInfo.getVectorDimension()); + return; + } + throw new IllegalArgumentException( + "Field [" + + vectorField.name() + + "] is not a byte vector field, but the field info is configured for byte vectors"); + } + case FLOAT32 -> { + if (vectorField instanceof KnnFloatVectorField floatVectorField) { + if (info.floatVectorCount == 1) { + throw new IllegalArgumentException( + "Only one value per field allowed for float vector field [" + + vectorField.name() + + "]"); + } + info.floatVectorCount++; + if (info.floatVectorValues == null) { + info.floatVectorValues = new float[1][]; + } + info.floatVectorValues[0] = + ArrayUtil.copyOfSubArray( + floatVectorField.vectorValue(), 0, info.fieldInfo.getVectorDimension()); + return; + } + throw new IllegalArgumentException( + "Field [" + + vectorField.name() + + "] is not a float vector field, but the field info is configured for float vectors"); + } + } + } + private void storeValues(Info info, IndexableField field) { if (info.storedValues == null) { info.storedValues = new ArrayList<>(); @@ -778,12 +836,12 @@ private void storeDocValues(Info info, DocValuesType docValuesType, Object docVa new FieldInfo( info.fieldInfo.name, info.fieldInfo.number, - info.fieldInfo.hasVectors(), + info.fieldInfo.hasTermVectors(), info.fieldInfo.hasPayloads(), info.fieldInfo.hasPayloads(), info.fieldInfo.getIndexOptions(), docValuesType, - false, + DocValuesSkipIndexType.NONE, -1, info.fieldInfo.attributes(), info.fieldInfo.getPointDimensionCount(), @@ -1148,6 +1206,18 @@ private final class Info { private BytesRef[] pointValues; + /** Number of float vectors added for this field */ + private int floatVectorCount; + + /** the float vectors added for this field */ + private float[][] floatVectorValues; + + /** Number of byte vectors added for this field */ + private int byteVectorCount; + + /** the byte vectors added for this field */ + private byte[][] byteVectorValues; + private byte[] minPackedValue; private byte[] maxPackedValue; @@ -1641,12 +1711,20 @@ public PointValues getPointValues(String fieldName) { @Override public FloatVectorValues getFloatVectorValues(String fieldName) { - return null; + Info info = fields.get(fieldName); + if (info == null || info.floatVectorValues == null) { + return null; + } + return new MemoryFloatVectorValues(info); } @Override public ByteVectorValues getByteVectorValues(String fieldName) { - return null; + Info info = fields.get(fieldName); + if (info == null || info.byteVectorValues == null) { + return null; + } + return new MemoryByteVectorValues(info); } @Override @@ -2204,4 +2282,132 @@ public int[] clear() { return super.clear(); } } + + private static final class MemoryFloatVectorValues extends FloatVectorValues { + private final Info info; + + MemoryFloatVectorValues(Info info) { + this.info = info; + } + + @Override + public int dimension() { + return info.fieldInfo.getVectorDimension(); + } + + @Override + public int size() { + return info.floatVectorCount; + } + + @Override + public float[] vectorValue(int ord) { + if (ord == 0) { + return info.floatVectorValues[0]; + } else { + return null; + } + } + + @Override + public DocIndexIterator iterator() { + return createDenseIterator(); + } + + @Override + public VectorScorer scorer(float[] query) { + if (query.length != info.fieldInfo.getVectorDimension()) { + throw new IllegalArgumentException( + "query vector dimension " + + query.length + + " does not match field dimension " + + info.fieldInfo.getVectorDimension()); + } + MemoryFloatVectorValues vectorValues = new MemoryFloatVectorValues(info); + DocIndexIterator iterator = vectorValues.iterator(); + return new VectorScorer() { + @Override + public float score() throws IOException { + assert iterator.docID() == 0; + return info.fieldInfo + .getVectorSimilarityFunction() + .compare(vectorValues.vectorValue(0), query); + } + + @Override + public DocIdSetIterator iterator() { + return iterator; + } + }; + } + + @Override + public MemoryFloatVectorValues copy() { + return this; + } + } + + private static final class MemoryByteVectorValues extends ByteVectorValues { + private final Info info; + + MemoryByteVectorValues(Info info) { + this.info = info; + } + + @Override + public int dimension() { + return info.fieldInfo.getVectorDimension(); + } + + @Override + public int size() { + return info.byteVectorCount; + } + + @Override + public byte[] vectorValue(int ord) { + if (ord == 0) { + return info.byteVectorValues[0]; + } else { + return null; + } + } + + @Override + public DocIndexIterator iterator() { + return createDenseIterator(); + } + + @Override + public VectorScorer scorer(byte[] query) { + if (query.length != info.fieldInfo.getVectorDimension()) { + throw new IllegalArgumentException( + "query vector dimension " + + query.length + + " does not match field dimension " + + info.fieldInfo.getVectorDimension()); + } + MemoryByteVectorValues vectorValues = new MemoryByteVectorValues(info); + DocIndexIterator iterator = vectorValues.iterator(); + return new VectorScorer() { + @Override + public float score() { + assert iterator.docID() == 0; + return info.fieldInfo + .getVectorSimilarityFunction() + .compare(vectorValues.vectorValue(0), query); + } + + @Override + public DocIdSetIterator iterator() { + return iterator; + } + }; + } + + @Override + public MemoryByteVectorValues copy() { + return this; + } + } } diff --git a/lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndex.java b/lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndex.java index b40409f1f3bf..7c5928689127 100644 --- a/lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndex.java +++ b/lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndex.java @@ -43,6 +43,8 @@ import org.apache.lucene.document.IntField; import org.apache.lucene.document.IntPoint; import org.apache.lucene.document.InvertableType; +import org.apache.lucene.document.KnnByteVectorField; +import org.apache.lucene.document.KnnFloatVectorField; import org.apache.lucene.document.LongPoint; import org.apache.lucene.document.NumericDocValuesField; import org.apache.lucene.document.SortedDocValuesField; @@ -53,12 +55,15 @@ import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.ByteVectorValues; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.FieldInvertState; +import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.IndexableFieldType; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.PostingsEnum; @@ -68,6 +73,7 @@ import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.search.CollectionStatistics; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.IndexSearcher; @@ -76,6 +82,7 @@ import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermStatistics; +import org.apache.lucene.search.VectorScorer; import org.apache.lucene.search.similarities.BM25Similarity; import org.apache.lucene.search.similarities.ClassicSimilarity; import org.apache.lucene.search.similarities.Similarity; @@ -741,6 +748,172 @@ public void testStoredFields() throws IOException { d, "multibinary", new BytesRef[] {new BytesRef("bbar"), new BytesRef("bbaz")}); } + public void testKnnFloatVectorOnlyOneVectorAllowed() throws IOException { + Document doc = new Document(); + doc.add(new KnnFloatVectorField("knnFloatA", new float[] {1.0f, 2.0f})); + doc.add(new KnnFloatVectorField("knnFloatA", new float[] {3.0f, 4.0f})); + expectThrows( + IllegalArgumentException.class, + () -> MemoryIndex.fromDocument(doc, new StandardAnalyzer())); + } + + public void testKnnFloatVectors() throws IOException { + List fields = new ArrayList<>(); + fields.add(new KnnFloatVectorField("knnFloatA", new float[] {1.0f, 2.0f})); + fields.add(new KnnFloatVectorField("knnFloatB", new float[] {3.0f, 4.0f, 5.0f, 6.0f})); + fields.add( + new KnnFloatVectorField( + "knnFloatC", new float[] {7.0f, 8.0f, 9.0f}, VectorSimilarityFunction.DOT_PRODUCT)); + Collections.shuffle(fields, random()); + Document doc = new Document(); + for (IndexableField f : fields) { + doc.add(f); + } + + MemoryIndex mi = MemoryIndex.fromDocument(doc, new StandardAnalyzer()); + assertFloatVectorValue(mi, "knnFloatA", new float[] {1.0f, 2.0f}); + assertFloatVectorValue(mi, "knnFloatB", new float[] {3.0f, 4.0f, 5.0f, 6.0f}); + assertFloatVectorValue(mi, "knnFloatC", new float[] {7.0f, 8.0f, 9.0f}); + + assertFloatVectorScore(mi, "knnFloatA", new float[] {1.0f, 1.0f}, 0.5f); + assertFloatVectorScore(mi, "knnFloatB", new float[] {3.0f, 3.0f, 3.0f, 3.0f}, 0.06666667f); + assertFloatVectorScore(mi, "knnFloatC", new float[] {7.0f, 7.0f, 7.0f}, 84.5f); + + assertNull( + mi.createSearcher() + .getIndexReader() + .leaves() + .get(0) + .reader() + .getFloatVectorValues("knnFloatMissing")); + assertNull( + mi.createSearcher() + .getIndexReader() + .leaves() + .get(0) + .reader() + .getByteVectorValues("knnByteVectorValue")); + } + + public void testKnnByteVectorOnlyOneVectorAllowed() throws IOException { + Document doc = new Document(); + doc.add(new KnnByteVectorField("knnByteA", new byte[] {1, 2})); + doc.add(new KnnByteVectorField("knnByteA", new byte[] {3, 4})); + expectThrows( + IllegalArgumentException.class, + () -> MemoryIndex.fromDocument(doc, new StandardAnalyzer())); + } + + public void testKnnByteVectors() throws IOException { + List fields = new ArrayList<>(); + fields.add(new KnnByteVectorField("knnByteA", new byte[] {1, 2})); + fields.add(new KnnByteVectorField("knnByteB", new byte[] {3, 4, 5, 6})); + fields.add( + new KnnByteVectorField( + "knnByteC", new byte[] {7, 8, 9}, VectorSimilarityFunction.DOT_PRODUCT)); + Collections.shuffle(fields, random()); + Document doc = new Document(); + for (IndexableField f : fields) { + doc.add(f); + } + + MemoryIndex mi = MemoryIndex.fromDocument(doc, new StandardAnalyzer()); + assertByteVectorValue(mi, "knnByteA", new byte[] {1, 2}); + assertByteVectorValue(mi, "knnByteB", new byte[] {3, 4, 5, 6}); + assertByteVectorValue(mi, "knnByteC", new byte[] {7, 8, 9}); + + assertByteVectorScore(mi, "knnByteA", new byte[] {1, 1}, 0.5f); + assertByteVectorScore(mi, "knnByteB", new byte[] {3, 3, 3, 3}, 0.06666667f); + assertByteVectorScore(mi, "knnByteC", new byte[] {7, 7, 7}, 0.501709f); + + assertNull( + mi.createSearcher() + .getIndexReader() + .leaves() + .get(0) + .reader() + .getByteVectorValues("knnByteMissing")); + assertNull( + mi.createSearcher() + .getIndexReader() + .leaves() + .get(0) + .reader() + .getFloatVectorValues("knnFloatVectorValue")); + } + + private static void assertFloatVectorValue(MemoryIndex mi, String fieldName, float[] expected) + throws IOException { + FloatVectorValues fvv = + mi.createSearcher() + .getIndexReader() + .leaves() + .get(0) + .reader() + .getFloatVectorValues(fieldName); + assertNotNull(fvv); + KnnVectorValues.DocIndexIterator iterator = fvv.iterator(); + assertEquals(0, iterator.nextDoc()); + assertArrayEquals(expected, fvv.vectorValue(0), 1e-6f); + assertEquals(DocIdSetIterator.NO_MORE_DOCS, iterator.nextDoc()); + } + + private static void assertFloatVectorScore( + MemoryIndex mi, String fieldName, float[] queryVector, float expectedScore) + throws IOException { + FloatVectorValues fvv = + mi.createSearcher() + .getIndexReader() + .leaves() + .get(0) + .reader() + .getFloatVectorValues(fieldName); + assertNotNull(fvv); + if (random().nextBoolean()) { + fvv.iterator().nextDoc(); + } + VectorScorer scorer = fvv.scorer(queryVector); + assertEquals(0, scorer.iterator().nextDoc()); + assertEquals(expectedScore, scorer.score(), 0.0f); + assertEquals(DocIdSetIterator.NO_MORE_DOCS, scorer.iterator().nextDoc()); + } + + private static void assertByteVectorValue(MemoryIndex mi, String fieldName, byte[] expected) + throws IOException { + ByteVectorValues bvv = + mi.createSearcher() + .getIndexReader() + .leaves() + .get(0) + .reader() + .getByteVectorValues(fieldName); + assertNotNull(bvv); + KnnVectorValues.DocIndexIterator iterator = bvv.iterator(); + assertEquals(0, iterator.nextDoc()); + assertArrayEquals(expected, bvv.vectorValue(0)); + assertEquals(DocIdSetIterator.NO_MORE_DOCS, iterator.nextDoc()); + } + + private static void assertByteVectorScore( + MemoryIndex mi, String fieldName, byte[] queryVector, float expectedScore) + throws IOException { + ByteVectorValues bvv = + mi.createSearcher() + .getIndexReader() + .leaves() + .get(0) + .reader() + .getByteVectorValues(fieldName); + assertNotNull(bvv); + if (random().nextBoolean()) { + bvv.iterator().nextDoc(); + } + VectorScorer scorer = bvv.scorer(queryVector); + assertEquals(0, scorer.iterator().nextDoc()); + assertEquals(expectedScore, scorer.score(), 0.0f); + assertEquals(DocIdSetIterator.NO_MORE_DOCS, scorer.iterator().nextDoc()); + } + private static void assertContains( Document d, String field, Object expected, Function value) { assertNotNull(d.getField(field)); @@ -840,28 +1013,14 @@ public void testIntegerNumericDocValue() throws IOException { assertEquals(50, sndv.nextValue()); } - private static class MockIndexableField implements IndexableField { - - private final String field; - private final BytesRef value; - private final IndexableFieldType fieldType; - - MockIndexableField(String field, BytesRef value, IndexableFieldType fieldType) { - this.field = field; - this.value = value; - this.fieldType = fieldType; - } + private record MockIndexableField(String field, BytesRef value, IndexableFieldType fieldType) + implements IndexableField { @Override public String name() { return field; } - @Override - public IndexableFieldType fieldType() { - return fieldType; - } - @Override public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) { return null; diff --git a/lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndexAgainstDirectory.java b/lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndexAgainstDirectory.java index 74e1d52aab88..0de8147a487c 100644 --- a/lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndexAgainstDirectory.java +++ b/lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndexAgainstDirectory.java @@ -235,7 +235,7 @@ public void assertAllQueries(MemoryIndex memory, Directory directory, Analyzer a for (String query : queries) { TopDocs ramDocs = ram.search(qp.parse(query), 1); TopDocs memDocs = mem.search(qp.parse(query), 1); - assertEquals(query, ramDocs.totalHits.value, memDocs.totalHits.value); + assertEquals(query, ramDocs.totalHits.value(), memDocs.totalHits.value()); } reader.close(); } @@ -665,7 +665,7 @@ public void testEmptyString() throws IOException { memory.addField("foo", new CannedTokenStream(new Token("", 0, 5))); IndexSearcher searcher = memory.createSearcher(); TopDocs docs = searcher.search(new TermQuery(new Term("foo", "")), 10); - assertEquals(1, docs.totalHits.value); + assertEquals(1, docs.totalHits.value()); TestUtil.checkReader(searcher.getIndexReader()); } diff --git a/lucene/misc/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java b/lucene/misc/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java index 3b667f3af167..44e32202fa88 100644 --- a/lucene/misc/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java +++ b/lucene/misc/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java @@ -45,10 +45,16 @@ public class SweetSpotSimilarity extends ClassicSimilarity { private double tf_hyper_base = 1.3d; private float tf_hyper_xoffset = 10.0f; + /** Default constructor: parameter-free */ public SweetSpotSimilarity() { super(); } + /** Primary constructor. */ + public SweetSpotSimilarity(boolean discountOverlaps) { + super(discountOverlaps); + } + /** * Sets the baseline and minimum function variables for baselineTf * @@ -82,11 +88,10 @@ public void setHyperbolicTfFactors(float min, float max, double base, float xoff * * @see #lengthNorm */ - public void setLengthNormFactors(int min, int max, float steepness, boolean discountOverlaps) { + public void setLengthNormFactors(int min, int max, float steepness) { this.ln_min = min; this.ln_max = max; this.ln_steep = steepness; - this.discountOverlaps = discountOverlaps; } /** diff --git a/lucene/misc/src/java/org/apache/lucene/misc/index/BPIndexReorderer.java b/lucene/misc/src/java/org/apache/lucene/misc/index/BPIndexReorderer.java index 43b6c7b9c04a..f51321fe8424 100644 --- a/lucene/misc/src/java/org/apache/lucene/misc/index/BPIndexReorderer.java +++ b/lucene/misc/src/java/org/apache/lucene/misc/index/BPIndexReorderer.java @@ -22,8 +22,8 @@ import java.util.Arrays; import java.util.HashSet; import java.util.Set; -import java.util.concurrent.ForkJoinPool; -import java.util.concurrent.RecursiveAction; +import java.util.concurrent.Callable; +import java.util.concurrent.Executor; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.index.CodecReader; import org.apache.lucene.index.DocValues; @@ -37,6 +37,7 @@ import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.TaskExecutor; import org.apache.lucene.store.ByteBuffersDataOutput; import org.apache.lucene.store.DataInput; import org.apache.lucene.store.DataOutput; @@ -123,7 +124,6 @@ private NotEnoughRAMException(String message) { private float maxDocFreq; private int minPartitionSize; private int maxIters; - private ForkJoinPool forkJoinPool; private double ramBudgetMB; private Set fields; @@ -133,7 +133,6 @@ public BPIndexReorderer() { setMaxDocFreq(1f); setMinPartitionSize(DEFAULT_MIN_PARTITION_SIZE); setMaxIters(DEFAULT_MAX_ITERS); - setForkJoinPool(null); // 10% of the available heap size by default setRAMBudgetMB(Runtime.getRuntime().totalMemory() / 1024d / 1024d / 10d); setFields(null); @@ -181,20 +180,6 @@ public void setMaxIters(int maxIters) { this.maxIters = maxIters; } - /** - * Set the {@link ForkJoinPool} to run graph partitioning concurrently. - * - *

    NOTE: A value of {@code null} can be used to run in the current thread, which is the - * default. - */ - public void setForkJoinPool(ForkJoinPool forkJoinPool) { - this.forkJoinPool = forkJoinPool; - } - - private int getParallelism() { - return forkJoinPool == null ? 1 : forkJoinPool.getParallelism(); - } - /** * Set the amount of RAM that graph partitioning is allowed to use. More RAM allows running * faster. If not enough RAM is provided, a {@link NotEnoughRAMException} will be thrown. This is @@ -225,21 +210,18 @@ private static class PerThreadState { } } - private abstract class BaseRecursiveAction extends RecursiveAction { + private abstract class BaseRecursiveAction implements Callable { + protected final TaskExecutor executor; protected final int depth; - BaseRecursiveAction(int depth) { + BaseRecursiveAction(TaskExecutor executor, int depth) { + this.executor = executor; this.depth = depth; } protected final boolean shouldFork(int problemSize, int totalProblemSize) { - if (forkJoinPool == null) { - return false; - } - if (getSurplusQueuedTaskCount() > 3) { - // Fork tasks if this worker doesn't have more queued work than other workers - // See javadocs of #getSurplusQueuedTaskCount for more details + if (executor == null) { return false; } if (problemSize == totalProblemSize) { @@ -249,6 +231,18 @@ protected final boolean shouldFork(int problemSize, int totalProblemSize) { } return problemSize > FORK_THRESHOLD; } + + @Override + public abstract Void call(); + + protected final void invokeAll(BaseRecursiveAction... actions) { + assert executor != null : "Only call invokeAll if shouldFork returned true"; + try { + executor.invokeAll(Arrays.asList(actions)); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } } private class IndexReorderingTask extends BaseRecursiveAction { @@ -263,8 +257,9 @@ private class IndexReorderingTask extends BaseRecursiveAction { float[] biases, CloseableThreadLocal threadLocal, BitSet parents, + TaskExecutor executor, int depth) { - super(depth); + super(executor, depth); this.docIDs = docIDs; this.biases = biases; this.threadLocal = threadLocal; @@ -292,7 +287,7 @@ private static void computeDocFreqs(IntsRef docs, ForwardIndex forwardIndex, int } @Override - protected void compute() { + public Void call() { if (depth > 0) { Arrays.sort(docIDs.ints, docIDs.offset, docIDs.offset + docIDs.length); } else { @@ -302,7 +297,7 @@ protected void compute() { int halfLength = docIDs.length / 2; if (halfLength < minPartitionSize) { - return; + return null; } IntsRef left = new IntsRef(docIDs.ints, docIDs.offset, halfLength); @@ -349,7 +344,7 @@ protected void compute() { if (split == docIDs.offset) { // No good split on the left side either: this slice has a single parent document, no // reordering is possible. Stop recursing. - return; + return null; } } @@ -362,16 +357,17 @@ protected void compute() { // It is fine for all tasks to share the same docs / biases array since they all work on // different slices of the array at a given point in time. IndexReorderingTask leftTask = - new IndexReorderingTask(left, biases, threadLocal, parents, depth + 1); + new IndexReorderingTask(left, biases, threadLocal, parents, executor, depth + 1); IndexReorderingTask rightTask = - new IndexReorderingTask(right, biases, threadLocal, parents, depth + 1); + new IndexReorderingTask(right, biases, threadLocal, parents, executor, depth + 1); if (shouldFork(docIDs.length, docIDs.ints.length)) { invokeAll(leftTask, rightTask); } else { - leftTask.compute(); - rightTask.compute(); + leftTask.call(); + rightTask.call(); } + return null; } // used for asserts @@ -422,8 +418,9 @@ private boolean shuffle( leftDocFreqs, rightDocFreqs, threadLocal, + executor, depth) - .compute(); + .call(); if (parents != null) { for (int i = docIDs.offset, end = docIDs.offset + docIDs.length; i < end; ) { @@ -592,8 +589,9 @@ private class ComputeBiasTask extends BaseRecursiveAction { int[] fromDocFreqs, int[] toDocFreqs, CloseableThreadLocal threadLocal, + TaskExecutor executor, int depth) { - super(depth); + super(executor, depth); this.docs = docs; this.biases = biases; this.from = from; @@ -604,15 +602,15 @@ private class ComputeBiasTask extends BaseRecursiveAction { } @Override - protected void compute() { + public Void call() { final int problemSize = to - from; if (problemSize > 1 && shouldFork(problemSize, docs.length)) { final int mid = (from + to) >>> 1; invokeAll( new ComputeBiasTask( - docs, biases, from, mid, fromDocFreqs, toDocFreqs, threadLocal, depth), + docs, biases, from, mid, fromDocFreqs, toDocFreqs, threadLocal, executor, depth), new ComputeBiasTask( - docs, biases, mid, to, fromDocFreqs, toDocFreqs, threadLocal, depth)); + docs, biases, mid, to, fromDocFreqs, toDocFreqs, threadLocal, executor, depth)); } else { ForwardIndex forwardIndex = threadLocal.get().forwardIndex; try { @@ -623,6 +621,7 @@ protected void compute() { throw new UncheckedIOException(e); } } + return null; } /** @@ -707,12 +706,16 @@ public void close() throws IOException { } private int writePostings( - CodecReader reader, Set fields, Directory tempDir, DataOutput postingsOut) + CodecReader reader, + Set fields, + Directory tempDir, + DataOutput postingsOut, + int parallelism) throws IOException { final int maxNumTerms = (int) ((ramBudgetMB * 1024 * 1024 - docRAMRequirements(reader.maxDoc())) - / getParallelism() + / parallelism / termRAMRequirementsPerThreadPerTerm()); final int maxDocFreq = (int) ((double) this.maxDocFreq * reader.maxDoc()); @@ -825,9 +828,10 @@ public void onFinish() throws IOException { /** * Expert: Compute the {@link DocMap} that holds the new doc ID numbering. This is exposed to * enable integration into {@link BPReorderingMergePolicy}, {@link #reorder(CodecReader, - * Directory)} should be preferred in general. + * Directory, Executor)} should be preferred in general. */ - public Sorter.DocMap computeDocMap(CodecReader reader, Directory tempDir) throws IOException { + public Sorter.DocMap computeDocMap(CodecReader reader, Directory tempDir, Executor executor) + throws IOException { if (docRAMRequirements(reader.maxDoc()) >= ramBudgetMB * 1024 * 1024) { throw new NotEnoughRAMException( "At least " @@ -847,7 +851,8 @@ public Sorter.DocMap computeDocMap(CodecReader reader, Directory tempDir) throws } } - int[] newToOld = computePermutation(reader, fields, tempDir); + TaskExecutor taskExecutor = executor == null ? null : new TaskExecutor(executor); + int[] newToOld = computePermutation(reader, fields, tempDir, taskExecutor); int[] oldToNew = new int[newToOld.length]; for (int i = 0; i < newToOld.length; ++i) { oldToNew[newToOld[i]] = i; @@ -877,27 +882,42 @@ public int newToOld(int docID) { * evaluation efficiency. Note that the returned {@link CodecReader} is slow and should typically * be used in a call to {@link IndexWriter#addIndexes(CodecReader...)}. * + *

    The provided {@link Executor} can be used to perform reordering concurrently. A value of + * {@code null} indicates that reordering should be performed in the current thread. + * + *

    NOTE: The provided {@link Executor} must not reject tasks. + * * @throws NotEnoughRAMException if not enough RAM is provided */ - public CodecReader reorder(CodecReader reader, Directory tempDir) throws IOException { - Sorter.DocMap docMap = computeDocMap(reader, tempDir); + public CodecReader reorder(CodecReader reader, Directory tempDir, Executor executor) + throws IOException { + Sorter.DocMap docMap = computeDocMap(reader, tempDir, executor); return SortingCodecReader.wrap(reader, docMap, null); } /** * Compute a permutation of the doc ID space that reduces log gaps between consecutive postings. */ - private int[] computePermutation(CodecReader reader, Set fields, Directory dir) + private int[] computePermutation( + CodecReader reader, Set fields, Directory dir, TaskExecutor executor) throws IOException { TrackingDirectoryWrapper trackingDir = new TrackingDirectoryWrapper(dir); + final int parallelism; + if (executor == null) { + parallelism = 1; + } else { + // Assume as many threads as processors + parallelism = Runtime.getRuntime().availableProcessors(); + } + final int maxDoc = reader.maxDoc(); ForwardIndex forwardIndex = null; IndexOutput postingsOutput = null; boolean success = false; try { postingsOutput = trackingDir.createTempOutput("postings", "", IOContext.DEFAULT); - int numTerms = writePostings(reader, fields, trackingDir, postingsOutput); + int numTerms = writePostings(reader, fields, trackingDir, postingsOutput, parallelism); CodecUtil.writeFooter(postingsOutput); postingsOutput.close(); final ForwardIndex finalForwardIndex = @@ -924,14 +944,7 @@ protected PerThreadState initialValue() { } }) { IntsRef docs = new IntsRef(sortedDocs, 0, sortedDocs.length); - IndexReorderingTask task = - new IndexReorderingTask(docs, new float[maxDoc], threadLocal, parents, 0); - if (forkJoinPool != null) { - forkJoinPool.execute(task); - task.join(); - } else { - task.compute(); - } + new IndexReorderingTask(docs, new float[maxDoc], threadLocal, parents, executor, 0).call(); } success = true; diff --git a/lucene/misc/src/java/org/apache/lucene/misc/index/BPReorderingMergePolicy.java b/lucene/misc/src/java/org/apache/lucene/misc/index/BPReorderingMergePolicy.java index 077b3891556b..5cd363192fac 100644 --- a/lucene/misc/src/java/org/apache/lucene/misc/index/BPReorderingMergePolicy.java +++ b/lucene/misc/src/java/org/apache/lucene/misc/index/BPReorderingMergePolicy.java @@ -19,6 +19,7 @@ import java.io.IOException; import java.util.Collections; import java.util.Map; +import java.util.concurrent.Executor; import org.apache.lucene.index.CodecReader; import org.apache.lucene.index.FilterMergePolicy; import org.apache.lucene.index.MergePolicy; @@ -129,11 +130,12 @@ public CodecReader wrapForMerge(CodecReader reader) throws IOException { } @Override - public Sorter.DocMap reorder(CodecReader reader, Directory dir) throws IOException { + public Sorter.DocMap reorder(CodecReader reader, Directory dir, Executor executor) + throws IOException { Sorter.DocMap docMap = null; if (reader.numDocs() >= minNumDocs) { try { - docMap = reorderer.computeDocMap(reader, dir); + docMap = reorderer.computeDocMap(reader, dir, executor); } catch ( @SuppressWarnings("unused") NotEnoughRAMException e) { diff --git a/lucene/misc/src/java/org/apache/lucene/misc/util/fst/UpToTwoPositiveIntOutputs.java b/lucene/misc/src/java/org/apache/lucene/misc/util/fst/UpToTwoPositiveIntOutputs.java index c35a03dfc9fe..c7e2da715520 100644 --- a/lucene/misc/src/java/org/apache/lucene/misc/util/fst/UpToTwoPositiveIntOutputs.java +++ b/lucene/misc/src/java/org/apache/lucene/misc/util/fst/UpToTwoPositiveIntOutputs.java @@ -42,36 +42,11 @@ public final class UpToTwoPositiveIntOutputs extends Outputs { /** Holds two long outputs. */ - public static final class TwoLongs { - public final long first; - public final long second; - - public TwoLongs(long first, long second) { - this.first = first; - this.second = second; + public record TwoLongs(long first, long second) { + public TwoLongs { assert first >= 0; assert second >= 0; } - - @Override - public String toString() { - return "TwoLongs:" + first + "," + second; - } - - @Override - public boolean equals(Object _other) { - if (_other instanceof TwoLongs) { - final TwoLongs other = (TwoLongs) _other; - return first == other.first && second == other.second; - } else { - return false; - } - } - - @Override - public int hashCode() { - return (int) ((first ^ (first >>> 32)) ^ (second ^ (second >> 32))); - } } private static final Long NO_OUTPUT = 0L; diff --git a/lucene/misc/src/test/org/apache/lucene/misc/TestSweetSpotSimilarity.java b/lucene/misc/src/test/org/apache/lucene/misc/TestSweetSpotSimilarity.java index 17094692c7c3..feb66476005d 100644 --- a/lucene/misc/src/test/org/apache/lucene/misc/TestSweetSpotSimilarity.java +++ b/lucene/misc/src/test/org/apache/lucene/misc/TestSweetSpotSimilarity.java @@ -74,7 +74,7 @@ private static Explanation findExplanation(Explanation expl, String text) { public void testSweetSpotComputeNorm() throws IOException { final SweetSpotSimilarity ss = new SweetSpotSimilarity(); - ss.setLengthNormFactors(1, 1, 0.5f, true); + ss.setLengthNormFactors(1, 1, 0.5f); Similarity d = new ClassicSimilarity(); Similarity s = ss; @@ -87,7 +87,7 @@ public void testSweetSpotComputeNorm() throws IOException { // make a sweet spot - ss.setLengthNormFactors(3, 10, 0.5f, true); + ss.setLengthNormFactors(3, 10, 0.5f); for (int i = 3; i <= 10; i++) { assertEquals("3,10: spot i=" + i, 1.0f, computeNorm(ss, "bogus", i), 0.0f); @@ -101,14 +101,14 @@ public void testSweetSpotComputeNorm() throws IOException { // separate sweet spot for certain fields - final SweetSpotSimilarity ssBar = new SweetSpotSimilarity(); - ssBar.setLengthNormFactors(8, 13, 0.5f, false); - final SweetSpotSimilarity ssYak = new SweetSpotSimilarity(); - ssYak.setLengthNormFactors(6, 9, 0.5f, false); - final SweetSpotSimilarity ssA = new SweetSpotSimilarity(); - ssA.setLengthNormFactors(5, 8, 0.5f, false); - final SweetSpotSimilarity ssB = new SweetSpotSimilarity(); - ssB.setLengthNormFactors(5, 8, 0.1f, false); + final SweetSpotSimilarity ssBar = new SweetSpotSimilarity(false); + ssBar.setLengthNormFactors(8, 13, 0.5f); + final SweetSpotSimilarity ssYak = new SweetSpotSimilarity(false); + ssYak.setLengthNormFactors(6, 9, 0.5f); + final SweetSpotSimilarity ssA = new SweetSpotSimilarity(false); + ssA.setLengthNormFactors(5, 8, 0.5f); + final SweetSpotSimilarity ssB = new SweetSpotSimilarity(false); + ssB.setLengthNormFactors(5, 8, 0.1f); Similarity sp = new PerFieldSimilarityWrapper() { diff --git a/lucene/misc/src/test/org/apache/lucene/misc/document/TestLazyDocument.java b/lucene/misc/src/test/org/apache/lucene/misc/document/TestLazyDocument.java index ceeea076a3ad..56e4fa2d0dda 100644 --- a/lucene/misc/src/test/org/apache/lucene/misc/document/TestLazyDocument.java +++ b/lucene/misc/src/test/org/apache/lucene/misc/document/TestLazyDocument.java @@ -210,7 +210,7 @@ public Status needsField(FieldInfo fieldInfo) { @Override public void stringField(FieldInfo fieldInfo, String value) throws IOException { final FieldType ft = new FieldType(TextField.TYPE_STORED); - ft.setStoreTermVectors(fieldInfo.hasVectors()); + ft.setStoreTermVectors(fieldInfo.hasTermVectors()); ft.setOmitNorms(fieldInfo.omitsNorms()); ft.setIndexOptions(fieldInfo.getIndexOptions()); Objects.requireNonNull(value, "String value should not be null"); diff --git a/lucene/misc/src/test/org/apache/lucene/misc/index/TestBPIndexReorderer.java b/lucene/misc/src/test/org/apache/lucene/misc/index/TestBPIndexReorderer.java index f4322ff600a5..b7da3088df2c 100644 --- a/lucene/misc/src/test/org/apache/lucene/misc/index/TestBPIndexReorderer.java +++ b/lucene/misc/src/test/org/apache/lucene/misc/index/TestBPIndexReorderer.java @@ -116,11 +116,10 @@ public void doTestSingleTerm(ForkJoinPool pool) throws IOException { CodecReader codecReader = SlowCodecReaderWrapper.wrap(leafRealer); BPIndexReorderer reorderer = new BPIndexReorderer(); - reorderer.setForkJoinPool(pool); reorderer.setMinDocFreq(2); reorderer.setMinPartitionSize(1); reorderer.setMaxIters(10); - CodecReader reordered = reorderer.reorder(codecReader, dir); + CodecReader reordered = reorderer.reorder(codecReader, dir, pool); String[] ids = new String[codecReader.maxDoc()]; StoredFields storedFields = reordered.storedFields(); for (int i = 0; i < codecReader.maxDoc(); ++i) { @@ -180,11 +179,10 @@ private void doTestSingleTermWithBlocks(ForkJoinPool pool) throws IOException { CodecReader codecReader = SlowCodecReaderWrapper.wrap(leafRealer); BPIndexReorderer reorderer = new BPIndexReorderer(); - reorderer.setForkJoinPool(pool); reorderer.setMinDocFreq(2); reorderer.setMinPartitionSize(1); reorderer.setMaxIters(10); - CodecReader reordered = reorderer.reorder(codecReader, dir); + CodecReader reordered = reorderer.reorder(codecReader, dir, pool); StoredFields storedFields = reordered.storedFields(); assertEquals("2", storedFields.document(0).get("id")); @@ -307,7 +305,7 @@ public void testMultiTerm() throws IOException { reorderer.setMinDocFreq(2); reorderer.setMinPartitionSize(1); reorderer.setMaxIters(10); - CodecReader reordered = reorderer.reorder(codecReader, dir); + CodecReader reordered = reorderer.reorder(codecReader, dir, null); String[] ids = new String[codecReader.maxDoc()]; StoredFields storedFields = reordered.storedFields(); for (int i = 0; i < codecReader.maxDoc(); ++i) { diff --git a/lucene/misc/src/test/org/apache/lucene/misc/util/fst/TestFSTsMisc.java b/lucene/misc/src/test/org/apache/lucene/misc/util/fst/TestFSTsMisc.java index 44c342b29ab7..5d10c6a4911a 100644 --- a/lucene/misc/src/test/org/apache/lucene/misc/util/fst/TestFSTsMisc.java +++ b/lucene/misc/src/test/org/apache/lucene/misc/util/fst/TestFSTsMisc.java @@ -114,10 +114,12 @@ private void doTest(int inputMode, IntsRef[] terms) throws IOException { protected boolean outputsEqual(Object output1, Object output2) { if (output1 instanceof TwoLongs && output2 instanceof List) { TwoLongs twoLongs1 = (TwoLongs) output1; - return Arrays.asList(new Long[] {twoLongs1.first, twoLongs1.second}).equals(output2); + return Arrays.asList(new Long[] {twoLongs1.first(), twoLongs1.second()}) + .equals(output2); } else if (output2 instanceof TwoLongs && output1 instanceof List) { TwoLongs twoLongs2 = (TwoLongs) output2; - return Arrays.asList(new Long[] {twoLongs2.first, twoLongs2.second}).equals(output1); + return Arrays.asList(new Long[] {twoLongs2.first(), twoLongs2.second()}) + .equals(output1); } return output1.equals(output2); } diff --git a/lucene/monitor/src/java/org/apache/lucene/monitor/CandidateMatcher.java b/lucene/monitor/src/java/org/apache/lucene/monitor/CandidateMatcher.java index 6132e195edba..055f1b07c87c 100644 --- a/lucene/monitor/src/java/org/apache/lucene/monitor/CandidateMatcher.java +++ b/lucene/monitor/src/java/org/apache/lucene/monitor/CandidateMatcher.java @@ -64,7 +64,7 @@ public CandidateMatcher(IndexSearcher searcher) { * @param metadata the query metadata * @throws IOException on IO errors */ - protected abstract void matchQuery(String queryId, Query matchQuery, Map metadata) + public abstract void matchQuery(String queryId, Query matchQuery, Map metadata) throws IOException; /** @@ -95,14 +95,14 @@ protected final void addMatch(T match, int doc) { public abstract T resolve(T match1, T match2); /** Called by the Monitor if running a query throws an Exception */ - void reportError(String queryId, Exception e) { + public void reportError(String queryId, Exception e) { this.errors.put(queryId, e); } /** * @return the matches from this matcher */ - final MultiMatchingQueries finish(long buildTime, int queryCount) { + public final MultiMatchingQueries finish(long buildTime, int queryCount) { doFinish(); this.searchTime = TimeUnit.MILLISECONDS.convert(System.nanoTime() - searchTime, TimeUnit.NANOSECONDS); diff --git a/lucene/monitor/src/java/org/apache/lucene/monitor/CollectingMatcher.java b/lucene/monitor/src/java/org/apache/lucene/monitor/CollectingMatcher.java index 02d8e3b6ba20..c434672fd68d 100644 --- a/lucene/monitor/src/java/org/apache/lucene/monitor/CollectingMatcher.java +++ b/lucene/monitor/src/java/org/apache/lucene/monitor/CollectingMatcher.java @@ -18,7 +18,9 @@ package org.apache.lucene.monitor; import java.io.IOException; +import java.util.Collection; import java.util.Map; +import org.apache.lucene.search.CollectorManager; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.Scorable; @@ -35,9 +37,31 @@ abstract class CollectingMatcher extends CandidateMatcher< } @Override - protected void matchQuery(final String queryId, Query matchQuery, Map metadata) + public void matchQuery(final String queryId, Query matchQuery, Map metadata) throws IOException { - searcher.search(matchQuery, new MatchCollector(queryId, scoreMode)); + MatchCollector matchCollector = new MatchCollector(queryId, scoreMode); + searcher.search( + matchQuery, + new CollectorManager() { + boolean newCollectorInvoked = false; + + @Override + public MatchCollector newCollector() { + if (newCollectorInvoked) { + throw new IllegalStateException( + "newCollector should be invoked at most once. Ensure your IndexSearcher has been created without an Executor."); + } + newCollectorInvoked = true; + return matchCollector; + } + + @Override + public Void reduce(Collection collectors) { + assert collectors.size() == 1 + : "collectors should contain exactly one collector instance"; + return null; + } + }); } /** diff --git a/lucene/monitor/src/java/org/apache/lucene/monitor/ExplainingMatch.java b/lucene/monitor/src/java/org/apache/lucene/monitor/ExplainingMatch.java index 1b4aa4044ef0..19e124f1a3f9 100644 --- a/lucene/monitor/src/java/org/apache/lucene/monitor/ExplainingMatch.java +++ b/lucene/monitor/src/java/org/apache/lucene/monitor/ExplainingMatch.java @@ -31,8 +31,8 @@ public class ExplainingMatch extends QueryMatch { searcher -> new CandidateMatcher(searcher) { @Override - protected void matchQuery( - String queryId, Query matchQuery, Map metadata) throws IOException { + public void matchQuery(String queryId, Query matchQuery, Map metadata) + throws IOException { int maxDocs = searcher.getIndexReader().maxDoc(); for (int i = 0; i < maxDocs; i++) { Explanation explanation = searcher.explain(matchQuery, i); diff --git a/lucene/monitor/src/java/org/apache/lucene/monitor/HighlightsMatch.java b/lucene/monitor/src/java/org/apache/lucene/monitor/HighlightsMatch.java index aadcc75fb6ed..013c1bf62824 100644 --- a/lucene/monitor/src/java/org/apache/lucene/monitor/HighlightsMatch.java +++ b/lucene/monitor/src/java/org/apache/lucene/monitor/HighlightsMatch.java @@ -46,8 +46,8 @@ public class HighlightsMatch extends QueryMatch { new CandidateMatcher(searcher) { @Override - protected void matchQuery( - String queryId, Query matchQuery, Map metadata) throws IOException { + public void matchQuery(String queryId, Query matchQuery, Map metadata) + throws IOException { Weight w = searcher.createWeight( searcher.rewrite(matchQuery), ScoreMode.COMPLETE_NO_SCORES, 1); @@ -180,47 +180,16 @@ void addHit(String field, int startPos, int endPos, int startOffset, int endOffs hitSet.add(new Hit(startPos, startOffset, endPos, endOffset)); } - /** Represents an individual hit */ - public static class Hit implements Comparable { - - /** The start position */ - public final int startPosition; - - /** The start offset */ - public final int startOffset; - - /** The end positions */ - public final int endPosition; - - /** The end offset */ - public final int endOffset; - - public Hit(int startPosition, int startOffset, int endPosition, int endOffset) { - this.startPosition = startPosition; - this.startOffset = startOffset; - this.endPosition = endPosition; - this.endOffset = endOffset; - } - - @Override - public boolean equals(Object obj) { - if (this == obj) return true; - if (!(obj instanceof Hit)) return false; - Hit other = (Hit) obj; - return this.startOffset == other.startOffset - && this.endOffset == other.endOffset - && this.startPosition == other.startPosition - && this.endPosition == other.endPosition; - } - - @Override - public int hashCode() { - int result = startPosition; - result = 31 * result + startOffset; - result = 31 * result + endPosition; - result = 31 * result + endOffset; - return result; - } + /** + * Represents an individual hit + * + * @param startPosition The start position + * @param startOffset The start offset + * @param endPosition The end position + * @param endOffset The end offset + */ + public record Hit(int startPosition, int startOffset, int endPosition, int endOffset) + implements Comparable { @Override public String toString() { diff --git a/lucene/monitor/src/java/org/apache/lucene/monitor/Monitor.java b/lucene/monitor/src/java/org/apache/lucene/monitor/Monitor.java index affb851ee512..e450f687c1a1 100644 --- a/lucene/monitor/src/java/org/apache/lucene/monitor/Monitor.java +++ b/lucene/monitor/src/java/org/apache/lucene/monitor/Monitor.java @@ -116,24 +116,14 @@ public QueryCacheStats getQueryCacheStats() throws IOException { queryIndex.numDocs(), queryIndex.cacheSize(), queryIndex.getLastPurged()); } - /** Statistics for the query cache and query index */ - public static class QueryCacheStats { - - /** Total number of queries in the query index */ - public final int queries; - - /** Total number of queries int the query cache */ - public final int cachedQueries; - - /** Time the query cache was last purged */ - public final long lastPurged; - - public QueryCacheStats(int queries, int cachedQueries, long lastPurged) { - this.queries = queries; - this.cachedQueries = cachedQueries; - this.lastPurged = lastPurged; - } - } + /** + * Statistics for the query cache and query index + * + * @param queries Total number of queries in the query index + * @param cachedQueries Total number of queries int the query cache + * @param lastPurged Time the query cache was last purged + */ + public record QueryCacheStats(int queries, int cachedQueries, long lastPurged) {} /** * Remove unused queries from the query cache. diff --git a/lucene/monitor/src/java/org/apache/lucene/monitor/ParallelMatcher.java b/lucene/monitor/src/java/org/apache/lucene/monitor/ParallelMatcher.java index fd6d7efceb35..e6974b234f87 100644 --- a/lucene/monitor/src/java/org/apache/lucene/monitor/ParallelMatcher.java +++ b/lucene/monitor/src/java/org/apache/lucene/monitor/ParallelMatcher.java @@ -74,7 +74,7 @@ private ParallelMatcher( } @Override - protected void matchQuery(String queryId, Query matchQuery, Map metadata) + public void matchQuery(String queryId, Query matchQuery, Map metadata) throws IOException { try { queue.put(new MatcherTask(queryId, matchQuery, metadata)); @@ -138,35 +138,15 @@ public CandidateMatcher call() { } } - private static class MatcherTask { - - final String id; - final Query matchQuery; - final Map metadata; - - private MatcherTask(String id, Query matchQuery, Map metadata) { - this.id = id; - this.matchQuery = matchQuery; - this.metadata = metadata; - } - } + private record MatcherTask(String id, Query matchQuery, Map metadata) {} /* Marker object placed on the queue after all matches are done, to indicate to the worker threads that they should finish */ private static final MatcherTask END = new MatcherTask("", null, Collections.emptyMap()); - private static class ParallelMatcherFactory implements MatcherFactory { - - private final ExecutorService executor; - private final MatcherFactory matcherFactory; - private final int threads; - - ParallelMatcherFactory( - ExecutorService executor, MatcherFactory matcherFactory, int threads) { - this.executor = executor; - this.matcherFactory = matcherFactory; - this.threads = threads; - } + private record ParallelMatcherFactory( + ExecutorService executor, MatcherFactory matcherFactory, int threads) + implements MatcherFactory { @Override public ParallelMatcher createMatcher(IndexSearcher searcher) { diff --git a/lucene/monitor/src/java/org/apache/lucene/monitor/PartitionMatcher.java b/lucene/monitor/src/java/org/apache/lucene/monitor/PartitionMatcher.java index aaf1f576ecb7..d432f8437c83 100644 --- a/lucene/monitor/src/java/org/apache/lucene/monitor/PartitionMatcher.java +++ b/lucene/monitor/src/java/org/apache/lucene/monitor/PartitionMatcher.java @@ -51,18 +51,7 @@ public class PartitionMatcher extends CandidateMatcher private final CandidateMatcher resolvingMatcher; - private static class MatchTask { - - final String queryId; - final Query matchQuery; - final Map metadata; - - private MatchTask(String queryId, Query matchQuery, Map metadata) { - this.queryId = queryId; - this.matchQuery = matchQuery; - this.metadata = metadata; - } - } + private record MatchTask(String queryId, Query matchQuery, Map metadata) {} private final List tasks = new ArrayList<>(); @@ -79,7 +68,7 @@ private PartitionMatcher( } @Override - protected void matchQuery(String queryId, Query matchQuery, Map metadata) { + public void matchQuery(String queryId, Query matchQuery, Map metadata) { tasks.add(new MatchTask(queryId, matchQuery, metadata)); } @@ -135,18 +124,9 @@ public MultiMatchingQueries call() { } } - private static class PartitionMatcherFactory implements MatcherFactory { - - private final ExecutorService executor; - private final MatcherFactory matcherFactory; - private final int threads; - - PartitionMatcherFactory( - ExecutorService executor, MatcherFactory matcherFactory, int threads) { - this.executor = executor; - this.matcherFactory = matcherFactory; - this.threads = threads; - } + private record PartitionMatcherFactory( + ExecutorService executor, MatcherFactory matcherFactory, int threads) + implements MatcherFactory { @Override public PartitionMatcher createMatcher(IndexSearcher searcher) { diff --git a/lucene/monitor/src/java/org/apache/lucene/monitor/QueryTimeListener.java b/lucene/monitor/src/java/org/apache/lucene/monitor/QueryTimeListener.java index a787724d1815..b9e2b80f0d92 100644 --- a/lucene/monitor/src/java/org/apache/lucene/monitor/QueryTimeListener.java +++ b/lucene/monitor/src/java/org/apache/lucene/monitor/QueryTimeListener.java @@ -39,7 +39,7 @@ static MatcherFactory timingMatcher( CandidateMatcher matcher = factory.createMatcher(searcher); return new CandidateMatcher(searcher) { @Override - protected void matchQuery(String queryId, Query matchQuery, Map metadata) + public void matchQuery(String queryId, Query matchQuery, Map metadata) throws IOException { long t = System.nanoTime(); matcher.matchQuery(queryId, matchQuery, metadata); diff --git a/lucene/monitor/src/java/org/apache/lucene/monitor/WritableQueryIndex.java b/lucene/monitor/src/java/org/apache/lucene/monitor/WritableQueryIndex.java index b51ad40d9c4b..69ab393b9bfb 100644 --- a/lucene/monitor/src/java/org/apache/lucene/monitor/WritableQueryIndex.java +++ b/lucene/monitor/src/java/org/apache/lucene/monitor/WritableQueryIndex.java @@ -130,15 +130,7 @@ private void commitWithoutNotify(List updates) throws IOException } } - private static class Indexable { - final QueryCacheEntry queryCacheEntry; - final Document document; - - private Indexable(QueryCacheEntry queryCacheEntry, Document document) { - this.queryCacheEntry = queryCacheEntry; - this.document = document; - } - } + private record Indexable(QueryCacheEntry queryCacheEntry, Document document) {} private void populateQueryCache(MonitorQuerySerializer serializer, QueryDecomposer decomposer) throws IOException { diff --git a/lucene/monitor/src/test/org/apache/lucene/monitor/TestCachePurging.java b/lucene/monitor/src/test/org/apache/lucene/monitor/TestCachePurging.java index a020c23d3364..e681d173b07f 100644 --- a/lucene/monitor/src/test/org/apache/lucene/monitor/TestCachePurging.java +++ b/lucene/monitor/src/test/org/apache/lucene/monitor/TestCachePurging.java @@ -51,7 +51,7 @@ public void onPurge() { monitor.register(queries); assertEquals(3, monitor.getQueryCount()); assertEquals(4, monitor.getDisjunctCount()); - assertEquals(4, monitor.getQueryCacheStats().cachedQueries); + assertEquals(4, monitor.getQueryCacheStats().cachedQueries()); Document doc = new Document(); doc.add(newTextField("field", "test1 test2 test3", Field.Store.NO)); @@ -59,11 +59,11 @@ public void onPurge() { monitor.deleteById("1"); assertEquals(2, monitor.getQueryCount()); - assertEquals(4, monitor.getQueryCacheStats().cachedQueries); + assertEquals(4, monitor.getQueryCacheStats().cachedQueries()); assertEquals(2, monitor.match(doc, QueryMatch.SIMPLE_MATCHER).getMatchCount()); monitor.purgeCache(); - assertEquals(2, monitor.getQueryCacheStats().cachedQueries); + assertEquals(2, monitor.getQueryCacheStats().cachedQueries()); MatchingQueries result = monitor.match(doc, QueryMatch.SIMPLE_MATCHER); assertEquals(2, result.getMatchCount()); @@ -109,13 +109,13 @@ private static void doConcurrentPurgesAndUpdatesTest() throws Exception { monitor.deleteById(Integer.toString(i)); } - assertEquals(200, monitor.getQueryCacheStats().cachedQueries); + assertEquals(200, monitor.getQueryCacheStats().cachedQueries()); startUpdating.countDown(); monitor.purgeCache(); finishUpdating.await(); - assertEquals(340, monitor.getQueryCacheStats().cachedQueries); + assertEquals(340, monitor.getQueryCacheStats().cachedQueries()); Document doc = new Document(); doc.add(newTextField("field", "test", Field.Store.NO)); MatchingQueries matcher = monitor.match(doc, QueryMatch.SIMPLE_MATCHER); @@ -137,15 +137,15 @@ public void testBackgroundPurges() throws IOException, InterruptedException { new MonitorConfiguration().setPurgeFrequency(50, TimeUnit.MILLISECONDS); try (Monitor monitor = new Monitor(ANALYZER, Presearcher.NO_FILTERING, config)) { - assertEquals(-1, monitor.getQueryCacheStats().lastPurged); + assertEquals(-1, monitor.getQueryCacheStats().lastPurged()); for (int i = 0; i < 100; i++) { monitor.register(newMonitorQuery(i)); } - assertEquals(100, monitor.getQueryCacheStats().cachedQueries); + assertEquals(100, monitor.getQueryCacheStats().cachedQueries()); monitor.deleteById("5"); - assertEquals(99, monitor.getQueryCacheStats().queries); + assertEquals(99, monitor.getQueryCacheStats().queries()); CountDownLatch latch = new CountDownLatch(1); monitor.addQueryIndexUpdateListener( @@ -154,7 +154,7 @@ public void testBackgroundPurges() throws IOException, InterruptedException { public void onPurge() { // It can sometimes take a couple of purge runs to get everything in sync try { - if (monitor.getQueryCacheStats().cachedQueries == 99) latch.countDown(); + if (monitor.getQueryCacheStats().cachedQueries() == 99) latch.countDown(); } catch (IOException e) { // Ignore throw new RuntimeException(e); @@ -163,9 +163,9 @@ public void onPurge() { }); assertTrue(latch.await(5, TimeUnit.SECONDS)); - assertEquals(99, monitor.getQueryCacheStats().queries); - assertEquals(99, monitor.getQueryCacheStats().cachedQueries); - assertTrue(monitor.getQueryCacheStats().lastPurged > 0); + assertEquals(99, monitor.getQueryCacheStats().queries()); + assertEquals(99, monitor.getQueryCacheStats().cachedQueries()); + assertTrue(monitor.getQueryCacheStats().lastPurged() > 0); } } } diff --git a/lucene/monitor/src/test/org/apache/lucene/monitor/TestMonitor.java b/lucene/monitor/src/test/org/apache/lucene/monitor/TestMonitor.java index aa5f3452764f..52675c3f9a7d 100644 --- a/lucene/monitor/src/test/org/apache/lucene/monitor/TestMonitor.java +++ b/lucene/monitor/src/test/org/apache/lucene/monitor/TestMonitor.java @@ -178,7 +178,7 @@ public void testMatcherMetadata() throws IOException { docs -> new CandidateMatcher(docs) { @Override - protected void matchQuery( + public void matchQuery( String queryId, Query matchQuery, Map metadata) { assertEquals("value", metadata.get("key")); } diff --git a/lucene/monitor/src/test/org/apache/lucene/monitor/outsidepackage/TestCandidateMatcherVisibility.java b/lucene/monitor/src/test/org/apache/lucene/monitor/outsidepackage/TestCandidateMatcherVisibility.java new file mode 100644 index 000000000000..73e84112e387 --- /dev/null +++ b/lucene/monitor/src/test/org/apache/lucene/monitor/outsidepackage/TestCandidateMatcherVisibility.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.monitor.outsidepackage; + +import java.io.IOException; +import java.util.Collections; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.memory.MemoryIndex; +import org.apache.lucene.monitor.CandidateMatcher; +import org.apache.lucene.monitor.QueryMatch; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.TermQuery; +import org.junit.Test; + +public class TestCandidateMatcherVisibility { + + private CandidateMatcher newCandidateMatcher() { + // Index and searcher for use in creating a matcher + MemoryIndex index = new MemoryIndex(); + final IndexSearcher searcher = index.createSearcher(); + return QueryMatch.SIMPLE_MATCHER.createMatcher(searcher); + } + + @Test + public void testMatchQueryVisibleOutsidePackage() throws IOException { + CandidateMatcher matcher = newCandidateMatcher(); + // This should compile from outside org.apache.lucene.monitor package + // (subpackage org.apache.lucene.monitor.outsidepackage cannot access package-private content + // from org.apache.lucene.monitor) + matcher.matchQuery("test", new TermQuery(new Term("test_field")), Collections.emptyMap()); + } + + @Test + public void testReportErrorVisibleOutsidePackage() { + CandidateMatcher matcher = newCandidateMatcher(); + // This should compile from outside org.apache.lucene.monitor package + // (subpackage org.apache.lucene.monitor.outsidepackage cannot access package-private content + // from org.apache.lucene.monitor) + matcher.reportError("test", new RuntimeException("test exception")); + } + + @Test + public void testFinishVisibleOutsidePackage() { + CandidateMatcher matcher = newCandidateMatcher(); + // This should compile from outside org.apache.lucene.monitor package + // (subpackage org.apache.lucene.monitor.outsidepackage cannot access package-private content + // from org.apache.lucene.monitor) + matcher.finish(0, 0); + } +} diff --git a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/ByteKnnVectorFieldSource.java b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/ByteKnnVectorFieldSource.java index 32517496d542..c95bf632a73a 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/ByteKnnVectorFieldSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/ByteKnnVectorFieldSource.java @@ -20,6 +20,7 @@ import java.util.Map; import java.util.Objects; import org.apache.lucene.index.ByteVectorValues; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.VectorEncoding; @@ -63,11 +64,12 @@ protected DocIdSetIterator getVectorIterator() { } return new VectorFieldFunction(this) { + KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); @Override public byte[] byteVectorVal(int doc) throws IOException { if (exists(doc)) { - return vectorValues.vectorValue(); + return vectorValues.vectorValue(iterator.index()); } else { return null; } @@ -75,7 +77,7 @@ public byte[] byteVectorVal(int doc) throws IOException { @Override protected DocIdSetIterator getVectorIterator() { - return vectorValues; + return iterator; } }; } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/FloatKnnVectorFieldSource.java b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/FloatKnnVectorFieldSource.java index 43cc3aff880e..f026d9537bc6 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/FloatKnnVectorFieldSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/FloatKnnVectorFieldSource.java @@ -20,6 +20,7 @@ import java.util.Map; import java.util.Objects; import org.apache.lucene.index.FloatVectorValues; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.VectorEncoding; @@ -62,11 +63,12 @@ protected DocIdSetIterator getVectorIterator() { } return new VectorFieldFunction(this) { + KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); @Override public float[] floatVectorVal(int doc) throws IOException { if (exists(doc)) { - return vectorValues.vectorValue(); + return vectorValues.vectorValue(iterator.index()); } else { return null; } @@ -74,7 +76,7 @@ public float[] floatVectorVal(int doc) throws IOException { @Override protected DocIdSetIterator getVectorIterator() { - return vectorValues; + return iterator; } }; } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/NormValueSource.java b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/NormValueSource.java index 3e87a19c20e5..4373d77d3819 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/NormValueSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/NormValueSource.java @@ -19,12 +19,12 @@ import java.io.IOException; import java.util.Map; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.queries.function.FunctionValues; import org.apache.lucene.queries.function.ValueSource; import org.apache.lucene.queries.function.docvalues.FloatDocValues; import org.apache.lucene.search.CollectionStatistics; import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.LeafSimScorer; import org.apache.lucene.search.TermStatistics; import org.apache.lucene.search.similarities.Similarity.SimScorer; import org.apache.lucene.search.similarities.TFIDFSimilarity; @@ -76,8 +76,7 @@ public FunctionValues getValues(Map context, LeafReaderContext r 1f, new CollectionStatistics(field, 1, 1, 1, 1), new TermStatistics(new BytesRef("bogus"), 1, 1)); - final LeafSimScorer leafSimScorer = - new LeafSimScorer(simScorer, readerContext.reader(), field, true); + final NumericDocValues norms = readerContext.reader().getNormValues(field); return new FloatDocValues(this) { int lastDocID = -1; @@ -88,7 +87,11 @@ public float floatVal(int docID) throws IOException { throw new AssertionError("docs out of order: lastDocID=" + lastDocID + " docID=" + docID); } lastDocID = docID; - return leafSimScorer.score(docID, 1f); + long norm = 1L; + if (norms != null && norms.advanceExact(docID)) { + norm = norms.longValue(); + } + return simScorer.score(1f, norm); } }; } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/BlockIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/BlockIntervalsSource.java index 91a5e94d5f2b..efb03b154ef3 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/BlockIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/BlockIntervalsSource.java @@ -38,6 +38,7 @@ private static List flatten(List sources) { List flattened = new ArrayList<>(); for (IntervalsSource s : sources) { if (s instanceof BlockIntervalsSource) { + // Block sources can be flattened because they do not increase the gap (gap = 0) flattened.addAll(((BlockIntervalsSource) s).subSources); } else { flattened.add(s); diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/DisjunctionIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/DisjunctionIntervalsSource.java index 6f683537c6a0..63d66822cabb 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/DisjunctionIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/DisjunctionIntervalsSource.java @@ -370,16 +370,9 @@ public float matchCost() { } }; - private static class DisjunctionMatchesIterator implements IntervalMatchesIterator { - - final DisjunctionIntervalIterator it; - final List subs; - - private DisjunctionMatchesIterator( - DisjunctionIntervalIterator it, List subs) { - this.it = it; - this.subs = subs; - } + private record DisjunctionMatchesIterator( + DisjunctionIntervalIterator it, List subs) + implements IntervalMatchesIterator { @Override public boolean next() throws IOException { diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java index 7b9c933c1672..e3808f406cf6 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java @@ -238,6 +238,7 @@ public static IntervalsSource regexp(BytesRef regexp) { */ public static IntervalsSource regexp(BytesRef regexp, int maxExpansions) { Automaton automaton = new RegExp(new Term("", regexp).text()).toAutomaton(); + automaton = Operations.determinize(automaton, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT); CompiledAutomaton ca = new CompiledAutomaton(automaton, false, true, false); return new MultiTermIntervalsSource(ca, maxExpansions, regexp.utf8ToString()); } @@ -246,8 +247,10 @@ public static IntervalsSource regexp(BytesRef regexp, int maxExpansions) { * Return an {@link IntervalsSource} over the disjunction of all terms that fall within the given * range * - * @param lowerTerm The term text at the lower end of the range - * @param upperTerm The term text at the upper end of the range + * @param lowerTerm The term text at the lower end of the range; can be {@code null} to indicate + * an open-ended range at this end + * @param upperTerm The term text at the upper end of the range; can be {@code null} to indicate + * an open-ended range at this end * @param includeLower If true, the lowerTerm is included in the range * @param includeUpper If true, the upperTerm is included in the range * @throws IllegalStateException if the range expands to more than {@link #DEFAULT_MAX_EXPANSIONS} @@ -265,8 +268,10 @@ public static IntervalsSource range( *

    WARNING: Setting {@code maxExpansions} to higher than the default value of {@link * #DEFAULT_MAX_EXPANSIONS} can be both slow and memory-intensive * - * @param lowerTerm The term text at the lower end of the range - * @param upperTerm The term text at the upper end of the range + * @param lowerTerm The term text at the lower end of the range; can be {@code null} to indicate + * an open-ended range at this end + * @param upperTerm The term text at the upper end of the range; can be {@code null} to indicate + * an open-ended range at this end * @param includeLower If true, the lowerTerm is included in the range * @param includeUpper If true, the upperTerm is included in the range * @param maxExpansions the maximum number of terms to expand to @@ -285,9 +290,9 @@ public static IntervalsSource range( StringBuilder buffer = new StringBuilder(); buffer.append("{"); - buffer.append(lowerTerm.utf8ToString()); + buffer.append(lowerTerm == null ? "* " : lowerTerm.utf8ToString()); buffer.append(","); - buffer.append(upperTerm.utf8ToString()); + buffer.append(upperTerm == null ? "*" : upperTerm.utf8ToString()); buffer.append("}"); return new MultiTermIntervalsSource(ca, maxExpansions, buffer.toString()); } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/OrderedIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/OrderedIntervalsSource.java index 65fa6d033957..fc1588c8bdf5 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/OrderedIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/OrderedIntervalsSource.java @@ -30,25 +30,13 @@ static IntervalsSource build(List sources) { if (sources.size() == 1) { return sources.get(0); } - List rewritten = deduplicate(flatten(sources)); + List rewritten = deduplicate(sources); if (rewritten.size() == 1) { return rewritten.get(0); } return new OrderedIntervalsSource(rewritten); } - private static List flatten(List sources) { - List flattened = new ArrayList<>(); - for (IntervalsSource s : sources) { - if (s instanceof OrderedIntervalsSource) { - flattened.addAll(((OrderedIntervalsSource) s).subSources); - } else { - flattened.add(s); - } - } - return flattened; - } - private static List deduplicate(List sources) { List deduplicated = new ArrayList<>(); List current = new ArrayList<>(); @@ -136,38 +124,58 @@ public int nextInterval() throws IOException { start = end = slop = IntervalIterator.NO_MORE_INTERVALS; int lastStart = Integer.MAX_VALUE; boolean minimizing = false; + final var subIterators = this.subIterators; + int currentIndex = i; while (true) { + int prevEnd = subIterators.get(currentIndex - 1).end(); while (true) { - if (subIterators.get(i - 1).end() >= lastStart) { + if (prevEnd >= lastStart) { + i = currentIndex; return start; } - if (i == subIterators.size() - || (minimizing && subIterators.get(i).start() > subIterators.get(i - 1).end())) { + if (currentIndex == subIterators.size()) { + break; + } + final IntervalIterator current = subIterators.get(currentIndex); + if (minimizing && (current.start() > prevEnd)) { break; } + int currentStart; do { - if (subIterators.get(i).end() >= lastStart - || subIterators.get(i).nextInterval() == IntervalIterator.NO_MORE_INTERVALS) { + if (current.end() >= lastStart + || (currentStart = current.nextInterval()) == IntervalIterator.NO_MORE_INTERVALS) { + i = currentIndex; return start; } - } while (subIterators.get(i).start() <= subIterators.get(i - 1).end()); - i++; + } while (currentStart <= prevEnd); + currentIndex++; + prevEnd = current.end(); } - start = subIterators.get(0).start(); + var first = subIterators.get(0); + final int start = first.start(); + this.start = start; if (start == NO_MORE_INTERVALS) { + i = currentIndex; return end = NO_MORE_INTERVALS; } - end = subIterators.get(subIterators.size() - 1).end(); - slop = end - start + 1; - for (IntervalIterator subIterator : subIterators) { - slop -= subIterator.width(); + var last = subIterators.getLast(); + + final int end = last.end(); + this.end = end; + int slop = end - start + 1; + // use indexed loop since this is always a random access capable list to avoid allocations + // in a hot nested loop + for (int j = 0, n = subIterators.size(); j < n; j++) { + slop -= subIterators.get(j).width(); } + this.slop = slop; onMatch.onMatch(); - lastStart = subIterators.get(subIterators.size() - 1).start(); - i = 1; - if (subIterators.get(0).nextInterval() == IntervalIterator.NO_MORE_INTERVALS) { + currentIndex = 1; + if (first.nextInterval() == IntervalIterator.NO_MORE_INTERVALS) { + i = currentIndex; return start; } + lastStart = last.start(); minimizing = true; } } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/TermIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/TermIntervalsSource.java index e5ca29b9fae5..c0f2f61d7cc1 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/TermIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/TermIntervalsSource.java @@ -21,8 +21,8 @@ import java.util.Collection; import java.util.Collections; import java.util.Objects; -import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat; -import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader; +import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat; +import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.PostingsEnum; @@ -261,10 +261,10 @@ public void visit(String field, QueryVisitor visitor) { /** * A guess of the average number of simple operations for the initial seek and buffer refill per * document for the positions of a term. See also {@link - * Lucene912PostingsReader.EverythingEnum#nextPosition()}. + * Lucene101PostingsReader.EverythingEnum#nextPosition()}. * *

    Aside: Instead of being constant this could depend among others on {@link - * Lucene912PostingsFormat#BLOCK_SIZE}, {@link TermsEnum#docFreq()}, {@link + * Lucene101PostingsFormat#BLOCK_SIZE}, {@link TermsEnum#docFreq()}, {@link * TermsEnum#totalTermFreq()}, {@link DocIdSetIterator#cost()} (expected number of matching docs), * {@link LeafReader#maxDoc()} (total number of docs in the segment), and the seek time and block * size of the device storing the index. @@ -272,7 +272,7 @@ public void visit(String field, QueryVisitor visitor) { private static final int TERM_POSNS_SEEK_OPS_PER_DOC = 128; /** - * Number of simple operations in {@link Lucene912PostingsReader.EverythingEnum#nextPosition()} + * Number of simple operations in {@link Lucene101PostingsReader.EverythingEnum#nextPosition()} * when no seek or buffer refill is done. */ private static final int TERM_OPS_PER_POS = 7; diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/UnorderedIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/UnorderedIntervalsSource.java index 132ab4b3976e..d2c708b4feed 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/UnorderedIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/UnorderedIntervalsSource.java @@ -33,7 +33,7 @@ static IntervalsSource build(List sources) { if (sources.size() == 1) { return sources.get(0); } - List rewritten = deduplicate(flatten(sources)); + List rewritten = deduplicate(sources); if (rewritten.size() == 1) { return rewritten.get(0); } @@ -55,18 +55,6 @@ private static List deduplicate(List sources) return deduplicated; } - private static List flatten(List sources) { - List flattened = new ArrayList<>(); - for (IntervalsSource s : sources) { - if (s instanceof UnorderedIntervalsSource) { - flattened.addAll(((UnorderedIntervalsSource) s).subSources); - } else { - flattened.add(s); - } - } - return flattened; - } - private UnorderedIntervalsSource(List sources) { super(sources); } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/payloads/PayloadScoreQuery.java b/lucene/queries/src/java/org/apache/lucene/queries/payloads/PayloadScoreQuery.java index 7c4541027791..59e35aa7604d 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/payloads/PayloadScoreQuery.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/payloads/PayloadScoreQuery.java @@ -20,6 +20,7 @@ import java.util.Map; import java.util.Objects; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermStates; @@ -32,14 +33,18 @@ import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.LeafSimScorer; import org.apache.lucene.search.Query; import org.apache.lucene.search.QueryVisitor; import org.apache.lucene.search.ScoreMode; import org.apache.lucene.search.ScorerSupplier; +import org.apache.lucene.search.similarities.Similarity.SimScorer; import org.apache.lucene.util.BytesRef; -/** A Query class that uses a {@link PayloadFunction} to modify the score of a wrapped SpanQuery */ +/** + * A Query class that uses a {@link PayloadFunction} to modify the score of a wrapped {@link + * SpanQuery}. A wrapped span query is used due to the way that payload values are indexed, see + * {@link PostingsEnum#PAYLOADS}. + */ public class PayloadScoreQuery extends SpanQuery { private final SpanQuery wrappedQuery; @@ -186,9 +191,9 @@ public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOExcepti if (spans == null) { return null; } - LeafSimScorer docScorer = innerWeight.getSimScorer(context); + NumericDocValues norms = context.reader().getNormValues(field); PayloadSpans payloadSpans = new PayloadSpans(spans, decoder); - final var scorer = new PayloadSpanScorer(payloadSpans, docScorer); + final var scorer = new PayloadSpanScorer(payloadSpans, innerWeight.getSimScorer(), norms); return new DefaultScorerSupplier(scorer); } } @@ -244,8 +249,9 @@ private class PayloadSpanScorer extends SpanScorer { private final PayloadSpans spans; - private PayloadSpanScorer(PayloadSpans spans, LeafSimScorer docScorer) throws IOException { - super(spans, docScorer); + private PayloadSpanScorer(PayloadSpans spans, SimScorer scorer, NumericDocValues norms) + throws IOException { + super(spans, scorer, norms); this.spans = spans; } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/payloads/SpanPayloadCheckQuery.java b/lucene/queries/src/java/org/apache/lucene/queries/payloads/SpanPayloadCheckQuery.java index 6cb2dbd2b0f5..5792e1065168 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/payloads/SpanPayloadCheckQuery.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/payloads/SpanPayloadCheckQuery.java @@ -21,6 +21,7 @@ import java.util.Map; import java.util.Objects; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermStates; @@ -34,7 +35,6 @@ import org.apache.lucene.queries.spans.Spans; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.LeafSimScorer; import org.apache.lucene.search.Query; import org.apache.lucene.search.QueryVisitor; import org.apache.lucene.search.ScoreMode; @@ -191,8 +191,8 @@ public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOExcepti if (spans == null) { return null; } - final LeafSimScorer docScorer = getSimScorer(context); - final var scorer = new SpanScorer(spans, docScorer); + final NumericDocValues norms = context.reader().getNormValues(field); + final var scorer = new SpanScorer(spans, getSimScorer(), norms); return new DefaultScorerSupplier(scorer); } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/spans/FilterSpans.java b/lucene/queries/src/java/org/apache/lucene/queries/spans/FilterSpans.java index d68e227f43a6..c29fe79ca1f4 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/spans/FilterSpans.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/spans/FilterSpans.java @@ -197,7 +197,7 @@ private final boolean twoPhaseCurrentDocMatches() throws IOException { if (startPos != NO_MORE_POSITIONS) { break; } - // else fallthrough + // else fallthrough case NO_MORE_IN_CURRENT_DOC: startPos = -1; return false; diff --git a/lucene/queries/src/java/org/apache/lucene/queries/spans/SpanContainingQuery.java b/lucene/queries/src/java/org/apache/lucene/queries/spans/SpanContainingQuery.java index 4752d6bee38d..aa7137cba4fa 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/spans/SpanContainingQuery.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/spans/SpanContainingQuery.java @@ -20,10 +20,10 @@ import java.util.ArrayList; import java.util.Map; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermStates; import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.LeafSimScorer; import org.apache.lucene.search.ScoreMode; import org.apache.lucene.search.ScorerSupplier; @@ -144,8 +144,8 @@ public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOExcepti if (spans == null) { return null; } - final LeafSimScorer docScorer = getSimScorer(context); - final var scorer = new SpanScorer(spans, docScorer); + final NumericDocValues norms = context.reader().getNormValues(field); + final var scorer = new SpanScorer(spans, getSimScorer(), norms); return new DefaultScorerSupplier(scorer); } } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/spans/SpanNearQuery.java b/lucene/queries/src/java/org/apache/lucene/queries/spans/SpanNearQuery.java index 69e1a74c80bb..9e7f4a049edf 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/spans/SpanNearQuery.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/spans/SpanNearQuery.java @@ -29,7 +29,6 @@ import org.apache.lucene.index.Terms; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.LeafSimScorer; import org.apache.lucene.search.Query; import org.apache.lucene.search.QueryVisitor; import org.apache.lucene.search.ScoreMode; @@ -247,8 +246,8 @@ public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOExcepti if (spans == null) { return null; } - final LeafSimScorer docScorer = getSimScorer(context); - final var scorer = new SpanScorer(spans, docScorer); + final var scorer = + new SpanScorer(spans, getSimScorer(), context.reader().getNormValues(field)); return new DefaultScorerSupplier(scorer); } } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/spans/SpanScorer.java b/lucene/queries/src/java/org/apache/lucene/queries/spans/SpanScorer.java index 9737ae5308e1..686ce7c91b16 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/spans/SpanScorer.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/spans/SpanScorer.java @@ -18,10 +18,11 @@ import java.io.IOException; import java.util.Objects; +import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.search.LeafSimScorer; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.TwoPhaseIterator; +import org.apache.lucene.search.similarities.Similarity.SimScorer; /** * A basic {@link Scorer} over {@link Spans}. @@ -31,7 +32,8 @@ public class SpanScorer extends Scorer { protected final Spans spans; - protected final LeafSimScorer docScorer; + protected final SimScorer scorer; + protected final NumericDocValues norms; /** accumulated sloppy freq (computed in setFreqCurrentDoc) */ private float freq; @@ -39,9 +41,10 @@ public class SpanScorer extends Scorer { private int lastScoredDoc = -1; // last doc we called setFreqCurrentDoc() for /** Sole constructor. */ - public SpanScorer(Spans spans, LeafSimScorer docScorer) { + public SpanScorer(Spans spans, SimScorer scorer, NumericDocValues norms) { this.spans = Objects.requireNonNull(spans); - this.docScorer = docScorer; + this.scorer = scorer; + this.norms = norms; } /** return the Spans for this Scorer * */ @@ -69,8 +72,12 @@ public TwoPhaseIterator twoPhaseIterator() { * slop-adjusted {@link #freq}. */ protected float scoreCurrentDoc() throws IOException { - assert docScorer != null : getClass() + " has a null docScorer!"; - return docScorer.score(docID(), freq); + assert scorer != null : getClass() + " has a null docScorer!"; + long norm = 1L; + if (norms != null && norms.advanceExact(docID())) { + norm = norms.longValue(); + } + return scorer.score(freq, norm); } /** @@ -98,7 +105,7 @@ protected final void setFreqCurrentDoc() throws IOException { // assert (startPos != prevStartPos) || (endPos > prevEndPos) : "non increased // endPos="+endPos; assert (startPos != prevStartPos) || (endPos >= prevEndPos) : "decreased endPos=" + endPos; - if (docScorer == null) { // scores not required, break out here + if (scorer == null) { // scores not required, break out here freq = 1; return; } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/spans/SpanTermQuery.java b/lucene/queries/src/java/org/apache/lucene/queries/spans/SpanTermQuery.java index 359f6ae7e959..82c0f9b4aee2 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/spans/SpanTermQuery.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/spans/SpanTermQuery.java @@ -160,7 +160,7 @@ public Spans getSpans(final LeafReaderContext context, Postings requiredPostings final PostingsEnum postings = termsEnum.postings(null, requiredPostings.getRequiredPostings()); float positionsCost = termPositionsCost(termsEnum) * PHRASE_TO_SPAN_TERM_POSITIONS_COST; - return new TermSpans(getSimScorer(context), postings, term, positionsCost); + return new TermSpans(postings, term, positionsCost); } } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/spans/SpanWeight.java b/lucene/queries/src/java/org/apache/lucene/queries/spans/SpanWeight.java index 29e1c13e773f..8c7d8dbdb3a6 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/spans/SpanWeight.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/spans/SpanWeight.java @@ -22,13 +22,13 @@ import java.util.Locale; import java.util.Map; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermStates; import org.apache.lucene.search.CollectionStatistics; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.LeafSimScorer; import org.apache.lucene.search.Matches; import org.apache.lucene.search.MatchesIterator; import org.apache.lucene.search.MatchesUtils; @@ -38,6 +38,7 @@ import org.apache.lucene.search.TermStatistics; import org.apache.lucene.search.Weight; import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarities.Similarity.SimScorer; import org.apache.lucene.util.ArrayUtil; /** Expert-only. Public for use by other weight implementations */ @@ -142,8 +143,8 @@ public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOExcepti if (spans == null) { return null; } - final LeafSimScorer docScorer = getSimScorer(context); - final var scorer = new SpanScorer(spans, docScorer); + final NumericDocValues norms = context.reader().getNormValues(field); + final var scorer = new SpanScorer(spans, simScorer, norms); return new ScorerSupplier() { @Override public SpanScorer get(long leadCost) throws IOException { @@ -157,15 +158,9 @@ public long cost() { }; } - /** - * Return a LeafSimScorer for this context - * - * @param context the LeafReaderContext - * @return a SimWeight - * @throws IOException on error - */ - public LeafSimScorer getSimScorer(LeafReaderContext context) throws IOException { - return simScorer == null ? null : new LeafSimScorer(simScorer, context.reader(), field, true); + /** Return the SimScorer */ + public SimScorer getSimScorer() { + return simScorer; } @Override @@ -176,9 +171,13 @@ public Explanation explain(LeafReaderContext context, int doc) throws IOExceptio if (newDoc == doc) { if (simScorer != null) { float freq = scorer.sloppyFreq(); - LeafSimScorer docScorer = new LeafSimScorer(simScorer, context.reader(), field, true); Explanation freqExplanation = Explanation.match(freq, "phraseFreq=" + freq); - Explanation scoreExplanation = docScorer.explain(doc, freqExplanation); + NumericDocValues norms = context.reader().getNormValues(field); + long norm = 1L; + if (norms != null && norms.advanceExact(doc)) { + norm = norms.longValue(); + } + Explanation scoreExplanation = simScorer.explain(freqExplanation, norm); return Explanation.match( scoreExplanation.getValue(), "weight(" diff --git a/lucene/queries/src/java/org/apache/lucene/queries/spans/TermSpans.java b/lucene/queries/src/java/org/apache/lucene/queries/spans/TermSpans.java index b81b1846a097..995b242d90bd 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/spans/TermSpans.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/spans/TermSpans.java @@ -21,7 +21,6 @@ import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.Term; import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.search.LeafSimScorer; /** * Expert: Public for extension only. This does not work correctly for terms that indexed at @@ -37,7 +36,7 @@ public class TermSpans extends Spans { protected boolean readPayload; private final float positionsCost; - public TermSpans(LeafSimScorer scorer, PostingsEnum postings, Term term, float positionsCost) { + public TermSpans(PostingsEnum postings, Term term, float positionsCost) { this.postings = Objects.requireNonNull(postings); this.term = Objects.requireNonNull(term); this.doc = -1; diff --git a/lucene/queries/src/test/org/apache/lucene/queries/TestCommonTermsQuery.java b/lucene/queries/src/test/org/apache/lucene/queries/TestCommonTermsQuery.java index cca0c162113d..9282fdb79461 100644 --- a/lucene/queries/src/test/org/apache/lucene/queries/TestCommonTermsQuery.java +++ b/lucene/queries/src/test/org/apache/lucene/queries/TestCommonTermsQuery.java @@ -87,7 +87,7 @@ public void testBasics() throws IOException { query.add(new Term("field", "universe")); query.add(new Term("field", "right")); TopDocs search = s.search(query, 10); - assertEquals(search.totalHits.value, 3); + assertEquals(search.totalHits.value(), 3); assertEquals("0", r.storedFields().document(search.scoreDocs[0].doc).get("id")); assertEquals("2", r.storedFields().document(search.scoreDocs[1].doc).get("id")); assertEquals("3", r.storedFields().document(search.scoreDocs[2].doc).get("id")); @@ -100,7 +100,7 @@ public void testBasics() throws IOException { query.add(new Term("field", "this")); query.add(new Term("field", "end")); TopDocs search = s.search(query, 10); - assertEquals(search.totalHits.value, 2); + assertEquals(search.totalHits.value(), 2); assertEquals("0", r.storedFields().document(search.scoreDocs[0].doc).get("id")); assertEquals("2", r.storedFields().document(search.scoreDocs[1].doc).get("id")); } @@ -114,7 +114,7 @@ public void testBasics() throws IOException { query.add(new Term("field", "world")); TopDocs search = s.search(query, 10); - assertEquals(search.totalHits.value, 1); + assertEquals(search.totalHits.value(), 1); assertEquals("0", r.storedFields().document(search.scoreDocs[0].doc).get("id")); } @@ -125,7 +125,7 @@ public void testBasics() throws IOException { query.add(new Term("field", "universe")); TopDocs search = s.search(query, 10); - assertEquals(search.totalHits.value, 1); + assertEquals(search.totalHits.value(), 1); assertEquals("3", r.storedFields().document(search.scoreDocs[0].doc).get("id")); } IOUtils.close(r, w, dir, analyzer); @@ -194,7 +194,12 @@ public void testNullTerm() { public void testMinShouldMatch() throws IOException { Directory dir = newDirectory(); MockAnalyzer analyzer = new MockAnalyzer(random()); - RandomIndexWriter w = new RandomIndexWriter(random(), dir, analyzer); + RandomIndexWriter w = + new RandomIndexWriter( + random(), + dir, + LuceneTestCase.newIndexWriterConfig(analyzer) + .setMergePolicy(LuceneTestCase.newMergePolicy(random(), false))); String[] docs = new String[] { "this is the end of the world right", @@ -222,7 +227,7 @@ public void testMinShouldMatch() throws IOException { query.add(new Term("field", "right")); query.setLowFreqMinimumNumberShouldMatch(0.5f); TopDocs search = s.search(query, 10); - assertEquals(search.totalHits.value, 1); + assertEquals(search.totalHits.value(), 1); assertEquals("0", r.storedFields().document(search.scoreDocs[0].doc).get("id")); } { @@ -236,7 +241,7 @@ public void testMinShouldMatch() throws IOException { query.add(new Term("field", "right")); query.setLowFreqMinimumNumberShouldMatch(2.0f); TopDocs search = s.search(query, 10); - assertEquals(search.totalHits.value, 1); + assertEquals(search.totalHits.value(), 1); assertEquals("0", r.storedFields().document(search.scoreDocs[0].doc).get("id")); } @@ -251,7 +256,7 @@ public void testMinShouldMatch() throws IOException { query.add(new Term("field", "right")); query.setLowFreqMinimumNumberShouldMatch(0.49f); TopDocs search = s.search(query, 10); - assertEquals(search.totalHits.value, 3); + assertEquals(search.totalHits.value(), 3); assertEquals("0", r.storedFields().document(search.scoreDocs[0].doc).get("id")); assertEquals("2", r.storedFields().document(search.scoreDocs[1].doc).get("id")); assertEquals("3", r.storedFields().document(search.scoreDocs[2].doc).get("id")); @@ -268,7 +273,7 @@ public void testMinShouldMatch() throws IOException { query.add(new Term("field", "right")); query.setLowFreqMinimumNumberShouldMatch(1.0f); TopDocs search = s.search(query, 10); - assertEquals(search.totalHits.value, 3); + assertEquals(search.totalHits.value(), 3); assertEquals("0", r.storedFields().document(search.scoreDocs[0].doc).get("id")); assertEquals("2", r.storedFields().document(search.scoreDocs[1].doc).get("id")); assertEquals("3", r.storedFields().document(search.scoreDocs[2].doc).get("id")); @@ -287,7 +292,7 @@ public void testMinShouldMatch() throws IOException { query.setLowFreqMinimumNumberShouldMatch(1.0f); query.setHighFreqMinimumNumberShouldMatch(4.0f); TopDocs search = s.search(query, 10); - assertEquals(search.totalHits.value, 3); + assertEquals(search.totalHits.value(), 3); assertEquals(search.scoreDocs[1].score, search.scoreDocs[2].score, 0.0f); assertEquals("0", r.storedFields().document(search.scoreDocs[0].doc).get("id")); // doc 2 and 3 only get a score from low freq terms @@ -309,7 +314,7 @@ public void testMinShouldMatch() throws IOException { query.setLowFreqMinimumNumberShouldMatch(1.0f); query.setHighFreqMinimumNumberShouldMatch(2.0f); TopDocs search = s.search(query, 10); - assertEquals(search.totalHits.value, 4); + assertEquals(search.totalHits.value(), 4); } { @@ -322,7 +327,7 @@ public void testMinShouldMatch() throws IOException { query.setLowFreqMinimumNumberShouldMatch(1.0f); query.setHighFreqMinimumNumberShouldMatch(2.0f); TopDocs search = s.search(query, 10); - assertEquals(search.totalHits.value, 2); + assertEquals(search.totalHits.value(), 2); assertEquals( new HashSet<>(Arrays.asList("0", "2")), new HashSet<>( @@ -384,7 +389,7 @@ public void testExtend() throws IOException { query.add(new Term("field", "universe")); query.add(new Term("field", "right")); TopDocs search = s.search(query, 10); - assertEquals(search.totalHits.value, 3); + assertEquals(search.totalHits.value(), 3); assertEquals("0", r.storedFields().document(search.scoreDocs[0].doc).get("id")); assertEquals("2", r.storedFields().document(search.scoreDocs[1].doc).get("id")); assertEquals("3", r.storedFields().document(search.scoreDocs[2].doc).get("id")); @@ -402,7 +407,7 @@ public void testExtend() throws IOException { query.add(new Term("field", "universe")); query.add(new Term("field", "right")); TopDocs search = s.search(query, 10); - assertEquals(search.totalHits.value, 3); + assertEquals(search.totalHits.value(), 3); assertEquals("2", r.storedFields().document(search.scoreDocs[0].doc).get("id")); assertEquals("3", r.storedFields().document(search.scoreDocs[1].doc).get("id")); assertEquals("0", r.storedFields().document(search.scoreDocs[2].doc).get("id")); @@ -481,7 +486,7 @@ protected boolean lessThan(TermAndFreq a, TermAndFreq b) { TopDocs cqSearch = searcher.search(cq, reader.maxDoc()); TopDocs verifySearch = searcher.search(verifyQuery.build(), reader.maxDoc()); - assertEquals(verifySearch.totalHits.value, cqSearch.totalHits.value); + assertEquals(verifySearch.totalHits.value(), cqSearch.totalHits.value()); Set hits = new HashSet<>(); for (ScoreDoc doc : verifySearch.scoreDocs) { hits.add(doc.doc); diff --git a/lucene/queries/src/test/org/apache/lucene/queries/function/TestDocValuesFieldSources.java b/lucene/queries/src/test/org/apache/lucene/queries/function/TestDocValuesFieldSources.java index cb82deeb5eb7..63d1e6c684b8 100644 --- a/lucene/queries/src/test/org/apache/lucene/queries/function/TestDocValuesFieldSources.java +++ b/lucene/queries/src/test/org/apache/lucene/queries/function/TestDocValuesFieldSources.java @@ -146,7 +146,7 @@ public void test(DocValuesType type) throws IOException { case SORTED: values.ordVal(i); // no exception assertTrue(values.numOrd() >= 1); - // fall-through + // fall-through case BINARY: assertEquals(expected, values.objectVal(i)); assertEquals(expected, values.strVal(i)); diff --git a/lucene/queries/src/test/org/apache/lucene/queries/function/TestFieldScoreQuery.java b/lucene/queries/src/test/org/apache/lucene/queries/function/TestFieldScoreQuery.java index 664e759a9eeb..44ef9ed03514 100644 --- a/lucene/queries/src/test/org/apache/lucene/queries/function/TestFieldScoreQuery.java +++ b/lucene/queries/src/test/org/apache/lucene/queries/function/TestFieldScoreQuery.java @@ -122,7 +122,7 @@ private void doTestExactScore(ValueSource valueSource) throws Exception { IndexReader r = DirectoryReader.open(dir); IndexSearcher s = newSearcher(r); TopDocs td = s.search(functionQuery, 1000); - assertEquals("All docs should be matched!", N_DOCS, td.totalHits.value); + assertEquals("All docs should be matched!", N_DOCS, td.totalHits.value()); ScoreDoc[] sd = td.scoreDocs; for (ScoreDoc aSd : sd) { float score = aSd.score; diff --git a/lucene/queries/src/test/org/apache/lucene/queries/function/TestFunctionMatchQuery.java b/lucene/queries/src/test/org/apache/lucene/queries/function/TestFunctionMatchQuery.java index 793bcc256fe8..d05c6e2d41de 100644 --- a/lucene/queries/src/test/org/apache/lucene/queries/function/TestFunctionMatchQuery.java +++ b/lucene/queries/src/test/org/apache/lucene/queries/function/TestFunctionMatchQuery.java @@ -54,7 +54,7 @@ public void testRangeMatching() throws IOException { FunctionMatchQuery fmq = new FunctionMatchQuery(in, d -> d >= 2 && d < 4); TopDocs docs = searcher.search(fmq, 10); - assertEquals(2, docs.totalHits.value); + assertEquals(2, docs.totalHits.value()); assertEquals(9, docs.scoreDocs[0].doc); assertEquals(13, docs.scoreDocs[1].doc); diff --git a/lucene/queries/src/test/org/apache/lucene/queries/function/TestFunctionScoreQuery.java b/lucene/queries/src/test/org/apache/lucene/queries/function/TestFunctionScoreQuery.java index d74c3823e9d7..85bcc05cbf78 100644 --- a/lucene/queries/src/test/org/apache/lucene/queries/function/TestFunctionScoreQuery.java +++ b/lucene/queries/src/test/org/apache/lucene/queries/function/TestFunctionScoreQuery.java @@ -153,7 +153,7 @@ public void testSimpleSourceScore() throws Exception { int[] expectedDocs = new int[] {4, 7, 9}; TopDocs docs = searcher.search(q, 4); - assertEquals(expectedDocs.length, docs.totalHits.value); + assertEquals(expectedDocs.length, docs.totalHits.value()); for (int i = 0; i < expectedDocs.length; i++) { assertEquals(docs.scoreDocs[i].doc, expectedDocs[i]); } @@ -176,7 +176,7 @@ public void testScoreModifyingSource() throws Exception { int[] expectedDocs = new int[] {4, 7, 9, 8, 12}; TopDocs docs = searcher.search(fq, 5); - assertEquals(plain.totalHits.value, docs.totalHits.value); + assertEquals(plain.totalHits.value(), docs.totalHits.value()); for (int i = 0; i < expectedDocs.length; i++) { assertEquals(expectedDocs[i], docs.scoreDocs[i].doc); } @@ -199,7 +199,7 @@ public void testCombiningMultipleQueryScores() throws Exception { int[] expectedDocs = new int[] {6, 1, 0, 2, 8}; TopDocs docs = searcher.search(fq, 20); - assertEquals(plain.totalHits.value, docs.totalHits.value); + assertEquals(plain.totalHits.value(), docs.totalHits.value()); for (int i = 0; i < expectedDocs.length; i++) { assertEquals(expectedDocs[i], docs.scoreDocs[i].doc); } @@ -223,7 +223,7 @@ public void testBoostsAreAppliedLast() throws Exception { Query boosted = new BoostQuery(q1, 2); TopDocs afterboost = searcher.search(boosted, 5); - assertEquals(plain.totalHits.value, afterboost.totalHits.value); + assertEquals(plain.totalHits.value(), afterboost.totalHits.value()); for (int i = 0; i < 5; i++) { assertEquals(plain.scoreDocs[i].doc, afterboost.scoreDocs[i].doc); assertEquals(plain.scoreDocs[i].score, afterboost.scoreDocs[i].score / 2, 0.0001); @@ -358,7 +358,7 @@ public void testScoreCalledTwice() throws Exception { q, new PhraseQuery(1, "ExampleText", "function", "plot"), 2); q = FunctionScoreQuery.boostByValue(q, DoubleValuesSource.SCORES); - assertEquals(1, new IndexSearcher(reader).search(q, 10).totalHits.value); + assertEquals(1, new IndexSearcher(reader).search(q, 10).totalHits.value()); } } } diff --git a/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervalQuery.java b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervalQuery.java index d9141882c947..a0539a0f8520 100644 --- a/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervalQuery.java +++ b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervalQuery.java @@ -338,6 +338,18 @@ public void testNestedOrInUnorderedMaxGaps() throws IOException { checkHits(q, new int[] {6, 7}); } + public void testUnorderedWithNoGap() throws IOException { + Query q = + new IntervalQuery( + field, + Intervals.maxgaps( + 0, + Intervals.unordered( + Intervals.term("w3"), + Intervals.unordered(Intervals.term("w1"), Intervals.term("w5"))))); + checkHits(q, new int[] {0}); + } + public void testOrderedWithGaps() throws IOException { Query q = new IntervalQuery( @@ -360,6 +372,18 @@ public void testOrderedWithGaps2() throws IOException { checkHits(q, new int[] {12}); } + public void testOrderedWithNoGap() throws IOException { + Query q = + new IntervalQuery( + field, + Intervals.maxgaps( + 0, + Intervals.ordered( + Intervals.ordered(Intervals.term("w1"), Intervals.term("w4")), + Intervals.term("w5")))); + checkHits(q, new int[] {0}); + } + public void testNestedOrInContainedBy() throws IOException { Query q = new IntervalQuery( @@ -388,7 +412,7 @@ public void testScoring() throws IOException { Query q = new IntervalQuery(field, source); TopDocs td = searcher.search(q, 10); - assertEquals(5, td.totalHits.value); + assertEquals(5, td.totalHits.value()); assertEquals(1, td.scoreDocs[0].doc); assertEquals(3, td.scoreDocs[1].doc); assertEquals(0, td.scoreDocs[2].doc); @@ -397,7 +421,7 @@ public void testScoring() throws IOException { Query boostQ = new BoostQuery(q, 2); TopDocs boostTD = searcher.search(boostQ, 10); - assertEquals(5, boostTD.totalHits.value); + assertEquals(5, boostTD.totalHits.value()); for (int i = 0; i < 5; i++) { assertEquals(td.scoreDocs[i].score * 2, boostTD.scoreDocs[i].score, 0); } @@ -405,7 +429,7 @@ public void testScoring() throws IOException { // change the pivot - order should remain the same Query q1 = new IntervalQuery(field, source, 2); TopDocs td1 = searcher.search(q1, 10); - assertEquals(5, td1.totalHits.value); + assertEquals(5, td1.totalHits.value()); assertEquals(0.5f, td1.scoreDocs[0].score, 0); // freq=pivot for (int i = 0; i < 5; i++) { assertEquals(td.scoreDocs[i].doc, td1.scoreDocs[i].doc); @@ -414,7 +438,7 @@ public void testScoring() throws IOException { // increase the exp, docs higher than pivot should get a higher score, and vice versa Query q2 = new IntervalQuery(field, source, 1.2f, 2f); TopDocs td2 = searcher.search(q2, 10); - assertEquals(5, td2.totalHits.value); + assertEquals(5, td2.totalHits.value()); for (int i = 0; i < 5; i++) { assertEquals(td.scoreDocs[i].doc, td2.scoreDocs[i].doc); if (i < 2) { @@ -447,4 +471,24 @@ public void testExtendDisjunctions() throws IOException { field, or(term("XXX"), containing(extend(term("message"), 0, 10), term("intend")))); checkHits(q, new int[] {}); } + + public void testEquality() { + assertEquals( + new IntervalQuery("f", Intervals.regexp(new BytesRef(".*foo"))), + new IntervalQuery("f", Intervals.regexp(new BytesRef(".*foo")))); + assertEquals( + new IntervalQuery("f", Intervals.prefix(new BytesRef("p"), 1)), + new IntervalQuery("f", Intervals.prefix(new BytesRef("p"), 1))); + assertEquals( + new IntervalQuery("f", Intervals.fuzzyTerm("kot", 1)), + new IntervalQuery("f", Intervals.fuzzyTerm("kot", 1))); + assertEquals( + new IntervalQuery("f", Intervals.wildcard(new BytesRef("*.txt"))), + new IntervalQuery("f", Intervals.wildcard(new BytesRef("*.txt")))); + assertEquals( + new IntervalQuery( + "f", Intervals.range(new BytesRef("cold"), new BytesRef("hot"), true, true)), + new IntervalQuery( + "f", Intervals.range(new BytesRef("cold"), new BytesRef("hot"), true, true))); + } } diff --git a/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java index 944530937ceb..3d9560858ac1 100644 --- a/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java +++ b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java @@ -1138,6 +1138,46 @@ public void testRange() throws IOException { checkVisits(source, 1); } + public void testOpenEndedRange() throws IOException { + { + IntervalsSource source = Intervals.range(new BytesRef("porridge"), null, false, false); + checkIntervals( + source, + "field1", + 5, + new int[][] { + {3, 3}, + {9, 9, 10, 10, 14, 14, 18, 18, 22, 22, 26, 26, 27, 27}, + {9, 9, 10, 10, 11, 11, 14, 14, 18, 18, 22, 22, 26, 26}, + {8, 8}, + {9, 9, 10, 10, 12, 12, 14, 14, 18, 18, 21, 21}, + {} + }); + MatchesIterator mi = getMatches(source, 3, "field1"); + assertNotNull(mi); + assertMatch(mi, 8, 8, 37, 41); + } + + { + IntervalsSource source = Intervals.range(null, new BytesRef("anyone"), false, true); + checkIntervals( + source, + "field1", + 1, + new int[][] { + {4, 4}, + {}, + {}, + {}, + {}, + {} + }); + MatchesIterator mi = getMatches(source, 0, "field1"); + assertNotNull(mi); + assertMatch(mi, 4, 4, 23, 29); + } + } + public void testWrappedFilters() throws IOException { IntervalsSource source = Intervals.or( @@ -1187,4 +1227,27 @@ public void testMultiTerm() throws IOException { checkVisits(source, 1); } + + // basic test for equality and inequality of instances created by the factories + public void testEquality() { + assertEquals(Intervals.term("wibble"), Intervals.term("wibble")); + assertEquals(Intervals.prefix(new BytesRef("p"), 1), Intervals.prefix(new BytesRef("p"), 1)); + assertEquals(Intervals.fuzzyTerm("kot", 1), Intervals.fuzzyTerm("kot", 1)); + assertEquals(Intervals.regexp(new BytesRef(".*ot")), Intervals.regexp(new BytesRef(".*ot"))); + assertEquals( + Intervals.wildcard(new BytesRef("*.txt")), Intervals.wildcard(new BytesRef("*.txt"))); + assertEquals( + Intervals.range(new BytesRef("cold"), new BytesRef("hot"), true, true), + Intervals.range(new BytesRef("cold"), new BytesRef("hot"), true, true)); + + assertNotEquals(Intervals.term("wibble"), Intervals.term("wobble")); + assertNotEquals(Intervals.prefix(new BytesRef("p"), 1), Intervals.prefix(new BytesRef("b"), 1)); + assertNotEquals(Intervals.fuzzyTerm("kot", 1), Intervals.fuzzyTerm("kof", 1)); + assertNotEquals(Intervals.regexp(new BytesRef(".*ot")), Intervals.regexp(new BytesRef(".*at"))); + assertNotEquals( + Intervals.wildcard(new BytesRef("*.txt")), Intervals.wildcard(new BytesRef("*.tat"))); + assertNotEquals( + Intervals.range(new BytesRef("warm"), new BytesRef("hot"), true, true), + Intervals.range(new BytesRef("cold"), new BytesRef("hot"), true, true)); + } } diff --git a/lucene/queries/src/test/org/apache/lucene/queries/payloads/TestPayloadTermQuery.java b/lucene/queries/src/test/org/apache/lucene/queries/payloads/TestPayloadTermQuery.java index de2d6b5d9f9a..74694956462b 100644 --- a/lucene/queries/src/test/org/apache/lucene/queries/payloads/TestPayloadTermQuery.java +++ b/lucene/queries/src/test/org/apache/lucene/queries/payloads/TestPayloadTermQuery.java @@ -162,7 +162,7 @@ public void test() throws IOException { TopDocs hits = searcher.search(query, 100); assertTrue("hits is null and it shouldn't be", hits != null); assertTrue( - "hits Size: " + hits.totalHits.value + " is not: " + 100, hits.totalHits.value == 100); + "hits Size: " + hits.totalHits.value() + " is not: " + 100, hits.totalHits.value() == 100); // they should all have the exact same score, because they all contain seventy once, and we set // all the other similarity factors to be 1 @@ -216,7 +216,7 @@ public void testMultipleMatchesPerDoc() throws Exception { TopDocs hits = searcher.search(query, 100); assertTrue("hits is null and it shouldn't be", hits != null); assertTrue( - "hits Size: " + hits.totalHits.value + " is not: " + 100, hits.totalHits.value == 100); + "hits Size: " + hits.totalHits.value() + " is not: " + 100, hits.totalHits.value() == 100); // they should all have the exact same score, because they all contain seventy once, and we set // all the other similarity factors to be 1 @@ -260,7 +260,8 @@ public void testNoMatch() throws Exception { PayloadDecoder.FLOAT_DECODER); TopDocs hits = searcher.search(query, 100); assertTrue("hits is null and it shouldn't be", hits != null); - assertTrue("hits Size: " + hits.totalHits.value + " is not: " + 0, hits.totalHits.value == 0); + assertTrue( + "hits Size: " + hits.totalHits.value() + " is not: " + 0, hits.totalHits.value() == 0); } public void testNoPayload() throws Exception { @@ -281,7 +282,8 @@ public void testNoPayload() throws Exception { query.add(c2); TopDocs hits = searcher.search(query.build(), 100); assertTrue("hits is null and it shouldn't be", hits != null); - assertTrue("hits Size: " + hits.totalHits.value + " is not: " + 1, hits.totalHits.value == 1); + assertTrue( + "hits Size: " + hits.totalHits.value() + " is not: " + 1, hits.totalHits.value() == 1); int[] results = new int[1]; results[0] = 0; // hits.scoreDocs[0].doc; CheckHits.checkHitCollector( diff --git a/lucene/queries/src/test/org/apache/lucene/queries/spans/AssertingSpanWeight.java b/lucene/queries/src/test/org/apache/lucene/queries/spans/AssertingSpanWeight.java index 4cbb0f39f722..4e1f16b9a024 100644 --- a/lucene/queries/src/test/org/apache/lucene/queries/spans/AssertingSpanWeight.java +++ b/lucene/queries/src/test/org/apache/lucene/queries/spans/AssertingSpanWeight.java @@ -23,8 +23,8 @@ import org.apache.lucene.index.TermStates; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.LeafSimScorer; import org.apache.lucene.search.ScorerSupplier; +import org.apache.lucene.search.similarities.Similarity.SimScorer; /** Wraps a SpanWeight with additional asserts */ public class AssertingSpanWeight extends SpanWeight { @@ -55,8 +55,8 @@ public Spans getSpans(LeafReaderContext context, Postings requiredPostings) thro } @Override - public LeafSimScorer getSimScorer(LeafReaderContext context) throws IOException { - return in.getSimScorer(context); + public SimScorer getSimScorer() { + return in.getSimScorer(); } @Override diff --git a/lucene/queries/src/test/org/apache/lucene/queries/spans/TestQueryRescorerWithSpans.java b/lucene/queries/src/test/org/apache/lucene/queries/spans/TestQueryRescorerWithSpans.java index f01732458b2e..b95c682d655c 100644 --- a/lucene/queries/src/test/org/apache/lucene/queries/spans/TestQueryRescorerWithSpans.java +++ b/lucene/queries/src/test/org/apache/lucene/queries/spans/TestQueryRescorerWithSpans.java @@ -66,7 +66,7 @@ public void testBasic() throws Exception { IndexSearcher searcher = getSearcher(r); TopDocs hits = searcher.search(bq.build(), 10); - assertEquals(2, hits.totalHits.value); + assertEquals(2, hits.totalHits.value()); assertEquals("0", searcher.storedFields().document(hits.scoreDocs[0].doc).get("id")); assertEquals("1", searcher.storedFields().document(hits.scoreDocs[1].doc).get("id")); @@ -78,7 +78,7 @@ public void testBasic() throws Exception { TopDocs hits3 = QueryRescorer.rescore(searcher, hits, snq, 2.0, 10); // Resorting changed the order: - assertEquals(2, hits3.totalHits.value); + assertEquals(2, hits3.totalHits.value()); assertEquals("1", searcher.storedFields().document(hits3.scoreDocs[0].doc).get("id")); assertEquals("0", searcher.storedFields().document(hits3.scoreDocs[1].doc).get("id")); @@ -109,7 +109,7 @@ public void testMissingSecondPassScore() throws Exception { IndexSearcher searcher = getSearcher(r); TopDocs hits = searcher.search(bq.build(), 10); - assertEquals(2, hits.totalHits.value); + assertEquals(2, hits.totalHits.value()); assertEquals("0", searcher.storedFields().document(hits.scoreDocs[0].doc).get("id")); assertEquals("1", searcher.storedFields().document(hits.scoreDocs[1].doc).get("id")); @@ -121,7 +121,7 @@ public void testMissingSecondPassScore() throws Exception { TopDocs hits3 = QueryRescorer.rescore(searcher, hits, snq, 2.0, 10); // Resorting changed the order: - assertEquals(2, hits3.totalHits.value); + assertEquals(2, hits3.totalHits.value()); assertEquals("1", searcher.storedFields().document(hits3.scoreDocs[0].doc).get("id")); assertEquals("0", searcher.storedFields().document(hits3.scoreDocs[1].doc).get("id")); diff --git a/lucene/queries/src/test/org/apache/lucene/queries/spans/TestSpanFirstQuery.java b/lucene/queries/src/test/org/apache/lucene/queries/spans/TestSpanFirstQuery.java index 20d563171dd3..64d1f66f5e13 100644 --- a/lucene/queries/src/test/org/apache/lucene/queries/spans/TestSpanFirstQuery.java +++ b/lucene/queries/src/test/org/apache/lucene/queries/spans/TestSpanFirstQuery.java @@ -54,12 +54,12 @@ public void testStartPositions() throws Exception { // user queries on "starts-with quick" SpanQuery sfq = spanFirstQuery(spanTermQuery("field", "quick"), 1); - assertEquals(1, searcher.search(sfq, 10).totalHits.value); + assertEquals(1, searcher.search(sfq, 10).totalHits.value()); // user queries on "starts-with the quick" SpanQuery include = spanFirstQuery(spanTermQuery("field", "quick"), 2); sfq = spanNotQuery(include, sfq); - assertEquals(1, searcher.search(sfq, 10).totalHits.value); + assertEquals(1, searcher.search(sfq, 10).totalHits.value()); writer.close(); reader.close(); diff --git a/lucene/queries/src/test/org/apache/lucene/queries/spans/TestSpanSimilarity.java b/lucene/queries/src/test/org/apache/lucene/queries/spans/TestSpanSimilarity.java index 06b8da14a268..d07c20c28cc9 100644 --- a/lucene/queries/src/test/org/apache/lucene/queries/spans/TestSpanSimilarity.java +++ b/lucene/queries/src/test/org/apache/lucene/queries/spans/TestSpanSimilarity.java @@ -155,7 +155,7 @@ public void testCrazySpans() throws Exception { SpanTermQuery s2 = new SpanTermQuery(new Term("foo", "baz")); Query query = new SpanOrQuery(s1, s2); TopDocs td = is.search(query, 10); - assertEquals(1, td.totalHits.value); + assertEquals(1, td.totalHits.value()); float score = td.scoreDocs[0].score; assertFalse("negative score for " + sim, score < 0.0f); assertFalse("inf score for " + sim, Float.isInfinite(score)); diff --git a/lucene/queries/src/test/org/apache/lucene/queries/spans/TestSpans.java b/lucene/queries/src/test/org/apache/lucene/queries/spans/TestSpans.java index 359c31709dba..00d6c602f624 100644 --- a/lucene/queries/src/test/org/apache/lucene/queries/spans/TestSpans.java +++ b/lucene/queries/src/test/org/apache/lucene/queries/spans/TestSpans.java @@ -349,14 +349,15 @@ public void testNPESpanQuery() throws Throwable { // This throws exception (it shouldn't) assertEquals( 1, - searcher.search( + searcher + .search( createSpan( 0, true, new SpanQuery[] {createSpan(4, false, "chased", "cat"), createSpan("ate")}), 10) .totalHits - .value); + .value()); reader.close(); dir.close(); } diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/complexPhrase/ComplexPhraseQueryParser.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/complexPhrase/ComplexPhraseQueryParser.java index 72a709bd4e05..18476f50c2b4 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/complexPhrase/ComplexPhraseQueryParser.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/complexPhrase/ComplexPhraseQueryParser.java @@ -294,9 +294,9 @@ public Query rewrite(IndexSearcher indexSearcher) throws IOException { if (qc instanceof BooleanQuery || qc instanceof SynonymQuery) { ArrayList sc = new ArrayList<>(); - BooleanQuery booleanCaluse = + BooleanQuery booleanClause = qc instanceof BooleanQuery ? (BooleanQuery) qc : convert((SynonymQuery) qc); - addComplexPhraseClause(sc, booleanCaluse); + addComplexPhraseClause(sc, booleanClause); if (sc.size() > 0) { allSpanClauses[i] = sc.get(0); } else { diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/ext/ExtendableQueryParser.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/ext/ExtendableQueryParser.java index 7a79c850b35e..b0e06d053b84 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/ext/ExtendableQueryParser.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/ext/ExtendableQueryParser.java @@ -108,9 +108,9 @@ protected Query getFieldQuery(final String field, final String queryText, boolea throws ParseException { final Pair splitExtensionField = this.extensions.splitExtensionField(defaultField, field); - final ParserExtension extension = this.extensions.getExtension(splitExtensionField.cud); + final ParserExtension extension = this.extensions.getExtension(splitExtensionField.cud()); if (extension != null) { - return extension.parse(new ExtensionQuery(this, splitExtensionField.cur, queryText)); + return extension.parse(new ExtensionQuery(this, splitExtensionField.cur(), queryText)); } return super.getFieldQuery(field, queryText, quoted); } diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/ext/ExtensionQuery.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/ext/ExtensionQuery.java index 8e4acd17531d..b86b1831cc92 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/ext/ExtensionQuery.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/ext/ExtensionQuery.java @@ -22,52 +22,11 @@ * {@link ExtensionQuery} holds all query components extracted from the original query string like * the query field and the extension query string. * + * @param topLevelParser the top level parser + * @param field the query field + * @param rawQueryString the raw extension query string * @see Extensions * @see ExtendableQueryParser * @see ParserExtension */ -public class ExtensionQuery { - - private final String field; - private final String rawQueryString; - private final QueryParser topLevelParser; - - /** - * Creates a new {@link ExtensionQuery} - * - * @param field the query field - * @param rawQueryString the raw extension query string - */ - public ExtensionQuery(QueryParser topLevelParser, String field, String rawQueryString) { - this.field = field; - this.rawQueryString = rawQueryString; - this.topLevelParser = topLevelParser; - } - - /** - * Returns the query field - * - * @return the query field - */ - public String getField() { - return field; - } - - /** - * Returns the raw extension query string - * - * @return the raw extension query string - */ - public String getRawQueryString() { - return rawQueryString; - } - - /** - * Returns the top level parser which created this {@link ExtensionQuery} - * - * @return the top level parser which created this {@link ExtensionQuery} - */ - public QueryParser getTopLevelParser() { - return topLevelParser; - } -} +public record ExtensionQuery(QueryParser topLevelParser, String field, String rawQueryString) {} diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/ext/Extensions.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/ext/Extensions.java index c2dda823598f..0b016487d39d 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/ext/Extensions.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/ext/Extensions.java @@ -167,20 +167,5 @@ public String buildExtensionField(String extensionKey, String field) { * @param the pairs first element * @param the pairs last element of the pair. */ - public static class Pair { - - public final Cur cur; - public final Cud cud; - - /** - * Creates a new Pair - * - * @param cur the pairs first element - * @param cud the pairs last element - */ - public Pair(Cur cur, Cud cud) { - this.cur = cur; - this.cud = cud; - } - } + public record Pair(Cur cur, Cud cud) {} } diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/TestMultiPhraseQueryParsing.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/TestMultiPhraseQueryParsing.java index fb028b858d31..2109d686763f 100644 --- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/TestMultiPhraseQueryParsing.java +++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/TestMultiPhraseQueryParsing.java @@ -28,15 +28,7 @@ public class TestMultiPhraseQueryParsing extends LuceneTestCase { - private static class TokenAndPos { - public final String token; - public final int pos; - - public TokenAndPos(String token, int pos) { - this.token = token; - this.pos = pos; - } - } + private record TokenAndPos(String token, int pos) {} private static class CannedAnalyzer extends Analyzer { private final TokenAndPos[] tokens; diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/ext/ExtensionStub.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/ext/ExtensionStub.java index 3a0b80a6b37d..4cc9502d0bf1 100644 --- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/ext/ExtensionStub.java +++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/ext/ExtensionStub.java @@ -25,6 +25,6 @@ class ExtensionStub extends ParserExtension { @Override public Query parse(ExtensionQuery components) throws ParseException { - return new TermQuery(new Term(components.getField(), components.getRawQueryString())); + return new TermQuery(new Term(components.field(), components.rawQueryString())); } } diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestQPHelper.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestQPHelper.java index ceba4137ef01..2c8308ee440d 100644 --- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestQPHelper.java +++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestQPHelper.java @@ -1329,7 +1329,7 @@ public void testMultiPhraseQuery() throws Exception { Query q = new StandardQueryParser(new CannedAnalyzer()).parse("\"a\"", "field"); assertTrue(q instanceof MultiPhraseQuery); - assertEquals(1, s.search(q, 10).totalHits.value); + assertEquals(1, s.search(q, 10).totalHits.value()); r.close(); w.close(); dir.close(); diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java index b55006ec94d6..5aaa35b4c20e 100644 --- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java +++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java @@ -1207,7 +1207,7 @@ public void testPositionIncrements() throws Exception { IndexSearcher s = newSearcher(r); Query q = getQuery("\"wizard of ozzy\"", a); - assertEquals(1, s.search(q, 1).totalHits.value); + assertEquals(1, s.search(q, 1).totalHits.value()); r.close(); dir.close(); } diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/xml/TestCoreParser.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/xml/TestCoreParser.java index 3aa20b57b77b..dbd3bcde0542 100644 --- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/xml/TestCoreParser.java +++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/xml/TestCoreParser.java @@ -130,7 +130,7 @@ public void testUserQueryXML() throws ParserException, IOException { public void testCustomFieldUserQueryXML() throws ParserException, IOException { Query q = parse("UserInputQueryCustomField.xml"); - long h = searcher().search(q, 1000).totalHits.value; + long h = searcher().search(q, 1000).totalHits.value(); assertEquals("UserInputQueryCustomField should produce 0 result ", 0, h); } @@ -149,7 +149,7 @@ public void testSpanTermXML() throws Exception { public void testSpanPositionRangeQueryXML() throws Exception { Query q = parse("SpanPositionRangeQuery.xml"); - long h = searcher().search(q, 10).totalHits.value; + long h = searcher().search(q, 10).totalHits.value(); assertEquals("SpanPositionRangeQuery should produce 2 result ", 2, h); SpanQuery sq = parseAsSpan("SpanPositionRangeQuery.xml"); dumpResults("SpanPositionRangeQuery", sq, 5); @@ -323,7 +323,7 @@ protected void dumpResults(String qType, Query q, int numDocs) throws IOExceptio } final IndexSearcher searcher = searcher(); TopDocs hits = searcher.search(q, numDocs); - final boolean producedResults = (hits.totalHits.value > 0); + final boolean producedResults = (hits.totalHits.value() > 0); if (!producedResults) { System.out.println( "TEST: qType=" @@ -338,7 +338,7 @@ protected void dumpResults(String qType, Query q, int numDocs) throws IOExceptio if (VERBOSE) { ScoreDoc[] scoreDocs = hits.scoreDocs; StoredFields storedFields = searcher.storedFields(); - for (int i = 0; i < Math.min(numDocs, hits.totalHits.value); i++) { + for (int i = 0; i < Math.min(numDocs, hits.totalHits.value()); i++) { Document ldoc = storedFields.document(scoreDocs[i].doc); System.out.println("[" + ldoc.get("date") + "]" + ldoc.get("contents")); } diff --git a/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/CopyJob.java b/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/CopyJob.java index 236e61c1ee34..c91febf73257 100644 --- a/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/CopyJob.java +++ b/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/CopyJob.java @@ -137,7 +137,7 @@ private synchronized void _transferAndCancel(CopyJob prevJob) throws IOException if (prevTmpFileName != null) { // This fileName is common to both jobs, and the old job already finished copying it (to a // temp file), so we keep it: - long fileLength = ent.getValue().length; + long fileLength = ent.getValue().length(); bytesAlreadyCopied += fileLength; dest.message( "xfer: carry over already-copied file " @@ -187,7 +187,7 @@ private synchronized void _transferAndCancel(CopyJob prevJob) throws IOException prevJob.current = null; - totBytes += current.metaData.length; + totBytes += current.metaData.length(); // So it's not in our copy list anymore: it.remove(); diff --git a/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/CopyOneFile.java b/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/CopyOneFile.java index d8329ff761c7..5ee74ed5ee97 100644 --- a/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/CopyOneFile.java +++ b/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/CopyOneFile.java @@ -53,7 +53,7 @@ public CopyOneFile( // last 8 bytes are checksum, which we write ourselves after copying all bytes and confirming // checksum: - bytesToCopy = metaData.length - Long.BYTES; + bytesToCopy = metaData.length() - Long.BYTES; if (Node.VERBOSE_FILES) { dest.message( @@ -97,7 +97,7 @@ public boolean visit() throws IOException { long bytesLeft = bytesToCopy - bytesCopied; if (bytesLeft == 0) { long checksum = out.getChecksum(); - if (checksum != metaData.checksum) { + if (checksum != metaData.checksum()) { // Bits flipped during copy! dest.message( "file " @@ -105,7 +105,7 @@ public boolean visit() throws IOException { + ": checksum mismatch after copy (bits flipped during network copy?) after-copy checksum=" + checksum + " vs expected=" - + metaData.checksum + + metaData.checksum() + "; cancel job"); throw new IOException("file " + name + ": checksum mismatch after file copy"); } @@ -134,7 +134,7 @@ public boolean visit() throws IOException { Locale.ROOT, "file %s: done copying [%s, %.3fms]", name, - Node.bytesToString(metaData.length), + Node.bytesToString(metaData.length()), (System.nanoTime() - copyStartNS) / (double) TimeUnit.MILLISECONDS.toNanos(1))); } diff --git a/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/CopyState.java b/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/CopyState.java index c4ef3135c387..b0ee9deb4b3f 100644 --- a/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/CopyState.java +++ b/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/CopyState.java @@ -25,36 +25,22 @@ /** * Holds incRef'd file level details for one point-in-time segment infos on the primary node. * + * @param infos only non-null on the primary node * @lucene.experimental */ -public class CopyState { +public record CopyState( + Map files, + long version, + long gen, + byte[] infosBytes, + Set completedMergeFiles, + long primaryGen, + SegmentInfos infos) { - public final Map files; - public final long version; - public final long gen; - public final byte[] infosBytes; - public final Set completedMergeFiles; - public final long primaryGen; - - // only non-null on the primary node - public final SegmentInfos infos; - - public CopyState( - Map files, - long version, - long gen, - byte[] infosBytes, - Set completedMergeFiles, - long primaryGen, - SegmentInfos infos) { + public CopyState { assert completedMergeFiles != null; - this.files = Collections.unmodifiableMap(files); - this.version = version; - this.gen = gen; - this.infosBytes = infosBytes; - this.completedMergeFiles = Collections.unmodifiableSet(completedMergeFiles); - this.primaryGen = primaryGen; - this.infos = infos; + files = Collections.unmodifiableMap(files); + completedMergeFiles = Collections.unmodifiableSet(completedMergeFiles); } @Override diff --git a/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/FileMetaData.java b/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/FileMetaData.java index d7eeebf66460..48cebe2d6e9f 100644 --- a/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/FileMetaData.java +++ b/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/FileMetaData.java @@ -19,31 +19,19 @@ /** * Holds metadata details about a single file that we use to confirm two files (one remote, one - * local) are in fact "identical". + * local) are in fact "identical". Header and footer of the file must be identical between primary + * and replica to consider the files equal: * + * @param header Header of the file + * @param footer Footer of the file + * @param length Length of the file + * @param checksum Used to ensure no bit flips when copying the file: * @lucene.experimental */ -public class FileMetaData { - - // Header and footer of the file must be identical between primary and replica to consider the - // files equal: - public final byte[] header; - public final byte[] footer; - - public final long length; - - // Used to ensure no bit flips when copying the file: - public final long checksum; - - public FileMetaData(byte[] header, byte[] footer, long length, long checksum) { - this.header = header; - this.footer = footer; - this.length = length; - this.checksum = checksum; - } +public record FileMetaData(byte[] header, byte[] footer, long length, long checksum) { @Override public String toString() { - return "FileMetaData(length=" + length + ")"; + return "FileMetaData(length=" + length + " checksum=" + checksum + ")"; } } diff --git a/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/PrimaryNode.java b/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/PrimaryNode.java index d9605a9018db..6674d9f10d2f 100644 --- a/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/PrimaryNode.java +++ b/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/PrimaryNode.java @@ -181,7 +181,7 @@ public boolean flushAndRefresh() throws IOException { } public long getCopyStateVersion() { - return copyState.version; + return copyState.version(); } public synchronized long getLastCommitVersion() { @@ -222,7 +222,7 @@ public void commit() throws IOException { // TODO (opto): it's a bit wasteful that we put "last refresh" version here, not the actual // version we are committing, because it means // on xlog replay we are replaying more ops than necessary. - commitData.put(VERSION_KEY, Long.toString(copyState.version)); + commitData.put(VERSION_KEY, Long.toString(copyState.version())); message("top: commit commitData=" + commitData); writer.setLiveCommitData(commitData.entrySet(), false); writer.commit(); @@ -233,8 +233,8 @@ public synchronized CopyState getCopyState() throws IOException { ensureOpen(false); // message("top: getCopyState replicaID=" + replicaID + " replicaNodeID=" + replicaNodeID + " // version=" + curInfos.getVersion() + " infos=" + curInfos.toString()); - assert curInfos == copyState.infos; - writer.incRefDeleter(copyState.infos); + assert curInfos == copyState.infos(); + writer.incRefDeleter(copyState.infos()); int count = copyingCount.incrementAndGet(); assert count > 0; return copyState; @@ -243,8 +243,8 @@ public synchronized CopyState getCopyState() throws IOException { /** Called once replica is done (or failed) copying an NRT point */ public void releaseCopyState(CopyState copyState) throws IOException { // message("top: releaseCopyState version=" + copyState.version); - assert copyState.infos != null; - writer.decRefDeleter(copyState.infos); + assert copyState.infos() != null; + writer.decRefDeleter(copyState.infos()); int count = copyingCount.decrementAndGet(); assert count >= 0; } diff --git a/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/ReplicaNode.java b/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/ReplicaNode.java index c3d79b84843a..e4a88e3dd031 100644 --- a/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/ReplicaNode.java +++ b/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/ReplicaNode.java @@ -249,7 +249,7 @@ protected synchronized void start(long curPrimaryGen) throws IOException { null); job.start(); - message("top: init: sync sis.version=" + job.getCopyState().version); + message("top: init: sync sis.version=" + job.getCopyState().version()); // Force this copy job to finish while we wait, now. Note that this can be very time // consuming! @@ -272,17 +272,17 @@ protected synchronized void start(long curPrimaryGen) throws IOException { } } - lastPrimaryGen = job.getCopyState().primaryGen; + lastPrimaryGen = job.getCopyState().primaryGen(); SegmentInfos syncInfos = SegmentInfos.readCommit( - dir, toIndexInput(job.getCopyState().infosBytes), job.getCopyState().gen); + dir, toIndexInput(job.getCopyState().infosBytes()), job.getCopyState().gen()); // Must always commit to a larger generation than what's currently in the index: syncInfos.updateGeneration(infos); infos = syncInfos; - assert infos.getVersion() == job.getCopyState().version; + assert infos.getVersion() == job.getCopyState().version(); message(" version=" + infos.getVersion() + " segments=" + infos.toString()); message("top: init: incRef nrtFiles=" + job.getFileNames()); deleter.incRef(job.getFileNames()); @@ -293,7 +293,7 @@ protected synchronized void start(long curPrimaryGen) throws IOException { lastNRTFiles.addAll(job.getFileNames()); message("top: init: set lastNRTFiles=" + lastNRTFiles); - lastFileMetaData = job.getCopyState().files; + lastFileMetaData = job.getCopyState().files(); message( String.format( Locale.ROOT, @@ -301,7 +301,7 @@ protected synchronized void start(long curPrimaryGen) throws IOException { id, (System.nanoTime() - initSyncStartNS) / (double) TimeUnit.SECONDS.toNanos(1), bytesToString(job.getTotalBytesCopied()), - job.getCopyState().version)); + job.getCopyState().version())); doCommit = true; } else { @@ -406,7 +406,7 @@ protected void finishNRTCopy(CopyJob job, long startNS) throws IOException { CopyState copyState = job.getCopyState(); message( "top: finishNRTCopy: version=" - + copyState.version + + copyState.version() + (job.getFailed() ? " FAILED" : "") + " job=" + job); @@ -437,8 +437,8 @@ protected void finishNRTCopy(CopyJob job, long startNS) throws IOException { // Turn byte[] back to SegmentInfos: SegmentInfos infos = - SegmentInfos.readCommit(dir, toIndexInput(copyState.infosBytes), copyState.gen); - assert infos.getVersion() == copyState.version; + SegmentInfos.readCommit(dir, toIndexInput(copyState.infosBytes()), copyState.gen()); + assert infos.getVersion() == copyState.version(); message(" version=" + infos.getVersion() + " segments=" + infos.toString()); @@ -447,7 +447,7 @@ protected void finishNRTCopy(CopyJob job, long startNS) throws IOException { // Must first incRef new NRT files, then decRef old ones, to make sure we don't remove an NRT // file that's in common to both: - Collection newFiles = copyState.files.keySet(); + Collection newFiles = copyState.files().keySet(); message("top: incRef newNRTFiles=" + newFiles); deleter.incRef(newFiles); @@ -467,9 +467,10 @@ protected void finishNRTCopy(CopyJob job, long startNS) throws IOException { // finishes, copies its files out to us, but is then merged away (or dropped due to 100% // deletions) before we ever cutover to it // in an NRT point: - if (copyState.completedMergeFiles.isEmpty() == false) { - message("now remove-if-not-ref'd completed merge files: " + copyState.completedMergeFiles); - for (String fileName : copyState.completedMergeFiles) { + if (copyState.completedMergeFiles().isEmpty() == false) { + message( + "now remove-if-not-ref'd completed merge files: " + copyState.completedMergeFiles()); + for (String fileName : copyState.completedMergeFiles()) { if (pendingMergeFiles.contains(fileName)) { pendingMergeFiles.remove(fileName); deleter.deleteIfNoRef(fileName); @@ -477,7 +478,7 @@ protected void finishNRTCopy(CopyJob job, long startNS) throws IOException { } } - lastFileMetaData = copyState.files; + lastFileMetaData = copyState.files(); } int markerCount; @@ -494,7 +495,7 @@ protected void finishNRTCopy(CopyJob job, long startNS) throws IOException { "top: done sync: took %.3fs for %s, opened NRT reader version=%d markerCount=%d", (System.nanoTime() - startNS) / (double) TimeUnit.SECONDS.toNanos(1), bytesToString(job.getTotalBytesCopied()), - copyState.version, + copyState.version(), markerCount)); } @@ -609,7 +610,7 @@ public void run(CopyJob job) { return null; } - assert newPrimaryGen == job.getCopyState().primaryGen; + assert newPrimaryGen == job.getCopyState().primaryGen(); Collection newNRTFiles = job.getFileNames(); @@ -865,8 +866,8 @@ private boolean fileIsIdentical(String fileName, FileMetaData srcMetaData) throw return false; } - if (Arrays.equals(destMetaData.header, srcMetaData.header) == false - || Arrays.equals(destMetaData.footer, srcMetaData.footer) == false) { + if (Arrays.equals(destMetaData.header(), srcMetaData.header()) == false + || Arrays.equals(destMetaData.footer(), srcMetaData.footer()) == false) { // Segment name was reused! This is rare but possible and otherwise devastating: if (Node.VERBOSE_FILES) { message("file " + fileName + ": will copy [header/footer is different]"); diff --git a/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/SimpleCopyJob.java b/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/SimpleCopyJob.java index d85b644c0543..0cd8e85420b6 100644 --- a/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/SimpleCopyJob.java +++ b/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/SimpleCopyJob.java @@ -66,13 +66,13 @@ public synchronized void start() throws IOException { c.out.writeByte((byte) 0); c.out.writeString(current.name); c.out.writeVLong(current.getBytesCopied()); - totBytes += current.metaData.length; + totBytes += current.metaData.length(); } for (Map.Entry ent : toCopy) { String fileName = ent.getKey(); FileMetaData metaData = ent.getValue(); - totBytes += metaData.length; + totBytes += metaData.length(); c.out.writeByte((byte) 0); c.out.writeString(fileName); c.out.writeVLong(0); @@ -86,12 +86,12 @@ public synchronized void start() throws IOException { // socket buffering waiting for primary to // send us this length: long len = c.in.readVLong(); - if (len != current.metaData.length) { + if (len != current.metaData.length()) { throw new IllegalStateException( "file " + current.name + ": meta data says length=" - + current.metaData.length + + current.metaData.length() + " but c.in says " + len); } @@ -197,12 +197,12 @@ synchronized boolean visit() throws IOException { FileMetaData metaData = next.getValue(); String fileName = next.getKey(); long len = c.in.readVLong(); - if (len != metaData.length) { + if (len != metaData.length()) { throw new IllegalStateException( "file " + fileName + ": meta data says length=" - + metaData.length + + metaData.length() + " but c.in says " + len); } diff --git a/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/SimplePrimaryNode.java b/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/SimplePrimaryNode.java index 0822b53947d0..8c1f5fd71e3a 100644 --- a/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/SimplePrimaryNode.java +++ b/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/SimplePrimaryNode.java @@ -388,17 +388,17 @@ private void handleFlush(DataInput topIn, DataOutput topOut, BufferedOutputStrea private static void writeCopyState(CopyState state, DataOutput out) throws IOException { // TODO (opto): we could encode to byte[] once when we created the copyState, and then just send // same byts to all replicas... - out.writeVInt(state.infosBytes.length); - out.writeBytes(state.infosBytes, 0, state.infosBytes.length); - out.writeVLong(state.gen); - out.writeVLong(state.version); - TestSimpleServer.writeFilesMetaData(out, state.files); - - out.writeVInt(state.completedMergeFiles.size()); - for (String fileName : state.completedMergeFiles) { + out.writeVInt(state.infosBytes().length); + out.writeBytes(state.infosBytes(), 0, state.infosBytes().length); + out.writeVLong(state.gen()); + out.writeVLong(state.version()); + TestSimpleServer.writeFilesMetaData(out, state.files()); + + out.writeVInt(state.completedMergeFiles().size()); + for (String fileName : state.completedMergeFiles()) { out.writeString(fileName); } - out.writeVLong(state.primaryGen); + out.writeVLong(state.primaryGen()); } /** Called when another node (replica) wants to copy files from us */ @@ -417,7 +417,7 @@ private boolean handleFetchFiles( } else if (b == 1) { // Caller does not have CopyState; we pull the latest one: copyState = getCopyState(); - Thread.currentThread().setName("send-R" + replicaID + "-" + copyState.version); + Thread.currentThread().setName("send-R" + replicaID + "-" + copyState.version()); } else { // Protocol error: throw new IllegalArgumentException("invalid CopyState byte=" + b); diff --git a/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/SimpleReplicaNode.java b/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/SimpleReplicaNode.java index 3af0915ca254..03cb5a6bfdd2 100644 --- a/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/SimpleReplicaNode.java +++ b/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/SimpleReplicaNode.java @@ -136,7 +136,7 @@ protected CopyJob newCopyJob( c.out.writeByte((byte) 1); c.flush(); copyState = TestSimpleServer.readCopyState(c.in); - files = copyState.files; + files = copyState.files(); } else { c.out.writeByte((byte) 0); copyState = null; diff --git a/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/TestSimpleServer.java b/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/TestSimpleServer.java index edcb09a01b7b..82adf086bf20 100644 --- a/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/TestSimpleServer.java +++ b/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/TestSimpleServer.java @@ -141,12 +141,12 @@ static void writeFilesMetaData(DataOutput out, Map files) out.writeString(ent.getKey()); FileMetaData fmd = ent.getValue(); - out.writeVLong(fmd.length); - out.writeVLong(fmd.checksum); - out.writeVInt(fmd.header.length); - out.writeBytes(fmd.header, 0, fmd.header.length); - out.writeVInt(fmd.footer.length); - out.writeBytes(fmd.footer, 0, fmd.footer.length); + out.writeVLong(fmd.length()); + out.writeVLong(fmd.checksum()); + out.writeVInt(fmd.header().length); + out.writeBytes(fmd.header(), 0, fmd.header().length); + out.writeVInt(fmd.footer().length); + out.writeBytes(fmd.footer(), 0, fmd.footer().length); } } diff --git a/lucene/sandbox/build.gradle b/lucene/sandbox/build.gradle index 93f01e3f96dc..72762fe1c3d2 100644 --- a/lucene/sandbox/build.gradle +++ b/lucene/sandbox/build.gradle @@ -22,5 +22,6 @@ description = 'Various third party contributions and new ideas' dependencies { moduleApi project(':lucene:core') moduleApi project(':lucene:queries') + moduleApi project(':lucene:facet') moduleTestImplementation project(':lucene:test-framework') } diff --git a/lucene/sandbox/src/java/module-info.java b/lucene/sandbox/src/java/module-info.java index 3daace50cee4..f40a05af433a 100644 --- a/lucene/sandbox/src/java/module-info.java +++ b/lucene/sandbox/src/java/module-info.java @@ -19,6 +19,7 @@ module org.apache.lucene.sandbox { requires org.apache.lucene.core; requires org.apache.lucene.queries; + requires org.apache.lucene.facet; exports org.apache.lucene.payloads; exports org.apache.lucene.sandbox.codecs.idversion; @@ -27,6 +28,12 @@ exports org.apache.lucene.sandbox.queries; exports org.apache.lucene.sandbox.search; exports org.apache.lucene.sandbox.index; + exports org.apache.lucene.sandbox.facet; + exports org.apache.lucene.sandbox.facet.recorders; + exports org.apache.lucene.sandbox.facet.cutters.ranges; + exports org.apache.lucene.sandbox.facet.iterators; + exports org.apache.lucene.sandbox.facet.cutters; + exports org.apache.lucene.sandbox.facet.labels; provides org.apache.lucene.codecs.PostingsFormat with org.apache.lucene.sandbox.codecs.idversion.IDVersionPostingsFormat; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/idversion/IDVersionPostingsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/idversion/IDVersionPostingsWriter.java index 477fa2e4a6c5..1be42194dab5 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/idversion/IDVersionPostingsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/idversion/IDVersionPostingsWriter.java @@ -71,7 +71,7 @@ public void setField(FieldInfo fieldInfo) { // LUCENE-5693: because CheckIndex cross-checks term vectors with postings even for deleted // docs, and because our PF only indexes the // non-deleted documents on flush, CheckIndex will see this as corruption: - if (fieldInfo.hasVectors()) { + if (fieldInfo.hasTermVectors()) { throw new IllegalArgumentException( "field cannot index term vectors: CheckIndex will report this as index corruption"); } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/idversion/VersionBlockTreeTermsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/idversion/VersionBlockTreeTermsWriter.java index b0abc0b5059f..6d498ba950a6 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/idversion/VersionBlockTreeTermsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/idversion/VersionBlockTreeTermsWriter.java @@ -143,15 +143,14 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer { final PostingsWriterBase postingsWriter; final FieldInfos fieldInfos; - private static class FieldMetaData { - public final FieldInfo fieldInfo; - public final Pair rootCode; - public final long numTerms; - public final long indexStartFP; - public final BytesRef minTerm; - public final BytesRef maxTerm; - - public FieldMetaData( + private record FieldMetaData( + FieldInfo fieldInfo, + Pair rootCode, + long numTerms, + long indexStartFP, + BytesRef minTerm, + BytesRef maxTerm) { + private FieldMetaData( FieldInfo fieldInfo, Pair rootCode, long numTerms, diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/quantization/KMeans.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/quantization/KMeans.java index bb9d3ca63df5..88d2adba5fad 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/quantization/KMeans.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/quantization/KMeans.java @@ -25,11 +25,11 @@ import java.util.List; import java.util.Random; import java.util.Set; +import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.VectorUtil; import org.apache.lucene.util.hnsw.NeighborQueue; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; /** KMeans clustering algorithm for vectors */ public class KMeans { @@ -38,7 +38,7 @@ public class KMeans { public static final int DEFAULT_ITRS = 10; public static final int DEFAULT_SAMPLE_SIZE = 100_000; - private final RandomAccessVectorValues.Floats vectors; + private final FloatVectorValues vectors; private final int numVectors; private final int numCentroids; private final Random random; @@ -57,9 +57,7 @@ public class KMeans { * @throws IOException when if there is an error accessing vectors */ public static Results cluster( - RandomAccessVectorValues.Floats vectors, - VectorSimilarityFunction similarityFunction, - int numClusters) + FloatVectorValues vectors, VectorSimilarityFunction similarityFunction, int numClusters) throws IOException { return cluster( vectors, @@ -93,7 +91,7 @@ public static Results cluster( * @throws IOException if there is error accessing vectors */ public static Results cluster( - RandomAccessVectorValues.Floats vectors, + FloatVectorValues vectors, int numClusters, boolean assignCentroidsToVectors, long seed, @@ -124,7 +122,7 @@ public static Results cluster( if (numClusters == 1) { centroids = new float[1][vectors.dimension()]; } else { - RandomAccessVectorValues.Floats sampleVectors = + FloatVectorValues sampleVectors = vectors.size() <= sampleSize ? vectors : createSampleReader(vectors, sampleSize, seed); KMeans kmeans = new KMeans(sampleVectors, numClusters, random, initializationMethod, restarts, iters); @@ -142,7 +140,7 @@ public static Results cluster( } private KMeans( - RandomAccessVectorValues.Floats vectors, + FloatVectorValues vectors, int numCentroids, Random random, KmeansInitializationMethod initializationMethod, @@ -276,7 +274,7 @@ private float[][] initializePlusPlus() throws IOException { * @throws IOException if there is an error accessing vector values */ private static double runKMeansStep( - RandomAccessVectorValues.Floats vectors, + FloatVectorValues vectors, float[][] centroids, short[] docCentroids, boolean useKahanSummation, @@ -348,9 +346,7 @@ private static double runKMeansStep( * descending distance to the current centroid set */ static void assignCentroids( - RandomAccessVectorValues.Floats vectors, - float[][] centroids, - List unassignedCentroidsIdxs) + FloatVectorValues vectors, float[][] centroids, List unassignedCentroidsIdxs) throws IOException { int[] assignedCentroidsIdxs = new int[centroids.length - unassignedCentroidsIdxs.size()]; int assignedIndex = 0; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/quantization/SampleReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/quantization/SampleReader.java index 9a718c811017..684c9fac838f 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/quantization/SampleReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/quantization/SampleReader.java @@ -20,18 +20,18 @@ import java.io.IOException; import java.util.Random; import java.util.function.IntUnaryOperator; +import org.apache.lucene.codecs.lucene95.HasIndexSlice; +import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.Bits; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; /** A reader of vector values that samples a subset of the vectors. */ -public class SampleReader implements RandomAccessVectorValues.Floats { - private final RandomAccessVectorValues.Floats origin; +public class SampleReader extends FloatVectorValues implements HasIndexSlice { + private final FloatVectorValues origin; private final int sampleSize; private final IntUnaryOperator sampleFunction; - SampleReader( - RandomAccessVectorValues.Floats origin, int sampleSize, IntUnaryOperator sampleFunction) { + SampleReader(FloatVectorValues origin, int sampleSize, IntUnaryOperator sampleFunction) { this.origin = origin; this.sampleSize = sampleSize; this.sampleFunction = sampleFunction; @@ -48,13 +48,13 @@ public int dimension() { } @Override - public Floats copy() throws IOException { + public FloatVectorValues copy() throws IOException { throw new IllegalStateException("Not supported"); } @Override public IndexInput getSlice() { - return origin.getSlice(); + return ((HasIndexSlice) origin).getSlice(); } @Override @@ -77,8 +77,7 @@ public Bits getAcceptOrds(Bits acceptDocs) { throw new IllegalStateException("Not supported"); } - public static SampleReader createSampleReader( - RandomAccessVectorValues.Floats origin, int k, long seed) { + public static SampleReader createSampleReader(FloatVectorValues origin, int k, long seed) { int[] samples = reservoirSample(origin.size(), k, seed); return new SampleReader(origin, samples.length, i -> samples[i]); } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/document/FloatPointNearestNeighbor.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/document/FloatPointNearestNeighbor.java index bb18ecc1bcb6..0bd424d93ce0 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/document/FloatPointNearestNeighbor.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/document/FloatPointNearestNeighbor.java @@ -40,15 +40,12 @@ */ public class FloatPointNearestNeighbor { - static class Cell implements Comparable { - final int readerIndex; - final byte[] minPacked; - final byte[] maxPacked; - final PointTree index; - - /** The closest possible distance^2 of all points in this cell */ - final double distanceSquared; - + /** + * @param distanceSquared The closest possible distance^2 of all points in this cell + */ + record Cell( + PointTree index, int readerIndex, byte[] minPacked, byte[] maxPacked, double distanceSquared) + implements Comparable { Cell( PointTree index, int readerIndex, diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/ComparableUtils.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/ComparableUtils.java new file mode 100644 index 000000000000..aa6e8fd0775a --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/ComparableUtils.java @@ -0,0 +1,261 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.facet; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Objects; +import org.apache.lucene.sandbox.facet.cutters.LongValueFacetCutter; +import org.apache.lucene.sandbox.facet.iterators.ComparableSupplier; +import org.apache.lucene.sandbox.facet.recorders.CountFacetRecorder; +import org.apache.lucene.sandbox.facet.recorders.LongAggregationsFacetRecorder; +import org.apache.lucene.util.InPlaceMergeSorter; + +/** + * Collection of static methods to provide most common comparables for sandbox faceting. You can + * also use it as an example for creating your own {@link ComparableSupplier} to enable custom + * facets top-n and sorting. + * + * @lucene.experimental + */ +public final class ComparableUtils { + private ComparableUtils() {} + + /** {@link ComparableSupplier} to sort by ords (ascending). */ + public static ComparableSupplier byOrdinal() { + return new ComparableSupplier<>() { + public void reuseComparable(int ord, ByOrdinalComparable reuse) { + reuse.ord = ord; + } + + public ByOrdinalComparable createComparable(int ord) { + ByOrdinalComparable result = new ByOrdinalComparable(); + result.ord = ord; + return result; + } + }; + } + + /** Used for {@link #byOrdinal} result. */ + public static class ByOrdinalComparable implements Comparable { + + private int ord; + + @Override + public int compareTo(ByOrdinalComparable o) { + return Integer.compare(o.ord, ord); + } + } + + /** + * {@link ComparableSupplier} to sort ordinals by count (descending) with ord as a tie-break + * (ascending) using provided {@link CountFacetRecorder}. + */ + public static ComparableSupplier byCount(CountFacetRecorder recorder) { + return new ComparableSupplier<>() { + public void reuseComparable(int ord, ByCountComparable reuse) { + reuse.ord = ord; + reuse.count = recorder.getCount(ord); + } + + public ByCountComparable createComparable(int ord) { + ByCountComparable result = new ByCountComparable(); + result.ord = ord; + result.count = recorder.getCount(ord); + return result; + } + }; + } + + /** Used for {@link #byCount} result. */ + public static class ByCountComparable implements Comparable { + private ByCountComparable() {} + + private int count; + private int ord; + + @Override + public int compareTo(ByCountComparable o) { + int cmp = Integer.compare(count, o.count); + if (cmp == 0) { + cmp = Integer.compare(o.ord, ord); + } + return cmp; + } + } + + /** + * {@link ComparableSupplier} to sort ordinals by long aggregation (descending) with tie-break by + * count (descending) or by ordinal (ascending) using provided {@link CountFacetRecorder} and + * {@link LongAggregationsFacetRecorder}. + */ + public static ComparableSupplier byAggregatedValue( + CountFacetRecorder countRecorder, + LongAggregationsFacetRecorder longAggregationsFacetRecorder, + int aggregationId) { + return new ComparableSupplier<>() { + public void reuseComparable(int ord, ByAggregatedValueComparable reuse) { + reuse.ord = ord; + reuse.secondaryRank = countRecorder.getCount(ord); + reuse.primaryRank = longAggregationsFacetRecorder.getRecordedValue(ord, aggregationId); + } + + public ByAggregatedValueComparable createComparable(int ord) { + ByAggregatedValueComparable result = new ByAggregatedValueComparable(); + reuseComparable(ord, result); + return result; + } + }; + } + + /** Used for {@link #byAggregatedValue} result. */ + public static class ByAggregatedValueComparable + implements Comparable { + private ByAggregatedValueComparable() {} + + private int ord; + private int secondaryRank; + private long primaryRank; + + @Override + public int compareTo(ByAggregatedValueComparable o) { + int cmp = Long.compare(primaryRank, o.primaryRank); + if (cmp == 0) { + cmp = Integer.compare(secondaryRank, o.secondaryRank); + if (cmp == 0) { + cmp = Integer.compare(o.ord, ord); + } + } + return cmp; + } + } + + /** + * {@link ComparableSupplier} to sort ordinals by long value from {@link LongValueFacetCutter} + * (descending). + */ + public static ComparableSupplier byLongValue( + LongValueFacetCutter longValueFacetCutter) { + return new ComparableSupplier<>() { + public void reuseComparable(int ord, ByLongValueComparable reuse) { + reuse.value = longValueFacetCutter.getValue(ord); + } + + public ByLongValueComparable createComparable(int ord) { + ByLongValueComparable result = new ByLongValueComparable(); + result.value = longValueFacetCutter.getValue(ord); + return result; + } + }; + } + + /** Used for {@link #byLongValue} result. */ + public static final class ByLongValueComparable implements Comparable { + private ByLongValueComparable() {} + + private long value; + + @Override + public int compareTo(ByLongValueComparable o) { + return Long.compare(o.value, value); + } + + @Override + public boolean equals(Object obj) { + if (obj instanceof ByLongValueComparable other) { + return other.value == value; + } + return false; + } + + @Override + public int hashCode() { + return Objects.hash(value); + } + } + + /** + * {@link ComparableSupplier} to sort ordinals by count (descending) from {@link + * CountFacetRecorder} with tie-break by long value (ascending) from {@link LongValueFacetCutter}. + */ + public static ComparableSupplier byCount( + CountFacetRecorder countFacetRecorder, LongValueFacetCutter longValueFacetCutter) { + return new ComparableSupplier<>() { + public void reuseComparable(int ord, ByCountAndLongValueComparable reuse) { + reuse.value = longValueFacetCutter.getValue(ord); + reuse.count = countFacetRecorder.getCount(ord); + } + + public ByCountAndLongValueComparable createComparable(int ord) { + ByCountAndLongValueComparable result = new ByCountAndLongValueComparable(); + reuseComparable(ord, result); + return result; + } + }; + } + + /** Used for {@link #byCount(CountFacetRecorder, LongValueFacetCutter)} result. */ + public static class ByCountAndLongValueComparable + implements Comparable { + private ByCountAndLongValueComparable() {} + + private int count; + private long value; + + @Override + public int compareTo(ByCountAndLongValueComparable o) { + int cmp = Integer.compare(count, o.count); + if (cmp == 0) { + cmp = Long.compare(o.value, value); + } + return cmp; + } + } + + /** + * Sort array of ordinals. + * + *

    To get top-n ordinals use {@link + * org.apache.lucene.sandbox.facet.iterators.TopnOrdinalIterator} instead. + * + * @param ordinals array of ordinals to sort + * @param comparableSupplier defines sort order + */ + public static > void sort( + int[] ordinals, ComparableSupplier comparableSupplier) throws IOException { + List comparables = new ArrayList<>(ordinals.length); + for (int i = 0; i < ordinals.length; i++) { + comparables.add(comparableSupplier.createComparable(ordinals[i])); + } + new InPlaceMergeSorter() { + @Override + protected void swap(int i, int j) { + int tmp = ordinals[i]; + ordinals[i] = ordinals[j]; + ordinals[j] = tmp; + Collections.swap(comparables, i, j); + } + + @Override + protected int compare(int i, int j) { + return comparables.get(j).compareTo(comparables.get(i)); + } + }.sort(0, ordinals.length); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/FacetFieldCollector.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/FacetFieldCollector.java new file mode 100644 index 000000000000..90158bd06502 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/FacetFieldCollector.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.facet; + +import java.io.IOException; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.sandbox.facet.cutters.FacetCutter; +import org.apache.lucene.sandbox.facet.recorders.FacetRecorder; +import org.apache.lucene.search.Collector; +import org.apache.lucene.search.LeafCollector; +import org.apache.lucene.search.ScoreMode; + +/** + * {@link Collector} that brings together {@link FacetCutter} and {@link FacetRecorder} to compute + * facets during collection phase. + * + * @lucene.experimental + */ +public final class FacetFieldCollector implements Collector { + private final FacetCutter facetCutter; + private final FacetRecorder facetRecorder; + + /** Collector for cutter+recorder pair. */ + public FacetFieldCollector(FacetCutter facetCutter, FacetRecorder facetRecorder) { + this.facetCutter = facetCutter; + this.facetRecorder = facetRecorder; + } + + @Override + public LeafCollector getLeafCollector(LeafReaderContext context) throws IOException { + return new FacetFieldLeafCollector(context, facetCutter, facetRecorder); + } + + @Override + public ScoreMode scoreMode() { + // TODO: Some FacetRecorders might need scores, e.g. to get associated numeric values, see for + // example TaxonomyFacetFloatAssociations. Not sure if anyone actually uses it, because + // FacetsCollectorManager creates FacetsCollector with keepScores: false. But if someone needs + // it, we can add boolean needScores method to FacetRecorder interface, return + // ScoreMode.COMPLETE here when the method returns true. FacetRecorders#needScores should be + // implemented on case by case basis, e.g. LongAggregationsFacetRecorder can take it as a + // constuctor argument, and when it's true call LongValues#getValues with the scores. + return ScoreMode.COMPLETE_NO_SCORES; + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/FacetFieldCollectorManager.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/FacetFieldCollectorManager.java new file mode 100644 index 000000000000..f78aebcff242 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/FacetFieldCollectorManager.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.facet; + +import java.io.IOException; +import java.util.Collection; +import org.apache.lucene.sandbox.facet.cutters.FacetCutter; +import org.apache.lucene.sandbox.facet.recorders.FacetRecorder; +import org.apache.lucene.search.CollectorManager; + +/** + * Collector manager for {@link FacetFieldCollector}. Returns the same extension of {@link + * FacetRecorder} that was used to collect results. + * + * @lucene.experimental + */ +public final class FacetFieldCollectorManager + implements CollectorManager { + + private final FacetCutter facetCutter; + private final V facetRecorder; + + /** Create collector for a cutter + recorder pair */ + public FacetFieldCollectorManager(FacetCutter facetCutter, V facetRecorder) { + this.facetCutter = facetCutter; + this.facetRecorder = facetRecorder; + } + + @Override + public FacetFieldCollector newCollector() throws IOException { + return new FacetFieldCollector(facetCutter, facetRecorder); + } + + @Override + public V reduce(Collection collectors) throws IOException { + facetRecorder.reduce(facetCutter); + return this.facetRecorder; + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/FacetFieldLeafCollector.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/FacetFieldLeafCollector.java new file mode 100644 index 000000000000..3361022be8b4 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/FacetFieldLeafCollector.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.facet; + +import java.io.IOException; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.sandbox.facet.cutters.FacetCutter; +import org.apache.lucene.sandbox.facet.cutters.LeafFacetCutter; +import org.apache.lucene.sandbox.facet.recorders.FacetRecorder; +import org.apache.lucene.sandbox.facet.recorders.LeafFacetRecorder; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.LeafCollector; +import org.apache.lucene.search.Scorable; + +/** + * {@link LeafCollector} that for each facet ordinal from {@link LeafFacetCutter} records data with + * {@link LeafFacetRecorder}. + */ +final class FacetFieldLeafCollector implements LeafCollector { + + private final LeafReaderContext context; + private final FacetCutter cutter; + private final FacetRecorder recorder; + private LeafFacetCutter leafCutter; + + private LeafFacetRecorder leafRecorder; + + FacetFieldLeafCollector(LeafReaderContext context, FacetCutter cutter, FacetRecorder recorder) { + this.context = context; + this.cutter = cutter; + this.recorder = recorder; + } + + @Override + public void setScorer(Scorable scorer) throws IOException { + // TODO: see comment in FacetFieldCollector#scoreMode + } + + @Override + public void collect(int doc) throws IOException { + if (leafCutter == null) { + leafCutter = cutter.createLeafCutter(context); + assert leafRecorder == null; + leafRecorder = recorder.getLeafRecorder(context); + } + if (leafCutter.advanceExact(doc)) { + for (int curOrd = leafCutter.nextOrd(); + curOrd != LeafFacetCutter.NO_MORE_ORDS; + curOrd = leafCutter.nextOrd()) { + leafRecorder.record(doc, curOrd); + } + } + } + + @Override + public DocIdSetIterator competitiveIterator() throws IOException { + // TODO: any ideas? + // 1. Docs that have values for the index field we about to facet on + // 2. TK + return LeafCollector.super.competitiveIterator(); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/cutters/FacetCutter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/cutters/FacetCutter.java new file mode 100644 index 000000000000..7f33af313e06 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/cutters/FacetCutter.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.facet.cutters; + +import java.io.IOException; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.sandbox.facet.iterators.OrdinalIterator; + +/** + * Creates {@link LeafFacetCutter} for each leaf. + * + *

    TODO: do we need FacetCutterManager similar to CollectorManager, e.g. is createLeafCutter + * always thread safe? + * + * @lucene.experimental + */ +public interface FacetCutter { + + /** Get cutter for the leaf. */ + LeafFacetCutter createLeafCutter(LeafReaderContext context) throws IOException; + + /** + * For facets that have hierarchy (levels), return all top level dimension ordinals that require + * rollup. + * + *

    Rollup is an optimization for facets types that support hierarchy, if single document + * belongs to at most one node in the hierarchy, we can first record data for these nodes only, + * and then roll up values to parent ordinals. + * + *

    Default implementation returns null, which means that rollup is not needed. + */ + default OrdinalIterator getOrdinalsToRollup() throws IOException { + return null; + } + + /** For facets that have hierarchy (levels), get all children ordinals for given ord. */ + default OrdinalIterator getChildrenOrds(int ord) throws IOException { + return null; + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/cutters/LeafFacetCutter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/cutters/LeafFacetCutter.java new file mode 100644 index 000000000000..fdfe1b3600cb --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/cutters/LeafFacetCutter.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.facet.cutters; + +import java.io.IOException; +import org.apache.lucene.sandbox.facet.iterators.OrdinalIterator; + +/** + * Interface to be implemented to cut documents into facets for an index segment (leaf). + * + *

    When {@link #advanceExact(int)} returns true, {@link #nextOrd()} yields all facet ordinals for + * the current document. It is illegal to call {@link #nextOrd()} if {@link #advanceExact(int)} + * returns false. + * + * @lucene.experimental + */ +public interface LeafFacetCutter extends OrdinalIterator { + /** advance to the next doc */ + boolean advanceExact(int doc) throws IOException; +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/cutters/LongValueFacetCutter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/cutters/LongValueFacetCutter.java new file mode 100644 index 000000000000..1ec32c863755 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/cutters/LongValueFacetCutter.java @@ -0,0 +1,187 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.facet.cutters; + +import java.io.IOException; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.locks.Lock; +import java.util.concurrent.locks.ReentrantReadWriteLock; +import java.util.function.IntSupplier; +import org.apache.lucene.facet.taxonomy.FacetLabel; +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.SortedNumericDocValues; +import org.apache.lucene.internal.hppc.IntLongHashMap; +import org.apache.lucene.internal.hppc.LongIntHashMap; +import org.apache.lucene.sandbox.facet.labels.OrdToLabel; + +/** + * {@link FacetCutter} and {@link OrdToLabel} for distinct long values. + * + *

    TODO: This class is quite inefficient. Will optimise later. + * + *

    TODO: add support for other value sources e.g: LongValues + * + * @lucene.experimental + */ +public final class LongValueFacetCutter implements FacetCutter, OrdToLabel { + private final String field; + // TODO: consider alternatives if this is a bottleneck + private final LongIntHashMapSyncCompute valueToOrdMap; + private IntLongHashMap ordToValueMap; + private final AtomicInteger maxOrdinal; + + /** + * Constructor. + * + * @param field field name to read long values from. + */ + public LongValueFacetCutter(String field) { + this.field = field; + valueToOrdMap = new LongIntHashMapSyncCompute(); + ordToValueMap = null; + maxOrdinal = new AtomicInteger(-1); + } + + @Override + public LeafFacetCutter createLeafCutter(LeafReaderContext context) throws IOException { + SortedNumericDocValues docValues = DocValues.getSortedNumeric(context.reader(), field); + return new LeafFacetCutter() { + int docValueCount; + long lastDocValue; + int docValueCursor; + + @Override + public boolean advanceExact(int doc) throws IOException { + if (docValues.advanceExact(doc)) { + docValueCount = docValues.docValueCount(); + docValueCursor = 0; + return true; + } + return false; + } + + @Override + public int nextOrd() throws IOException { + while (docValueCursor++ < docValueCount) { + long value = docValues.nextValue(); + // SortedNumericDocValues can have duplicates, but values are sorted, so we only need to + // check previous value to remove duplicates + if (docValueCursor == 1 || value != lastDocValue) { + lastDocValue = value; + return valueToOrdMap.computeIfAbsent(value, maxOrdinal::incrementAndGet); + } + } + return NO_MORE_ORDS; + } + }; + } + + @Override + public FacetLabel getLabel(int ordinal) { + if (ordToValueMap == null) { + buildOrdToValueMap(); + } + if (ordToValueMap.containsKey(ordinal)) { + return new FacetLabel(String.valueOf(ordToValueMap.get(ordinal))); + } + assert false + : "ordinal=" + + ordinal + + ", ordToValueMap.size=" + + ordToValueMap.size() + + ", valueToOrdMap.size=" + + valueToOrdMap.size(); + return null; + } + + /** + * Get value by ordinal. Should only be called after collection phase. + * + *

    TODO: we need it to tie break sort by value. Alternatively we can sort by label (then we + * don't need this method), but we would have to convert FacetLabel to "long" to have the same + * order... Overall, it is probably not important to tie break by value, and we can tie break by + * ord same as for other facets; but for now we don't want to change results order just in case. + * + * @param ordinal facet ordinal. + * @return long value + */ + public long getValue(int ordinal) { + // TODO: do we want to create #finish method that called by #reduce to build the map? + if (ordToValueMap == null) { + buildOrdToValueMap(); + } + return ordToValueMap.get(ordinal); + } + + private void buildOrdToValueMap() { + ordToValueMap = new IntLongHashMap(valueToOrdMap.size()); + for (LongIntHashMap.LongIntCursor cursor : valueToOrdMap) { + ordToValueMap.put(cursor.value, cursor.key); + } + } + + @Override + public FacetLabel[] getLabels(int[] ordinals) throws IOException { + FacetLabel[] facetLabels = new FacetLabel[ordinals.length]; + for (int i = 0; i < ordinals.length; i++) { + facetLabels[i] = getLabel(ordinals[i]); + } + return facetLabels; + } + + /** {@link LongIntHashMap} with threadsafe computeIfAbsent method */ + private static class LongIntHashMapSyncCompute extends LongIntHashMap { + private final ReentrantReadWriteLock rwl = new ReentrantReadWriteLock(); + private final Lock r = rwl.readLock(); + private final Lock w = rwl.writeLock(); + + /** + * If key exists in the map return its value, otherwise insert value from the value supplier and + * return it. + * + *

    The method is threadsafe, and it allows concurrent reading from the map, but it locks the + * map to insert a new value as it might require rehashing. + */ + public int computeIfAbsent(long key, IntSupplier valueSupplier) { + r.lock(); + int value; + try { + value = super.getOrDefault(key, -1); + } finally { + r.unlock(); + } + if (value == -1) { + w.lock(); + try { + int index = super.indexOf(key); + if (super.indexExists(index)) { + return super.indexGet(index); + } else { + value = valueSupplier.getAsInt(); + super.indexInsert(index, key, value); + return value; + } + } finally { + w.unlock(); + } + } else { + return value; + } + } + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/cutters/TaxonomyFacetsCutter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/cutters/TaxonomyFacetsCutter.java new file mode 100644 index 000000000000..cbefdd8feeba --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/cutters/TaxonomyFacetsCutter.java @@ -0,0 +1,199 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.facet.cutters; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.Map; +import org.apache.lucene.facet.FacetsConfig; +import org.apache.lucene.facet.taxonomy.FacetLabel; +import org.apache.lucene.facet.taxonomy.ParallelTaxonomyArrays; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.SortedNumericDocValues; +import org.apache.lucene.sandbox.facet.iterators.OrdinalIterator; + +/** + * {@link FacetCutter} for facets that use taxonomy side-car index. + * + * @lucene.experimental + */ +public final class TaxonomyFacetsCutter implements FacetCutter { + + private final FacetsConfig facetsConfig; + private final TaxonomyReader taxoReader; + private final String indexFieldName; + private final boolean disableRollup; + + private ParallelTaxonomyArrays.IntArray children; + private ParallelTaxonomyArrays.IntArray siblings; + + /** Create {@link FacetCutter} for taxonomy facets. */ + public TaxonomyFacetsCutter( + String indexFieldName, FacetsConfig facetsConfig, TaxonomyReader taxoReader) { + this(indexFieldName, facetsConfig, taxoReader, false); + } + + /** + * Expert: Create {@link FacetCutter} for taxonomy facets. + * + * @param disableRollup if set to true, rollup is disabled. In most cases users should not use it. + * Setting it to true silently leads to incorrect results for dimensions that require rollup. + * At the same time, if you are sure that there are no dimensions that require rollup, setting + * it to true might improve performance. + */ + public TaxonomyFacetsCutter( + String indexFieldName, + FacetsConfig facetsConfig, + TaxonomyReader taxoReader, + boolean disableRollup) { + this.facetsConfig = facetsConfig; + this.indexFieldName = indexFieldName; + this.taxoReader = taxoReader; + this.disableRollup = disableRollup; + } + + /** + * Returns int[] mapping each ordinal to its first child; this is a large array and is computed + * (and then saved) the first time this method is invoked. + */ + ParallelTaxonomyArrays.IntArray getChildren() throws IOException { + if (children == null) { + children = taxoReader.getParallelTaxonomyArrays().children(); + } + return children; + } + + /** + * Returns int[] mapping each ordinal to its next sibling; this is a large array and is computed + * (and then saved) the first time this method is invoked. + */ + ParallelTaxonomyArrays.IntArray getSiblings() throws IOException { + if (siblings == null) { + siblings = taxoReader.getParallelTaxonomyArrays().siblings(); + } + return siblings; + } + + @Override + public LeafFacetCutter createLeafCutter(LeafReaderContext context) throws IOException { + SortedNumericDocValues multiValued = + DocValues.getSortedNumeric(context.reader(), indexFieldName); + // DocValues.getSortedNumeric never returns null + assert multiValued != null; + // TODO: if multiValued is emptySortedNumeric we can throw CollectionTerminatedException + // in FacetFieldLeafCollector and save some CPU cycles. + TaxonomyLeafFacetCutterMultiValue leafCutter = + new TaxonomyLeafFacetCutterMultiValue(multiValued); + return leafCutter; + + // TODO: does unwrapping Single valued make things any faster? We still need to wrap it into + // LeafFacetCutter + // NumericDocValues singleValued = DocValues.unwrapSingleton(multiValued); + } + + @Override + public OrdinalIterator getOrdinalsToRollup() throws IOException { + if (disableRollup) { + return null; + } + + // Rollup any necessary dims: + Iterator> dimensions = + facetsConfig.getDimConfigs().entrySet().iterator(); + + ArrayList dimsToRollup = new ArrayList<>(); + + while (dimensions.hasNext()) { + Map.Entry ent = dimensions.next(); + String dim = ent.getKey(); + FacetsConfig.DimConfig ft = ent.getValue(); + if (ft.hierarchical && ft.multiValued == false && ft.indexFieldName.equals(indexFieldName)) { + dimsToRollup.add(new FacetLabel(dim)); + } + } + + int[] dimOrdToRollup = taxoReader.getBulkOrdinals(dimsToRollup.toArray(new FacetLabel[0])); + + return new OrdinalIterator() { + int currentIndex = 0; + + @Override + public int nextOrd() throws IOException { + for (; currentIndex < dimOrdToRollup.length; currentIndex++) { + // It can be invalid if this field was declared in the + // config but never indexed + if (dimOrdToRollup[currentIndex] != TaxonomyReader.INVALID_ORDINAL) { + return dimOrdToRollup[currentIndex++]; + } + } + return NO_MORE_ORDS; + } + }; + } + + @Override + public OrdinalIterator getChildrenOrds(final int parentOrd) throws IOException { + ParallelTaxonomyArrays.IntArray children = getChildren(); + ParallelTaxonomyArrays.IntArray siblings = getSiblings(); + return new OrdinalIterator() { + int currentChild = parentOrd; + + @Override + public int nextOrd() { + if (currentChild == parentOrd) { + currentChild = children.get(currentChild); + } else { + currentChild = siblings.get(currentChild); + } + if (currentChild != TaxonomyReader.INVALID_ORDINAL) { + return currentChild; + } + return NO_MORE_ORDS; + } + }; + } + + private static class TaxonomyLeafFacetCutterMultiValue implements LeafFacetCutter { + private final SortedNumericDocValues multiValued; + private int ordsInDoc; + + private TaxonomyLeafFacetCutterMultiValue(SortedNumericDocValues multiValued) { + this.multiValued = multiValued; + } + + @Override + public int nextOrd() throws IOException { + if (ordsInDoc > 0) { + ordsInDoc--; + return (int) multiValued.nextValue(); + } + return LeafFacetCutter.NO_MORE_ORDS; + } + + @Override + public boolean advanceExact(int doc) throws IOException { + if (multiValued.advanceExact(doc)) { + ordsInDoc = multiValued.docValueCount(); + return true; + } + return false; + } + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/cutters/package-info.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/cutters/package-info.java new file mode 100644 index 000000000000..83e72212fe29 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/cutters/package-info.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * Sandbox faceting: facet cutters, see {@link org.apache.lucene.sandbox.facet.cutters.FacetCutter} + * for details. + * + * @lucene.experimental + */ +package org.apache.lucene.sandbox.facet.cutters; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/cutters/ranges/DoubleRangeFacetCutter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/cutters/ranges/DoubleRangeFacetCutter.java new file mode 100644 index 000000000000..69c295355a33 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/cutters/ranges/DoubleRangeFacetCutter.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.facet.cutters.ranges; + +import java.io.IOException; +import org.apache.lucene.facet.MultiDoubleValuesSource; +import org.apache.lucene.facet.MultiLongValuesSource; +import org.apache.lucene.facet.range.DoubleRange; +import org.apache.lucene.facet.range.DoubleRangeFacetCounts; +import org.apache.lucene.facet.range.LongRange; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.sandbox.facet.cutters.FacetCutter; +import org.apache.lucene.sandbox.facet.cutters.LeafFacetCutter; +import org.apache.lucene.search.DoubleValuesSource; +import org.apache.lucene.search.LongValuesSource; +import org.apache.lucene.util.NumericUtils; + +/** + * {@link FacetCutter} for ranges of double values. + * + *

    Based on {@link DoubleRangeFacetCounts}, this class translates double ranges to long ranges + * using {@link NumericUtils#doubleToSortableLong} and delegates faceting work to a {@link + * LongRangeFacetCutter}. + * + * @lucene.experimental + */ +public final class DoubleRangeFacetCutter implements FacetCutter { + + private final LongRangeFacetCutter longRangeFacetCutter; + + /** Constructor. */ + public DoubleRangeFacetCutter( + MultiDoubleValuesSource multiDoubleValuesSource, DoubleRange[] doubleRanges) { + super(); + DoubleValuesSource singleDoubleValuesSource = + MultiDoubleValuesSource.unwrapSingleton(multiDoubleValuesSource); + LongValuesSource singleLongValuesSource; + MultiLongValuesSource multiLongValuesSource; + if (singleDoubleValuesSource != null) { + singleLongValuesSource = singleDoubleValuesSource.toSortableLongDoubleValuesSource(); + multiLongValuesSource = null; + } else { + singleLongValuesSource = null; + multiLongValuesSource = multiDoubleValuesSource.toSortableMultiLongValuesSource(); + } + LongRange[] longRanges = mapDoubleRangesToSortableLong(doubleRanges); + // TODO: instead of relying on either single value source or multi value source to be null, we + // should create different factory methods for single and multi valued versions and use the + // right one + this.longRangeFacetCutter = + LongRangeFacetCutter.createSingleOrMultiValued( + multiLongValuesSource, singleLongValuesSource, longRanges); + } + + @Override + public LeafFacetCutter createLeafCutter(LeafReaderContext context) throws IOException { + return longRangeFacetCutter.createLeafCutter(context); + } + + // TODO: it is exactly the same as DoubleRangeFacetCounts#getLongRanges (protected), we should + // dedup + private LongRange[] mapDoubleRangesToSortableLong(DoubleRange[] doubleRanges) { + LongRange[] longRanges = new LongRange[doubleRanges.length]; + for (int i = 0; i < longRanges.length; i++) { + DoubleRange dr = doubleRanges[i]; + longRanges[i] = + new LongRange( + dr.label, + NumericUtils.doubleToSortableLong(dr.min), + true, + NumericUtils.doubleToSortableLong(dr.max), + true); + } + return longRanges; + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/cutters/ranges/IntervalTracker.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/cutters/ranges/IntervalTracker.java new file mode 100644 index 000000000000..f3b11f56296f --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/cutters/ranges/IntervalTracker.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.facet.cutters.ranges; + +import java.io.IOException; +import org.apache.lucene.sandbox.facet.iterators.OrdinalIterator; +import org.apache.lucene.util.FixedBitSet; + +/** + * A specialised ordinal iterator that supports write (set and clear) operations. Clients can write + * data and freeze the state before reading data from it like any other OrdinalIterator. Instances + * may be reused by clearing the current iterator E.g. LongRangeFacetCutter uses IntervalTracker + * instances to map ranges to ordinals and track per-range data and retrieve recorded ranges for a + * data set. + * + * @lucene.experimental + */ +interface IntervalTracker extends OrdinalIterator { + /** track information for the seen input interval * */ + void set(int i); + + /** clear recorded information on this tracker. * */ + void clear(); + + /** check if any data for the interval has been recorded * */ + boolean get(int index); + + /** finalise any state before read operations can be performed on this OrdinalIterator */ + void freeze(); + + /** + * Interval Tracker that tracks data for multiple intervals. The interval is recorded only once + * iff data belonging to the interval is encountered * + */ + class MultiIntervalTracker implements IntervalTracker { + + private FixedBitSet tracker; + private int trackerState; + private int bitFrom; + + private int intervalsWithHit; + + MultiIntervalTracker(int size) { + tracker = new FixedBitSet(size); + } + + @Override + public void set(int i) { + tracker.set(i); + } + + @Override + public void clear() { + tracker.clear(); + bitFrom = 0; + trackerState = 0; + intervalsWithHit = 0; + } + + @Override + public boolean get(int index) { + return tracker.get(index); + } + + @Override + public void freeze() { + intervalsWithHit = tracker.cardinality(); + } + + @Override + public int nextOrd() throws IOException { + if (trackerState == intervalsWithHit) { + return NO_MORE_ORDS; + } + trackerState++; + int nextSetBit = tracker.nextSetBit(bitFrom); + bitFrom = nextSetBit + 1; + return nextSetBit; + } + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/cutters/ranges/LongRangeFacetCutter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/cutters/ranges/LongRangeFacetCutter.java new file mode 100644 index 000000000000..49fd4b1317f0 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/cutters/ranges/LongRangeFacetCutter.java @@ -0,0 +1,320 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.facet.cutters.ranges; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Comparator; +import java.util.List; +import org.apache.lucene.facet.MultiLongValues; +import org.apache.lucene.facet.MultiLongValuesSource; +import org.apache.lucene.facet.range.LongRange; +import org.apache.lucene.sandbox.facet.cutters.FacetCutter; +import org.apache.lucene.sandbox.facet.cutters.LeafFacetCutter; +import org.apache.lucene.search.LongValues; +import org.apache.lucene.search.LongValuesSource; + +/** + * {@link FacetCutter} for ranges of long values. It's based on LongRangeCounter class. + * + * @lucene.experimental + */ +public abstract class LongRangeFacetCutter implements FacetCutter { + + final MultiLongValuesSource valuesSource; + + // TODO: refactor - weird that we have both multi and single here. + final LongValuesSource singleValues; + final LongRangeAndPos[] sortedRanges; + + final int requestedRangeCount; + + final List elementaryIntervals; + + /** elementary interval boundaries used for efficient counting (bsearch to find interval) */ + final long[] boundaries; + + final int[] pos; + + // Default interval position, when elementary interval is mapped to this interval + // it is skipped. + static final int SKIP_INTERVAL_POSITION = -1; + + /** Create {@link FacetCutter} for provided value source and long ranges. */ + static LongRangeFacetCutter createSingleOrMultiValued( + MultiLongValuesSource longValuesSource, + LongValuesSource singleLongValuesSource, + LongRange[] longRanges) { + if (areOverlappingRanges(longRanges)) { + return new OverlappingLongRangeFacetCutter( + longValuesSource, singleLongValuesSource, longRanges); + } + return new NonOverlappingLongRangeFacetCutter( + longValuesSource, singleLongValuesSource, longRanges); + } + + public static LongRangeFacetCutter create( + MultiLongValuesSource longValuesSource, LongRange[] longRanges) { + return createSingleOrMultiValued(longValuesSource, null, longRanges); + } + + // caller handles conversion of Doubles and DoubleRange to Long and LongRange + // ranges need not be sorted + LongRangeFacetCutter( + MultiLongValuesSource longValuesSource, + LongValuesSource singleLongValuesSource, + LongRange[] longRanges) { + super(); + valuesSource = longValuesSource; + if (singleLongValuesSource != null) { + singleValues = singleLongValuesSource; + } else { + singleValues = MultiLongValuesSource.unwrapSingleton(valuesSource); + } + + sortedRanges = new LongRangeAndPos[longRanges.length]; + requestedRangeCount = longRanges.length; + + for (int i = 0; i < longRanges.length; i++) { + sortedRanges[i] = new LongRangeAndPos(longRanges[i], i); + } + + Arrays.sort(this.sortedRanges, Comparator.comparingLong(r -> r.range.min)); + elementaryIntervals = buildElementaryIntervals(); + + // Keep track of elementary interval boundary ends (for binary search) along with the requested + // range they map back to (and -1 when they map to a "gap" range in case of ExclusiveRanges): + boundaries = new long[elementaryIntervals.size()]; + pos = new int[elementaryIntervals.size()]; + Arrays.fill(pos, SKIP_INTERVAL_POSITION); + int currRange = 0; + for (int i = 0; i < boundaries.length; i++) { + boundaries[i] = elementaryIntervals.get(i).end; + if (currRange < sortedRanges.length) { + LongRangeAndPos curr = sortedRanges[currRange]; + if (boundaries[i] == curr.range.max) { + pos[i] = curr.pos; + currRange++; + } + } + } + } + + /** + * Generates non-overlapping intervals that cover requested ranges and gaps in-between. Each + * elementary range refers to a gap, single requested range, or multiple requested ranges when + * they overlap. + */ + abstract List buildElementaryIntervals(); + + private static boolean areOverlappingRanges(LongRange[] ranges) { + if (ranges.length == 0) { + return false; + } + + // Copy before sorting so we don't mess with the caller's original ranges: + // TODO: We're going to do this again in the constructor. Can't we come up with a clever way to + // avoid doing it twice? + LongRange[] sortedRanges = new LongRange[ranges.length]; + System.arraycopy(ranges, 0, sortedRanges, 0, ranges.length); + Arrays.sort(sortedRanges, Comparator.comparingLong(r -> r.min)); + + long previousMax = sortedRanges[0].max; + for (int i = 1; i < sortedRanges.length; i++) { + // Ranges overlap if the next min is <= the previous max (note that LongRange models + // closed ranges, so equal limit points are considered overlapping): + if (sortedRanges[i].min <= previousMax) { + return true; + } + previousMax = sortedRanges[i].max; + } + + return false; + } + + abstract static class LongRangeMultivaluedLeafFacetCutter implements LeafFacetCutter { + private final MultiLongValues multiLongValues; + private final long[] boundaries; + final int[] pos; + final IntervalTracker elementaryIntervalTracker; + + // TODO: we need it only for overlapping ranges, should not handle it in advanceExact for + // exclusive ranges. + IntervalTracker requestedIntervalTracker; + + LongRangeMultivaluedLeafFacetCutter(MultiLongValues longValues, long[] boundaries, int[] pos) { + this.multiLongValues = longValues; + this.boundaries = boundaries; + this.pos = pos; + elementaryIntervalTracker = new IntervalTracker.MultiIntervalTracker(boundaries.length); + } + + @Override + public boolean advanceExact(int doc) throws IOException { + if (multiLongValues.advanceExact(doc) == false) { + return false; + } + + elementaryIntervalTracker.clear(); + + if (requestedIntervalTracker != null) { + requestedIntervalTracker.clear(); + } + + long numValues = multiLongValues.getValueCount(); + + int lastIntervalSeen = -1; + + for (int i = 0; i < numValues; i++) { + lastIntervalSeen = processValue(multiLongValues.nextValue(), lastIntervalSeen); + assert lastIntervalSeen >= 0 && lastIntervalSeen < boundaries.length; + elementaryIntervalTracker.set(lastIntervalSeen); + if (lastIntervalSeen == boundaries.length - 1) { + // we've already reached the end of all possible intervals for this doc + break; + } + } + maybeRollUp(requestedIntervalTracker); + + elementaryIntervalTracker.freeze(); + + if (requestedIntervalTracker != null) { + requestedIntervalTracker.freeze(); + } + + return true; + } + + // Returns the value of the interval v belongs or lastIntervalSeen + // if no processing is done, it returns the lastIntervalSeen + private int processValue(long v, int lastIntervalSeen) { + int lo = 0, hi = boundaries.length - 1; + + if (lastIntervalSeen != -1) { + // this is the multivalued doc case, we need to set lo correctly + if (v <= boundaries[lastIntervalSeen]) { + // we've already counted something for this interval and doc + // we don't need to process v + return lastIntervalSeen; + } + + lo = lastIntervalSeen + 1; + if (lo == boundaries.length) { + // we've already counted the last elementary interval. If so, there's nothing + // else to count for this doc + // TODO: does it make sense to return something else? + return lastIntervalSeen; + } + } + int lowerBound = lo; + + while (true) { + int mid = (lo + hi) >>> 1; + if (v <= boundaries[mid]) { + if (mid == lowerBound) { + return mid; + } else { + hi = mid - 1; + } + } else if (v > boundaries[mid + 1]) { + lo = mid + 1; + } else { + return mid + 1; + } + } + } + + void maybeRollUp(IntervalTracker rollUpInto) {} + } + + abstract static class LongRangeSingleValuedLeafFacetCutter implements LeafFacetCutter { + private final LongValues longValues; + private final long[] boundaries; + final int[] pos; + int elementaryIntervalOrd; + + IntervalTracker requestedIntervalTracker; + + LongRangeSingleValuedLeafFacetCutter(LongValues longValues, long[] boundaries, int[] pos) { + this.longValues = longValues; + this.boundaries = boundaries; + this.pos = pos; + } + + @Override + public boolean advanceExact(int doc) throws IOException { + if (longValues.advanceExact(doc) == false) { + return false; + } + if (requestedIntervalTracker != null) { + requestedIntervalTracker.clear(); + } + elementaryIntervalOrd = processValue(longValues.longValue()); + maybeRollUp(requestedIntervalTracker); + if (requestedIntervalTracker != null) { + requestedIntervalTracker.freeze(); + } + + return true; + } + + // Returns the value of the interval v belongs or lastIntervalSeen + // if no processing is done, it returns the lastIntervalSeen + private int processValue(long v) { + int lo = 0, hi = boundaries.length - 1; + + int lowerBound = lo; + + while (true) { + int mid = (lo + hi) >>> 1; + if (v <= boundaries[mid]) { + if (mid == lowerBound) { + return mid; + } else { + hi = mid - 1; + } + } else if (v > boundaries[mid + 1]) { + lo = mid + 1; + } else { + return mid + 1; + } + } + } + + void maybeRollUp(IntervalTracker rollUpInto) {} + } + + record LongRangeAndPos(LongRange range, int pos) { + @Override + public String toString() { + return "LongRangeAndPos[" + "range=" + range + ", " + "pos=" + pos + ']'; + } + } + + /** + * Similar to InclusiveRange from LongRangeCounter. + * + *

    TODO: dedup + */ + record InclusiveRange(long start, long end) { + + @Override + public String toString() { + return start + " to " + end; + } + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/cutters/ranges/LongRangeNode.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/cutters/ranges/LongRangeNode.java new file mode 100644 index 000000000000..576d6ecf465d --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/cutters/ranges/LongRangeNode.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.facet.cutters.ranges; + +import org.apache.lucene.internal.hppc.IntArrayList; + +/** + * Holds one node of the segment tree. + * + *

    TODO: dedup existing LongRangeNode. + */ +final class LongRangeNode { + final LongRangeNode left; + final LongRangeNode right; + + // Our range, inclusive: + final long start; + final long end; + + // Which range indices to output when a query goes + // through this node: + IntArrayList outputs; + + /** add doc * */ + LongRangeNode(long start, long end, LongRangeNode left, LongRangeNode right) { + this.start = start; + this.end = end; + this.left = left; + this.right = right; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + toString(sb, 0); + return sb.toString(); + } + + private static void indent(StringBuilder sb, int depth) { + sb.append(" ".repeat(depth)); + } + + /** Recursively assigns range outputs to each node. */ + public void addOutputs(LongRangeFacetCutter.LongRangeAndPos range) { + if (start >= range.range().min && end <= range.range().max) { + // Our range is fully included in the incoming + // range; add to our output list: + if (outputs == null) { + outputs = new IntArrayList(); + } + outputs.add(range.pos()); + } else if (left != null) { + assert right != null; + // Recurse: + left.addOutputs(range); + right.addOutputs(range); + } + } + + private void toString(StringBuilder sb, int depth) { + indent(sb, depth); + if (left == null) { + assert right == null; + sb.append("leaf: ").append(start).append(" to ").append(end); + } else { + sb.append("node: ").append(start).append(" to ").append(end); + } + if (outputs != null) { + sb.append(" outputs="); + sb.append(outputs); + } + sb.append('\n'); + + if (left != null) { + assert right != null; + left.toString(sb, depth + 1); + right.toString(sb, depth + 1); + } + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/cutters/ranges/NonOverlappingLongRangeFacetCutter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/cutters/ranges/NonOverlappingLongRangeFacetCutter.java new file mode 100644 index 000000000000..3d657a96570d --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/cutters/ranges/NonOverlappingLongRangeFacetCutter.java @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.facet.cutters.ranges; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import org.apache.lucene.facet.MultiLongValues; +import org.apache.lucene.facet.MultiLongValuesSource; +import org.apache.lucene.facet.range.LongRange; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.sandbox.facet.cutters.LeafFacetCutter; +import org.apache.lucene.search.LongValues; +import org.apache.lucene.search.LongValuesSource; + +/** {@link LongRangeFacetCutter} for ranges of long value that don't overlap. * */ +class NonOverlappingLongRangeFacetCutter extends LongRangeFacetCutter { + NonOverlappingLongRangeFacetCutter( + MultiLongValuesSource longValuesSource, + LongValuesSource singleLongValuesSource, + LongRange[] longRanges) { + super(longValuesSource, singleLongValuesSource, longRanges); + } + + /** + * TODO: it's identical to private ExclusiveLongRangeCounter#buildElementaryIntervals, let's + * dedup. + */ + @Override + List buildElementaryIntervals() { + List elementaryIntervals = new ArrayList<>(); + long prev = Long.MIN_VALUE; + for (LongRangeAndPos range : sortedRanges) { + if (range.range().min > prev) { + // add a "gap" range preceding requested range if necessary: + elementaryIntervals.add(new InclusiveRange(prev, range.range().min - 1)); + } + // add the requested range: + elementaryIntervals.add(new InclusiveRange(range.range().min, range.range().max)); + prev = range.range().max + 1; + } + if (elementaryIntervals.isEmpty() == false) { + long lastEnd = elementaryIntervals.get(elementaryIntervals.size() - 1).end(); + if (lastEnd < Long.MAX_VALUE) { + elementaryIntervals.add(new InclusiveRange(lastEnd + 1, Long.MAX_VALUE)); + } + } else { + // If no ranges were requested, create a single entry from MIN_VALUE to MAX_VALUE: + elementaryIntervals.add(new InclusiveRange(Long.MIN_VALUE, Long.MAX_VALUE)); + } + + return elementaryIntervals; + } + + @Override + public LeafFacetCutter createLeafCutter(LeafReaderContext context) throws IOException { + if (singleValues != null) { + LongValues values = singleValues.getValues(context, null); + return new NonOverlappingLongRangeSingleValueLeafFacetCutter(values, boundaries, pos); + } else { + MultiLongValues values = valuesSource.getValues(context); + return new NonOverlappingLongRangeMultiValueLeafFacetCutter(values, boundaries, pos); + } + } + + /** + * TODO: dedup NonOverlappingLongRangeMultiValueLeafFacetCutter and + * NonOverlappingLongRangeSingleValueLeafFacetCutter code - they are similar but they extend + * different base classes. + */ + static class NonOverlappingLongRangeMultiValueLeafFacetCutter + extends LongRangeMultivaluedLeafFacetCutter { + + NonOverlappingLongRangeMultiValueLeafFacetCutter( + MultiLongValues longValues, long[] boundaries, int[] pos) { + super(longValues, boundaries, pos); + } + + @Override + public int nextOrd() throws IOException { + while (true) { + int ordinal = elementaryIntervalTracker.nextOrd(); + if (ordinal == NO_MORE_ORDS) { + return NO_MORE_ORDS; + } + int result = pos[ordinal]; + if (result != SKIP_INTERVAL_POSITION) { + return result; + } + } + } + } + + static class NonOverlappingLongRangeSingleValueLeafFacetCutter + extends LongRangeSingleValuedLeafFacetCutter { + NonOverlappingLongRangeSingleValueLeafFacetCutter( + LongValues longValues, long[] boundaries, int[] pos) { + super(longValues, boundaries, pos); + } + + @Override + public int nextOrd() throws IOException { + if (elementaryIntervalOrd == NO_MORE_ORDS) { + return NO_MORE_ORDS; + } + int result = pos[elementaryIntervalOrd]; + elementaryIntervalOrd = NO_MORE_ORDS; + return result != SKIP_INTERVAL_POSITION ? result : NO_MORE_ORDS; + } + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/cutters/ranges/OverlappingLongRangeFacetCutter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/cutters/ranges/OverlappingLongRangeFacetCutter.java new file mode 100644 index 000000000000..58586db892f7 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/cutters/ranges/OverlappingLongRangeFacetCutter.java @@ -0,0 +1,273 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.facet.cutters.ranges; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import org.apache.lucene.facet.MultiLongValues; +import org.apache.lucene.facet.MultiLongValuesSource; +import org.apache.lucene.facet.range.LongRange; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.internal.hppc.IntCursor; +import org.apache.lucene.sandbox.facet.cutters.LeafFacetCutter; +import org.apache.lucene.search.LongValues; +import org.apache.lucene.search.LongValuesSource; + +/** + * {@link LongRangeFacetCutter} for ranges of long value that overlap. Uses segment tree + * optimisation to find all matching ranges for a given value fast-range-faceting- + * using-segment-trees.html + */ +class OverlappingLongRangeFacetCutter extends LongRangeFacetCutter { + private final LongRangeNode root; + + OverlappingLongRangeFacetCutter( + MultiLongValuesSource longValuesSource, + LongValuesSource singleLongValuesSource, + LongRange[] longRanges) { + super(longValuesSource, singleLongValuesSource, longRanges); + + // Build binary tree on top of intervals: + root = split(0, elementaryIntervals.size(), elementaryIntervals); + + // Set outputs, so we know which range to output for each node in the tree: + for (LongRangeAndPos range : sortedRanges) { + root.addOutputs(range); + } + } + + /** + * TODO: it's identical to private OverlappingLongRangeCounter#buildElementaryIntervals, let's + * dedup. + */ + @Override + List buildElementaryIntervals() { + // Maps all range inclusive endpoints to int flags; 1 + // = start of interval, 2 = end of interval. We need to + // track the start vs end case separately because if a + // given point is both, then it must be its own + // elementary interval: + Map endsMap = new HashMap<>(); + + endsMap.put(Long.MIN_VALUE, 1); + endsMap.put(Long.MAX_VALUE, 2); + + for (LongRangeAndPos rangeAndPos : sortedRanges) { + Integer cur = endsMap.get(rangeAndPos.range().min); + if (cur == null) { + endsMap.put(rangeAndPos.range().min, 1); + } else { + endsMap.put(rangeAndPos.range().min, cur | 1); + } + cur = endsMap.get(rangeAndPos.range().max); + if (cur == null) { + endsMap.put(rangeAndPos.range().max, 2); + } else { + endsMap.put(rangeAndPos.range().max, cur | 2); + } + } + + List endsList = new ArrayList<>(endsMap.keySet()); + Collections.sort(endsList); + + // Build elementaryIntervals (a 1D Venn diagram): + List elementaryIntervals = new ArrayList<>(); + int upto = 1; + long v = endsList.get(0); + long prev; + if (endsMap.get(v) == 3) { + elementaryIntervals.add(new InclusiveRange(v, v)); + prev = v + 1; + } else { + prev = v; + } + + while (upto < endsList.size()) { + v = endsList.get(upto); + int flags = endsMap.get(v); + if (flags == 3) { + // This point is both an end and a start; we need to + // separate it: + if (v > prev) { + elementaryIntervals.add(new InclusiveRange(prev, v - 1)); + } + elementaryIntervals.add(new InclusiveRange(v, v)); + prev = v + 1; + } else if (flags == 1) { + // This point is only the start of an interval; + // attach it to next interval: + if (v > prev) { + elementaryIntervals.add(new InclusiveRange(prev, v - 1)); + } + prev = v; + } else { + assert flags == 2; + // This point is only the end of an interval; attach + // it to last interval: + elementaryIntervals.add(new InclusiveRange(prev, v)); + prev = v + 1; + } + upto++; + } + + return elementaryIntervals; + } + + private static LongRangeNode split(int start, int end, List elementaryIntervals) { + if (start == end - 1) { + // leaf + InclusiveRange range = elementaryIntervals.get(start); + return new LongRangeNode(range.start(), range.end(), null, null); + } else { + int mid = (start + end) >>> 1; + LongRangeNode left = split(start, mid, elementaryIntervals); + LongRangeNode right = split(mid, end, elementaryIntervals); + return new LongRangeNode(left.start, right.end, left, right); + } + } + + @Override + public LeafFacetCutter createLeafCutter(LeafReaderContext context) throws IOException { + if (singleValues != null) { + LongValues values = singleValues.getValues(context, null); + return new OverlappingSingleValuedRangeLeafFacetCutter( + values, boundaries, pos, requestedRangeCount, root); + } else { + MultiLongValues values = valuesSource.getValues(context); + return new OverlappingMultivaluedRangeLeafFacetCutter( + values, boundaries, pos, requestedRangeCount, root); + } + } + + /** + * TODO: dedup OverlappingMultivaluedRangeLeafFacetCutter and + * OverlappingSingleValuedRangeLeafFacetCutter code - they are identical but they extend different + * base classes. + */ + static class OverlappingMultivaluedRangeLeafFacetCutter + extends LongRangeMultivaluedLeafFacetCutter { + + private final LongRangeNode elementaryIntervalRoot; + + private int elementaryIntervalUpto; + + OverlappingMultivaluedRangeLeafFacetCutter( + MultiLongValues longValues, + long[] boundaries, + int[] pos, + int requestedRangeCount, + LongRangeNode elementaryIntervalRoot) { + super(longValues, boundaries, pos); + requestedIntervalTracker = new IntervalTracker.MultiIntervalTracker(requestedRangeCount); + this.elementaryIntervalRoot = elementaryIntervalRoot; + } + + @Override + void maybeRollUp(IntervalTracker rollUpInto) { + elementaryIntervalUpto = 0; + rollupMultiValued(elementaryIntervalRoot); + } + + private boolean rollupMultiValued(LongRangeNode node) { + boolean containedHit; + if (node.left != null) { + containedHit = rollupMultiValued(node.left); + containedHit |= rollupMultiValued(node.right); + } else { + // Leaf: + containedHit = elementaryIntervalTracker.get(elementaryIntervalUpto); + elementaryIntervalUpto++; + } + if (containedHit && node.outputs != null) { + for (IntCursor rangeIndex : node.outputs) { + requestedIntervalTracker.set(rangeIndex.value); + } + } + + return containedHit; + } + + @Override + public int nextOrd() throws IOException { + if (requestedIntervalTracker == null) { + return NO_MORE_ORDS; + } + return requestedIntervalTracker.nextOrd(); + } + } + + static class OverlappingSingleValuedRangeLeafFacetCutter + extends LongRangeSingleValuedLeafFacetCutter { + + private final LongRangeNode elementaryIntervalRoot; + + private int elementaryIntervalUpto; + + OverlappingSingleValuedRangeLeafFacetCutter( + LongValues longValues, + long[] boundaries, + int[] pos, + int requestedRangeCount, + LongRangeNode elementaryIntervalRoot) { + super(longValues, boundaries, pos); + requestedIntervalTracker = new IntervalTracker.MultiIntervalTracker(requestedRangeCount); + this.elementaryIntervalRoot = elementaryIntervalRoot; + } + + @Override + void maybeRollUp(IntervalTracker rollUpInto) { + // TODO: for single valued we can rollup after collecting all documents, e.g. in reduce + // method. Maybe we can use FacetCutter rollup methods to handle this case too? + elementaryIntervalUpto = 0; + rollupSingleValued(elementaryIntervalRoot); + } + + // Note: combined rollUpSingleValued and rollUpMultiValued from OverlappingLongRangeCounter into + // 1 rollUp method + private boolean rollupSingleValued(LongRangeNode node) { + boolean containedHit; + if (node.left != null) { + containedHit = rollupSingleValued(node.left); + containedHit |= rollupSingleValued(node.right); + } else { + // Leaf: + containedHit = elementaryIntervalUpto == elementaryIntervalOrd; + elementaryIntervalUpto++; + } + if (containedHit && node.outputs != null) { + for (IntCursor rangeIndex : node.outputs) { + requestedIntervalTracker.set(rangeIndex.value); + } + } + + return containedHit; + } + + @Override + public int nextOrd() throws IOException { + if (requestedIntervalTracker == null) { + return NO_MORE_ORDS; + } + return requestedIntervalTracker.nextOrd(); + } + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/cutters/ranges/package-info.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/cutters/ranges/package-info.java new file mode 100644 index 000000000000..7d76f6218f43 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/cutters/ranges/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * Sandbox faceting: Range Faceting + * + * @lucene.experimental + */ +package org.apache.lucene.sandbox.facet.cutters.ranges; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/iterators/CandidateSetOrdinalIterator.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/iterators/CandidateSetOrdinalIterator.java new file mode 100644 index 000000000000..503530513c73 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/iterators/CandidateSetOrdinalIterator.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.facet.iterators; + +import java.io.IOException; +import org.apache.lucene.facet.taxonomy.FacetLabel; +import org.apache.lucene.sandbox.facet.labels.LabelToOrd; +import org.apache.lucene.sandbox.facet.recorders.FacetRecorder; + +/** + * {@link OrdinalIterator} that filters out ordinals from delegate if they are not in the candidate + * set. Can be handy to get results only for specific facets. + * + * @lucene.experimental + */ +public final class CandidateSetOrdinalIterator implements OrdinalIterator { + + private final OrdinalIterator candidateOrdinalIterator; + private final FacetRecorder facetRecorder; + + /** Constructor. */ + public CandidateSetOrdinalIterator( + FacetRecorder facetRecorder, FacetLabel[] candidateLabels, LabelToOrd labelToOrd) + throws IOException { + // TODO: if candidates size >> number of ordinals in facetRecorder, it is more efficient to + // iterate ordinals from FacetRecorder, and check if candidates contain them + if (facetRecorder.isEmpty()) { + // Getting ordinals for labels might be expensive, e.g. it requires reading index for taxonomy + // facets, so we make sure we don't do it for empty facet recorder. + this.candidateOrdinalIterator = OrdinalIterator.EMPTY; + } else { + this.candidateOrdinalIterator = + OrdinalIterator.fromArray(labelToOrd.getOrds(candidateLabels)); + } + this.facetRecorder = facetRecorder; + } + + @Override + public int nextOrd() throws IOException { + for (int ord = candidateOrdinalIterator.nextOrd(); + ord != NO_MORE_ORDS; + ord = candidateOrdinalIterator.nextOrd()) { + if (facetRecorder.contains(ord)) { + return ord; + } + } + return NO_MORE_ORDS; + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/iterators/ComparableSupplier.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/iterators/ComparableSupplier.java new file mode 100644 index 000000000000..d31e25757705 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/iterators/ComparableSupplier.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.facet.iterators; + +/** + * Generates {@link Comparable} for provided ordinal. For example, it can be used to find topN facet + * ordinals. + * + * @param something ordinals can be compared by. + * @lucene.experimental + */ +public interface ComparableSupplier> { + + /** + * For given ordinal, get something it can be compared by. + * + * @param ord ordinal. + * @param reuse object to reuse for building result. Must not be null. + */ + void reuseComparable(int ord, T reuse); + + /** + * For given ordinal, create something it can be compared by. + * + * @param ord ordinal. + * @return Comparable. + */ + T createComparable(int ord); +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/iterators/OrdinalIterator.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/iterators/OrdinalIterator.java new file mode 100644 index 000000000000..c5e0bfc28226 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/iterators/OrdinalIterator.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.facet.iterators; + +import java.io.IOException; +import org.apache.lucene.internal.hppc.IntArrayList; + +/** + * Iterate over ordinals. + * + * @lucene.experimental + */ +public interface OrdinalIterator { + + /** This const is returned by nextOrd when there are no more ordinals. */ + int NO_MORE_ORDS = -1; + + /** Returns next ord or {@link #NO_MORE_ORDS}. * */ + int nextOrd() throws IOException; + + /** + * Convert to int array. Note that after this method is called original OrdinalIterator is + * exhausted. + */ + default int[] toArray() throws IOException { + IntArrayList resultList = new IntArrayList(); + for (int ord = this.nextOrd(); ord != NO_MORE_ORDS; ord = this.nextOrd()) { + resultList.add(ord); + } + return resultList.toArray(); + } + + /** Convert int array to ordinal iterator. */ + static OrdinalIterator fromArray(int[] source) { + return new OrdinalIterator() { + int cursor; + + @Override + public int nextOrd() throws IOException { + int ord; + while (cursor < source.length) { + ord = source[cursor++]; + // NO_MORE_ORDS should be returned only after we read the entire array. + if (ord != NO_MORE_ORDS) { + return ord; + } + } + return NO_MORE_ORDS; + } + }; + } + + /** Return empty ordinal iterator */ + OrdinalIterator EMPTY = () -> NO_MORE_ORDS; +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/iterators/TaxonomyChildrenOrdinalIterator.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/iterators/TaxonomyChildrenOrdinalIterator.java new file mode 100644 index 000000000000..84ab5875467f --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/iterators/TaxonomyChildrenOrdinalIterator.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.facet.iterators; + +import java.io.IOException; +import org.apache.lucene.facet.taxonomy.ParallelTaxonomyArrays; +import org.apache.lucene.sandbox.facet.labels.LabelToOrd; + +/** + * Facets results selector to get children for selected parent. Source ordinals order is preserved. + * + * @lucene.experimental + */ +public final class TaxonomyChildrenOrdinalIterator implements OrdinalIterator { + + // TODO: do we want to have something like ChainOrdinalIterators to chain multiple iterators? + // Or are we fine with chaining them manually every time? + private final OrdinalIterator sourceOrds; + private final ParallelTaxonomyArrays.IntArray parents; + private final int parentOrd; + + /** Create */ + public TaxonomyChildrenOrdinalIterator( + OrdinalIterator sourceOrds, ParallelTaxonomyArrays.IntArray parents, int parentOrd) { + this.sourceOrds = sourceOrds; + this.parents = parents; + assert parentOrd != LabelToOrd.INVALID_ORD : "Parent Ordinal is not valid"; + this.parentOrd = parentOrd; + } + + @Override + public int nextOrd() throws IOException { + // TODO: in some cases it might be faster to traverse children of selected parent + // (children/siblings IntArrays) and check if source ords contain them. We can think of some + // heuristics to decide which approach to use on case by case basis? There is similar comment in + // TaxonomyFacets#getTopChildrenForPath + for (int ord = sourceOrds.nextOrd(); ord != NO_MORE_ORDS; ord = sourceOrds.nextOrd()) { + if (parents.get(ord) == parentOrd) { + return ord; + } + } + return NO_MORE_ORDS; + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/iterators/TopnOrdinalIterator.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/iterators/TopnOrdinalIterator.java new file mode 100644 index 000000000000..684afec5f5de --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/iterators/TopnOrdinalIterator.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.facet.iterators; + +import java.io.IOException; +import org.apache.lucene.util.PriorityQueue; + +/** + * Class that consumes incoming ordinals, sorts them by provided Comparable, and returns first top N + * ordinals only. + * + * @lucene.experimental + */ +public final class TopnOrdinalIterator> implements OrdinalIterator { + + private final ComparableSupplier comparableSupplier; + private final OrdinalIterator sourceOrds; + private final int topN; + private int[] result; + private int currentIndex; + + /** Constructor. */ + public TopnOrdinalIterator( + OrdinalIterator sourceOrds, ComparableSupplier comparableSupplier, int topN) { + if (topN <= 0) { + throw new IllegalArgumentException("topN must be > 0 (got: " + topN + ")"); + } + this.sourceOrds = sourceOrds; + this.comparableSupplier = comparableSupplier; + this.topN = topN; + } + + private void getTopN() throws IOException { + assert result == null; + // TODO: current taxonomy implementations limit queue size by taxo reader size too, but it + // probably doesn't make sense for large enough taxonomy indexes? + // e.g. TopOrdAndIntQueue q = new TopComparableQueue(Math.min(taxoReader.getSize(), topN)); + // TODO: create queue lazily - skip if first nextOrd is NO_MORE_ORDS ? + TopComparableQueue queue = new TopComparableQueue<>(topN); + OrdComparablePair reuse = null; + for (int ord = sourceOrds.nextOrd(); ord != NO_MORE_ORDS; ord = sourceOrds.nextOrd()) { + if (reuse == null) { + reuse = new OrdComparablePair<>(ord, comparableSupplier.createComparable(ord)); + } else { + reuse.ordinal = ord; + comparableSupplier.reuseComparable(ord, reuse.comparable); + } + reuse = queue.insertWithOverflow(reuse); + } + // Now we need to read from the queue as well as the queue gives the least element, not the top. + result = new int[queue.size()]; + for (int i = result.length - 1; i >= 0; i--) { + result[i] = queue.pop().ordinal; + } + currentIndex = 0; + } + + @Override + public int nextOrd() throws IOException { + if (result == null) { + getTopN(); + } + assert result != null; + if (currentIndex >= result.length) { + return NO_MORE_ORDS; + } + return result[currentIndex++]; + } + + /** Keeps top N results ordered by Comparable. */ + private static class TopComparableQueue> + extends PriorityQueue> { + + /** Sole constructor. */ + public TopComparableQueue(int topN) { + super(topN); + } + + @Override + protected boolean lessThan(OrdComparablePair a, OrdComparablePair b) { + return a.lessThan(b); + } + } + + /** Pair of ordinal and comparable to use in TopComparableQueue */ + private static class OrdComparablePair> { + int ordinal; + T comparable; + + private OrdComparablePair(int ordinal, T comparable) { + this.ordinal = ordinal; + this.comparable = comparable; + } + + boolean lessThan(OrdComparablePair other) { + return comparable.compareTo(other.comparable) < 0; + } + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/iterators/package-info.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/iterators/package-info.java new file mode 100644 index 000000000000..c80222fedfc1 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/iterators/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * Sandbox faceting: facet ordinals. + * + * @lucene.experimental + */ +package org.apache.lucene.sandbox.facet.iterators; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/labels/LabelToOrd.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/labels/LabelToOrd.java new file mode 100644 index 000000000000..7bc537429d3d --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/labels/LabelToOrd.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.facet.labels; + +import java.io.IOException; +import org.apache.lucene.facet.taxonomy.FacetLabel; + +/** + * Label to ord mapping interface. + * + *

    TODO: move FacetLabel out of taxonomy folder to use it for any facets, not just taxonomy? + * + *

    TODO: there is some overlap with {@link + * org.apache.lucene.facet.taxonomy.writercache.LabelToOrdinal}, can we reuse something? + * + * @lucene.experimental + */ +public interface LabelToOrd { + + /** + * Ordinal to return if facet label doesn't exist in {@link #getOrd(FacetLabel)} and {@link + * #getOrds(FacetLabel[])} + */ + int INVALID_ORD = -1; + + /** get ord for one label */ + int getOrd(FacetLabel label) throws IOException; + + /** get ords for multiple labels */ + int[] getOrds(FacetLabel[] labels) throws IOException; +} diff --git a/lucene/core/src/java/org/apache/lucene/util/quantization/RandomAccessQuantizedByteVectorValues.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/labels/OrdToLabel.java similarity index 61% rename from lucene/core/src/java/org/apache/lucene/util/quantization/RandomAccessQuantizedByteVectorValues.java rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/labels/OrdToLabel.java index b86009a690e1..88e598444255 100644 --- a/lucene/core/src/java/org/apache/lucene/util/quantization/RandomAccessQuantizedByteVectorValues.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/labels/OrdToLabel.java @@ -14,23 +14,22 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.util.quantization; +package org.apache.lucene.sandbox.facet.labels; import java.io.IOException; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; +import org.apache.lucene.facet.taxonomy.FacetLabel; /** - * Random access values for byte[], but also includes accessing the score correction - * constant for the current vector in the buffer. + * Ordinal to label mapping interface. + * + *

    TODO: move FacetLabel out of taxonomy folder to use it for any facets, not just taxonomy? * * @lucene.experimental */ -public interface RandomAccessQuantizedByteVectorValues extends RandomAccessVectorValues.Bytes { - - ScalarQuantizer getScalarQuantizer(); - - float getScoreCorrectionConstant(int vectorOrd) throws IOException; +public interface OrdToLabel { + /** get label of one ord TODO: what do we return when ordinal is not valid? null? */ + FacetLabel getLabel(int ordinal) throws IOException; - @Override - RandomAccessQuantizedByteVectorValues copy() throws IOException; + /** get labels for multiple ords */ + FacetLabel[] getLabels(int[] ordinals) throws IOException; } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/labels/RangeOrdToLabel.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/labels/RangeOrdToLabel.java new file mode 100644 index 000000000000..3c0da42fbf4c --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/labels/RangeOrdToLabel.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.facet.labels; + +import org.apache.lucene.facet.range.Range; +import org.apache.lucene.facet.taxonomy.FacetLabel; + +/** + * {@link OrdToLabel} for ranges. + * + * @lucene.experimental + */ +public class RangeOrdToLabel implements OrdToLabel { + + private final Range[] ranges; + + /** Constructor that takes array of Range objects as input */ + public RangeOrdToLabel(Range[] inputRanges) { + ranges = inputRanges; + } + + @Override + public FacetLabel getLabel(int ordinal) { + if (ordinal >= 0 && ordinal < ranges.length) { + return new FacetLabel(ranges[ordinal].label); + } + return null; + } + + @Override + public FacetLabel[] getLabels(int[] ordinals) { + FacetLabel[] facetLabels = new FacetLabel[ordinals.length]; + for (int i = 0; i < ordinals.length; i++) { + facetLabels[i] = getLabel(ordinals[i]); + } + return facetLabels; + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/labels/TaxonomyOrdLabelBiMap.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/labels/TaxonomyOrdLabelBiMap.java new file mode 100644 index 000000000000..e57df7cc139f --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/labels/TaxonomyOrdLabelBiMap.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.facet.labels; + +import java.io.IOException; +import org.apache.lucene.facet.taxonomy.FacetLabel; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; + +/** + * Map taxonomy labels to ordinals. + * + * @lucene.experimental + */ +public final class TaxonomyOrdLabelBiMap implements OrdToLabel, LabelToOrd { + + private final TaxonomyReader taxoReader; + + /** Construct */ + public TaxonomyOrdLabelBiMap(TaxonomyReader taxoReader) { + this.taxoReader = taxoReader; + } + + @Override + public FacetLabel getLabel(int ordinal) throws IOException { + return taxoReader.getPath(ordinal); + } + + @Override + public FacetLabel[] getLabels(int[] ordinals) throws IOException { + return taxoReader.getBulkPath( + ordinals.clone()); // Have to clone because getBulkPath shuffles its input array. + } + + @Override + public int getOrd(FacetLabel label) throws IOException { + return taxoReader.getOrdinal(label); + } + + @Override + public int[] getOrds(FacetLabel[] labels) throws IOException { + return taxoReader.getBulkOrdinals(labels); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/labels/package-info.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/labels/package-info.java new file mode 100644 index 000000000000..8d4c1c725f0f --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/labels/package-info.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * Sandbox faceting: facet labels, see {@link org.apache.lucene.sandbox.facet.labels.OrdToLabel} for + * details. + * + * @lucene.experimental + */ +package org.apache.lucene.sandbox.facet.labels; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/package-info.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/package-info.java new file mode 100644 index 000000000000..f0e116bf68c1 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/package-info.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * Sandbox faceting - Collectors that compute facets. Facet Ordinals/Ids: Each doc may have + * different facets and therefore, different facet ordinals. For example a book can have Author, + * Publish Date, Page Count etc. as facets. The specific value for each of these Facets for a book + * can be mapped to an ordinal. Facet ordinals may be common across different book documents. + * FacetCutter: Can interpret Facets of a specific type for a doc type and output all the Facet + * Ordinals for the type for the doc. Facet Recorders: record data per ordinal. Some recorders may + * compute aggregations and record per ordinal data aggregated across an index. + * + *

    See SandboxFacetsExample for examples. + * + * @lucene.experimental + */ +package org.apache.lucene.sandbox.facet; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/recorders/CountFacetRecorder.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/recorders/CountFacetRecorder.java new file mode 100644 index 000000000000..0fab6dfb663c --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/recorders/CountFacetRecorder.java @@ -0,0 +1,169 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.facet.recorders; + +import static org.apache.lucene.sandbox.facet.iterators.OrdinalIterator.NO_MORE_ORDS; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.internal.hppc.IntCursor; +import org.apache.lucene.internal.hppc.IntIntHashMap; +import org.apache.lucene.sandbox.facet.cutters.FacetCutter; +import org.apache.lucene.sandbox.facet.cutters.LeafFacetCutter; +import org.apache.lucene.sandbox.facet.iterators.OrdinalIterator; + +/** + * {@link FacetRecorder} to count facets. + * + *

    TODO: add an option to keep counts in an array, to improve performance for facets with small + * number of ordinals e.g. range facets. Options: - {@link LeafFacetCutter} can inform {@link + * LeafFacetRecorder} about expected number of facet ordinals ({@link + * org.apache.lucene.sandbox.facet.FacetFieldCollector} can orchestrate that). If expeted facet ord + * number is below some threshold - use array instead of a map? - first 100/1k counts in array, the + * rest - in a map; the limit can also be provided in a constructor? It is similar to what + * LongValuesFacetCounts does today. + * + *

    TODO: We can also consider collecting 2 (3, 4, ..., can be parametrizes) slices to a single + * sync map which can reduce thread contention compared to single sync map for all slices; at the + * same time there will be less work for reduce method. So far reduce wasn't a bottleneck for us, + * but it is definitely not free. + * + *

    TODO: If we come back to some for of synchronized count maps, we should be more careful what + * we acquire locks for - we used to lock addTo method itself, but it could be faster if we only + * synchronized after computing the key's hash; or we can lock the entire map only if we need to + * insert key, and lock single key otherwise? + * + * @lucene.experimental + */ +public final class CountFacetRecorder implements FacetRecorder { + private IntIntHashMap values; + private final List perLeafValues; + + /** Create. */ + public CountFacetRecorder() { + // Has to be synchronizedList as we have one recorder per all slices. + perLeafValues = Collections.synchronizedList(new ArrayList<>()); + } + + /** Get count for provided ordinal. */ + public int getCount(int ord) { + return values.get(ord); + } + + @Override + public LeafFacetRecorder getLeafRecorder(LeafReaderContext context) { + // TODO: we are planning to do some experiments with how hash maps are assigned to leaf or slice + // recorders, see other TODOs in this class. When we make the decision, we can collect + // leaf/slice recorders themselves, not the hashmaps? + IntIntHashMap leafValues = new IntIntHashMap(); + perLeafValues.add(leafValues); + return new CountLeafFacetRecorder(leafValues); + } + + @Override + public OrdinalIterator recordedOrds() { + // TODO: even if this is called before collection started, we want it to use results from the + // time when nextOrd is first called. Does ordIterator work like that? I've run some tests that + // confirmed expected behavior, but I'm not sure IntIntMap guarantees that. We should at least + // add a unit test to make sure it always work that way. + Iterator ordIterator = values.keys().iterator(); + return new OrdinalIterator() { + @Override + public int nextOrd() { + if (ordIterator.hasNext()) { + return ordIterator.next().value; + } else { + return NO_MORE_ORDS; + } + } + }; + } + + @Override + public boolean isEmpty() { + return values.isEmpty(); + } + + @Override + public void reduce(FacetCutter facetCutter) throws IOException { + boolean firstElement = true; + for (IntIntHashMap leafRecords : perLeafValues) { + if (firstElement) { + values = leafRecords; + firstElement = false; + } else { + for (IntIntHashMap.IntIntCursor elem : leafRecords) { + values.addTo(elem.key, elem.value); + } + } + } + if (firstElement) { + // TODO: do we need empty map by default? + values = new IntIntHashMap(); + } + + OrdinalIterator dimOrds = facetCutter.getOrdinalsToRollup(); + if (dimOrds != null) { + for (int dimOrd = dimOrds.nextOrd(); dimOrd != NO_MORE_ORDS; dimOrd = dimOrds.nextOrd()) { + int rolledUp = rollup(dimOrd, facetCutter); + if (rolledUp > 0) { + values.addTo(dimOrd, rolledUp); + } + } + } + } + + @Override + public boolean contains(int ordinal) { + return values.containsKey(ordinal); + } + + private int rollup(int ord, FacetCutter facetCutter) throws IOException { + OrdinalIterator childOrds = facetCutter.getChildrenOrds(ord); + int accum = 0; + for (int nextChild = childOrds.nextOrd(); + nextChild != NO_MORE_ORDS; + nextChild = childOrds.nextOrd()) { + int rolledUp = rollup(nextChild, facetCutter); + // Don't rollup zeros to not add ordinals that we don't actually have counts for to the map + if (rolledUp > 0) { + accum += values.addTo(nextChild, rolledUp); + } else { + accum += values.get(nextChild); + } + } + return accum; + } + + private static class CountLeafFacetRecorder implements LeafFacetRecorder { + + private final IntIntHashMap values; + + public CountLeafFacetRecorder(IntIntHashMap values) { + this.values = values; + } + + @Override + public void record(int docId, int facetOrd) { + this.values.addTo(facetOrd, 1); + } + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/recorders/FacetRecorder.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/recorders/FacetRecorder.java new file mode 100644 index 000000000000..f86fe7bf8dec --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/recorders/FacetRecorder.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.facet.recorders; + +import java.io.IOException; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.sandbox.facet.cutters.FacetCutter; +import org.apache.lucene.sandbox.facet.cutters.LeafFacetCutter; +import org.apache.lucene.sandbox.facet.iterators.OrdinalIterator; + +/** + * Record data for each facet of each doc. + * + *

    TODO: In the next iteration we can add an extra layer between FacetRecorder and + * LeafFacetRecorder, e.g. SliceFacetRecorder. The new layer will be created per {@link + * org.apache.lucene.search.Collector}, which means that collecting of multiple leafs (segments) + * within a slice is sequential and can be done to a single non-sync map to improve performance and + * reduce memory consumption. We already tried that, but didn't see any performance improvement. + * Given that it also makes lazy leaf recorder init in {@link + * org.apache.lucene.sandbox.facet.FacetFieldCollector} trickier, it was decided to rollback the + * initial attempt and try again later, in the next iteration. + * + * @lucene.experimental + */ +public interface FacetRecorder { + /** Get leaf recorder. */ + LeafFacetRecorder getLeafRecorder(LeafReaderContext context) throws IOException; + + /** Return next collected ordinal, or {@link LeafFacetCutter#NO_MORE_ORDS} */ + OrdinalIterator recordedOrds(); + + /** True if there are no records */ + boolean isEmpty(); + + /** + * Reduce leaf recorder results into this recorder. If {@link FacetCutter#getOrdinalsToRollup()} + * result is not null, it also rolls up values. + * + *

    After this method is called, it's illegal to add values to recorder, i.e. calling {@link + * #getLeafRecorder} or {@link LeafFacetRecorder#record} on its leaf recorders. + * + * @throws UnsupportedOperationException if {@link FacetCutter#getOrdinalsToRollup()} returns not + * null but this recorder doesn't support rollup. + */ + void reduce(FacetCutter facetCutter) throws IOException; + + /** Check if any data was recorded for provided facet ordinal. */ + boolean contains(int ordinal); +} diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/HashFunction.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/recorders/LeafFacetRecorder.java similarity index 60% rename from lucene/codecs/src/java/org/apache/lucene/codecs/bloom/HashFunction.java rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/recorders/LeafFacetRecorder.java index eac514a7bb88..eff0f01c8ff5 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/HashFunction.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/recorders/LeafFacetRecorder.java @@ -14,24 +14,22 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.bloom; +package org.apache.lucene.sandbox.facet.recorders; -import org.apache.lucene.util.BytesRef; +import java.io.IOException; /** - * Base class for hashing functions that can be referred to by name. Subclasses are expected to - * provide threadsafe implementations of the hash function on the range of bytes referenced in the - * provided {@link BytesRef} + * Record data for each facet of each doc of a leaf (segment). * * @lucene.experimental */ -public abstract class HashFunction { +public interface LeafFacetRecorder { /** - * Hashes the contents of the referenced bytes + * TODO: Rename: collect? accumulate? * - * @param bytes the data to be hashed - * @return the hash of the bytes referenced by bytes.offset and length bytes.length + * @param docId document ID + * @param facetOrd facet ordinal */ - public abstract long hash(BytesRef bytes); + void record(int docId, int facetOrd) throws IOException; } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/recorders/LongAggregationsFacetRecorder.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/recorders/LongAggregationsFacetRecorder.java new file mode 100644 index 000000000000..c7f475b4fdbd --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/recorders/LongAggregationsFacetRecorder.java @@ -0,0 +1,207 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.facet.recorders; + +import static org.apache.lucene.sandbox.facet.iterators.OrdinalIterator.NO_MORE_ORDS; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.internal.hppc.IntCursor; +import org.apache.lucene.internal.hppc.IntObjectHashMap; +import org.apache.lucene.sandbox.facet.cutters.FacetCutter; +import org.apache.lucene.sandbox.facet.iterators.OrdinalIterator; +import org.apache.lucene.search.LongValues; +import org.apache.lucene.search.LongValuesSource; + +/** + * {@link FacetRecorder} that computes multiple long aggregations per facet. + * + *

    TODO: [premature optimization idea] if instead of one array we keep aggregations in two + * LongVector (one for MAX aggregation and one for SUM) we can benefit from SIMD? + * + * @lucene.experimental + */ +public final class LongAggregationsFacetRecorder implements FacetRecorder { + + private IntObjectHashMap values; + private final List> leafValues; + + private final LongValuesSource[] longValuesSources; + private final Reducer[] reducers; + + /** Constructor. */ + public LongAggregationsFacetRecorder(LongValuesSource[] longValuesSources, Reducer[] reducers) { + assert longValuesSources.length == reducers.length; + this.longValuesSources = longValuesSources; + this.reducers = reducers; + leafValues = Collections.synchronizedList(new ArrayList<>()); + } + + @Override + public LeafFacetRecorder getLeafRecorder(LeafReaderContext context) throws IOException { + LongValues[] longValues = new LongValues[longValuesSources.length]; + for (int i = 0; i < longValuesSources.length; i++) { + longValues[i] = longValuesSources[i].getValues(context, null); + } + IntObjectHashMap valuesRecorder = new IntObjectHashMap<>(); + leafValues.add(valuesRecorder); + return new LongAggregationsLeafFacetRecorder(longValues, reducers, valuesRecorder); + } + + @Override + public OrdinalIterator recordedOrds() { + Iterator ordIterator = values.keys().iterator(); + return new OrdinalIterator() { + @Override + public int nextOrd() throws IOException { + if (ordIterator.hasNext()) { + return ordIterator.next().value; + } else { + return NO_MORE_ORDS; + } + } + }; + } + + @Override + public boolean isEmpty() { + return values.isEmpty(); + } + + @Override + public void reduce(FacetCutter facetCutter) throws IOException { + boolean firstElement = true; + for (IntObjectHashMap leafValue : leafValues) { + if (firstElement) { + values = leafValue; + firstElement = false; + } else { + for (IntObjectHashMap.IntObjectCursor elem : leafValue) { + long[] vals = values.get(elem.key); + if (vals == null) { + values.put(elem.key, elem.value); + } else { + for (int i = 0; i < longValuesSources.length; i++) { + vals[i] = reducers[i].reduce(vals[i], elem.value[i]); + } + } + } + } + } + if (firstElement) { + // TODO: do we need empty map by default? + values = new IntObjectHashMap<>(); + } + + OrdinalIterator dimOrds = facetCutter.getOrdinalsToRollup(); + if (dimOrds != null) { + for (int dimOrd = dimOrds.nextOrd(); dimOrd != NO_MORE_ORDS; dimOrd = dimOrds.nextOrd()) { + rollup(values.get(dimOrd), dimOrd, facetCutter); + } + } + } + + @Override + public boolean contains(int ordinal) { + return values.containsKey(ordinal); + } + + /** + * Rollup all child values of ord to accum, and return accum. Accum param can be null. In this + * case, if recursive rollup for every child returns null, this method returns null. Otherwise, + * accum is initialized. + */ + private long[] rollup(long[] accum, int ord, FacetCutter facetCutter) throws IOException { + OrdinalIterator childOrds = facetCutter.getChildrenOrds(ord); + for (int nextChild = childOrds.nextOrd(); + nextChild != NO_MORE_ORDS; + nextChild = childOrds.nextOrd()) { + long[] current = rollup(values.get(nextChild), nextChild, facetCutter); + if (current != null) { + if (accum == null) { + accum = new long[longValuesSources.length]; + values.put(ord, accum); + } + for (int i = 0; i < longValuesSources.length; i++) { + accum[i] = reducers[i].reduce(accum[i], current[i]); + } + } + } + return accum; + } + + /** Return aggregated value for facet ordinal and aggregation ID, or zero as default. */ + public long getRecordedValue(int ord, int valuesId) { + if (valuesId < 0 || valuesId >= longValuesSources.length) { + throw new IllegalArgumentException("Invalid request for ordinal values"); + } + long[] valuesForOrd = values.get(ord); + if (valuesForOrd != null) { + return valuesForOrd[valuesId]; + } + // There are a few options what we can return here e.g. throw an exception, return hardcoded or + // provided default value. It might be better API to do that instead of returning zero, but + // there are two reasons why I think returning zero is the right compromise: + // 1) recorder result is a map-like structure, and maps in java usually return default value + // e.g. null or 0 rather than throw an exception when a key is missing. + // 2) Handling correctly all missing value cases might be expensive, e.g. what if only one + // aggregation for selected facet ordinal is missing, i.e. no docs that belong to this facet + // ordinal have a value to aggregate? To handle that we would have to maintain missing values + // during collection instead of using default array value - zero. I believe it is excessive and + // most users are not going to use it anyway. Worst case scenario, we can add another public get + // method that handles missing values later. + return 0; + } + + private static class LongAggregationsLeafFacetRecorder implements LeafFacetRecorder { + + private final LongValues[] longValues; + + private final Reducer[] reducers; + private final IntObjectHashMap perOrdinalValues; + + LongAggregationsLeafFacetRecorder( + LongValues[] longValues, Reducer[] reducers, IntObjectHashMap perOrdinalValues) { + this.longValues = longValues; + this.reducers = reducers; + this.perOrdinalValues = perOrdinalValues; + } + + @Override + public void record(int docId, int facetOrd) throws IOException { + long[] valuesForOrd = perOrdinalValues.get(facetOrd); + if (valuesForOrd == null) { + valuesForOrd = new long[longValues.length]; + perOrdinalValues.put(facetOrd, valuesForOrd); + } + + LongValues values; + for (int i = 0; i < longValues.length; i++) { + // TODO: cache advance/longValue results for current doc? Skipped for now as LongValues + // themselves can keep the cache. + values = longValues[i]; + if (values.advanceExact(docId)) { + valuesForOrd[i] = reducers[i].reduce(valuesForOrd[i], values.longValue()); + } + } + } + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/recorders/MultiFacetsRecorder.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/recorders/MultiFacetsRecorder.java new file mode 100644 index 000000000000..db336e382078 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/recorders/MultiFacetsRecorder.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.facet.recorders; + +import java.io.IOException; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.sandbox.facet.cutters.FacetCutter; +import org.apache.lucene.sandbox.facet.iterators.OrdinalIterator; + +/** + * {@link FacetRecorder} that contains multiple FacetRecorders. + * + * @lucene.experimental + */ +public final class MultiFacetsRecorder implements FacetRecorder { + + private final FacetRecorder[] delegates; + + /** Constructor */ + public MultiFacetsRecorder(FacetRecorder... delegates) { + this.delegates = delegates; + } + + @Override + public LeafFacetRecorder getLeafRecorder(LeafReaderContext context) throws IOException { + LeafFacetRecorder[] leafDelegates = new LeafFacetRecorder[delegates.length]; + for (int i = 0; i < delegates.length; i++) { + leafDelegates[i] = delegates[i].getLeafRecorder(context); + } + return new MultiFacetsLeafRecorder(leafDelegates); + } + + @Override + public OrdinalIterator recordedOrds() { + throw new UnsupportedOperationException( + "Not supported, call recordedOrds for sub-recorders instead"); + } + + @Override + public boolean isEmpty() { + throw new UnsupportedOperationException( + "Not supported, call isEmpty for sub-recorders instead"); + } + + @Override + public void reduce(FacetCutter facetCutter) throws IOException { + for (FacetRecorder recorder : delegates) { + recorder.reduce(facetCutter); + } + } + + @Override + public boolean contains(int ordinal) { + throw new UnsupportedOperationException( + "Not supported, call contains for sub-recorders instead"); + } + + private static final class MultiFacetsLeafRecorder implements LeafFacetRecorder { + + private final LeafFacetRecorder[] delegates; + + private MultiFacetsLeafRecorder(LeafFacetRecorder[] delegates) { + this.delegates = delegates; + } + + @Override + public void record(int docId, int facetOrd) throws IOException { + for (LeafFacetRecorder leafRecorder : delegates) { + leafRecorder.record(docId, facetOrd); + } + } + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/recorders/Reducer.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/recorders/Reducer.java new file mode 100644 index 000000000000..b10e20dce186 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/recorders/Reducer.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.facet.recorders; + +/** + * Reducer for numeric values. + * + * @lucene.experimental + */ +public interface Reducer { + + /** Int values reducer. */ + int reduce(int a, int b); + + /** Long values reducer. */ + long reduce(long a, long b); + + /** Float values reducer. */ + float reduce(float a, float b); + + /** Double values reducer. */ + double reduce(double a, double b); + + /** Reducer that returns MAX of two values. */ + Reducer MAX = + new Reducer() { + @Override + public int reduce(int a, int b) { + return Math.max(a, b); + } + + @Override + public long reduce(long a, long b) { + return Math.max(a, b); + } + + @Override + public float reduce(float a, float b) { + return Math.max(a, b); + } + + @Override + public double reduce(double a, double b) { + return Math.max(a, b); + } + }; + + /** Reducer that returns SUM of two values. */ + Reducer SUM = + new Reducer() { + @Override + public int reduce(int a, int b) { + return Math.addExact(a, b); + } + + @Override + public long reduce(long a, long b) { + return Math.addExact(a, b); + } + + @Override + public float reduce(float a, float b) { + return a + b; + } + + @Override + public double reduce(double a, double b) { + return a + b; + } + }; +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/recorders/package-info.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/recorders/package-info.java new file mode 100644 index 000000000000..26c35d50575d --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/recorders/package-info.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * Sandbox faceting: classes that can record per ordinal data E.g. aggregations per facet ordinals + * can be recorded. + * + * @lucene.experimental + */ +package org.apache.lucene.sandbox.facet.recorders; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java index 35b9e8dc78d7..8f21c8e1850d 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java @@ -45,7 +45,6 @@ import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.LeafSimScorer; import org.apache.lucene.search.Matches; import org.apache.lucene.search.Query; import org.apache.lucene.search.QueryVisitor; @@ -147,28 +146,7 @@ public CombinedFieldQuery build() { } } - static class FieldAndWeight { - final String field; - final float weight; - - FieldAndWeight(String field, float weight) { - this.field = field; - this.weight = weight; - } - - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - FieldAndWeight that = (FieldAndWeight) o; - return Float.compare(that.weight, weight) == 0 && Objects.equals(field, that.field); - } - - @Override - public int hashCode() { - return Objects.hash(field, weight); - } - } + record FieldAndWeight(String field, float weight) {} // sorted map for fields. private final TreeMap fieldAndWeights; @@ -423,14 +401,12 @@ public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOExcepti MultiNormsLeafSimScorer scoringSimScorer = new MultiNormsLeafSimScorer(simWeight, context.reader(), fieldAndWeights.values(), true); - LeafSimScorer nonScoringSimScorer = - new LeafSimScorer(simWeight, context.reader(), "pseudo_field", false); // we use termscorers + disjunction as an impl detail DisiPriorityQueue queue = new DisiPriorityQueue(iterators.size()); for (int i = 0; i < iterators.size(); i++) { float weight = fields.get(i).weight; queue.add( - new WeightedDisiWrapper(new TermScorer(iterators.get(i), nonScoringSimScorer), weight)); + new WeightedDisiWrapper(new TermScorer(iterators.get(i), simWeight, null), weight)); } // Even though it is called approximation, it is accurate since none of // the sub iterators are two-phase iterators. diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/LargeNumHitsTopDocsCollector.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/LargeNumHitsTopDocsCollector.java index e0f8ff6acffa..d56984f7847f 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/LargeNumHitsTopDocsCollector.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/LargeNumHitsTopDocsCollector.java @@ -129,7 +129,7 @@ public TopDocs topDocs(int howMany) { */ protected void populateResults(ScoreDoc[] results, int howMany) { if (pq != null) { - assert totalHits >= requestedHitCount; + assert totalHits > requestedHitCount; for (int i = howMany - 1; i >= 0; i--) { results[i] = pq.pop(); } @@ -137,7 +137,7 @@ protected void populateResults(ScoreDoc[] results, int howMany) { } // Total number of hits collected were less than requestedHitCount - assert totalHits < requestedHitCount; + assert totalHits <= requestedHitCount; Collections.sort( hits, Comparator.comparing((ScoreDoc scoreDoc) -> scoreDoc.score) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/MultiNormsLeafSimScorer.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/MultiNormsLeafSimScorer.java index ebc98df31afb..026fa4130d3c 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/MultiNormsLeafSimScorer.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/MultiNormsLeafSimScorer.java @@ -16,8 +16,6 @@ */ package org.apache.lucene.sandbox.search; -import static org.apache.lucene.sandbox.search.CombinedFieldQuery.FieldAndWeight; - import java.io.IOException; import java.util.ArrayList; import java.util.Collection; @@ -27,13 +25,13 @@ import java.util.Set; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.sandbox.search.CombinedFieldQuery.FieldAndWeight; import org.apache.lucene.search.Explanation; -import org.apache.lucene.search.LeafSimScorer; import org.apache.lucene.search.similarities.Similarity.SimScorer; import org.apache.lucene.util.SmallFloat; /** - * Copy of {@link LeafSimScorer} that sums document's norms from multiple fields. + * Scorer that sums document's norms from multiple fields. * *

    For all fields, norms must be encoded using {@link SmallFloat#intToByte4}. This scorer also * requires that either all fields or no fields have norms enabled. Having only some fields with @@ -65,15 +63,15 @@ final class MultiNormsLeafSimScorer { final List weightList = new ArrayList<>(); final Set duplicateCheckingSet = new HashSet<>(); for (FieldAndWeight field : normFields) { - assert duplicateCheckingSet.add(field.field) + assert duplicateCheckingSet.add(field.field()) : "There is a duplicated field [" - + field.field + + field.field() + "] used to construct MultiNormsLeafSimScorer"; - NumericDocValues norms = reader.getNormValues(field.field); + NumericDocValues norms = reader.getNormValues(field.field()); if (norms != null) { normsList.add(norms); - weightList.add(field.weight); + weightList.add(field.weight()); } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/ProfilerCollectorManager.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/ProfilerCollectorManager.java new file mode 100644 index 000000000000..900d0b642fd8 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/ProfilerCollectorManager.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.search; + +import java.io.IOException; +import java.util.Collection; +import java.util.List; +import org.apache.lucene.search.Collector; +import org.apache.lucene.search.CollectorManager; + +/** Collector manager for {@link ProfilerCollector} */ +public abstract class ProfilerCollectorManager + implements CollectorManager { + + private final String reason; + + /** + * Creates a profiler collector manager provided a certain reason + * + * @param reason the reason for the collection + */ + public ProfilerCollectorManager(String reason) { + this.reason = reason; + } + + /** Creates the collector to be wrapped with a {@link ProfilerCollector} */ + protected abstract Collector createCollector() throws IOException; + + @Override + public final ProfilerCollector newCollector() throws IOException { + return new ProfilerCollector(createCollector(), reason, List.of()); + } + + @Override + public ProfilerCollectorResult reduce(Collection collectors) + throws IOException { + String name = null; + String reason = null; + long time = 0; + + for (ProfilerCollector collector : collectors) { + assert name == null || name.equals(collector.getName()); + name = collector.getName(); + assert reason == null || reason.equals(collector.getReason()); + reason = collector.getReason(); + ProfilerCollectorResult profileResult = collector.getProfileResult(); + assert profileResult.getTime() == collector.getTime(); + time += profileResult.getTime(); + } + + return new ProfilerCollectorResult(name, reason, time, List.of()); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/TermAutomatonQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/TermAutomatonQuery.java index 46386b522498..1ec261139608 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/TermAutomatonQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/TermAutomatonQuery.java @@ -24,6 +24,7 @@ import java.util.List; import java.util.Map; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.ReaderUtil; import org.apache.lucene.index.Term; @@ -35,7 +36,6 @@ import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.LeafSimScorer; import org.apache.lucene.search.MatchNoDocsQuery; import org.apache.lucene.search.MultiPhraseQuery; import org.apache.lucene.search.PhraseQuery; @@ -429,9 +429,8 @@ public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOExcepti } if (any) { - scorer = - new TermAutomatonScorer( - this, enums, anyTermID, new LeafSimScorer(stats, context.reader(), field, true)); + NumericDocValues norms = context.reader().getNormValues(field); + scorer = new TermAutomatonScorer(this, enums, anyTermID, stats, norms); } else { return null; } @@ -456,15 +455,20 @@ public Explanation explain(LeafReaderContext context, int doc) throws IOExceptio } float score = scorer.score(); - LeafSimScorer leafSimScorer = ((TermAutomatonScorer) scorer).getLeafSimScorer(); EnumAndScorer[] originalSubsOnDoc = ((TermAutomatonScorer) scorer).getOriginalSubsOnDoc(); + NumericDocValues norms = context.reader().getNormValues(field); + long norm = 1L; + if (norms != null && norms.advanceExact(doc)) { + norm = norms.longValue(); + } + List termExplanations = new ArrayList<>(); for (EnumAndScorer enumAndScorer : originalSubsOnDoc) { if (enumAndScorer != null) { PostingsEnum postingsEnum = enumAndScorer.posEnum; if (postingsEnum.docID() == doc) { - float termScore = leafSimScorer.score(doc, postingsEnum.freq()); + float termScore = stats.score(postingsEnum.freq(), norm); termExplanations.add( Explanation.match( postingsEnum.freq(), @@ -482,7 +486,7 @@ public Explanation explain(LeafReaderContext context, int doc) throws IOExceptio Explanation freqExplanation = Explanation.match(score, "TermAutomatonQuery, sum of:", termExplanations); - return leafSimScorer.explain(doc, freqExplanation); + return stats.explain(freqExplanation, norm); } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/TermAutomatonScorer.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/TermAutomatonScorer.java index 2d0b7ecb85bd..7c6d6c9b6bbd 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/TermAutomatonScorer.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/TermAutomatonScorer.java @@ -17,11 +17,12 @@ package org.apache.lucene.sandbox.search; import java.io.IOException; +import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.sandbox.search.TermAutomatonQuery.EnumAndScorer; import org.apache.lucene.sandbox.search.TermAutomatonQuery.TermAutomatonWeight; import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.search.LeafSimScorer; import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.similarities.Similarity.SimScorer; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.RamUsageEstimator; @@ -44,7 +45,8 @@ class TermAutomatonScorer extends Scorer { // This is -1 if wildcard (null) terms were not used, else it's the id // of the wildcard term: private final int anyTermID; - private final LeafSimScorer docScorer; + private final SimScorer scorer; + private final NumericDocValues norms; private int numSubsOnDoc; @@ -61,11 +63,16 @@ class TermAutomatonScorer extends Scorer { private final EnumAndScorer[] originalSubsOnDoc; public TermAutomatonScorer( - TermAutomatonWeight weight, EnumAndScorer[] subs, int anyTermID, LeafSimScorer docScorer) + TermAutomatonWeight weight, + EnumAndScorer[] subs, + int anyTermID, + SimScorer scorer, + NumericDocValues norms) throws IOException { // System.out.println(" automaton:\n" + weight.automaton.toDot()); this.runAutomaton = new TermRunAutomaton(weight.automaton, subs.length); - this.docScorer = docScorer; + this.scorer = scorer; + this.norms = norms; this.docIDQueue = new DocIDQueue(subs.length); this.posQueue = new PositionQueue(subs.length); this.anyTermID = anyTermID; @@ -356,10 +363,6 @@ EnumAndScorer[] getOriginalSubsOnDoc() { return originalSubsOnDoc; } - LeafSimScorer getLeafSimScorer() { - return docScorer; - } - @Override public int docID() { return docID; @@ -369,12 +372,16 @@ public int docID() { public float score() throws IOException { // TODO: we could probably do better here, e.g. look @ freqs of actual terms involved in this // doc and score differently - return docScorer.score(docID, freq); + long norm = 1L; + if (norms != null && norms.advanceExact(docID)) { + norm = norms.longValue(); + } + return scorer.score(freq, norm); } @Override public float getMaxScore(int upTo) throws IOException { - return docScorer.getSimScorer().score(Float.MAX_VALUE, 1L); + return scorer.score(Float.MAX_VALUE, 1L); } static class TermRunAutomaton extends RunAutomaton { diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/idversion/TestIDVersionPostingsFormat.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/idversion/TestIDVersionPostingsFormat.java index 7f67e4767f65..a34fbfa6db53 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/idversion/TestIDVersionPostingsFormat.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/idversion/TestIDVersionPostingsFormat.java @@ -343,7 +343,7 @@ public PerThreadVersionPKLookup(IndexReader r, String field) throws IOException /** Returns docID if found, else -1. */ public int lookup(BytesRef id, long version) throws IOException { - for (int seg = 0; seg < numSegs; seg++) { + for (int seg = 0; seg < numEnums; seg++) { if (((IDVersionSegmentTermsEnum) termsEnums[seg]).seekExact(id, version)) { if (VERBOSE) { System.out.println(" found in seg=" + termsEnums[seg]); diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/quantization/TestKMeans.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/quantization/TestKMeans.java index 61c0e58c91ef..3669079b719d 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/quantization/TestKMeans.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/quantization/TestKMeans.java @@ -20,9 +20,9 @@ import java.io.IOException; import java.util.ArrayList; import java.util.List; +import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.tests.util.LuceneTestCase; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; public class TestKMeans extends LuceneTestCase { @@ -32,7 +32,7 @@ public void testKMeansAPI() throws IOException { int dims = random().nextInt(2, 20); int randIdx = random().nextInt(VectorSimilarityFunction.values().length); VectorSimilarityFunction similarityFunction = VectorSimilarityFunction.values()[randIdx]; - RandomAccessVectorValues.Floats vectors = generateData(nVectors, dims, nClusters); + FloatVectorValues vectors = generateData(nVectors, dims, nClusters); // default case { @@ -75,7 +75,7 @@ public void testKMeansSpecialCases() throws IOException { // nClusters > nVectors int nClusters = 20; int nVectors = 10; - RandomAccessVectorValues.Floats vectors = generateData(nVectors, 5, nClusters); + FloatVectorValues vectors = generateData(nVectors, 5, nClusters); KMeans.Results results = KMeans.cluster(vectors, VectorSimilarityFunction.EUCLIDEAN, nClusters); // assert that we get 1 centroid, as nClusters will be adjusted @@ -87,7 +87,7 @@ public void testKMeansSpecialCases() throws IOException { int sampleSize = 2; int nClusters = 2; int nVectors = 300; - RandomAccessVectorValues.Floats vectors = generateData(nVectors, 5, nClusters); + FloatVectorValues vectors = generateData(nVectors, 5, nClusters); KMeans.KmeansInitializationMethod initializationMethod = KMeans.KmeansInitializationMethod.PLUS_PLUS; KMeans.Results results = @@ -108,7 +108,7 @@ public void testKMeansSpecialCases() throws IOException { // test unassigned centroids int nClusters = 4; int nVectors = 400; - RandomAccessVectorValues.Floats vectors = generateData(nVectors, 5, nClusters); + FloatVectorValues vectors = generateData(nVectors, 5, nClusters); KMeans.Results results = KMeans.cluster(vectors, VectorSimilarityFunction.EUCLIDEAN, nClusters); float[][] centroids = results.centroids(); @@ -118,8 +118,7 @@ public void testKMeansSpecialCases() throws IOException { } } - private static RandomAccessVectorValues.Floats generateData( - int nSamples, int nDims, int nClusters) { + private static FloatVectorValues generateData(int nSamples, int nDims, int nClusters) { List vectors = new ArrayList<>(nSamples); float[][] centroids = new float[nClusters][nDims]; // Generate random centroids @@ -137,6 +136,6 @@ private static RandomAccessVectorValues.Floats generateData( } vectors.add(vector); } - return RandomAccessVectorValues.fromFloats(vectors, nDims); + return FloatVectorValues.fromFloats(vectors, nDims); } } diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/facet/SandboxFacetTestCase.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/facet/SandboxFacetTestCase.java new file mode 100644 index 000000000000..560a3d4bc303 --- /dev/null +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/facet/SandboxFacetTestCase.java @@ -0,0 +1,198 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.facet; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import org.apache.lucene.facet.FacetResult; +import org.apache.lucene.facet.LabelAndValue; +import org.apache.lucene.facet.taxonomy.FacetLabel; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.sandbox.facet.iterators.CandidateSetOrdinalIterator; +import org.apache.lucene.sandbox.facet.iterators.ComparableSupplier; +import org.apache.lucene.sandbox.facet.iterators.OrdinalIterator; +import org.apache.lucene.sandbox.facet.iterators.TaxonomyChildrenOrdinalIterator; +import org.apache.lucene.sandbox.facet.iterators.TopnOrdinalIterator; +import org.apache.lucene.sandbox.facet.labels.OrdToLabel; +import org.apache.lucene.sandbox.facet.labels.TaxonomyOrdLabelBiMap; +import org.apache.lucene.sandbox.facet.recorders.CountFacetRecorder; +import org.apache.lucene.tests.util.LuceneTestCase; + +public abstract class SandboxFacetTestCase extends LuceneTestCase { + // TODO: We don't have access to overall count for all facets from count recorder, and we can't + // compute it as a SUM of values for each facet ordinal because we need to respect cases where the + // same doc belongs to multiple facets (e.g. overlapping ranges or multi value fields). In most + // cases we can already access the value. E.g. for facets with hierarchy (taxonomy or SSDV) we can + // read value for parent facet ordinal. I believe the only case that requires code changes is + // range facets. To solve it we can add a parameter to range FacetCutter to assign/yeild special + // facet ordinal for every document that matches at least one range from the list. Overall, + // sandbox facet tests don't have to use FacetResult, so we change it to assert facet labels and + // recorded results directly and avoid need for this constant. + static final int VALUE_CANT_BE_COMPUTED = Integer.MIN_VALUE; + + protected void assertNumericValuesEquals(Number a, Number b) { + assertTrue(a.getClass().isInstance(b)); + if (a instanceof Float) { + assertEquals(a.floatValue(), b.floatValue(), a.floatValue() / 1e5); + } else if (a instanceof Double) { + assertEquals(a.doubleValue(), b.doubleValue(), a.doubleValue() / 1e5); + } else { + assertEquals(a, b); + } + } + + protected void assertFacetResult( + FacetResult result, + String expectedDim, + String[] expectedPath, + int expectedChildCount, + Number expectedValue, + LabelAndValue... expectedChildren) { + assertEquals(expectedDim, result.dim); + assertArrayEquals(expectedPath, result.path); + assertEquals(expectedChildCount, result.childCount); + assertNumericValuesEquals(expectedValue, result.value); + assertEquals(expectedChildren.length, result.labelValues.length); + // assert children equal with no assumption of the children ordering + assertTrue(Arrays.asList(result.labelValues).containsAll(Arrays.asList(expectedChildren))); + } + + FacetResult getTopChildrenByCount( + CountFacetRecorder countFacetRecorder, + TaxonomyReader taxoReader, + int topN, + String dimension, + String... path) + throws IOException { + ComparableSupplier countComparable = + ComparableUtils.byCount(countFacetRecorder); + TaxonomyOrdLabelBiMap ordLabels = new TaxonomyOrdLabelBiMap(taxoReader); + FacetLabel parentLabel = new FacetLabel(dimension, path); + OrdinalIterator childrenIterator = + new TaxonomyChildrenOrdinalIterator( + countFacetRecorder.recordedOrds(), + taxoReader.getParallelTaxonomyArrays().parents(), + ordLabels.getOrd(parentLabel)); + OrdinalIterator topByCountOrds = + new TopnOrdinalIterator<>(childrenIterator, countComparable, topN); + // Get array of final ordinals - we need to use all of them to get labels first, and then to get + // counts, + // but OrdinalIterator only allows reading ordinals once. + int[] resultOrdinals = topByCountOrds.toArray(); + + FacetLabel[] labels = ordLabels.getLabels(resultOrdinals); + List labelsAndValues = new ArrayList<>(labels.length); + int childCount = 0; + for (int i = 0; i < resultOrdinals.length; i++) { + int count = countFacetRecorder.getCount(resultOrdinals[i]); + labelsAndValues.add(new LabelAndValue(labels[i].lastComponent(), count)); + childCount++; + } + // int value = countFacetRecorder.getCount(parentOrdinal); + return new FacetResult( + dimension, + path, + VALUE_CANT_BE_COMPUTED, + labelsAndValues.toArray(new LabelAndValue[0]), + childCount); + } + + FacetResult getAllChildren( + CountFacetRecorder countFacetRecorder, + TaxonomyReader taxoReader, + String dimension, + String... path) + throws IOException { + TaxonomyOrdLabelBiMap ordLabels = new TaxonomyOrdLabelBiMap(taxoReader); + FacetLabel parentLabel = new FacetLabel(dimension, path); + int parentOrdinal = ordLabels.getOrd(parentLabel); + OrdinalIterator childrenIternator = + new TaxonomyChildrenOrdinalIterator( + countFacetRecorder.recordedOrds(), + taxoReader.getParallelTaxonomyArrays().parents(), + parentOrdinal); + // Get array of final ordinals - we need to use all of them to get labels first, and then to get + // counts, + // but OrdinalIterator only allows reading ordinals once. + int[] resultOrdinals = childrenIternator.toArray(); + + FacetLabel[] labels = ordLabels.getLabels(resultOrdinals); + List labelsAndValues = new ArrayList<>(labels.length); + int childCount = 0; + for (int i = 0; i < resultOrdinals.length; i++) { + int count = countFacetRecorder.getCount(resultOrdinals[i]); + labelsAndValues.add(new LabelAndValue(labels[i].lastComponent(), count)); + childCount++; + } + // int value = countFacetRecorder.getCount(parentOrdinal); + return new FacetResult( + dimension, + path, + VALUE_CANT_BE_COMPUTED, + labelsAndValues.toArray(new LabelAndValue[0]), + childCount); + } + + FacetResult getAllSortByOrd( + int[] resultOrdinals, + CountFacetRecorder countFacetRecorder, + String dimension, + OrdToLabel ordLabels) + throws IOException { + ComparableUtils.sort(resultOrdinals, ComparableUtils.byOrdinal()); + FacetLabel[] labels = ordLabels.getLabels(resultOrdinals); + List labelsAndValues = new ArrayList<>(labels.length); + int childCount = 0; + for (int i = 0; i < resultOrdinals.length; i++) { + int count = countFacetRecorder.getCount(resultOrdinals[i]); + labelsAndValues.add(new LabelAndValue(labels[i].lastComponent(), count)); + childCount++; + } + + return new FacetResult( + dimension, + new String[0], + VALUE_CANT_BE_COMPUTED, + labelsAndValues.toArray(new LabelAndValue[0]), + childCount); + } + + int getSpecificValue( + CountFacetRecorder countFacetRecorder, TaxonomyReader taxoReader, String... path) + throws IOException { + TaxonomyOrdLabelBiMap ordLabels = new TaxonomyOrdLabelBiMap(taxoReader); + FacetLabel label = new FacetLabel(path); + int facetOrd = ordLabels.getOrd(label); + return countFacetRecorder.getCount(facetOrd); + } + + int[] getCountsForRecordedCandidates( + CountFacetRecorder countFacetRecorder, TaxonomyReader taxoReader, FacetLabel[] candidates) + throws IOException { + int[] resultOrds = + new CandidateSetOrdinalIterator( + countFacetRecorder, candidates, new TaxonomyOrdLabelBiMap(taxoReader)) + .toArray(); + int[] counts = new int[resultOrds.length]; + for (int i = 0; i < resultOrds.length; i++) { + counts[i] = countFacetRecorder.getCount(resultOrds[i]); + } + return counts; + } +} diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/facet/TestCandidateSetOrdinalIterator.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/facet/TestCandidateSetOrdinalIterator.java new file mode 100644 index 000000000000..5b8f55b96a51 --- /dev/null +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/facet/TestCandidateSetOrdinalIterator.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.facet; + +import java.io.IOException; +import org.apache.lucene.facet.taxonomy.FacetLabel; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.sandbox.facet.cutters.FacetCutter; +import org.apache.lucene.sandbox.facet.cutters.LeafFacetCutter; +import org.apache.lucene.sandbox.facet.iterators.CandidateSetOrdinalIterator; +import org.apache.lucene.sandbox.facet.iterators.OrdinalIterator; +import org.apache.lucene.sandbox.facet.labels.LabelToOrd; +import org.apache.lucene.sandbox.facet.recorders.CountFacetRecorder; +import org.apache.lucene.sandbox.facet.recorders.FacetRecorder; +import org.apache.lucene.sandbox.facet.recorders.LeafFacetRecorder; +import org.apache.lucene.tests.util.LuceneTestCase; + +/** Tests for {@link CandidateSetOrdinalIterator}. */ +public class TestCandidateSetOrdinalIterator extends LuceneTestCase { + + /** LabelToOrd that parses label's string to get int ordinal */ + private LabelToOrd mockLabelToOrd = + new LabelToOrd() { + @Override + public int getOrd(FacetLabel label) { + return Integer.valueOf(label.lastComponent()); + } + + @Override + public int[] getOrds(FacetLabel[] labels) { + int[] result = new int[labels.length]; + for (int i = 0; i < result.length; i++) { + result[i] = getOrd(labels[i]); + } + return result; + } + }; + + private FacetCutter mockFacetCutter = + new FacetCutter() { + @Override + public LeafFacetCutter createLeafCutter(LeafReaderContext context) throws IOException { + return null; + } + }; + + public void testBasic() throws IOException { + FacetRecorder recorder = new CountFacetRecorder(); + LeafFacetRecorder leafRecorder = recorder.getLeafRecorder(null); + leafRecorder.record(0, 0); + leafRecorder.record(0, 3); + recorder.reduce(mockFacetCutter); + + FacetLabel[] candidates = + new FacetLabel[] { + new FacetLabel("0"), + new FacetLabel("1"), + new FacetLabel(String.valueOf(LabelToOrd.INVALID_ORD)), + new FacetLabel("3") + }; + + // Note that "1" is filtered out as it was never recorded + assertArrayEquals( + new int[] {0, 3}, + new CandidateSetOrdinalIterator(recorder, candidates, mockLabelToOrd).toArray()); + } + + public void testEmptyRecorder() throws IOException { + FacetRecorder recorder = new CountFacetRecorder(); + recorder.reduce(mockFacetCutter); + + FacetLabel[] candidates = + new FacetLabel[] { + new FacetLabel("0"), + new FacetLabel("1"), + new FacetLabel(String.valueOf(LabelToOrd.INVALID_ORD)), + new FacetLabel("3") + }; + + // Note that "1" is filtered out as it was never recorded + assertEquals( + OrdinalIterator.NO_MORE_ORDS, + new CandidateSetOrdinalIterator(recorder, candidates, mockLabelToOrd).nextOrd()); + } +} diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/facet/TestFacetRecorders.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/facet/TestFacetRecorders.java new file mode 100644 index 000000000000..41d73e16f989 --- /dev/null +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/facet/TestFacetRecorders.java @@ -0,0 +1,478 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.facet; + +import static org.apache.lucene.facet.FacetsConfig.DEFAULT_INDEX_FIELD_NAME; + +import java.io.IOException; +import java.util.Arrays; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.DoubleDocValuesField; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.document.StringField; +import org.apache.lucene.facet.FacetField; +import org.apache.lucene.facet.FacetsConfig; +import org.apache.lucene.facet.taxonomy.FacetLabel; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader; +import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.Term; +import org.apache.lucene.sandbox.facet.cutters.TaxonomyFacetsCutter; +import org.apache.lucene.sandbox.facet.iterators.CandidateSetOrdinalIterator; +import org.apache.lucene.sandbox.facet.iterators.ComparableSupplier; +import org.apache.lucene.sandbox.facet.iterators.OrdinalIterator; +import org.apache.lucene.sandbox.facet.iterators.TaxonomyChildrenOrdinalIterator; +import org.apache.lucene.sandbox.facet.iterators.TopnOrdinalIterator; +import org.apache.lucene.sandbox.facet.labels.TaxonomyOrdLabelBiMap; +import org.apache.lucene.sandbox.facet.recorders.CountFacetRecorder; +import org.apache.lucene.sandbox.facet.recorders.FacetRecorder; +import org.apache.lucene.sandbox.facet.recorders.LongAggregationsFacetRecorder; +import org.apache.lucene.sandbox.facet.recorders.MultiFacetsRecorder; +import org.apache.lucene.sandbox.facet.recorders.Reducer; +import org.apache.lucene.search.DoubleValuesSource; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.LongValuesSource; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.index.RandomIndexWriter; +import org.apache.lucene.util.IOUtils; + +/** Test for {@link FacetRecorder} */ +public class TestFacetRecorders extends SandboxFacetTestCase { + + public void testCountAndLongAggregationRecordersBasic() throws Exception { + Directory dir = newDirectory(); + Directory taxoDir = newDirectory(); + + // Writes facet ords to a separate directory from the + // main index: + DirectoryTaxonomyWriter taxoWriter = + new DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE); + + FacetsConfig config = new FacetsConfig(); + config.setHierarchical("Publish Date", true); + config.setMultiValued("Publish Date", random().nextBoolean()); + + RandomIndexWriter writer = new RandomIndexWriter(random(), dir); + + Document doc = new Document(); + doc.add(new FacetField("Author", "Bob")); + doc.add(new FacetField("Publish Date", "2010", "10", "15")); + doc.add(new NumericDocValuesField("Units", 9)); + doc.add(new DoubleDocValuesField("Popularity", 3.5d)); + doc.add(new StringField("Availability", "yes", Field.Store.NO)); + writer.addDocument(config.build(taxoWriter, doc)); + + doc = new Document(); + doc.add(new FacetField("Author", "Lisa")); + doc.add(new FacetField("Publish Date", "2010")); + doc.add(new NumericDocValuesField("Units", 2)); + doc.add(new DoubleDocValuesField("Popularity", 4.1D)); + doc.add(new StringField("Availability", "yes", Field.Store.NO)); + writer.addDocument(config.build(taxoWriter, doc)); + + doc = new Document(); + doc.add(new FacetField("Author", "Lisa")); + doc.add(new FacetField("Publish Date", "2012", "1", "1")); + doc.add(new NumericDocValuesField("Units", 5)); + doc.add(new DoubleDocValuesField("Popularity", 3.9D)); + doc.add(new StringField("Availability", "yes", Field.Store.NO)); + writer.addDocument(config.build(taxoWriter, doc)); + + doc = new Document(); + doc.add(new FacetField("Author", "Susan")); + doc.add(new FacetField("Publish Date", "2012", "1", "7")); + doc.add(new NumericDocValuesField("Units", 7)); + doc.add(new DoubleDocValuesField("Popularity", 4D)); + doc.add(new StringField("Availability", "yes", Field.Store.NO)); + writer.addDocument(config.build(taxoWriter, doc)); + + doc = new Document(); + doc.add(new FacetField("Author", "Frank")); + doc.add(new FacetField("Publish Date", "1999", "5", "5")); + doc.add(new NumericDocValuesField("Units", 6)); + doc.add(new DoubleDocValuesField("Popularity", 7.9D)); + doc.add(new StringField("Availability", "yes", Field.Store.NO)); + writer.addDocument(config.build(taxoWriter, doc)); + + // Add a document that is not returned by a query + doc = new Document(); + doc.add(new FacetField("Author", "John")); + doc.add(new FacetField("Publish Date", "2024", "11", "12")); + doc.add(new NumericDocValuesField("Units", 200)); + doc.add(new DoubleDocValuesField("Popularity", 13D)); + doc.add(new StringField("Availability", "no", Field.Store.NO)); + writer.addDocument(config.build(taxoWriter, doc)); + + // NRT open + IndexSearcher searcher = newSearcher(writer.getReader()); + + // NRT open + TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter); + + Query query = new TermQuery(new Term("Availability", "yes")); + + TaxonomyFacetsCutter defaultTaxoCutter = + new TaxonomyFacetsCutter(DEFAULT_INDEX_FIELD_NAME, config, taxoReader); + + LongValuesSource[] longValuesSources = new LongValuesSource[2]; + Reducer[] reducers = new Reducer[2]; + // popularity:max + longValuesSources[0] = DoubleValuesSource.fromDoubleField("Popularity").toLongValuesSource(); + reducers[0] = Reducer.MAX; + // units:sum + longValuesSources[1] = LongValuesSource.fromLongField("Units"); + reducers[1] = Reducer.SUM; + + LongAggregationsFacetRecorder longAggregationsFacetRecorder = + new LongAggregationsFacetRecorder(longValuesSources, reducers); + + final CountFacetRecorder countRecorder = new CountFacetRecorder(); + // Compute both counts and aggregations + MultiFacetsRecorder multiFacetsRecorder = + new MultiFacetsRecorder(countRecorder, longAggregationsFacetRecorder); + + FacetFieldCollectorManager collectorManager = + new FacetFieldCollectorManager<>(defaultTaxoCutter, multiFacetsRecorder); + searcher.search(query, collectorManager); + + int[] ordsFromCounts = countRecorder.recordedOrds().toArray(); + Arrays.sort(ordsFromCounts); + int[] ordsFromAggregations = longAggregationsFacetRecorder.recordedOrds().toArray(); + Arrays.sort(ordsFromAggregations); + assertArrayEquals(ordsFromCounts, ordsFromAggregations); + + // Retrieve & verify results: + assertEquals( + "dim=Publish Date path=[]\n" + + " 2010 (2, agg0=4 agg1=11)\n" + + " 2012 (2, agg0=4 agg1=12)\n" + + " 1999 (1, agg0=7 agg1=6)\n", + getTopChildrenWithLongAggregations( + countRecorder, taxoReader, 10, 2, longAggregationsFacetRecorder, null, "Publish Date")); + assertEquals( + "dim=Author path=[]\n" + + " Lisa (2, agg0=4 agg1=7)\n" + + " Bob (1, agg0=3 agg1=9)\n" + + " Susan (1, agg0=4 agg1=7)\n" + + " Frank (1, agg0=7 agg1=6)\n", + getTopChildrenWithLongAggregations( + countRecorder, taxoReader, 10, 2, longAggregationsFacetRecorder, null, "Author")); + + assertArrayEquals( + new long[] {11, 6}, + getAggregationForRecordedCandidates( + longAggregationsFacetRecorder, + 1, + taxoReader, + new FacetLabel[] { + new FacetLabel("Publish Date", "2010"), + // Not in the index - skipped + new FacetLabel("Publish Date", "2025"), + // Not matched by the query - skipped + new FacetLabel("Publish Date", "2024"), + new FacetLabel("Publish Date", "1999"), + })); + + assertArrayEquals( + new long[] {7, 6}, + getAggregationForRecordedCandidates( + longAggregationsFacetRecorder, + 1, + taxoReader, + new FacetLabel[] { + new FacetLabel("Author", "Lisa"), + // Not in the index - skipped + new FacetLabel("Author", "Christofer"), + // Not matched by the query - skipped + new FacetLabel("Author", "John"), + new FacetLabel("Author", "Frank"), + })); + + writer.close(); + IOUtils.close(taxoWriter, searcher.getIndexReader(), taxoReader, taxoDir, dir); + } + + /** + * Test that counts and long aggregations are correct when different index segments have different + * facet ordinals. + */ + public void testCountAndLongAggregationRecordersMultipleSegments() throws Exception { + Directory dir = newDirectory(); + Directory taxoDir = newDirectory(); + + // Writes facet ords to a separate directory from the + // main index: + DirectoryTaxonomyWriter taxoWriter = + new DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE); + + FacetsConfig config = new FacetsConfig(); + config.setHierarchical("Publish Date", true); + config.setMultiValued("Publish Date", random().nextBoolean()); + + RandomIndexWriter writer = new RandomIndexWriter(random(), dir); + + Document doc = new Document(); + doc.add(new FacetField("Author", "Bob")); + doc.add(new FacetField("Publish Date", "2010", "10", "15")); + doc.add(new NumericDocValuesField("Units", 9)); + doc.add(new DoubleDocValuesField("Popularity", 3.5d)); + writer.addDocument(config.build(taxoWriter, doc)); + writer.commit(); + + doc = new Document(); + doc.add(new FacetField("Author", "Lisa")); + doc.add(new FacetField("Publish Date", "2012", "10", "20")); + doc.add(new NumericDocValuesField("Units", 2)); + doc.add(new DoubleDocValuesField("Popularity", 4.1D)); + writer.addDocument(config.build(taxoWriter, doc)); + writer.commit(); + + // NRT open + IndexSearcher searcher = newSearcher(writer.getReader()); + + // NRT open + TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter); + + Query query = new MatchAllDocsQuery(); + + TaxonomyFacetsCutter defaultTaxoCutter = + new TaxonomyFacetsCutter(DEFAULT_INDEX_FIELD_NAME, config, taxoReader); + + LongValuesSource[] longValuesSources = new LongValuesSource[2]; + Reducer[] reducers = new Reducer[2]; + // popularity:max + longValuesSources[0] = DoubleValuesSource.fromDoubleField("Popularity").toLongValuesSource(); + reducers[0] = Reducer.MAX; + // units:sum + longValuesSources[1] = LongValuesSource.fromLongField("Units"); + reducers[1] = Reducer.SUM; + + LongAggregationsFacetRecorder longAggregationsFacetRecorder = + new LongAggregationsFacetRecorder(longValuesSources, reducers); + + final CountFacetRecorder countRecorder = new CountFacetRecorder(); + // Compute both counts and aggregations + MultiFacetsRecorder multiFacetsRecorder = + new MultiFacetsRecorder(countRecorder, longAggregationsFacetRecorder); + + FacetFieldCollectorManager collectorManager = + new FacetFieldCollectorManager<>(defaultTaxoCutter, multiFacetsRecorder); + searcher.search(query, collectorManager); + + // Retrieve & verify results: + assertEquals( + "dim=Publish Date path=[]\n" + + " 2010 (1, agg0=3 agg1=9)\n" + + " 2012 (1, agg0=4 agg1=2)\n", + getTopChildrenWithLongAggregations( + countRecorder, taxoReader, 10, 2, longAggregationsFacetRecorder, null, "Publish Date")); + assertEquals( + "dim=Author path=[]\n" + " Bob (1, agg0=3 agg1=9)\n" + " Lisa (1, agg0=4 agg1=2)\n", + getTopChildrenWithLongAggregations( + countRecorder, taxoReader, 10, 2, longAggregationsFacetRecorder, null, "Author")); + + writer.close(); + IOUtils.close(taxoWriter, searcher.getIndexReader(), taxoReader, taxoDir, dir); + } + + public void testSortByLongAggregation() throws Exception { + Directory dir = newDirectory(); + Directory taxoDir = newDirectory(); + + // Writes facet ords to a separate directory from the + // main index: + DirectoryTaxonomyWriter taxoWriter = + new DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE); + + FacetsConfig config = new FacetsConfig(); + config.setHierarchical("Publish Date", true); + config.setMultiValued("Publish Date", random().nextBoolean()); + + RandomIndexWriter writer = new RandomIndexWriter(random(), dir); + + Document doc = new Document(); + doc.add(new FacetField("Author", "Bob")); + doc.add(new FacetField("Publish Date", "2010", "10", "15")); + doc.add(new NumericDocValuesField("Units", 9)); + doc.add(new DoubleDocValuesField("Popularity", 3.5d)); + writer.addDocument(config.build(taxoWriter, doc)); + + doc = new Document(); + doc.add(new FacetField("Author", "Lisa")); + doc.add(new FacetField("Publish Date", "2010", "10", "20")); + doc.add(new NumericDocValuesField("Units", 2)); + doc.add(new DoubleDocValuesField("Popularity", 4.1D)); + writer.addDocument(config.build(taxoWriter, doc)); + + doc = new Document(); + doc.add(new FacetField("Author", "Lisa")); + doc.add(new FacetField("Publish Date", "2012", "1", "1")); + doc.add(new NumericDocValuesField("Units", 5)); + doc.add(new DoubleDocValuesField("Popularity", 3.9D)); + writer.addDocument(config.build(taxoWriter, doc)); + + doc = new Document(); + doc.add(new FacetField("Author", "Susan")); + doc.add(new FacetField("Publish Date", "2012", "1", "7")); + doc.add(new NumericDocValuesField("Units", 7)); + doc.add(new DoubleDocValuesField("Popularity", 4D)); + writer.addDocument(config.build(taxoWriter, doc)); + + doc = new Document(); + doc.add(new FacetField("Author", "Frank")); + doc.add(new FacetField("Publish Date", "1999", "5", "5")); + doc.add(new NumericDocValuesField("Units", 6)); + doc.add(new DoubleDocValuesField("Popularity", 7.9D)); + writer.addDocument(config.build(taxoWriter, doc)); + + // NRT open + IndexSearcher searcher = newSearcher(writer.getReader()); + + // NRT open + TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter); + + Query query = new MatchAllDocsQuery(); + + TaxonomyFacetsCutter defaultTaxoCutter = + new TaxonomyFacetsCutter(DEFAULT_INDEX_FIELD_NAME, config, taxoReader); + + LongValuesSource[] longValuesSources = new LongValuesSource[2]; + Reducer[] reducers = new Reducer[2]; + // popularity:max + longValuesSources[0] = DoubleValuesSource.fromDoubleField("Popularity").toLongValuesSource(); + reducers[0] = Reducer.MAX; + // units:sum + longValuesSources[1] = LongValuesSource.fromLongField("Units"); + reducers[1] = Reducer.SUM; + + LongAggregationsFacetRecorder longAggregationsFacetRecorder = + new LongAggregationsFacetRecorder(longValuesSources, reducers); + + final CountFacetRecorder countRecorder = new CountFacetRecorder(); + // Compute both counts and aggregations + MultiFacetsRecorder multiFacetsRecorder = + new MultiFacetsRecorder(countRecorder, longAggregationsFacetRecorder); + + FacetFieldCollectorManager collectorManager = + new FacetFieldCollectorManager<>(defaultTaxoCutter, multiFacetsRecorder); + searcher.search(query, collectorManager); + + // Retrieve & verify results: + assertEquals( + "dim=Publish Date path=[]\n" + + " 2012 (2, agg0=4 agg1=12)\n" + + " 2010 (2, agg0=4 agg1=11)\n" + + " 1999 (1, agg0=7 agg1=6)\n", + getTopChildrenWithLongAggregations( + countRecorder, taxoReader, 10, 2, longAggregationsFacetRecorder, 1, "Publish Date")); + assertEquals( + "dim=Author path=[]\n" + + " Frank (1, agg0=7 agg1=6)\n" + + " Lisa (2, agg0=4 agg1=7)\n" + + " Susan (1, agg0=4 agg1=7)\n" + + " Bob (1, agg0=3 agg1=9)\n", + getTopChildrenWithLongAggregations( + countRecorder, taxoReader, 10, 2, longAggregationsFacetRecorder, 0, "Author")); + + writer.close(); + IOUtils.close(taxoWriter, searcher.getIndexReader(), taxoReader, taxoDir, dir); + } + + private String getTopChildrenWithLongAggregations( + CountFacetRecorder countFacetRecorder, + TaxonomyReader taxoReader, + int topN, + int numOfAggregations, + LongAggregationsFacetRecorder longAggregationsFacetRecorder, + Integer sortByLongAggregationId, + String dimension, + String... path) + throws IOException { + StringBuilder resultBuilder = new StringBuilder(); + resultBuilder.append("dim="); + resultBuilder.append(dimension); + resultBuilder.append(" path="); + resultBuilder.append(Arrays.toString(path)); + resultBuilder.append('\n'); + + TaxonomyOrdLabelBiMap ordLabels = new TaxonomyOrdLabelBiMap(taxoReader); + FacetLabel parentLabel = new FacetLabel(dimension, path); + OrdinalIterator childrenIternator = + new TaxonomyChildrenOrdinalIterator( + countFacetRecorder.recordedOrds(), + taxoReader.getParallelTaxonomyArrays().parents(), + ordLabels.getOrd(parentLabel)); + final int[] resultOrdinals; + if (sortByLongAggregationId != null) { + ComparableSupplier comparableSupplier = + ComparableUtils.byAggregatedValue( + countFacetRecorder, longAggregationsFacetRecorder, sortByLongAggregationId); + OrdinalIterator topByCountOrds = + new TopnOrdinalIterator<>(childrenIternator, comparableSupplier, topN); + resultOrdinals = topByCountOrds.toArray(); + } else { + ComparableSupplier countComparable = + ComparableUtils.byCount(countFacetRecorder); + OrdinalIterator topByCountOrds = + new TopnOrdinalIterator<>(childrenIternator, countComparable, topN); + resultOrdinals = topByCountOrds.toArray(); + } + + FacetLabel[] labels = ordLabels.getLabels(resultOrdinals); + for (int i = 0; i < resultOrdinals.length; i++) { + int facetOrdinal = resultOrdinals[i]; + int count = countFacetRecorder.getCount(facetOrdinal); + resultBuilder.append(" "); + resultBuilder.append(labels[i].lastComponent()); + resultBuilder.append(" ("); + resultBuilder.append(count); + resultBuilder.append(", "); + for (int a = 0; a < numOfAggregations; a++) { + resultBuilder.append(" agg"); + resultBuilder.append(a); + resultBuilder.append("="); + resultBuilder.append(longAggregationsFacetRecorder.getRecordedValue(facetOrdinal, a)); + } + resultBuilder.append(")"); + resultBuilder.append("\n"); + } + // int value = countFacetRecorder.getCount(parentOrdinal); + return resultBuilder.toString(); + } + + long[] getAggregationForRecordedCandidates( + LongAggregationsFacetRecorder aggregationsRecorder, + int aggregationId, + TaxonomyReader taxoReader, + FacetLabel[] candidates) + throws IOException { + int[] resultOrds = + new CandidateSetOrdinalIterator( + aggregationsRecorder, candidates, new TaxonomyOrdLabelBiMap(taxoReader)) + .toArray(); + long[] result = new long[resultOrds.length]; + for (int i = 0; i < resultOrds.length; i++) { + result[i] = aggregationsRecorder.getRecordedValue(resultOrds[i], aggregationId); + } + return result; + } +} diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/facet/TestLongValueFacet.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/facet/TestLongValueFacet.java new file mode 100644 index 000000000000..2c2f82fc9e6c --- /dev/null +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/facet/TestLongValueFacet.java @@ -0,0 +1,841 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.facet; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.IntPoint; +import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.document.SortedNumericDocValuesField; +import org.apache.lucene.facet.FacetResult; +import org.apache.lucene.facet.LabelAndValue; +import org.apache.lucene.facet.LongValueFacetCounts; +import org.apache.lucene.facet.taxonomy.FacetLabel; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.sandbox.facet.cutters.LongValueFacetCutter; +import org.apache.lucene.sandbox.facet.iterators.ComparableSupplier; +import org.apache.lucene.sandbox.facet.iterators.OrdinalIterator; +import org.apache.lucene.sandbox.facet.iterators.TopnOrdinalIterator; +import org.apache.lucene.sandbox.facet.recorders.CountFacetRecorder; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.index.RandomIndexWriter; +import org.apache.lucene.tests.util.TestUtil; + +/** Tests long value facets, based on TestLongValueFacetCounts. */ +public class TestLongValueFacet extends SandboxFacetTestCase { + + public void testBasic() throws Exception { + Directory d = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), d); + for (long l = 0; l < 100; l++) { + Document doc = new Document(); + doc.add(new NumericDocValuesField("field", l % 5)); + w.addDocument(doc); + } + + // Also add Long.MAX_VALUE + Document doc = new Document(); + doc.add(new NumericDocValuesField("field", Long.MAX_VALUE)); + w.addDocument(doc); + + IndexReader r = w.getReader(); + w.close(); + + IndexSearcher s = newSearcher(r); + LongValueFacetCutter longValuesFacetCutter = new LongValueFacetCutter("field"); + CountFacetRecorder countRecorder = new CountFacetRecorder(); + FacetFieldCollectorManager collectorManager = + new FacetFieldCollectorManager<>(longValuesFacetCutter, countRecorder); + s.search(new MatchAllDocsQuery(), collectorManager); + + FacetResult result = getAllChildrenSortByValue("field", longValuesFacetCutter, countRecorder); + assertEquals( + "dim=field path=[] value=-2147483648 childCount=6\n 0 (20)\n 1 (20)\n 2 (20)\n 3 (20)\n " + + "4 (20)\n 9223372036854775807 (1)\n", + result.toString()); + + FacetResult topChildrenResult = + getTopChildren(2, "field", longValuesFacetCutter, countRecorder); + assertEquals( + "dim=field path=[] value=-2147483648 childCount=2\n 0 (20)\n 1 (20)\n", + topChildrenResult.toString()); + + assertFacetResult( + getAllChildren("field", longValuesFacetCutter, countRecorder), + "field", + new String[0], + 6, + -2147483648, + new LabelAndValue("0", 20), + new LabelAndValue("1", 20), + new LabelAndValue("2", 20), + new LabelAndValue("3", 20), + new LabelAndValue("4", 20), + new LabelAndValue("9223372036854775807", 1)); + + r.close(); + d.close(); + } + + public void testOnlyBigLongs() throws Exception { + Directory d = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), d); + for (long l = 0; l < 3; l++) { + Document doc = new Document(); + doc.add(new NumericDocValuesField("field", Long.MAX_VALUE - l)); + w.addDocument(doc); + } + + IndexReader r = w.getReader(); + w.close(); + + IndexSearcher s = newSearcher(r); + LongValueFacetCutter longValuesFacetCutter = new LongValueFacetCutter("field"); + CountFacetRecorder countRecorder = new CountFacetRecorder(); + FacetFieldCollectorManager collectorManager = + new FacetFieldCollectorManager<>(longValuesFacetCutter, countRecorder); + s.search(new MatchAllDocsQuery(), collectorManager); + + FacetResult result = getAllChildrenSortByValue("field", longValuesFacetCutter, countRecorder); + + assertFacetResult( + getAllChildren("field", longValuesFacetCutter, countRecorder), + "field", + new String[0], + 3, + -2147483648, + new LabelAndValue("9223372036854775805", 1), + new LabelAndValue("9223372036854775806", 1), + new LabelAndValue("9223372036854775807", 1)); + + // since we have no insight into the value order in the hashMap, we sort labels by value and + // count in + // ascending order in order to compare with expected results + Arrays.sort( + result.labelValues, + Comparator.comparing((LabelAndValue a) -> a.label) + .thenComparingLong(a -> a.value.longValue())); + + assertEquals( + "dim=field path=[] value=-2147483648 childCount=3\n 9223372036854775805 (1)\n " + + "9223372036854775806 (1)\n 9223372036854775807 (1)\n", + result.toString()); + r.close(); + d.close(); + } + + public void testRandomSingleValued() throws Exception { + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), dir); + + int docCount = atLeast(1000); + double missingChance = random().nextDouble(); + long maxValue; + if (random().nextBoolean()) { + maxValue = random().nextLong() & Long.MAX_VALUE; + } else { + maxValue = random().nextInt(1000); + } + if (VERBOSE) { + System.out.println( + "TEST: valueCount=" + + docCount + + " valueRange=-" + + maxValue + + "-" + + maxValue + + " missingChance=" + + missingChance); + } + Long[] values = new Long[docCount]; + // int missingCount = 0; + for (int i = 0; i < docCount; i++) { + Document doc = new Document(); + doc.add(new IntPoint("id", i)); + if (random().nextDouble() > missingChance) { + long value = TestUtil.nextLong(random(), -maxValue, maxValue); + doc.add(new NumericDocValuesField("field", value)); + values[i] = value; + } else { + // missingCount++; + } + w.addDocument(doc); + } + + IndexReader r = w.getReader(); + w.close(); + + IndexSearcher s = newSearcher(r); + + int iters = atLeast(100); + for (int iter = 0; iter < iters; iter++) { + if (VERBOSE) { + System.out.println("\nTEST: iter=" + iter); + System.out.println(" test all docs"); + } + + // all docs + Map expected = new HashMap<>(); + int expectedChildCount = 0; + for (int i = 0; i < docCount; i++) { + if (values[i] != null) { + Integer curCount = expected.get(values[i]); + if (curCount == null) { + curCount = 0; + expectedChildCount++; + } + expected.put(values[i], curCount + 1); + } + } + + List> expectedCounts = new ArrayList<>(expected.entrySet()); + + // sort by value + expectedCounts.sort(Comparator.comparingLong(Map.Entry::getKey)); + + LongValueFacetCutter longValuesFacetCutter = new LongValueFacetCutter("field"); + CountFacetRecorder countRecorder = new CountFacetRecorder(); + FacetFieldCollectorManager collectorManager = + new FacetFieldCollectorManager<>(longValuesFacetCutter, countRecorder); + s.search(new MatchAllDocsQuery(), collectorManager); + /* TODO: uncomment and adjust when LongValueFacetCutter supports value sources + if (random().nextBoolean()) { + if (VERBOSE) { + System.out.println(" use value source"); + } + + if (random().nextBoolean()) { + facetCounts = + new LongValueFacetCounts("field", LongValuesSource.fromLongField("field"), fc); + } else if (random().nextBoolean()) { + facetCounts = + new LongValueFacetCounts("field", MultiLongValuesSource.fromLongField("field"), fc); + } else { + facetCounts = + new LongValueFacetCounts( + "field", + MultiLongValuesSource.fromSingleValued(LongValuesSource.fromLongField("field")), + fc); + } + } else { */ + if (VERBOSE) { + System.out.println(" use doc values"); + } + + FacetResult actual = getAllChildrenSortByValue("field", longValuesFacetCutter, countRecorder); + assertSame( + "all docs, sort facets by value", + expectedCounts, + expectedChildCount, + -2147483648, + // docCount - missingCount, + actual, + Integer.MAX_VALUE); + + // test getAllChildren + expectedCounts.sort( + Map.Entry.comparingByKey().thenComparingLong(Map.Entry::getValue)); + FacetResult allChildren = getAllChildren("field", longValuesFacetCutter, countRecorder); + // sort labels by value, count in ascending order + Arrays.sort( + allChildren.labelValues, + Comparator.comparing((LabelAndValue a) -> a.label) + .thenComparingLong(a -> a.value.longValue())); + assertSame( + "test getAllChildren", + expectedCounts, + expectedChildCount, + -2147483648, + // docCount - missingCount, + actual, + Integer.MAX_VALUE); + + // sort by count + expectedCounts.sort( + (a, b) -> { + int cmp = -Integer.compare(a.getValue(), b.getValue()); + if (cmp == 0) { + // tie break by value + cmp = Long.compare(a.getKey(), b.getKey()); + } + return cmp; + }); + int topN; + if (random().nextBoolean()) { + topN = docCount; + } else { + topN = random().nextInt(1, docCount); + } + if (VERBOSE) { + System.out.println(" topN=" + topN); + } + actual = getTopChildren(topN, "field", longValuesFacetCutter, countRecorder); + assertSame( + "all docs, sort facets by count", + expectedCounts, + Math.min(topN, expectedChildCount), + // expectedChildCount, + -2147483648, + // docCount - missingCount, + actual, + topN); + + // subset of docs + int minId = random().nextInt(docCount); + int maxId = random().nextInt(docCount); + if (minId > maxId) { + int tmp = minId; + minId = maxId; + maxId = tmp; + } + if (VERBOSE) { + System.out.println(" test id range " + minId + "-" + maxId); + } + + longValuesFacetCutter = new LongValueFacetCutter("field"); + countRecorder = new CountFacetRecorder(); + collectorManager = new FacetFieldCollectorManager<>(longValuesFacetCutter, countRecorder); + s.search(IntPoint.newRangeQuery("id", minId, maxId), collectorManager); + // TODO: uncomment and change longValuesFacetCutter when LongValueFacetCutter supports value + // sources + // if (random().nextBoolean()) { + if (VERBOSE) { + System.out.println(" use doc values"); + } + /*} else { + if (VERBOSE) { + System.out.println(" use value source"); + } + if (random().nextBoolean()) { + facetCounts = + new LongValueFacetCounts("field", LongValuesSource.fromLongField("field"), fc); + } else if (random().nextBoolean()) { + facetCounts = + new LongValueFacetCounts("field", MultiLongValuesSource.fromLongField("field"), fc); + } else { + facetCounts = + new LongValueFacetCounts( + "field", + MultiLongValuesSource.fromSingleValued(LongValuesSource.fromLongField("field")), + fc); + } + }*/ + + expected = new HashMap<>(); + expectedChildCount = 0; + // int totCount = 0; + for (int i = minId; i <= maxId; i++) { + if (values[i] != null) { + // totCount++; + Integer curCount = expected.get(values[i]); + if (curCount == null) { + expectedChildCount++; + curCount = 0; + } + expected.put(values[i], curCount + 1); + } + } + expectedCounts = new ArrayList<>(expected.entrySet()); + + // sort by value + expectedCounts.sort(Comparator.comparingLong(Map.Entry::getKey)); + actual = getAllChildrenSortByValue("field", longValuesFacetCutter, countRecorder); + assertSame( + "id " + minId + "-" + maxId + ", sort facets by value", + expectedCounts, + expectedChildCount, + -2147483648, + // totCount, + actual, + Integer.MAX_VALUE); + + // sort by count + expectedCounts.sort( + (a, b) -> { + int cmp = -Integer.compare(a.getValue(), b.getValue()); + if (cmp == 0) { + // tie break by value + cmp = Long.compare(a.getKey(), b.getKey()); + } + return cmp; + }); + if (random().nextBoolean()) { + topN = docCount; + } else { + topN = random().nextInt(1, docCount); + } + actual = getTopChildren(topN, "field", longValuesFacetCutter, countRecorder); + assertSame( + "id " + minId + "-" + maxId + ", sort facets by count", + expectedCounts, + Math.min(topN, expectedChildCount), + // expectedChildCount, + -2147483648, + // totCount, + actual, + topN); + } + r.close(); + dir.close(); + } + + public void testRandomMultiValued() throws Exception { + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), dir); + + int docCount = atLeast(1000); + double missingChance = random().nextDouble(); + + // sometimes exercise codec optimizations when a claimed multi valued field is in fact single + // valued: + boolean allSingleValued = rarely(); + long maxValue; + + if (random().nextBoolean()) { + maxValue = random().nextLong() & Long.MAX_VALUE; + } else { + maxValue = random().nextInt(1000); + } + if (VERBOSE) { + System.out.println( + "TEST: valueCount=" + + docCount + + " valueRange=-" + + maxValue + + "-" + + maxValue + + " missingChance=" + + missingChance + + " allSingleValued=" + + allSingleValued); + } + + long[][] values = new long[docCount][]; + for (int i = 0; i < docCount; i++) { + Document doc = new Document(); + doc.add(new IntPoint("id", i)); + if (random().nextDouble() > missingChance) { + if (allSingleValued) { + values[i] = new long[1]; + } else { + values[i] = new long[TestUtil.nextInt(random(), 1, 5)]; + } + + for (int j = 0; j < values[i].length; j++) { + long value = TestUtil.nextLong(random(), -maxValue, maxValue); + values[i][j] = value; + doc.add(new SortedNumericDocValuesField("field", value)); + } + + if (VERBOSE) { + System.out.println(" doc=" + i + " values=" + Arrays.toString(values[i])); + } + + // sort values to enable duplicate detection by comparing with the previous value + Arrays.sort(values[i]); + } else { + if (VERBOSE) { + System.out.println(" doc=" + i + " missing values"); + } + } + w.addDocument(doc); + } + + IndexReader r = w.getReader(); + w.close(); + + IndexSearcher s = newSearcher(r); + + int iters = atLeast(100); + for (int iter = 0; iter < iters; iter++) { + if (VERBOSE) { + System.out.println("\nTEST: iter=" + iter); + System.out.println(" test all docs"); + } + + // all docs + Map expected = new HashMap<>(); + // int expectedTotalCount = 0; + for (int i = 0; i < docCount; i++) { + if (values[i] != null && values[i].length > 0) { + // expectedTotalCount++; + setExpectedFrequencies(values[i], expected); + } + } + + List> expectedCounts = new ArrayList<>(expected.entrySet()); + int expectedChildCount = expected.size(); + + // sort by value + expectedCounts.sort(Comparator.comparingLong(Map.Entry::getKey)); + + LongValueFacetCutter longValuesFacetCutter = new LongValueFacetCutter("field"); + CountFacetRecorder countRecorder = new CountFacetRecorder(); + FacetFieldCollectorManager collectorManager = + new FacetFieldCollectorManager<>(longValuesFacetCutter, countRecorder); + s.search(new MatchAllDocsQuery(), collectorManager); + if (VERBOSE) { + System.out.println(" use doc values"); + } + // TODO: uncomment and adjust when LongValueFacetCutter supports value sources + /*if (random().nextBoolean()) { + facetCounts = new LongValueFacetCounts("field", fc); + } else { + facetCounts = + new LongValueFacetCounts("field", MultiLongValuesSource.fromLongField("field"), fc); + }*/ + + FacetResult actual = getAllChildrenSortByValue("field", longValuesFacetCutter, countRecorder); + assertSame( + "all docs, sort facets by value", + expectedCounts, + expectedChildCount, + -2147483648, + // expectedTotalCount, + actual, + Integer.MAX_VALUE); + + // test getAllChildren + expectedCounts.sort( + Map.Entry.comparingByKey().thenComparingLong(Map.Entry::getValue)); + FacetResult allChildren = getAllChildren("field", longValuesFacetCutter, countRecorder); + // sort labels by value, count in ascending order + Arrays.sort( + allChildren.labelValues, + Comparator.comparing((LabelAndValue a) -> a.label) + .thenComparingLong(a -> a.value.longValue())); + assertSame( + "test getAllChildren", + expectedCounts, + expectedChildCount, + -2147483648, + // expectedTotalCount, + actual, + Integer.MAX_VALUE); + + // sort by count + expectedCounts.sort( + (a, b) -> { + int cmp = -Integer.compare(a.getValue(), b.getValue()); + if (cmp == 0) { + // tie break by value + cmp = Long.compare(a.getKey(), b.getKey()); + } + return cmp; + }); + int topN; + if (random().nextBoolean()) { + topN = docCount; + } else { + topN = random().nextInt(1, docCount); + } + if (VERBOSE) { + System.out.println(" topN=" + topN); + } + actual = getTopChildren(topN, "field", longValuesFacetCutter, countRecorder); + assertSame( + "all docs, sort facets by count", + expectedCounts, + Math.min(topN, expectedChildCount), + // expectedChildCount, + -2147483648, + // expectedTotalCount, + actual, + topN); + + // subset of docs + int minId = random().nextInt(docCount); + int maxId = random().nextInt(docCount); + if (minId > maxId) { + int tmp = minId; + minId = maxId; + maxId = tmp; + } + if (VERBOSE) { + System.out.println(" test id range " + minId + "-" + maxId); + } + + longValuesFacetCutter = new LongValueFacetCutter("field"); + countRecorder = new CountFacetRecorder(); + collectorManager = new FacetFieldCollectorManager<>(longValuesFacetCutter, countRecorder); + s.search(IntPoint.newRangeQuery("id", minId, maxId), collectorManager); + // TODO: uncomment and adjust when LongValueFacetCutter supports value sources + /*if (random().nextBoolean()) { + facetCounts = new LongValueFacetCounts("field", fc); + } else { + facetCounts = + new LongValueFacetCounts("field", MultiLongValuesSource.fromLongField("field"), fc); + }*/ + + expected = new HashMap<>(); + // expectedTotalCount = 0; + for (int i = minId; i <= maxId; i++) { + if (values[i] != null && values[i].length > 0) { + // expectedTotalCount++; + setExpectedFrequencies(values[i], expected); + } + } + expectedCounts = new ArrayList<>(expected.entrySet()); + expectedChildCount = expected.size(); + + // sort by value + expectedCounts.sort(Comparator.comparingLong(Map.Entry::getKey)); + actual = getAllChildrenSortByValue("field", longValuesFacetCutter, countRecorder); + assertSame( + "id " + minId + "-" + maxId + ", sort facets by value", + expectedCounts, + expectedChildCount, + -2147483648, + // expectedTotalCount, + actual, + Integer.MAX_VALUE); + + // sort by count + expectedCounts.sort( + (a, b) -> { + int cmp = -Integer.compare(a.getValue(), b.getValue()); + if (cmp == 0) { + // tie break by value + cmp = Long.compare(a.getKey(), b.getKey()); + } + return cmp; + }); + if (random().nextBoolean()) { + topN = docCount; + } else { + topN = random().nextInt(1, docCount); + } + actual = getTopChildren(topN, "field", longValuesFacetCutter, countRecorder); + assertSame( + "id " + minId + "-" + maxId + ", sort facets by count", + expectedCounts, + Math.min(expectedChildCount, topN), + // expectedChildCount, + -2147483648, + // expectedTotalCount, + actual, + topN); + } + r.close(); + dir.close(); + } + + private void setExpectedFrequencies(long[] values, Map expected) { + long previousValue = 0; + for (int j = 0; j < values.length; j++) { + if (j == 0 || previousValue != values[j]) { + Integer curCount = expected.getOrDefault(values[j], 0); + expected.put(values[j], curCount + 1); + } + previousValue = values[j]; + } + } + + private static void assertSame( + String desc, + List> expectedCounts, + int expectedChildCount, + int expectedTotalCount, + FacetResult actual, + int topN) { + int expectedTopN = Math.min(topN, expectedCounts.size()); + if (VERBOSE) { + System.out.println(" expected topN=" + expectedTopN); + for (int i = 0; i < expectedTopN; i++) { + System.out.println( + " " + + i + + ": value=" + + expectedCounts.get(i).getKey() + + " count=" + + expectedCounts.get(i).getValue()); + } + System.out.println(" actual topN=" + actual.labelValues.length); + for (int i = 0; i < actual.labelValues.length; i++) { + System.out.println( + " " + + i + + ": value=" + + actual.labelValues[i].label + + " count=" + + actual.labelValues[i].value); + } + } + assertEquals(desc + ": topN", expectedTopN, actual.labelValues.length); + assertEquals(desc + ": childCount", expectedChildCount, actual.childCount); + assertEquals(desc + ": totCount", expectedTotalCount, actual.value.intValue()); + assertTrue(actual.labelValues.length <= topN); + + for (int i = 0; i < expectedTopN; i++) { + assertEquals( + desc + ": label[" + i + "]", + Long.toString(expectedCounts.get(i).getKey()), + actual.labelValues[i].label); + assertEquals( + desc + ": counts[" + i + "]", + expectedCounts.get(i).getValue().intValue(), + actual.labelValues[i].value.intValue()); + } + } + + /** + * LUCENE-9964: Duplicate long values in a document field should only be counted once when using + * SortedNumericDocValuesFields + */ + public void testDuplicateLongValues() throws Exception { + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), dir); + + Document doc = new Document(); + // these two values are not unique in a document + doc.add(new SortedNumericDocValuesField("field", 42)); + doc.add(new SortedNumericDocValuesField("field", 42)); + w.addDocument(doc); + doc = new Document(); + doc.add(new SortedNumericDocValuesField("field", 43)); + doc.add(new SortedNumericDocValuesField("field", 43)); + w.addDocument(doc); + + IndexReader r = w.getReader(); + w.close(); + IndexSearcher s = newSearcher(r); + LongValueFacetCutter longValuesFacetCutter = new LongValueFacetCutter("field"); + CountFacetRecorder countRecorder = new CountFacetRecorder(); + FacetFieldCollectorManager collectorManager = + new FacetFieldCollectorManager<>(longValuesFacetCutter, countRecorder); + s.search(new MatchAllDocsQuery(), collectorManager); + + FacetResult fr = getAllChildrenSortByValue("field", longValuesFacetCutter, countRecorder); + for (LabelAndValue labelAndValue : fr.labelValues) { + assert labelAndValue.value.equals(1); + } + + assertFacetResult( + getAllChildren("field", longValuesFacetCutter, countRecorder), + "field", + new String[0], + 2, + -2147483648, + new LabelAndValue("42", 1), + new LabelAndValue("43", 1)); + + r.close(); + dir.close(); + } + + /** + * Get all results sorted by value, similar to {@link + * LongValueFacetCounts#getAllChildrenSortByValue()} + */ + private FacetResult getAllChildrenSortByValue( + String fieldName, + LongValueFacetCutter longValuesFacetCutter, + CountFacetRecorder countRecorder) + throws IOException { + int[] resultOrdinals = countRecorder.recordedOrds().toArray(); + ComparableSupplier comparableSupplier = + ComparableUtils.byLongValue(longValuesFacetCutter); + + ComparableUtils.sort(resultOrdinals, comparableSupplier); + + FacetLabel[] labels = longValuesFacetCutter.getLabels(resultOrdinals); + List labelsAndValues = new ArrayList<>(labels.length); + int childCount = 0; + for (int i = 0; i < resultOrdinals.length; i++) { + int count = countRecorder.getCount(resultOrdinals[i]); + labelsAndValues.add(new LabelAndValue(labels[i].lastComponent(), count)); + childCount++; + } + // int value = countFacetRecorder.getCount(parentOrdinal); + return new FacetResult( + fieldName, + new String[0], + VALUE_CANT_BE_COMPUTED, + labelsAndValues.toArray(new LabelAndValue[0]), + childCount); + } + + /** + * Get top results sorted by count with tie-break by value, similar to {@link + * LongValueFacetCounts#getTopChildren(int, String, String...)} + */ + private FacetResult getTopChildren( + int topN, + String field, + LongValueFacetCutter longValuesFacetCutter, + CountFacetRecorder countRecorder) + throws IOException { + ComparableSupplier comparableSupplier = + ComparableUtils.byCount(countRecorder, longValuesFacetCutter); + + OrdinalIterator topByCountOrds = + new TopnOrdinalIterator<>(countRecorder.recordedOrds(), comparableSupplier, topN); + + int[] resultOrdinals = topByCountOrds.toArray(); + + FacetLabel[] labels = longValuesFacetCutter.getLabels(resultOrdinals); + List labelsAndValues = new ArrayList<>(labels.length); + int childCount = 0; + for (int i = 0; i < resultOrdinals.length; i++) { + int count = countRecorder.getCount(resultOrdinals[i]); + labelsAndValues.add(new LabelAndValue(labels[i].lastComponent(), count)); + childCount++; + } + // int value = countFacetRecorder.getCount(parentOrdinal); + return new FacetResult( + field, + new String[0], + VALUE_CANT_BE_COMPUTED, + labelsAndValues.toArray(new LabelAndValue[0]), + childCount); + } + + /** + * Get all results in no particular order, similar to {@link + * LongValueFacetCounts#getAllChildren(String, String...)} + */ + private FacetResult getAllChildren( + String field, LongValueFacetCutter longValuesFacetCutter, CountFacetRecorder countRecorder) + throws IOException { + int[] resultOrdinals = countRecorder.recordedOrds().toArray(); + + FacetLabel[] labels = longValuesFacetCutter.getLabels(resultOrdinals); + List labelsAndValues = new ArrayList<>(labels.length); + int childCount = 0; + for (int i = 0; i < resultOrdinals.length; i++) { + int count = countRecorder.getCount(resultOrdinals[i]); + labelsAndValues.add(new LabelAndValue(labels[i].lastComponent(), count)); + childCount++; + } + // int value = countFacetRecorder.getCount(parentOrdinal); + return new FacetResult( + field, + new String[0], + VALUE_CANT_BE_COMPUTED, + labelsAndValues.toArray(new LabelAndValue[0]), + childCount); + } +} diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/facet/TestRangeFacet.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/facet/TestRangeFacet.java new file mode 100644 index 000000000000..94b3281847ed --- /dev/null +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/facet/TestRangeFacet.java @@ -0,0 +1,1638 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.facet; + +import static org.apache.lucene.facet.FacetsConfig.DEFAULT_INDEX_FIELD_NAME; + +import com.carrotsearch.randomizedtesting.generators.RandomNumbers; +import java.io.IOException; +import java.util.List; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.DoubleDocValuesField; +import org.apache.lucene.document.DoublePoint; +import org.apache.lucene.document.LongPoint; +import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.document.SortedNumericDocValuesField; +import org.apache.lucene.facet.DrillDownQuery; +import org.apache.lucene.facet.DrillSideways; +import org.apache.lucene.facet.FacetField; +import org.apache.lucene.facet.FacetResult; +import org.apache.lucene.facet.FacetsConfig; +import org.apache.lucene.facet.LabelAndValue; +import org.apache.lucene.facet.MultiDoubleValuesSource; +import org.apache.lucene.facet.MultiLongValuesSource; +import org.apache.lucene.facet.range.DoubleRange; +import org.apache.lucene.facet.range.LongRange; +import org.apache.lucene.facet.range.Range; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader; +import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.sandbox.facet.cutters.TaxonomyFacetsCutter; +import org.apache.lucene.sandbox.facet.cutters.ranges.DoubleRangeFacetCutter; +import org.apache.lucene.sandbox.facet.cutters.ranges.LongRangeFacetCutter; +import org.apache.lucene.sandbox.facet.labels.OrdToLabel; +import org.apache.lucene.sandbox.facet.labels.RangeOrdToLabel; +import org.apache.lucene.sandbox.facet.recorders.CountFacetRecorder; +import org.apache.lucene.search.DoubleValues; +import org.apache.lucene.search.DoubleValuesSource; +import org.apache.lucene.search.Explanation; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.LongValuesSource; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.MultiCollectorManager; +import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.index.RandomIndexWriter; +import org.apache.lucene.tests.search.DummyTotalHitCountCollector; +import org.apache.lucene.tests.util.TestUtil; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.NumericUtils; + +/** + * Test sandbox facet ranges. Mostly test cases from LongRangeFacetCounts adopted for sandbox + * faceting. + */ +public class TestRangeFacet extends SandboxFacetTestCase { + + public void testBasicLong() throws Exception { + Directory d = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), d); + Document doc = new Document(); + NumericDocValuesField field = new NumericDocValuesField("field", 0L); + doc.add(field); + for (long l = 0; l < 100; l++) { + field.setLongValue(l); + w.addDocument(doc); + } + + // Also add Long.MAX_VALUE + field.setLongValue(Long.MAX_VALUE); + w.addDocument(doc); + + IndexReader r = w.getReader(); + w.close(); + + IndexSearcher s = newSearcher(r); + LongRange[] inputRanges = + new LongRange[] { + new LongRange("less than 10", 0L, true, 10L, false), + new LongRange("less than or equal to 10", 0L, true, 10L, true), + new LongRange("over 90", 90L, false, 100L, false), + new LongRange("90 or above", 90L, true, 100L, false), + new LongRange("over 1000", 1000L, false, Long.MAX_VALUE, true), + }; + + MultiLongValuesSource valuesSource = MultiLongValuesSource.fromLongField("field"); + LongRangeFacetCutter longRangeFacetCutter = + LongRangeFacetCutter.create(valuesSource, inputRanges); + CountFacetRecorder countRecorder = new CountFacetRecorder(); + + FacetFieldCollectorManager collectorManager = + new FacetFieldCollectorManager<>(longRangeFacetCutter, countRecorder); + s.search(new MatchAllDocsQuery(), collectorManager); + OrdToLabel ordToLabel = new RangeOrdToLabel(inputRanges); + + assertEquals( + "dim=field path=[] value=-2147483648 childCount=5\n less than 10 (10)\n less than or equal to 10 (11)\n over 90 (9)\n 90 or above (10)\n over 1000 (1)\n", + getAllSortByOrd(getRangeOrdinals(inputRanges), countRecorder, "field", ordToLabel) + .toString()); + + r.close(); + d.close(); + } + + private int[] getRangeOrdinals(Range[] inputRanges) { + // Naive method to get a list of facet ordinals for range facets, + // it is used to get all range ordinals, including the ones that didn't match any docs. + int[] result = new int[inputRanges.length]; + for (int i = 0; i < inputRanges.length; i++) { + result[i] = i; + } + return result; + } + + public void testBasicLongMultiValued() throws Exception { + Directory d = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), d); + Document doc = new Document(); + // just index the same value twice each time and make sure we don't double count + SortedNumericDocValuesField field1 = new SortedNumericDocValuesField("field", 0L); + SortedNumericDocValuesField field2 = new SortedNumericDocValuesField("field", 0L); + doc.add(field1); + doc.add(field2); + for (long l = 100; l < 200; l++) { + field1.setLongValue(l); + // Make second value sometimes smaller, sometimes bigger, and sometimes equal + if (l % 3 == 0) { + field2.setLongValue(l - 100); + } else if (l % 3 == 1) { + field2.setLongValue(l + 100); + } else { + field2.setLongValue(l); + } + w.addDocument(doc); + } + + // Also add Long.MAX_VALUE + field1.setLongValue(Long.MAX_VALUE); + field2.setLongValue(Long.MAX_VALUE); + w.addDocument(doc); + + IndexReader r = w.getReader(); + w.close(); + + IndexSearcher s = newSearcher(r); + + ////////// Not overlapping ranges + LongRange[] inputRanges = + new LongRange[] { + new LongRange("110-120", 110L, true, 120L, true), + new LongRange("121-130", 121L, true, 130L, true), + }; + + MultiLongValuesSource valuesSource = MultiLongValuesSource.fromLongField("field"); + LongRangeFacetCutter longRangeFacetCutter = + LongRangeFacetCutter.create(valuesSource, inputRanges); + CountFacetRecorder countRecorder = new CountFacetRecorder(); + + FacetFieldCollectorManager collectorManager = + new FacetFieldCollectorManager<>(longRangeFacetCutter, countRecorder); + s.search(new MatchAllDocsQuery(), collectorManager); + OrdToLabel ordToLabel = new RangeOrdToLabel(inputRanges); + + assertEquals( + "dim=field path=[] value=-2147483648 childCount=2\n" + + " 110-120 (11)\n" + + " 121-130 (10)\n", + getAllSortByOrd(getRangeOrdinals(inputRanges), countRecorder, "field", ordToLabel) + .toString()); + + ///////// Overlapping ranges + inputRanges = + new LongRange[] { + new LongRange("110-120", 110L, true, 120L, true), + new LongRange("115-125", 115L, true, 125L, true), + }; + + valuesSource = MultiLongValuesSource.fromLongField("field"); + longRangeFacetCutter = LongRangeFacetCutter.create(valuesSource, inputRanges); + countRecorder = new CountFacetRecorder(); + + collectorManager = new FacetFieldCollectorManager<>(longRangeFacetCutter, countRecorder); + s.search(new MatchAllDocsQuery(), collectorManager); + ordToLabel = new RangeOrdToLabel(inputRanges); + + assertEquals( + "dim=field path=[] value=-2147483648 childCount=2\n" + + " 110-120 (11)\n" + + " 115-125 (11)\n", + getAllSortByOrd(getRangeOrdinals(inputRanges), countRecorder, "field", ordToLabel) + .toString()); + + ////////// Multiple ranges (similar to original test) + inputRanges = + new LongRange[] { + new LongRange("[100-110)", 100L, true, 110L, false), + new LongRange("[100-110]", 100L, true, 110L, true), + new LongRange("(190-200)", 190L, false, 200L, false), + new LongRange("[190-200]", 190L, true, 200L, false), + new LongRange("over 1000", 1000L, false, Long.MAX_VALUE, true) + }; + + valuesSource = MultiLongValuesSource.fromLongField("field"); + longRangeFacetCutter = LongRangeFacetCutter.create(valuesSource, inputRanges); + countRecorder = new CountFacetRecorder(); + + collectorManager = new FacetFieldCollectorManager<>(longRangeFacetCutter, countRecorder); + s.search(new MatchAllDocsQuery(), collectorManager); + ordToLabel = new RangeOrdToLabel(inputRanges); + + assertEquals( + "dim=field path=[] value=-2147483648 childCount=5\n" + + " [100-110) (10)\n" + + " [100-110] (11)\n" + + " (190-200) (9)\n" + + " [190-200] (10)\n" + + " over 1000 (1)\n", + getAllSortByOrd(getRangeOrdinals(inputRanges), countRecorder, "field", ordToLabel) + .toString()); + + r.close(); + d.close(); + } + + public void testBasicLongMultiValuedMixedSegmentTypes() throws Exception { + Directory d = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), d); + SortedNumericDocValuesField field1 = new SortedNumericDocValuesField("field", 0L); + SortedNumericDocValuesField field2 = new SortedNumericDocValuesField("field", 0L); + // write docs as two segments (50 in each). the first segment will contain a mix of single- and + // multi-value cases, while the second segment will be all single values. + for (int l = 0; l < 100; l++) { + field1.setLongValue(l); + field2.setLongValue(l); + Document doc = new Document(); + doc.add(field1); + if (l == 0) { + doc.add(field2); + } else if (l < 50) { + if (random().nextBoolean()) { + doc.add(field2); + } + } + w.addDocument(doc); + if (l == 50) { + w.commit(); + } + } + + IndexReader r = w.getReader(); + w.close(); + + IndexSearcher s = newSearcher(r); + + LongRange[] inputRanges = + new LongRange[] { + new LongRange("less than 10", 0L, true, 10L, false), + new LongRange("less than or equal to 10", 0L, true, 10L, true), + new LongRange("over 90", 90L, false, 100L, false), + new LongRange("90 or above", 90L, true, 100L, false), + new LongRange("over 1000", 1000L, false, Long.MAX_VALUE, true) + }; + + MultiLongValuesSource valuesSource = MultiLongValuesSource.fromLongField("field"); + LongRangeFacetCutter longRangeFacetCutter = + LongRangeFacetCutter.create(valuesSource, inputRanges); + CountFacetRecorder countRecorder = new CountFacetRecorder(); + + FacetFieldCollectorManager collectorManager = + new FacetFieldCollectorManager<>(longRangeFacetCutter, countRecorder); + s.search(new MatchAllDocsQuery(), collectorManager); + OrdToLabel ordToLabel = new RangeOrdToLabel(inputRanges); + + assertEquals( + "dim=field path=[] value=-2147483648 childCount=5\n less than 10 (10)\n less than or equal to 10 (11)\n over 90 (9)\n 90 or above (10)\n over 1000 (0)\n", + getAllSortByOrd(getRangeOrdinals(inputRanges), countRecorder, "field", ordToLabel) + .toString()); + + r.close(); + d.close(); + } + + public void testLongMinMax() throws Exception { + Directory d = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), d); + Document doc = new Document(); + NumericDocValuesField field = new NumericDocValuesField("field", 0L); + doc.add(field); + field.setLongValue(Long.MIN_VALUE); + w.addDocument(doc); + field.setLongValue(0); + w.addDocument(doc); + field.setLongValue(Long.MAX_VALUE); + w.addDocument(doc); + + IndexReader r = w.getReader(); + w.close(); + + IndexSearcher s = newSearcher(r); + + LongRange[] inputRanges = + new LongRange[] { + new LongRange("min", Long.MIN_VALUE, true, Long.MIN_VALUE, true), + new LongRange("max", Long.MAX_VALUE, true, Long.MAX_VALUE, true), + new LongRange("all0", Long.MIN_VALUE, true, Long.MAX_VALUE, true), + new LongRange("all1", Long.MIN_VALUE, false, Long.MAX_VALUE, true), + new LongRange("all2", Long.MIN_VALUE, true, Long.MAX_VALUE, false), + new LongRange("all3", Long.MIN_VALUE, false, Long.MAX_VALUE, false) + }; + + MultiLongValuesSource valuesSource = MultiLongValuesSource.fromLongField("field"); + LongRangeFacetCutter longRangeFacetCutter = + LongRangeFacetCutter.create(valuesSource, inputRanges); + CountFacetRecorder countRecorder = new CountFacetRecorder(); + + FacetFieldCollectorManager collectorManager = + new FacetFieldCollectorManager<>(longRangeFacetCutter, countRecorder); + s.search(new MatchAllDocsQuery(), collectorManager); + OrdToLabel ordToLabel = new RangeOrdToLabel(inputRanges); + + assertEquals( + "dim=field path=[] value=-2147483648 childCount=6\n min (1)\n max (1)\n all0 (3)\n all1 (2)\n all2 (2)\n all3 (1)\n", + getAllSortByOrd(getRangeOrdinals(inputRanges), countRecorder, "field", ordToLabel) + .toString()); + + r.close(); + d.close(); + } + + public void testOverlappedEndStart() throws Exception { + Directory d = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), d); + Document doc = new Document(); + NumericDocValuesField field = new NumericDocValuesField("field", 0L); + doc.add(field); + for (long l = 0; l < 100; l++) { + field.setLongValue(l); + w.addDocument(doc); + } + field.setLongValue(Long.MAX_VALUE); + w.addDocument(doc); + + IndexReader r = w.getReader(); + w.close(); + + IndexSearcher s = newSearcher(r); + + LongRange[] inputRanges = + new LongRange[] { + new LongRange("0-10", 0L, true, 10L, true), + new LongRange("10-20", 10L, true, 20L, true), + new LongRange("20-30", 20L, true, 30L, true), + new LongRange("30-40", 30L, true, 40L, true) + }; + + MultiLongValuesSource valuesSource = MultiLongValuesSource.fromLongField("field"); + LongRangeFacetCutter longRangeFacetCutter = + LongRangeFacetCutter.create(valuesSource, inputRanges); + CountFacetRecorder countRecorder = new CountFacetRecorder(); + + FacetFieldCollectorManager collectorManager = + new FacetFieldCollectorManager<>(longRangeFacetCutter, countRecorder); + s.search(new MatchAllDocsQuery(), collectorManager); + OrdToLabel ordToLabel = new RangeOrdToLabel(inputRanges); + + assertEquals( + "dim=field path=[] value=-2147483648 childCount=4\n 0-10 (11)\n 10-20 (11)\n 20-30 (11)\n 30-40 (11)\n", + getAllSortByOrd(getRangeOrdinals(inputRanges), countRecorder, "field", ordToLabel) + .toString()); + + r.close(); + d.close(); + } + + public void testEmptyRangesSingleValued() throws Exception { + Directory d = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), d); + Document doc = new Document(); + NumericDocValuesField field = new NumericDocValuesField("field", 0L); + doc.add(field); + for (long l = 0; l < 100; l++) { + field.setLongValue(l); + w.addDocument(doc); + } + + IndexReader r = w.getReader(); + w.close(); + + IndexSearcher s = newSearcher(r); + + LongRange[] inputRanges = new LongRange[0]; + + MultiLongValuesSource valuesSource = MultiLongValuesSource.fromLongField("field"); + LongRangeFacetCutter longRangeFacetCutter = + LongRangeFacetCutter.create(valuesSource, inputRanges); + CountFacetRecorder countRecorder = new CountFacetRecorder(); + + FacetFieldCollectorManager collectorManager = + new FacetFieldCollectorManager<>(longRangeFacetCutter, countRecorder); + s.search(new MatchAllDocsQuery(), collectorManager); + OrdToLabel ordToLabel = new RangeOrdToLabel(inputRanges); + + assertEquals( + "dim=field path=[] value=-2147483648 childCount=0\n", + getAllSortByOrd(getRangeOrdinals(inputRanges), countRecorder, "field", ordToLabel) + .toString()); + + r.close(); + d.close(); + } + + public void testEmptyRangesMultiValued() throws Exception { + Directory d = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), d); + Document doc = new Document(); + SortedNumericDocValuesField field1 = new SortedNumericDocValuesField("field", 0L); + SortedNumericDocValuesField field2 = new SortedNumericDocValuesField("field", 0L); + doc.add(field1); + doc.add(field2); + for (long l = 0; l < 100; l++) { + field1.setLongValue(l); + field2.setLongValue(l); + w.addDocument(doc); + } + + IndexReader r = w.getReader(); + w.close(); + + IndexSearcher s = newSearcher(r); + + LongRange[] inputRanges = new LongRange[0]; + + MultiLongValuesSource valuesSource = MultiLongValuesSource.fromLongField("field"); + LongRangeFacetCutter longRangeFacetCutter = + LongRangeFacetCutter.create(valuesSource, inputRanges); + CountFacetRecorder countRecorder = new CountFacetRecorder(); + + FacetFieldCollectorManager collectorManager = + new FacetFieldCollectorManager<>(longRangeFacetCutter, countRecorder); + s.search(new MatchAllDocsQuery(), collectorManager); + OrdToLabel ordToLabel = new RangeOrdToLabel(inputRanges); + + assertEquals( + "dim=field path=[] value=-2147483648 childCount=0\n", + getAllSortByOrd(getRangeOrdinals(inputRanges), countRecorder, "field", ordToLabel) + .toString()); + + r.close(); + d.close(); + } + + /** + * Tests single request that mixes Range and non-Range faceting, with DrillSideways and taxonomy. + */ + public void testMixedRangeAndNonRangeTaxonomy() throws Exception { + Directory d = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), d); + Directory td = newDirectory(); + DirectoryTaxonomyWriter tw = new DirectoryTaxonomyWriter(td, IndexWriterConfig.OpenMode.CREATE); + + FacetsConfig config = new FacetsConfig(); + + for (long l = 0; l < 100; l++) { + Document doc = new Document(); + // For computing range facet counts: + doc.add(new NumericDocValuesField("field", l)); + // For drill down by numeric range: + doc.add(new LongPoint("field", l)); + + if ((l & 3) == 0) { + doc.add(new FacetField("dim", "a")); + } else { + doc.add(new FacetField("dim", "b")); + } + w.addDocument(config.build(tw, doc)); + } + + final IndexReader r = w.getReader(); + final TaxonomyReader tr = new DirectoryTaxonomyReader(tw); + + IndexSearcher s = newSearcher(r, false, false, Concurrency.INTER_SEGMENT); + // DrillSideways requires the entire range of docs to be scored at once, so it doesn't support + // timeouts whose implementation scores one window of doc IDs at a time. + s.setTimeout(null); + + if (VERBOSE) { + System.out.println("TEST: searcher=" + s); + } + + DrillSideways ds = + new DrillSideways(s, config, tr) { + @Override + protected boolean scoreSubDocsAtOnce() { + return random().nextBoolean(); + } + }; + + // Data for range facets + LongRange[] inputRanges = + new LongRange[] { + new LongRange("less than 10", 0L, true, 10L, false), + new LongRange("less than or equal to 10", 0L, true, 10L, true), + new LongRange("over 90", 90L, false, 100L, false), + new LongRange("90 or above", 90L, true, 100L, false), + new LongRange("over 1000", 1000L, false, Long.MAX_VALUE, false) + }; + MultiLongValuesSource valuesSource = MultiLongValuesSource.fromLongField("field"); + LongRangeFacetCutter fieldCutter = LongRangeFacetCutter.create(valuesSource, inputRanges); + CountFacetRecorder fieldCountRecorder = new CountFacetRecorder(); + FacetFieldCollectorManager fieldCollectorManager = + new FacetFieldCollectorManager<>(fieldCutter, fieldCountRecorder); + OrdToLabel fieldOrdToLabel = new RangeOrdToLabel(inputRanges); + + // Data for taxonomy facets + TaxonomyFacetsCutter dimCutter = new TaxonomyFacetsCutter(DEFAULT_INDEX_FIELD_NAME, config, tr); + CountFacetRecorder dimCountRecorder = new CountFacetRecorder(); + FacetFieldCollectorManager dimCollectorManager = + new FacetFieldCollectorManager<>(dimCutter, dimCountRecorder); + + MultiCollectorManager collectorManager = + new MultiCollectorManager(fieldCollectorManager, dimCollectorManager); + + ////// First search, no drill-downs: + DrillDownQuery ddq = new DrillDownQuery(config); + ds.search(ddq, collectorManager, List.of()); + + // assertEquals(100, dsr.hits.totalHits.value()); + assertEquals( + "dim=dim path=[] value=-2147483648 childCount=2\n b (75)\n a (25)\n", + getTopChildrenByCount(dimCountRecorder, tr, 10, "dim").toString()); + assertEquals( + "dim=field path=[] value=-2147483648 childCount=5\n less than 10 (10)\n less than or equal to 10 (11)\n over 90 (9)\n 90 or above (10)\n over 1000 (0)\n", + getAllSortByOrd(getRangeOrdinals(inputRanges), fieldCountRecorder, "field", fieldOrdToLabel) + .toString()); + + ////// Second search, drill down on dim=b: + fieldCountRecorder = new CountFacetRecorder(); + fieldCollectorManager = new FacetFieldCollectorManager<>(fieldCutter, fieldCountRecorder); + dimCountRecorder = new CountFacetRecorder(); + dimCollectorManager = new FacetFieldCollectorManager<>(dimCutter, dimCountRecorder); + ddq = new DrillDownQuery(config); + ddq.add("dim", "b"); + ds.search(ddq, fieldCollectorManager, List.of(dimCollectorManager)); + + // assertEquals(75, dsr.hits.totalHits.value()); + assertEquals( + "dim=dim path=[] value=-2147483648 childCount=2\n b (75)\n a (25)\n", + getTopChildrenByCount(dimCountRecorder, tr, 10, "dim").toString()); + assertEquals( + "dim=field path=[] value=-2147483648 childCount=5\n less than 10 (7)\n less than or equal to 10 (8)\n over 90 (7)\n 90 or above (8)\n over 1000 (0)\n", + getAllSortByOrd(getRangeOrdinals(inputRanges), fieldCountRecorder, "field", fieldOrdToLabel) + .toString()); + + ////// Third search, drill down on "less than or equal to 10": + fieldCountRecorder = new CountFacetRecorder(); + fieldCollectorManager = new FacetFieldCollectorManager<>(fieldCutter, fieldCountRecorder); + dimCountRecorder = new CountFacetRecorder(); + dimCollectorManager = new FacetFieldCollectorManager<>(dimCutter, dimCountRecorder); + ddq = new DrillDownQuery(config); + ddq.add("field", LongPoint.newRangeQuery("field", 0L, 10L)); + ds.search(ddq, dimCollectorManager, List.of(fieldCollectorManager)); + + // assertEquals(11, dsr.hits.totalHits.value()); + assertEquals( + "dim=dim path=[] value=-2147483648 childCount=2\n b (8)\n a (3)\n", + getTopChildrenByCount(dimCountRecorder, tr, 10, "dim").toString()); + assertEquals( + "dim=field path=[] value=-2147483648 childCount=5\n less than 10 (10)\n less than or equal to 10 (11)\n over 90 (9)\n 90 or above (10)\n over 1000 (0)\n", + getAllSortByOrd(getRangeOrdinals(inputRanges), fieldCountRecorder, "field", fieldOrdToLabel) + .toString()); + + w.close(); + IOUtils.close(tw, tr, td, r, d); + } + + public void testBasicDouble() throws Exception { + Directory d = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), d); + Document doc = new Document(); + DoubleDocValuesField field = new DoubleDocValuesField("field", 0.0); + doc.add(field); + for (int i = 0; i < 100; i++) { + field.setDoubleValue(i); + w.addDocument(doc); + } + + IndexReader r = w.getReader(); + + IndexSearcher s = newSearcher(r); + DoubleRange[] inputRanges = + new DoubleRange[] { + new DoubleRange("less than 10", 0.0, true, 10.0, false), + new DoubleRange("less than or equal to 10", 0.0, true, 10.0, true), + new DoubleRange("over 90", 90.0, false, 100.0, false), + new DoubleRange("90 or above", 90.0, true, 100.0, false), + new DoubleRange("over 1000", 1000.0, false, Double.POSITIVE_INFINITY, false) + }; + + MultiDoubleValuesSource valuesSource = MultiDoubleValuesSource.fromDoubleField("field"); + DoubleRangeFacetCutter doubleRangeFacetCutter = + new DoubleRangeFacetCutter(valuesSource, inputRanges); + CountFacetRecorder countRecorder = new CountFacetRecorder(); + + FacetFieldCollectorManager collectorManager = + new FacetFieldCollectorManager<>(doubleRangeFacetCutter, countRecorder); + s.search(new MatchAllDocsQuery(), collectorManager); + OrdToLabel ordToLabel = new RangeOrdToLabel(inputRanges); + + assertEquals( + "dim=field path=[] value=-2147483648 childCount=5\n less than 10 (10)\n less than or equal to 10 (11)\n over 90 (9)\n 90 or above (10)\n over 1000 (0)\n", + getAllSortByOrd(getRangeOrdinals(inputRanges), countRecorder, "field", ordToLabel) + .toString()); + + w.close(); + IOUtils.close(r, d); + } + + public void testBasicDoubleMultiValued() throws Exception { + Directory d = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), d); + Document doc = new Document(); + // index the same value twice and make sure we don't double count + SortedNumericDocValuesField field1 = new SortedNumericDocValuesField("field", 0); + SortedNumericDocValuesField field2 = new SortedNumericDocValuesField("field", 0); + doc.add(field1); + doc.add(field2); + for (int i = 0; i < 100; i++) { + field1.setLongValue(NumericUtils.doubleToSortableLong(i)); + field2.setLongValue(NumericUtils.doubleToSortableLong(i)); + w.addDocument(doc); + } + + IndexReader r = w.getReader(); + + IndexSearcher s = newSearcher(r); + DoubleRange[] inputRanges = + new DoubleRange[] { + new DoubleRange("less than 10", 0.0, true, 10.0, false), + new DoubleRange("less than or equal to 10", 0.0, true, 10.0, true), + new DoubleRange("over 90", 90.0, false, 100.0, false), + new DoubleRange("90 or above", 90.0, true, 100.0, false), + new DoubleRange("over 1000", 1000.0, false, Double.POSITIVE_INFINITY, false) + }; + + MultiDoubleValuesSource valuesSource = MultiDoubleValuesSource.fromDoubleField("field"); + DoubleRangeFacetCutter doubleRangeFacetCutter = + new DoubleRangeFacetCutter(valuesSource, inputRanges); + CountFacetRecorder countRecorder = new CountFacetRecorder(); + + FacetFieldCollectorManager collectorManager = + new FacetFieldCollectorManager<>(doubleRangeFacetCutter, countRecorder); + s.search(new MatchAllDocsQuery(), collectorManager); + OrdToLabel ordToLabel = new RangeOrdToLabel(inputRanges); + + assertEquals( + "dim=field path=[] value=-2147483648 childCount=5\n less than 10 (10)\n less than or equal to 10 (11)\n over 90 (9)\n 90 or above (10)\n over 1000 (0)\n", + getAllSortByOrd(getRangeOrdinals(inputRanges), countRecorder, "field", ordToLabel) + .toString()); + + w.close(); + IOUtils.close(r, d); + } + + public void testBasicDoubleMultiValuedMixedSegmentTypes() throws Exception { + Directory d = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), d); + SortedNumericDocValuesField field1 = new SortedNumericDocValuesField("field", 0L); + SortedNumericDocValuesField field2 = new SortedNumericDocValuesField("field", 0L); + // write docs as two segments (50 in each). the first segment will contain a mix of single- and + // multi-value cases, while the second segment will be all single values. + for (int l = 0; l < 100; l++) { + field1.setLongValue(NumericUtils.doubleToSortableLong(l)); + field2.setLongValue(NumericUtils.doubleToSortableLong(l)); + Document doc = new Document(); + doc.add(field1); + if (l == 0) { + doc.add(field2); + } else if (l < 50) { + if (random().nextBoolean()) { + doc.add(field2); + } + } + w.addDocument(doc); + if (l == 50) { + w.commit(); + } + } + + IndexReader r = w.getReader(); + w.close(); + + IndexSearcher s = newSearcher(r); + + DoubleRange[] inputRanges = + new DoubleRange[] { + new DoubleRange("less than 10", 0.0, true, 10.0, false), + new DoubleRange("less than or equal to 10", 0.0, true, 10.0, true), + new DoubleRange("over 90", 90.0, false, 100.0, false), + new DoubleRange("90 or above", 90.0, true, 100.0, false), + new DoubleRange("over 1000", 1000.0, false, Double.POSITIVE_INFINITY, false) + }; + + MultiDoubleValuesSource valuesSource = MultiDoubleValuesSource.fromDoubleField("field"); + DoubleRangeFacetCutter doubleRangeFacetCutter = + new DoubleRangeFacetCutter(valuesSource, inputRanges); + CountFacetRecorder countRecorder = new CountFacetRecorder(); + + FacetFieldCollectorManager collectorManager = + new FacetFieldCollectorManager<>(doubleRangeFacetCutter, countRecorder); + s.search(new MatchAllDocsQuery(), collectorManager); + OrdToLabel ordToLabel = new RangeOrdToLabel(inputRanges); + + assertEquals( + "dim=field path=[] value=-2147483648 childCount=5\n less than 10 (10)\n less than or equal to 10 (11)\n over 90 (9)\n 90 or above (10)\n over 1000 (0)\n", + getAllSortByOrd(getRangeOrdinals(inputRanges), countRecorder, "field", ordToLabel) + .toString()); + r.close(); + d.close(); + } + + public void testRandomLongsSingleValued() throws Exception { + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), dir); + + int numDocs = atLeast(1000); + if (VERBOSE) { + System.out.println("TEST: numDocs=" + numDocs); + } + long[] values = new long[numDocs]; + long minValue = Long.MAX_VALUE; + long maxValue = Long.MIN_VALUE; + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + long v = random().nextLong(); + values[i] = v; + doc.add(new NumericDocValuesField("field", v)); + doc.add(new LongPoint("field", v)); + w.addDocument(doc); + minValue = Math.min(minValue, v); + maxValue = Math.max(maxValue, v); + } + IndexReader r = w.getReader(); + + IndexSearcher s = newSearcher(r, false); + FacetsConfig config = new FacetsConfig(); + + int numIters = atLeast(10); + for (int iter = 0; iter < numIters; iter++) { + if (VERBOSE) { + System.out.println("TEST: iter=" + iter); + } + int numRange = TestUtil.nextInt(random(), 1, 100); + LongRange[] ranges = new LongRange[numRange]; + int[] expectedCounts = new int[numRange]; + long minAcceptedValue = Long.MAX_VALUE; + long maxAcceptedValue = Long.MIN_VALUE; + for (int rangeID = 0; rangeID < numRange; rangeID++) { + long min; + if (rangeID > 0 && random().nextInt(10) == 7) { + // Use an existing boundary: + LongRange prevRange = ranges[random().nextInt(rangeID)]; + if (random().nextBoolean()) { + min = prevRange.min; + } else { + min = prevRange.max; + } + } else { + min = random().nextLong(); + } + long max; + if (rangeID > 0 && random().nextInt(10) == 7) { + // Use an existing boundary: + LongRange prevRange = ranges[random().nextInt(rangeID)]; + if (random().nextBoolean()) { + max = prevRange.min; + } else { + max = prevRange.max; + } + } else { + max = random().nextLong(); + } + + if (min > max) { + long x = min; + min = max; + max = x; + } + boolean minIncl; + boolean maxIncl; + + // NOTE: max - min >= 0 is here to handle the common overflow case! + if (max - min >= 0 && max - min < 2) { + // If max == min or max == min+1, we always do inclusive, else we might pass an empty + // range and hit exc from LongRange's ctor: + minIncl = true; + maxIncl = true; + } else { + minIncl = random().nextBoolean(); + maxIncl = random().nextBoolean(); + } + ranges[rangeID] = new LongRange("r" + rangeID, min, minIncl, max, maxIncl); + if (VERBOSE) { + System.out.println(" range " + rangeID + ": " + ranges[rangeID]); + } + + // Do "slow but hopefully correct" computation of + // expected count: + for (int i = 0; i < numDocs; i++) { + boolean accept = true; + if (minIncl) { + accept &= values[i] >= min; + } else { + accept &= values[i] > min; + } + if (maxIncl) { + accept &= values[i] <= max; + } else { + accept &= values[i] < max; + } + if (accept) { + expectedCounts[rangeID]++; + minAcceptedValue = Math.min(minAcceptedValue, values[i]); + maxAcceptedValue = Math.max(maxAcceptedValue, values[i]); + } + } + } + + // TODO: fastMatchQuery functionality is not implemented for sandbox faceting yet, do we need + // it? + /*Query fastMatchQuery; + if (random().nextBoolean()) { + if (random().nextBoolean()) { + fastMatchQuery = LongPoint.newRangeQuery("field", minValue, maxValue); + } else { + fastMatchQuery = LongPoint.newRangeQuery("field", minAcceptedValue, maxAcceptedValue); + } + } else { + fastMatchQuery = null; + }*/ + + final MultiLongValuesSource mvs; + if (random().nextBoolean()) { + LongValuesSource vs = LongValuesSource.fromLongField("field"); + mvs = MultiLongValuesSource.fromSingleValued(vs); + } else { + mvs = MultiLongValuesSource.fromLongField("field"); + } + + LongRangeFacetCutter longRangeFacetCutter = LongRangeFacetCutter.create(mvs, ranges); + CountFacetRecorder countRecorder = new CountFacetRecorder(); + + FacetFieldCollectorManager collectorManager = + new FacetFieldCollectorManager<>(longRangeFacetCutter, countRecorder); + s.search(new MatchAllDocsQuery(), collectorManager); + + OrdToLabel ordToLabel = new RangeOrdToLabel(ranges); + FacetResult result = + getAllSortByOrd(getRangeOrdinals(ranges), countRecorder, "field", ordToLabel); + assertEquals(numRange, result.labelValues.length); + for (int rangeID = 0; rangeID < numRange; rangeID++) { + if (VERBOSE) { + System.out.println(" range " + rangeID + " expectedCount=" + expectedCounts[rangeID]); + } + LabelAndValue subNode = result.labelValues[rangeID]; + assertEquals("r" + rangeID, subNode.label); + assertEquals(expectedCounts[rangeID], subNode.value.intValue()); + + LongRange range = ranges[rangeID]; + + // Test drill-down: + DrillDownQuery ddq = new DrillDownQuery(config); + ddq.add("field", LongPoint.newRangeQuery("field", range.min, range.max)); + assertEquals(expectedCounts[rangeID], s.count(ddq)); + } + } + + w.close(); + IOUtils.close(r, dir); + } + + public void testRandomLongsMultiValued() throws Exception { + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), dir); + + int numDocs = atLeast(1000); + if (VERBOSE) { + System.out.println("TEST: numDocs=" + numDocs); + } + long[][] values = new long[numDocs][]; + long minValue = Long.MAX_VALUE; + long maxValue = Long.MIN_VALUE; + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + int numVals = RandomNumbers.randomIntBetween(random(), 1, 50); + if (random().nextInt(10) == 0) { + numVals = 1; // make sure we have ample testing of single-value cases + } + values[i] = new long[numVals]; + for (int j = 0; j < numVals; j++) { + long v = random().nextLong(); + values[i][j] = v; + doc.add(new SortedNumericDocValuesField("field", v)); + doc.add(new LongPoint("field", v)); + minValue = Math.min(minValue, v); + maxValue = Math.max(maxValue, v); + } + w.addDocument(doc); + } + IndexReader r = w.getReader(); + + IndexSearcher s = newSearcher(r, false); + FacetsConfig config = new FacetsConfig(); + + int numIters = atLeast(10); + for (int iter = 0; iter < numIters; iter++) { + if (VERBOSE) { + System.out.println("TEST: iter=" + iter); + } + int numRange = TestUtil.nextInt(random(), 1, 100); + LongRange[] ranges = new LongRange[numRange]; + int[] expectedCounts = new int[numRange]; + long minAcceptedValue = Long.MAX_VALUE; + long maxAcceptedValue = Long.MIN_VALUE; + for (int rangeID = 0; rangeID < numRange; rangeID++) { + long min; + if (rangeID > 0 && random().nextInt(10) == 7) { + // Use an existing boundary: + LongRange prevRange = ranges[random().nextInt(rangeID)]; + if (random().nextBoolean()) { + min = prevRange.min; + } else { + min = prevRange.max; + } + } else { + min = random().nextLong(); + } + long max; + if (rangeID > 0 && random().nextInt(10) == 7) { + // Use an existing boundary: + LongRange prevRange = ranges[random().nextInt(rangeID)]; + if (random().nextBoolean()) { + max = prevRange.min; + } else { + max = prevRange.max; + } + } else { + max = random().nextLong(); + } + + if (min > max) { + long x = min; + min = max; + max = x; + } + boolean minIncl; + boolean maxIncl; + + // NOTE: max - min >= 0 is here to handle the common overflow case! + if (max - min >= 0 && max - min < 2) { + // If max == min or max == min+1, we always do inclusive, else we might pass an empty + // range and hit exc from LongRange's ctor: + minIncl = true; + maxIncl = true; + } else { + minIncl = random().nextBoolean(); + maxIncl = random().nextBoolean(); + } + ranges[rangeID] = new LongRange("r" + rangeID, min, minIncl, max, maxIncl); + if (VERBOSE) { + System.out.println(" range " + rangeID + ": " + ranges[rangeID]); + } + + // Do "slow but hopefully correct" computation of + // expected count: + for (int i = 0; i < numDocs; i++) { + for (int j = 0; j < values[i].length; j++) { + boolean accept = true; + if (minIncl) { + accept &= values[i][j] >= min; + } else { + accept &= values[i][j] > min; + } + if (maxIncl) { + accept &= values[i][j] <= max; + } else { + accept &= values[i][j] < max; + } + if (accept) { + expectedCounts[rangeID]++; + minAcceptedValue = Math.min(minAcceptedValue, values[i][j]); + maxAcceptedValue = Math.max(maxAcceptedValue, values[i][j]); + break; // ensure each doc can contribute at most 1 count to each range + } + } + } + } + + // TODO: fastMatchQuery functionality is not implemented for sandbox faceting yet, do we need + // it? + /*Query fastMatchQuery; + if (random().nextBoolean()) { + if (random().nextBoolean()) { + fastMatchQuery = LongPoint.newRangeQuery("field", minValue, maxValue); + } else { + fastMatchQuery = LongPoint.newRangeQuery("field", minAcceptedValue, maxAcceptedValue); + } + } else { + fastMatchQuery = null; + }*/ + final MultiLongValuesSource mvs = MultiLongValuesSource.fromLongField("field"); + + LongRangeFacetCutter longRangeFacetCutter = LongRangeFacetCutter.create(mvs, ranges); + CountFacetRecorder countRecorder = new CountFacetRecorder(); + + FacetFieldCollectorManager collectorManager = + new FacetFieldCollectorManager<>(longRangeFacetCutter, countRecorder); + s.search(new MatchAllDocsQuery(), collectorManager); + + OrdToLabel ordToLabel = new RangeOrdToLabel(ranges); + FacetResult result = + getAllSortByOrd(getRangeOrdinals(ranges), countRecorder, "field", ordToLabel); + assertEquals(numRange, result.labelValues.length); + for (int rangeID = 0; rangeID < numRange; rangeID++) { + if (VERBOSE) { + System.out.println(" range " + rangeID + " expectedCount=" + expectedCounts[rangeID]); + } + LabelAndValue subNode = result.labelValues[rangeID]; + assertEquals("r" + rangeID, subNode.label); + assertEquals(expectedCounts[rangeID], subNode.value.intValue()); + + LongRange range = ranges[rangeID]; + + // Test drill-down: + DrillDownQuery ddq = new DrillDownQuery(config); + if (random().nextBoolean()) { + ddq.add("field", LongPoint.newRangeQuery("field", range.min, range.max)); + } else { + ddq.add( + "field", + SortedNumericDocValuesField.newSlowRangeQuery("field", range.min, range.max)); + } + assertEquals(expectedCounts[rangeID], s.count(ddq)); + } + } + + w.close(); + IOUtils.close(r, dir); + } + + public void testRandomDoublesSingleValued() throws Exception { + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), dir); + + int numDocs = atLeast(1000); + double[] values = new double[numDocs]; + double minValue = Double.POSITIVE_INFINITY; + double maxValue = Double.NEGATIVE_INFINITY; + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + double v = random().nextDouble(); + values[i] = v; + doc.add(new DoubleDocValuesField("field", v)); + doc.add(new DoublePoint("field", v)); + w.addDocument(doc); + minValue = Math.min(minValue, v); + maxValue = Math.max(maxValue, v); + } + IndexReader r = w.getReader(); + + IndexSearcher s = newSearcher(r, false); + FacetsConfig config = new FacetsConfig(); + + int numIters = atLeast(10); + for (int iter = 0; iter < numIters; iter++) { + if (VERBOSE) { + System.out.println("TEST: iter=" + iter); + } + int numRange = TestUtil.nextInt(random(), 1, 5); + DoubleRange[] ranges = new DoubleRange[numRange]; + int[] expectedCounts = new int[numRange]; + double minAcceptedValue = Double.POSITIVE_INFINITY; + double maxAcceptedValue = Double.NEGATIVE_INFINITY; + for (int rangeID = 0; rangeID < numRange; rangeID++) { + double min; + if (rangeID > 0 && random().nextInt(10) == 7) { + // Use an existing boundary: + DoubleRange prevRange = ranges[random().nextInt(rangeID)]; + if (random().nextBoolean()) { + min = prevRange.min; + } else { + min = prevRange.max; + } + } else { + min = random().nextDouble(); + } + double max; + if (rangeID > 0 && random().nextInt(10) == 7) { + // Use an existing boundary: + DoubleRange prevRange = ranges[random().nextInt(rangeID)]; + if (random().nextBoolean()) { + max = prevRange.min; + } else { + max = prevRange.max; + } + } else { + max = random().nextDouble(); + } + + if (min > max) { + double x = min; + min = max; + max = x; + } + + boolean minIncl; + boolean maxIncl; + + long minAsLong = NumericUtils.doubleToSortableLong(min); + long maxAsLong = NumericUtils.doubleToSortableLong(max); + // NOTE: maxAsLong - minAsLong >= 0 is here to handle the common overflow case! + if (maxAsLong - minAsLong >= 0 && maxAsLong - minAsLong < 2) { + minIncl = true; + maxIncl = true; + } else { + minIncl = random().nextBoolean(); + maxIncl = random().nextBoolean(); + } + ranges[rangeID] = new DoubleRange("r" + rangeID, min, minIncl, max, maxIncl); + + // Do "slow but hopefully correct" computation of + // expected count: + for (int i = 0; i < numDocs; i++) { + boolean accept = true; + if (minIncl) { + accept &= values[i] >= min; + } else { + accept &= values[i] > min; + } + if (maxIncl) { + accept &= values[i] <= max; + } else { + accept &= values[i] < max; + } + if (accept) { + expectedCounts[rangeID]++; + minAcceptedValue = Math.min(minAcceptedValue, values[i]); + maxAcceptedValue = Math.max(maxAcceptedValue, values[i]); + } + } + } + + // TODO: fastMatchQuery functionality is not implemented for sandbox faceting yet, do we need + // it? + /*Query fastMatchFilter; + if (random().nextBoolean()) { + if (random().nextBoolean()) { + fastMatchFilter = DoublePoint.newRangeQuery("field", minValue, maxValue); + } else { + fastMatchFilter = DoublePoint.newRangeQuery("field", minAcceptedValue, maxAcceptedValue); + } + } else { + fastMatchFilter = null; + }*/ + + final MultiDoubleValuesSource mvs; + if (random().nextBoolean()) { + DoubleValuesSource vs = DoubleValuesSource.fromDoubleField("field"); + mvs = MultiDoubleValuesSource.fromSingleValued(vs); + } else { + mvs = MultiDoubleValuesSource.fromDoubleField("field"); + } + + DoubleRangeFacetCutter doubleRangeFacetCutter = new DoubleRangeFacetCutter(mvs, ranges); + CountFacetRecorder countRecorder = new CountFacetRecorder(); + + FacetFieldCollectorManager collectorManager = + new FacetFieldCollectorManager<>(doubleRangeFacetCutter, countRecorder); + s.search(new MatchAllDocsQuery(), collectorManager); + + OrdToLabel ordToLabel = new RangeOrdToLabel(ranges); + FacetResult result = + getAllSortByOrd(getRangeOrdinals(ranges), countRecorder, "field", ordToLabel); + assertEquals(numRange, result.labelValues.length); + for (int rangeID = 0; rangeID < numRange; rangeID++) { + if (VERBOSE) { + System.out.println(" range " + rangeID + " expectedCount=" + expectedCounts[rangeID]); + } + LabelAndValue subNode = result.labelValues[rangeID]; + assertEquals("r" + rangeID, subNode.label); + assertEquals(expectedCounts[rangeID], subNode.value.intValue()); + + DoubleRange range = ranges[rangeID]; + + // Test drill-down: + DrillDownQuery ddq = new DrillDownQuery(config); + ddq.add("field", DoublePoint.newRangeQuery("field", range.min, range.max)); + + assertEquals(expectedCounts[rangeID], s.count(ddq)); + } + } + + w.close(); + IOUtils.close(r, dir); + } + + public void testRandomDoublesMultiValued() throws Exception { + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), dir); + + int numDocs = atLeast(1000); + double[][] values = new double[numDocs][]; + double minValue = Double.POSITIVE_INFINITY; + double maxValue = Double.NEGATIVE_INFINITY; + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + int numVals = RandomNumbers.randomIntBetween(random(), 1, 50); + if (random().nextInt(10) == 0) { + numVals = 1; // make sure we have ample testing of single-value cases + } + values[i] = new double[numVals]; + for (int j = 0; j < numVals; j++) { + double v = random().nextDouble(); + values[i][j] = v; + doc.add(new SortedNumericDocValuesField("field", Double.doubleToLongBits(v))); + doc.add(new DoublePoint("field", v)); + minValue = Math.min(minValue, v); + maxValue = Math.max(maxValue, v); + } + w.addDocument(doc); + } + IndexReader r = w.getReader(); + + IndexSearcher s = newSearcher(r, false); + FacetsConfig config = new FacetsConfig(); + + int numIters = atLeast(10); + for (int iter = 0; iter < numIters; iter++) { + if (VERBOSE) { + System.out.println("TEST: iter=" + iter); + } + int numRange = TestUtil.nextInt(random(), 1, 5); + DoubleRange[] ranges = new DoubleRange[numRange]; + int[] expectedCounts = new int[numRange]; + double minAcceptedValue = Double.POSITIVE_INFINITY; + double maxAcceptedValue = Double.NEGATIVE_INFINITY; + for (int rangeID = 0; rangeID < numRange; rangeID++) { + double min; + if (rangeID > 0 && random().nextInt(10) == 7) { + // Use an existing boundary: + DoubleRange prevRange = ranges[random().nextInt(rangeID)]; + if (random().nextBoolean()) { + min = prevRange.min; + } else { + min = prevRange.max; + } + } else { + min = random().nextDouble(); + } + double max; + if (rangeID > 0 && random().nextInt(10) == 7) { + // Use an existing boundary: + DoubleRange prevRange = ranges[random().nextInt(rangeID)]; + if (random().nextBoolean()) { + max = prevRange.min; + } else { + max = prevRange.max; + } + } else { + max = random().nextDouble(); + } + + if (min > max) { + double x = min; + min = max; + max = x; + } + + boolean minIncl; + boolean maxIncl; + + long minAsLong = NumericUtils.doubleToSortableLong(min); + long maxAsLong = NumericUtils.doubleToSortableLong(max); + // NOTE: maxAsLong - minAsLong >= 0 is here to handle the common overflow case! + if (maxAsLong - minAsLong >= 0 && maxAsLong - minAsLong < 2) { + minIncl = true; + maxIncl = true; + } else { + minIncl = random().nextBoolean(); + maxIncl = random().nextBoolean(); + } + ranges[rangeID] = new DoubleRange("r" + rangeID, min, minIncl, max, maxIncl); + + // Do "slow but hopefully correct" computation of + // expected count: + for (int i = 0; i < numDocs; i++) { + for (int j = 0; j < values[i].length; j++) { + boolean accept = true; + if (minIncl) { + accept &= values[i][j] >= min; + } else { + accept &= values[i][j] > min; + } + if (maxIncl) { + accept &= values[i][j] <= max; + } else { + accept &= values[i][j] < max; + } + if (accept) { + expectedCounts[rangeID]++; + minAcceptedValue = Math.min(minAcceptedValue, values[i][j]); + maxAcceptedValue = Math.max(maxAcceptedValue, values[i][j]); + break; // ensure each doc can contribute at most 1 count to each range + } + } + } + } + // TODO: fastMatchQuery functionality is not implemented for sandbox faceting yet, do we need + // it? + /*Query fastMatchFilter; + if (random().nextBoolean()) { + if (random().nextBoolean()) { + fastMatchFilter = DoublePoint.newRangeQuery("field", minValue, maxValue); + } else { + fastMatchFilter = DoublePoint.newRangeQuery("field", minAcceptedValue, maxAcceptedValue); + } + } else { + fastMatchFilter = null; + }*/ + final MultiDoubleValuesSource mvs = MultiDoubleValuesSource.fromDoubleField("field"); + DoubleRangeFacetCutter doubleRangeFacetCutter = new DoubleRangeFacetCutter(mvs, ranges); + CountFacetRecorder countRecorder = new CountFacetRecorder(); + FacetFieldCollectorManager collectorManager = + new FacetFieldCollectorManager<>(doubleRangeFacetCutter, countRecorder); + s.search(new MatchAllDocsQuery(), collectorManager); + + OrdToLabel ordToLabel = new RangeOrdToLabel(ranges); + FacetResult result = + getAllSortByOrd(getRangeOrdinals(ranges), countRecorder, "field", ordToLabel); + assertEquals(numRange, result.labelValues.length); + for (int rangeID = 0; rangeID < numRange; rangeID++) { + if (VERBOSE) { + System.out.println(" range " + rangeID + " expectedCount=" + expectedCounts[rangeID]); + } + LabelAndValue subNode = result.labelValues[rangeID]; + assertEquals("r" + rangeID, subNode.label); + assertEquals(expectedCounts[rangeID], subNode.value.intValue()); + + DoubleRange range = ranges[rangeID]; + + // Test drill-down: + DrillDownQuery ddq = new DrillDownQuery(config); + if (random().nextBoolean()) { + ddq.add("field", DoublePoint.newRangeQuery("field", range.min, range.max)); + } else { + ddq.add( + "field", + SortedNumericDocValuesField.newSlowRangeQuery( + "field", Double.doubleToLongBits(range.min), Double.doubleToLongBits(range.max))); + } + + assertEquals(expectedCounts[rangeID], s.count(ddq)); + } + } + + w.close(); + IOUtils.close(r, dir); + } + + // LUCENE-5178 + public void testMissingValues() throws Exception { + Directory d = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), d); + Document doc = new Document(); + NumericDocValuesField field = new NumericDocValuesField("field", 0L); + doc.add(field); + for (long l = 0; l < 100; l++) { + if (l % 5 == 0) { + // Every 5th doc is missing the value: + w.addDocument(new Document()); + continue; + } + field.setLongValue(l); + w.addDocument(doc); + } + + IndexReader r = w.getReader(); + + IndexSearcher s = newSearcher(r); + LongRange[] inputRanges = + new LongRange[] { + new LongRange("less than 10", 0L, true, 10L, false), + new LongRange("less than or equal to 10", 0L, true, 10L, true), + new LongRange("over 90", 90L, false, 100L, false), + new LongRange("90 or above", 90L, true, 100L, false), + new LongRange("over 1000", 1000L, false, Long.MAX_VALUE, false) + }; + + MultiLongValuesSource valuesSource = MultiLongValuesSource.fromLongField("field"); + LongRangeFacetCutter longRangeFacetCutter = + LongRangeFacetCutter.create(valuesSource, inputRanges); + CountFacetRecorder countRecorder = new CountFacetRecorder(); + + FacetFieldCollectorManager collectorManager = + new FacetFieldCollectorManager<>(longRangeFacetCutter, countRecorder); + s.search(new MatchAllDocsQuery(), collectorManager); + OrdToLabel ordToLabel = new RangeOrdToLabel(inputRanges); + + assertEquals( + "dim=field path=[] value=-2147483648 childCount=5\n less than 10 (8)\n less than or equal to 10 (8)\n over 90 (8)\n 90 or above (8)\n over 1000 (0)\n", + getAllSortByOrd(getRangeOrdinals(inputRanges), countRecorder, "field", ordToLabel) + .toString()); + + w.close(); + IOUtils.close(r, d); + } + + public void testMissingValuesMultiValued() throws Exception { + Directory d = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), d); + Document doc = new Document(); + // index the same field twice to test multi-valued logic + SortedNumericDocValuesField field1 = new SortedNumericDocValuesField("field", 0L); + SortedNumericDocValuesField field2 = new SortedNumericDocValuesField("field", 0L); + doc.add(field1); + doc.add(field2); + for (long l = 0; l < 100; l++) { + if (l % 5 == 0) { + // Every 5th doc is missing the value: + w.addDocument(new Document()); + continue; + } + field1.setLongValue(l); + field2.setLongValue(l); + w.addDocument(doc); + } + + IndexReader r = w.getReader(); + + IndexSearcher s = newSearcher(r); + LongRange[] inputRanges = + new LongRange[] { + new LongRange("less than 10", 0L, true, 10L, false), + new LongRange("less than or equal to 10", 0L, true, 10L, true), + new LongRange("over 90", 90L, false, 100L, false), + new LongRange("90 or above", 90L, true, 100L, false), + new LongRange("over 1000", 1000L, false, Long.MAX_VALUE, false) + }; + + MultiLongValuesSource valuesSource = MultiLongValuesSource.fromLongField("field"); + LongRangeFacetCutter longRangeFacetCutter = + LongRangeFacetCutter.create(valuesSource, inputRanges); + CountFacetRecorder countRecorder = new CountFacetRecorder(); + + FacetFieldCollectorManager collectorManager = + new FacetFieldCollectorManager<>(longRangeFacetCutter, countRecorder); + s.search(new MatchAllDocsQuery(), collectorManager); + OrdToLabel ordToLabel = new RangeOrdToLabel(inputRanges); + + assertEquals( + "dim=field path=[] value=-2147483648 childCount=5\n less than 10 (8)\n less than or equal to 10 (8)\n over 90 (8)\n 90 or above (8)\n over 1000 (0)\n", + getAllSortByOrd(getRangeOrdinals(inputRanges), countRecorder, "field", ordToLabel) + .toString()); + + w.close(); + IOUtils.close(r, d); + } + + private static class PlusOneValuesSource extends DoubleValuesSource { + + @Override + public DoubleValues getValues(LeafReaderContext ctx, DoubleValues scores) throws IOException { + return new DoubleValues() { + int doc = -1; + + @Override + public double doubleValue() throws IOException { + return doc + 1; + } + + @Override + public boolean advanceExact(int doc) throws IOException { + this.doc = doc; + return true; + } + }; + } + + @Override + public boolean needsScores() { + return false; + } + + @Override + public boolean isCacheable(LeafReaderContext ctx) { + return false; + } + + @Override + public Explanation explain(LeafReaderContext ctx, int docId, Explanation scoreExplanation) + throws IOException { + return Explanation.match(docId + 1, ""); + } + + @Override + public DoubleValuesSource rewrite(IndexSearcher searcher) throws IOException { + return this; + } + + @Override + public int hashCode() { + return 0; + } + + @Override + public boolean equals(Object obj) { + return obj.getClass() == PlusOneValuesSource.class; + } + + @Override + public String toString() { + return null; + } + } + + public void testCustomDoubleValuesSource() throws Exception { + Directory dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir); + + Document doc = new Document(); + writer.addDocument(doc); + writer.addDocument(doc); + writer.addDocument(doc); + + // Test wants 3 docs in one segment: + writer.forceMerge(1); + + final DoubleValuesSource vs = new PlusOneValuesSource(); + + FacetsConfig config = new FacetsConfig(); + + IndexReader r = writer.getReader(); + + IndexSearcher s = newSearcher(r, false, false, Concurrency.INTER_SEGMENT); + // DrillSideways requires the entire range of docs to be scored at once, so it doesn't support + // timeouts whose implementation scores one window of doc IDs at a time. + s.setTimeout(null); + + final DoubleRange[] ranges = + new DoubleRange[] { + new DoubleRange("< 1", 0.0, true, 1.0, false), + new DoubleRange("< 2", 0.0, true, 2.0, false), + new DoubleRange("< 5", 0.0, true, 5.0, false), + new DoubleRange("< 10", 0.0, true, 10.0, false), + new DoubleRange("< 20", 0.0, true, 20.0, false), + new DoubleRange("< 50", 0.0, true, 50.0, false) + }; + + // TODO: fastMatchQuery functionality is not implemented for sandbox faceting yet, do we need + // it? + /*final Query fastMatchFilter; + final AtomicBoolean filterWasUsed = new AtomicBoolean(); + if (random().nextBoolean()) { + // Sort of silly: + final Query in = new MatchAllDocsQuery(); + fastMatchFilter = new UsedQuery(in, filterWasUsed); + } else { + fastMatchFilter = null; + } + + if (VERBOSE) { + System.out.println("TEST: fastMatchFilter=" + fastMatchFilter); + }*/ + + MultiDoubleValuesSource valuesSource = MultiDoubleValuesSource.fromSingleValued(vs); + DoubleRangeFacetCutter doubleRangeFacetCutter = + new DoubleRangeFacetCutter(valuesSource, ranges); + CountFacetRecorder countRecorder = new CountFacetRecorder(); + + FacetFieldCollectorManager collectorManager = + new FacetFieldCollectorManager<>(doubleRangeFacetCutter, countRecorder); + s.search(new MatchAllDocsQuery(), collectorManager); + OrdToLabel ordToLabel = new RangeOrdToLabel(ranges); + + assertEquals( + "dim=field path=[] value=-2147483648 childCount=6\n < 1 (0)\n < 2 (1)\n < 5 (3)\n < 10 (3)\n < 20 (3)\n < 50 (3)\n", + getAllSortByOrd(getRangeOrdinals(ranges), countRecorder, "field", ordToLabel).toString()); + // assertTrue(fastMatchFilter == null || filterWasUsed.get()); + + DrillDownQuery ddq = new DrillDownQuery(config); + if (random().nextBoolean()) { + ddq.add("field", ranges[1].getQuery(null, vs)); + } else { + ddq.add("field", ranges[1].getQuery(null, MultiDoubleValuesSource.fromSingleValued(vs))); + } + + // Test simple drill-down: + assertEquals(1, s.search(ddq, 10).totalHits.value()); + + // Test drill-sideways after drill-down + DrillSideways ds = + new DrillSideways(s, config, (TaxonomyReader) null) { + @Override + protected boolean scoreSubDocsAtOnce() { + return random().nextBoolean(); + } + }; + + countRecorder = new CountFacetRecorder(); + + DrillSideways.Result result = + ds.search( + ddq, + DummyTotalHitCountCollector.createManager(), + List.of(new FacetFieldCollectorManager<>(doubleRangeFacetCutter, countRecorder))); + assertEquals(1, result.drillDownResult().intValue()); + assertEquals( + "dim=field path=[] value=-2147483648 childCount=6\n < 1 (0)\n < 2 (1)\n < 5 (3)\n < 10 (3)\n < 20 (3)\n < 50 (3)\n", + getAllSortByOrd(getRangeOrdinals(ranges), countRecorder, "field", ordToLabel).toString()); + + writer.close(); + IOUtils.close(r, dir); + } +} diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/facet/TestTaxonomyFacet.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/facet/TestTaxonomyFacet.java new file mode 100644 index 000000000000..fda474974981 --- /dev/null +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/facet/TestTaxonomyFacet.java @@ -0,0 +1,211 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.facet; + +import static org.apache.lucene.facet.FacetsConfig.DEFAULT_INDEX_FIELD_NAME; + +import org.apache.lucene.document.Document; +import org.apache.lucene.facet.DrillDownQuery; +import org.apache.lucene.facet.FacetField; +import org.apache.lucene.facet.FacetsConfig; +import org.apache.lucene.facet.LabelAndValue; +import org.apache.lucene.facet.taxonomy.FacetLabel; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader; +import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.sandbox.facet.cutters.TaxonomyFacetsCutter; +import org.apache.lucene.sandbox.facet.labels.TaxonomyOrdLabelBiMap; +import org.apache.lucene.sandbox.facet.recorders.CountFacetRecorder; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.index.RandomIndexWriter; +import org.apache.lucene.util.IOUtils; + +/** Test for associations */ +public class TestTaxonomyFacet extends SandboxFacetTestCase { + + public void testConstants() { + // It is essential for TaxonomyOrdLabelBiMap that invalid ordinal is the same as for + // TaxonomyReader + assertEquals(TaxonomyOrdLabelBiMap.INVALID_ORD, TaxonomyReader.INVALID_ORDINAL); + } + + public void testBasic() throws Exception { + Directory dir = newDirectory(); + Directory taxoDir = newDirectory(); + + // Writes facet ords to a separate directory from the + // main index: + DirectoryTaxonomyWriter taxoWriter = + new DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE); + + FacetsConfig config = new FacetsConfig(); + config.setHierarchical("Publish Date", true); + + RandomIndexWriter writer = new RandomIndexWriter(random(), dir); + + Document doc = new Document(); + doc.add(new FacetField("Author", "Bob")); + doc.add(new FacetField("Publish Date", "2010", "10", "15")); + writer.addDocument(config.build(taxoWriter, doc)); + + doc = new Document(); + doc.add(new FacetField("Author", "Lisa")); + doc.add(new FacetField("Publish Date", "2010", "10", "20")); + writer.addDocument(config.build(taxoWriter, doc)); + + doc = new Document(); + doc.add(new FacetField("Author", "Lisa")); + doc.add(new FacetField("Publish Date", "2012", "1", "1")); + writer.addDocument(config.build(taxoWriter, doc)); + + doc = new Document(); + doc.add(new FacetField("Author", "Susan")); + doc.add(new FacetField("Publish Date", "2012", "1", "7")); + writer.addDocument(config.build(taxoWriter, doc)); + + doc = new Document(); + doc.add(new FacetField("Author", "Frank")); + doc.add(new FacetField("Publish Date", "1999", "5", "5")); + writer.addDocument(config.build(taxoWriter, doc)); + + // NRT open + IndexSearcher searcher = newSearcher(writer.getReader()); + + // NRT open + TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter); + + Query query = new MatchAllDocsQuery(); + + TaxonomyFacetsCutter defaultTaxoCutter = + new TaxonomyFacetsCutter(DEFAULT_INDEX_FIELD_NAME, config, taxoReader); + final CountFacetRecorder countRecorder = new CountFacetRecorder(); + FacetFieldCollectorManager collectorManager = + new FacetFieldCollectorManager<>(defaultTaxoCutter, countRecorder); + searcher.search(query, collectorManager); + + expectThrows( + IllegalArgumentException.class, + () -> { + getTopChildrenByCount(countRecorder, taxoReader, 0, "Author"); + }); + + // Retrieve & verify results: + assertEquals( + "dim=Publish Date path=[] value=-2147483648 childCount=3\n 2010 (2)\n 2012 (2)\n 1999 (1)\n", + getTopChildrenByCount(countRecorder, taxoReader, 10, "Publish Date").toString()); + assertEquals( + "dim=Author path=[] value=-2147483648 childCount=4\n Lisa (2)\n Bob (1)\n Susan (1)\n Frank (1)\n", + getTopChildrenByCount(countRecorder, taxoReader, 10, "Author").toString()); + + assertFacetResult( + getAllChildren(countRecorder, taxoReader, "Publish Date"), + "Publish Date", + new String[0], + 3, + VALUE_CANT_BE_COMPUTED, + new LabelAndValue[] { + new LabelAndValue("1999", 1), new LabelAndValue("2010", 2), new LabelAndValue("2012", 2), + }); + + assertFacetResult( + getAllChildren(countRecorder, taxoReader, "Author"), + "Author", + new String[0], + 4, + VALUE_CANT_BE_COMPUTED, + new LabelAndValue[] { + new LabelAndValue("Bob", 1), + new LabelAndValue("Frank", 1), + new LabelAndValue("Lisa", 2), + new LabelAndValue("Susan", 1), + }); + + // Now user drills down on Publish Date/2010: + DrillDownQuery q2 = new DrillDownQuery(config); + q2.add("Publish Date", "2010"); + final CountFacetRecorder countRecorder2 = new CountFacetRecorder(); + collectorManager = new FacetFieldCollectorManager<>(defaultTaxoCutter, countRecorder2); + searcher.search(q2, collectorManager); + + assertEquals( + "dim=Author path=[] value=-2147483648 childCount=2\n Bob (1)\n Lisa (1)\n", + getTopChildrenByCount(countRecorder2, taxoReader, 10, "Author").toString()); + + assertEquals(1, getSpecificValue(countRecorder2, taxoReader, "Author", "Lisa")); + + assertArrayEquals( + new int[] {1, 1}, + getCountsForRecordedCandidates( + countRecorder2, + taxoReader, + new FacetLabel[] { + new FacetLabel("Author", "Lisa"), + new FacetLabel("Author", "Susan"), // 0 count, filtered out + new FacetLabel("Author", "DoesNotExist"), // Doesn't exist in the index, filtered out + new FacetLabel("Author", "Bob"), + })); + + expectThrows( + AssertionError.class, + () -> { + getTopChildrenByCount(countRecorder2, taxoReader, 10, "Non exitent dim"); + }); + + writer.close(); + IOUtils.close(taxoWriter, searcher.getIndexReader(), taxoReader, taxoDir, dir); + } + + public void testTaxonomyCutterExpertModeDisableRollup() throws Exception { + Directory dir = newDirectory(); + Directory taxoDir = newDirectory(); + + DirectoryTaxonomyWriter taxoWriter = + new DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE); + + FacetsConfig config = new FacetsConfig(); + config.setHierarchical("Publish Date", true); + + RandomIndexWriter writer = new RandomIndexWriter(random(), dir); + + Document doc = new Document(); + doc.add(new FacetField("Publish Date", "2010", "10", "15")); + writer.addDocument(config.build(taxoWriter, doc)); + + IndexSearcher searcher = newSearcher(writer.getReader()); + TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter); + Query query = new MatchAllDocsQuery(); + + TaxonomyFacetsCutter defaultTaxoCutter = + new TaxonomyFacetsCutter(DEFAULT_INDEX_FIELD_NAME, config, taxoReader, true); + final CountFacetRecorder countRecorder = new CountFacetRecorder(); + FacetFieldCollectorManager collectorManager = + new FacetFieldCollectorManager<>(defaultTaxoCutter, countRecorder); + searcher.search(query, collectorManager); + + assertEquals( + "Only leaf value should have been counted when rollup is disabled", + 1, + countRecorder.recordedOrds().toArray().length); + + writer.close(); + IOUtils.close(taxoWriter, searcher.getIndexReader(), taxoReader, taxoDir, dir); + } +} diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java index c2e6eefd5337..4b0f17dc59c0 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java @@ -504,7 +504,7 @@ private void checkExpectedHits( new TopScoreDocCollectorManager(numHits, Integer.MAX_VALUE); TopDocs firstTopDocs = searcher.search(firstQuery, collectorManager); - assertEquals(numHits, firstTopDocs.totalHits.value); + assertEquals(numHits, firstTopDocs.totalHits.value()); collectorManager = new TopScoreDocCollectorManager(numHits, Integer.MAX_VALUE); TopDocs secondTopDocs = searcher.search(secondQuery, collectorManager); diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCoveringQuery.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCoveringQuery.java index 3edc18b16a0a..d7088b3f6da1 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCoveringQuery.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCoveringQuery.java @@ -242,7 +242,7 @@ private void assertSameMatches(IndexSearcher searcher, Query q1, Query q2, boole final int maxDoc = searcher.getIndexReader().maxDoc(); final TopDocs td1 = searcher.search(q1, maxDoc, scores ? Sort.RELEVANCE : Sort.INDEXORDER); final TopDocs td2 = searcher.search(q2, maxDoc, scores ? Sort.RELEVANCE : Sort.INDEXORDER); - assertEquals(td1.totalHits.value, td2.totalHits.value); + assertEquals(td1.totalHits.value(), td2.totalHits.value()); for (int i = 0; i < td1.scoreDocs.length; ++i) { assertEquals(td1.scoreDocs[i].doc, td2.scoreDocs[i].doc); if (scores) { diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestLargeNumHitsTopDocsCollector.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestLargeNumHitsTopDocsCollector.java index 42cd7e71b6b7..05686015f5d2 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestLargeNumHitsTopDocsCollector.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestLargeNumHitsTopDocsCollector.java @@ -89,7 +89,7 @@ public void testIllegalArguments() throws IOException { searcher.search(testQuery, largeCollector); TopDocs topDocs = searcher.search(testQuery, regularCollectorManager); - assertEquals(largeCollector.totalHits, topDocs.totalHits.value); + assertEquals(largeCollector.totalHits, topDocs.totalHits.value()); IllegalArgumentException expected = expectThrows( @@ -105,12 +105,12 @@ public void testNoPQBuild() throws IOException { IndexSearcher searcher = newSearcher(reader); LargeNumHitsTopDocsCollector largeCollector = new LargeNumHitsTopDocsCollector(250_000); TopScoreDocCollectorManager regularCollectorManager = - new TopScoreDocCollectorManager(250_000, Integer.MAX_VALUE); + new TopScoreDocCollectorManager(reader.numDocs(), Integer.MAX_VALUE); searcher.search(testQuery, largeCollector); TopDocs topDocs = searcher.search(testQuery, regularCollectorManager); - assertEquals(largeCollector.totalHits, topDocs.totalHits.value); + assertEquals(largeCollector.totalHits, topDocs.totalHits.value()); assertNull(largeCollector.pq); assertNull(largeCollector.pqTop); @@ -125,7 +125,7 @@ public void testPQBuild() throws IOException { searcher.search(testQuery, largeCollector); TopDocs topDocs = searcher.search(testQuery, regularCollectorManager); - assertEquals(largeCollector.totalHits, topDocs.totalHits.value); + assertEquals(largeCollector.totalHits, topDocs.totalHits.value()); assertNotNull(largeCollector.pq); assertNotNull(largeCollector.pqTop); @@ -135,12 +135,12 @@ public void testNoPQHitsOrder() throws IOException { IndexSearcher searcher = newSearcher(reader); LargeNumHitsTopDocsCollector largeCollector = new LargeNumHitsTopDocsCollector(250_000); TopScoreDocCollectorManager regularCollectorManager = - new TopScoreDocCollectorManager(250_000, Integer.MAX_VALUE); + new TopScoreDocCollectorManager(reader.numDocs(), Integer.MAX_VALUE); searcher.search(testQuery, largeCollector); TopDocs topDocs = searcher.search(testQuery, regularCollectorManager); - assertEquals(largeCollector.totalHits, topDocs.totalHits.value); + assertEquals(largeCollector.totalHits, topDocs.totalHits.value()); assertNull(largeCollector.pq); assertNull(largeCollector.pqTop); @@ -167,7 +167,7 @@ private void runNumHits(int numHits) throws IOException { TopDocs firstTopDocs = largeCollector.topDocs(); TopDocs secondTopDocs = searcher.search(testQuery, regularCollectorManager); - assertEquals(largeCollector.totalHits, secondTopDocs.totalHits.value); + assertEquals(largeCollector.totalHits, secondTopDocs.totalHits.value()); assertEquals(firstTopDocs.scoreDocs.length, secondTopDocs.scoreDocs.length); CheckHits.checkEqual(testQuery, firstTopDocs.scoreDocs, secondTopDocs.scoreDocs); } diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestProfilerCollector.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestProfilerCollector.java index 85c41ae0b970..4bc3667130b0 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestProfilerCollector.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestProfilerCollector.java @@ -44,7 +44,7 @@ public class TestProfilerCollector extends LuceneTestCase { public void testCollector() throws IOException { Directory dir = newDirectory(); RandomIndexWriter w = new RandomIndexWriter(random(), dir); - final int numDocs = TestUtil.nextInt(random(), 1, 20); + final int numDocs = TestUtil.nextInt(random(), 1, 100); for (int i = 0; i < numDocs; ++i) { Document doc = new Document(); doc.add(new StringField("foo", "bar", Store.NO)); @@ -53,19 +53,21 @@ public void testCollector() throws IOException { IndexReader reader = w.getReader(); w.close(); - ProfilerCollector collector = - new ProfilerCollector(new TotalHitCountCollector(), "total_hits", List.of()); - IndexSearcher searcher = new IndexSearcher(reader); + IndexSearcher searcher = newSearcher(reader); + + ProfilerCollectorManager profilerCollectorManager = + new ProfilerCollectorManager("total_hits") { + @Override + protected Collector createCollector() { + return new TotalHitCountCollector(); + } + }; Query query = new TermQuery(new Term("foo", "bar")); - searcher.search(query, collector); + ProfilerCollectorResult profileResult = searcher.search(query, profilerCollectorManager); - MatcherAssert.assertThat(collector.getReason(), equalTo("total_hits")); - MatcherAssert.assertThat(collector.getName(), equalTo("TotalHitCountCollector")); - ProfilerCollectorResult profileResult = collector.getProfileResult(); MatcherAssert.assertThat(profileResult.getReason(), equalTo("total_hits")); MatcherAssert.assertThat(profileResult.getName(), equalTo("TotalHitCountCollector")); MatcherAssert.assertThat(profileResult.getTime(), greaterThan(0L)); - MatcherAssert.assertThat(profileResult.getTime(), equalTo(collector.getTime())); reader.close(); dir.close(); diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestTermAutomatonQuery.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestTermAutomatonQuery.java index 937bad87b433..80bebc00452d 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestTermAutomatonQuery.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestTermAutomatonQuery.java @@ -102,7 +102,7 @@ public void testBasic1() throws Exception { q.addTransition(s2, s3, "sun"); q.finish(); - assertEquals(1, s.search(q, 1).totalHits.value); + assertEquals(1, s.search(q, 1).totalHits.value()); w.close(); r.close(); @@ -135,7 +135,7 @@ public void testBasicSynonym() throws Exception { q.addTransition(s2, s3, "moon"); q.finish(); - assertEquals(2, s.search(q, 1).totalHits.value); + assertEquals(2, s.search(q, 1).totalHits.value()); w.close(); r.close(); @@ -172,7 +172,7 @@ public void testBasicSlop() throws Exception { q.addTransition(s2, s3, "sun"); q.finish(); - assertEquals(2, s.search(q, 1).totalHits.value); + assertEquals(2, s.search(q, 1).totalHits.value()); w.close(); r.close(); @@ -228,7 +228,7 @@ public void testPosLengthAtQueryTimeMock() throws Exception { // System.out.println("DOT:\n" + q.toDot()); - assertEquals(4, s.search(q, 1).totalHits.value); + assertEquals(4, s.search(q, 1).totalHits.value()); w.close(); r.close(); @@ -273,7 +273,7 @@ public void testPosLengthAtQueryTimeTrueish() throws Exception { TermAutomatonQuery q = new TokenStreamToTermAutomatonQuery().toQuery("field", ts); // System.out.println("DOT: " + q.toDot()); - assertEquals(4, s.search(q, 1).totalHits.value); + assertEquals(4, s.search(q, 1).totalHits.value()); w.close(); r.close(); @@ -306,7 +306,7 @@ public void testSegsMissingTerms() throws Exception { q.addTransition(s2, s3, "moon"); q.finish(); - assertEquals(2, s.search(q, 1).totalHits.value); + assertEquals(2, s.search(q, 1).totalHits.value()); w.close(); r.close(); dir.close(); @@ -367,7 +367,7 @@ public void testAnyFromTokenStream() throws Exception { TermAutomatonQuery q = new TokenStreamToTermAutomatonQuery().toQuery("field", ts); // System.out.println("DOT: " + q.toDot()); - assertEquals(3, s.search(q, 1).totalHits.value); + assertEquals(3, s.search(q, 1).totalHits.value()); w.close(); r.close(); @@ -552,7 +552,7 @@ public TokenStreamComponents createComponents(String fieldName) { Set hits2Docs = toDocIDs(s, hits2); try { - assertEquals(hits2.totalHits.value, hits1.totalHits.value); + assertEquals(hits2.totalHits.value(), hits1.totalHits.value()); assertEquals(hits2Docs, hits1Docs); } catch (AssertionError ae) { System.out.println("FAILED:"); @@ -668,7 +668,7 @@ public void testWithCycles1() throws Exception { q.setAccept(s1, true); q.finish(); - assertEquals(1, s.search(q, 1).totalHits.value); + assertEquals(1, s.search(q, 1).totalHits.value()); w.close(); r.close(); dir.close(); @@ -703,7 +703,7 @@ public void testWithCycles2() throws Exception { q.setAccept(s4, true); q.finish(); - assertEquals(1, s.search(q, 1).totalHits.value); + assertEquals(1, s.search(q, 1).totalHits.value()); w.close(); r.close(); dir.close(); @@ -723,7 +723,7 @@ public void testTermDoesNotExist() throws Exception { TermAutomatonQuery q = new TokenStreamToTermAutomatonQuery().toQuery("field", ts); // System.out.println("DOT: " + q.toDot()); - assertEquals(0, s.search(q, 1).totalHits.value); + assertEquals(0, s.search(q, 1).totalHits.value()); w.close(); r.close(); @@ -744,7 +744,7 @@ public void testOneTermDoesNotExist() throws Exception { TermAutomatonQuery q = new TokenStreamToTermAutomatonQuery().toQuery("field", ts); // System.out.println("DOT: " + q.toDot()); - assertEquals(0, s.search(q, 1).totalHits.value); + assertEquals(0, s.search(q, 1).totalHits.value()); IOUtils.close(w, r, dir); } @@ -860,7 +860,7 @@ public void testExplainNoMatchingDocument() throws Exception { assertTrue(rewrittenQuery instanceof TermAutomatonQuery); TopDocs topDocs = searcher.search(rewrittenQuery, 10); - assertEquals(0, topDocs.totalHits.value); + assertEquals(0, topDocs.totalHits.value()); Explanation explanation = searcher.explain(rewrittenQuery, 0); assertFalse("Explanation should indicate no match", explanation.isMatch()); @@ -903,7 +903,7 @@ public void testExplainMatchingDocuments() throws Exception { "Rewritten query should be an instance of TermAutomatonQuery", rewrittenQuery instanceof TermAutomatonQuery); TopDocs topDocs = searcher.search(q, 10); - assertEquals(2, topDocs.totalHits.value); + assertEquals(2, topDocs.totalHits.value()); for (ScoreDoc scoreDoc : topDocs.scoreDocs) { Explanation explanation = searcher.explain(q, scoreDoc.doc); @@ -1038,7 +1038,7 @@ public void testOneTermMissing() throws Exception { q.addTransition(s2, s3, "moon"); q.finish(); - assertEquals(1, s.search(q, 1).totalHits.value); + assertEquals(1, s.search(q, 1).totalHits.value()); w.close(); r.close(); @@ -1068,7 +1068,7 @@ public void testFieldMissing() throws Exception { q.addTransition(s2, s3, "moon"); q.finish(); - assertEquals(0, s.search(q, 1).totalHits.value); + assertEquals(0, s.search(q, 1).totalHits.value()); w.close(); r.close(); diff --git a/lucene/spatial-extras/src/test/org/apache/lucene/spatial/SpatialTestCase.java b/lucene/spatial-extras/src/test/org/apache/lucene/spatial/SpatialTestCase.java index cf8fc8b57d76..74a01262868f 100644 --- a/lucene/spatial-extras/src/test/org/apache/lucene/spatial/SpatialTestCase.java +++ b/lucene/spatial-extras/src/test/org/apache/lucene/spatial/SpatialTestCase.java @@ -113,7 +113,7 @@ protected SearchResults executeQuery(Query query, int numDocs) { for (ScoreDoc scoreDoc : topDocs.scoreDocs) { results.add(new SearchResult(scoreDoc.score, storedFields.document(scoreDoc.doc))); } - return new SearchResults(topDocs.totalHits.value, results); + return new SearchResults(topDocs.totalHits.value(), results); } catch (IOException ioe) { throw new RuntimeException("IOException thrown while executing query", ioe); } diff --git a/lucene/spatial-extras/src/test/org/apache/lucene/spatial/TestSpatialExample.java b/lucene/spatial-extras/src/test/org/apache/lucene/spatial/TestSpatialExample.java index fd0a81105c5a..aadaff32ee03 100644 --- a/lucene/spatial-extras/src/test/org/apache/lucene/spatial/TestSpatialExample.java +++ b/lucene/spatial-extras/src/test/org/apache/lucene/spatial/TestSpatialExample.java @@ -197,8 +197,8 @@ private void search() throws Exception { private void assertDocMatchedIds(IndexSearcher indexSearcher, TopDocs docs, int... ids) throws IOException { - assert docs.totalHits.relation == Relation.EQUAL_TO; - int[] gotIds = new int[Math.toIntExact(docs.totalHits.value)]; + assert docs.totalHits.relation() == Relation.EQUAL_TO; + int[] gotIds = new int[Math.toIntExact(docs.totalHits.value())]; for (int i = 0; i < gotIds.length; i++) { gotIds[i] = indexSearcher diff --git a/lucene/spatial-extras/src/test/org/apache/lucene/spatial/prefix/TestJtsPolygon.java b/lucene/spatial-extras/src/test/org/apache/lucene/spatial/prefix/TestJtsPolygon.java index ea0bfa2fb904..6298277fcb42 100644 --- a/lucene/spatial-extras/src/test/org/apache/lucene/spatial/prefix/TestJtsPolygon.java +++ b/lucene/spatial-extras/src/test/org/apache/lucene/spatial/prefix/TestJtsPolygon.java @@ -119,6 +119,6 @@ public void testBadPrefixTreePrune() throws Exception { System.out.println(indexSearcher.storedFields().document(scoreDoc.doc)); } - assertEquals(1, search.totalHits.value); + assertEquals(1, search.totalHits.value()); } } diff --git a/lucene/spatial-extras/src/test/org/apache/lucene/spatial/prefix/TestRandomSpatialOpFuzzyPrefixTree.java b/lucene/spatial-extras/src/test/org/apache/lucene/spatial/prefix/TestRandomSpatialOpFuzzyPrefixTree.java index 988c2c960655..3dc3dc46e759 100644 --- a/lucene/spatial-extras/src/test/org/apache/lucene/spatial/prefix/TestRandomSpatialOpFuzzyPrefixTree.java +++ b/lucene/spatial-extras/src/test/org/apache/lucene/spatial/prefix/TestRandomSpatialOpFuzzyPrefixTree.java @@ -346,16 +346,16 @@ private void doTest(final SpatialOperation operation) throws IOException { case 0: queryShape = randomPoint(); break; - // LUCENE-5549 - // TODO debug: -Dtests.method=testWithin -Dtests.multiplier=3 - // -Dtests.seed=5F5294CE2E075A3E:AAD2F0F79288CA64 - // case 1:case 2:case 3: - // if (!indexedAtLeastOneShapePair) { - // // avoids ShapePair.relate(ShapePair), which isn't reliable - // queryShape = randomShapePairRect(!biasContains); - // // invert biasContains for query side - // break; - // } + // LUCENE-5549 + // TODO debug: -Dtests.method=testWithin -Dtests.multiplier=3 + // -Dtests.seed=5F5294CE2E075A3E:AAD2F0F79288CA64 + // case 1:case 2:case 3: + // if (!indexedAtLeastOneShapePair) { + // // avoids ShapePair.relate(ShapePair), which isn't reliable + // queryShape = randomShapePairRect(!biasContains); + // // invert biasContains for query side + // break; + // } case 4: // choose an existing indexed shape @@ -366,7 +366,7 @@ private void doTest(final SpatialOperation operation) throws IOException { break; } } - // fall-through + // fall-through default: queryShape = randomRectangle(); diff --git a/lucene/spatial-extras/src/test/org/apache/lucene/spatial/prefix/tree/TestSpatialPrefixTree.java b/lucene/spatial-extras/src/test/org/apache/lucene/spatial/prefix/tree/TestSpatialPrefixTree.java index cf939cf54ff7..9fcd5df0b977 100644 --- a/lucene/spatial-extras/src/test/org/apache/lucene/spatial/prefix/tree/TestSpatialPrefixTree.java +++ b/lucene/spatial-extras/src/test/org/apache/lucene/spatial/prefix/tree/TestSpatialPrefixTree.java @@ -110,6 +110,6 @@ public void testBadPrefixTreePrune() throws Exception { System.out.println(indexSearcher.storedFields().document(scoreDoc.doc)); } - assertEquals(1, search.totalHits.value); + assertEquals(1, search.totalHits.value()); } } diff --git a/lucene/spatial3d/src/java/org/apache/lucene/spatial3d/geom/GeoPolygonFactory.java b/lucene/spatial3d/src/java/org/apache/lucene/spatial3d/geom/GeoPolygonFactory.java index 29bb08895779..6a3b0ce6bc58 100755 --- a/lucene/spatial3d/src/java/org/apache/lucene/spatial3d/geom/GeoPolygonFactory.java +++ b/lucene/spatial3d/src/java/org/apache/lucene/spatial3d/geom/GeoPolygonFactory.java @@ -1717,20 +1717,16 @@ private static int getLegalIndex(int index, int size) { return index; } - /** Class representing a single (unused) edge. */ - private static class Edge { - /** Plane */ - public final SidedPlane plane; - - /** Start point */ - public final GeoPoint startPoint; - - /** End point */ - public final GeoPoint endPoint; - - /** Internal edge flag */ - public final boolean isInternal; - + /** + * Class representing a single (unused) edge. + * + * @param plane Plane + * @param startPoint Start point + * @param endPoint End point + * @param isInternal Internal edge flag + */ + private record Edge( + GeoPoint startPoint, GeoPoint endPoint, SidedPlane plane, boolean isInternal) { /** * Constructor. * @@ -1739,16 +1735,7 @@ private static class Edge { * @param plane the edge plane * @param isInternal true if internal edge */ - public Edge( - final GeoPoint startPoint, - final GeoPoint endPoint, - final SidedPlane plane, - final boolean isInternal) { - this.startPoint = startPoint; - this.endPoint = endPoint; - this.plane = plane; - this.isInternal = isInternal; - } + private Edge {} @Override public int hashCode() { diff --git a/lucene/spatial3d/src/java/org/apache/lucene/spatial3d/geom/GeoStandardPath.java b/lucene/spatial3d/src/java/org/apache/lucene/spatial3d/geom/GeoStandardPath.java index 7c3de452e246..60bb7cd0c202 100755 --- a/lucene/spatial3d/src/java/org/apache/lucene/spatial3d/geom/GeoStandardPath.java +++ b/lucene/spatial3d/src/java/org/apache/lucene/spatial3d/geom/GeoStandardPath.java @@ -430,14 +430,7 @@ public String toString() { + "}}"; } - private static class DistancePair { - public final double pathCenterDistance; - public final double distanceAlongPath; - - public DistancePair(final double pathCenterDistance, final double distanceAlongPath) { - this.pathCenterDistance = pathCenterDistance; - this.distanceAlongPath = distanceAlongPath; - } + private record DistancePair(double pathCenterDistance, double distanceAlongPath) { @Override public String toString() { diff --git a/lucene/spatial3d/src/test/org/apache/lucene/spatial3d/TestGeo3DPoint.java b/lucene/spatial3d/src/test/org/apache/lucene/spatial3d/TestGeo3DPoint.java index 4453af9d1cc5..796979195aac 100644 --- a/lucene/spatial3d/src/test/org/apache/lucene/spatial3d/TestGeo3DPoint.java +++ b/lucene/spatial3d/src/test/org/apache/lucene/spatial3d/TestGeo3DPoint.java @@ -143,7 +143,7 @@ public void testBasic() throws Exception { planetModel, toRadians(50), toRadians(-97), Math.PI / 180.)), 1) .totalHits - .value); + .value()); w.close(); r.close(); dir.close(); diff --git a/lucene/suggest/src/java/module-info.java b/lucene/suggest/src/java/module-info.java index c3f5b7c178f3..20542ac5281d 100644 --- a/lucene/suggest/src/java/module-info.java +++ b/lucene/suggest/src/java/module-info.java @@ -32,7 +32,8 @@ org.apache.lucene.search.suggest.document.Completion84PostingsFormat, org.apache.lucene.search.suggest.document.Completion90PostingsFormat, org.apache.lucene.search.suggest.document.Completion99PostingsFormat, - org.apache.lucene.search.suggest.document.Completion912PostingsFormat; + org.apache.lucene.search.suggest.document.Completion912PostingsFormat, + org.apache.lucene.search.suggest.document.Completion101PostingsFormat; provides org.apache.lucene.analysis.TokenFilterFactory with org.apache.lucene.search.suggest.analyzing.SuggestStopFilterFactory; } diff --git a/lucene/suggest/src/java/org/apache/lucene/search/spell/CombineSuggestion.java b/lucene/suggest/src/java/org/apache/lucene/search/spell/CombineSuggestion.java index 72cefcf5b736..8d96cfda6dbf 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/spell/CombineSuggestion.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/spell/CombineSuggestion.java @@ -16,20 +16,11 @@ */ package org.apache.lucene.search.spell; -/** A suggestion generated by combining one or more original query terms */ -public class CombineSuggestion { - /** The indexes from the passed-in array of terms used to make this word combination */ - public final int[] originalTermIndexes; - - /** The word combination suggestion */ - public final SuggestWord suggestion; - - /** - * Creates a new CombineSuggestion from a suggestion and an array of term ids - * (referencing the indexes to the original terms that form this combined suggestion) - */ - public CombineSuggestion(SuggestWord suggestion, int[] originalTermIndexes) { - this.suggestion = suggestion; - this.originalTermIndexes = originalTermIndexes; - } -} +/** + * A suggestion generated by combining one or more original query terms + * + * @param originalTermIndexes The indexes from the passed-in array of terms used to make this word + * combination + * @param suggestion The word combination suggestion + */ +public record CombineSuggestion(SuggestWord suggestion, int[] originalTermIndexes) {} diff --git a/lucene/suggest/src/java/org/apache/lucene/search/spell/WordBreakSpellChecker.java b/lucene/suggest/src/java/org/apache/lucene/search/spell/WordBreakSpellChecker.java index b3a84e9aaf82..58d68a66bd87 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/spell/WordBreakSpellChecker.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/spell/WordBreakSpellChecker.java @@ -130,7 +130,7 @@ public SuggestWord[][] suggestWordBreaks( * returned {@link CombineSuggestion} contains both a {@link SuggestWord} and also an array * detailing which passed-in terms were involved in creating this combination. The scores returned * are equal to the number of word combinations needed, also one less than the length of the array - * {@link CombineSuggestion#originalTermIndexes}. Generally, a suggestion with a lower score is + * {@link CombineSuggestion#originalTermIndexes()}. Generally, a suggestion with a lower score is * preferred over a higher score. * *

    To prevent two adjacent terms from being combined (for instance, if one is mandatory and the @@ -491,8 +491,8 @@ public int compare(CombineSuggestionWrapper o1, CombineSuggestionWrapper o2) { if (o1.numCombinations != o2.numCombinations) { return o2.numCombinations - o1.numCombinations; } - if (o1.combineSuggestion.suggestion.freq != o2.combineSuggestion.suggestion.freq) { - return o1.combineSuggestion.suggestion.freq - o2.combineSuggestion.suggestion.freq; + if (o1.combineSuggestion.suggestion().freq != o2.combineSuggestion.suggestion().freq) { + return o1.combineSuggestion.suggestion().freq - o2.combineSuggestion.suggestion().freq; } return 0; } @@ -516,13 +516,6 @@ private static class SuggestWordArrayWrapper { } } - private static class CombineSuggestionWrapper { - final CombineSuggestion combineSuggestion; - final int numCombinations; - - CombineSuggestionWrapper(CombineSuggestion combineSuggestion, int numCombinations) { - this.combineSuggestion = combineSuggestion; - this.numCombinations = numCombinations; - } - } + private record CombineSuggestionWrapper( + CombineSuggestion combineSuggestion, int numCombinations) {} } diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java index 426c02999f7d..c5f1fdc899af 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java @@ -739,8 +739,7 @@ public List lookup( searcherMgrReadLock.unlock(); } try { - TopFieldCollectorManager c = - new TopFieldCollectorManager(SORT, num, null, 1, searcher.getSlices().length > 1); + TopFieldCollectorManager c = new TopFieldCollectorManager(SORT, num, null, 1); // System.out.println("got searcher=" + searcher); TopFieldDocs hits = searcher.search(finalQuery, c); diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java index a995b6725e90..b55634e9b492 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java @@ -717,7 +717,7 @@ public List lookup( int count = 0; for (FSTUtil.Path> path : prefixPaths) { - if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) { + if (fst.findTargetArc(END_BYTE, path.fstNode(), scratchArc, bytesReader) != null) { // This node has END_BYTE arc leaving, meaning it's an // "exact" match: count++; @@ -740,11 +740,14 @@ public List lookup( // pruned our exact match from one of these nodes // ...: for (FSTUtil.Path> path : prefixPaths) { - if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) { + if (fst.findTargetArc(END_BYTE, path.fstNode(), scratchArc, bytesReader) != null) { // This node has END_BYTE arc leaving, meaning it's an // "exact" match: searcher.addStartPaths( - scratchArc, fst.outputs.add(path.output, scratchArc.output()), false, path.input); + scratchArc, + fst.outputs.add(path.output(), scratchArc.output()), + false, + path.input()); } } @@ -764,9 +767,9 @@ public List lookup( // nodes we have and the // maxSurfaceFormsPerAnalyzedForm: for (Result> completion : completions) { - BytesRef output2 = completion.output.output2; + BytesRef output2 = completion.output().output2; if (sameSurfaceForm(utf8Key, output2)) { - results.add(getLookupResult(completion.output.output1, output2, spare)); + results.add(getLookupResult(completion.output().output1, output2, spare)); break; } } @@ -814,7 +817,7 @@ protected boolean acceptResult(IntsRef input, Pair output) { prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst); for (FSTUtil.Path> path : prefixPaths) { - searcher.addStartPaths(path.fstNode, path.output, true, path.input); + searcher.addStartPaths(path.fstNode(), path.output(), true, path.input()); } TopResults> completions = searcher.search(); @@ -823,7 +826,7 @@ protected boolean acceptResult(IntsRef input, Pair output) { for (Result> completion : completions) { LookupResult result = - getLookupResult(completion.output.output1, completion.output.output2, spare); + getLookupResult(completion.output().output1, completion.output().output2, spare); // TODO: for fuzzy case would be nice to return // how many edits were required diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java index 1c5a3bf23db0..4ff6d3451ddd 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java @@ -35,29 +35,15 @@ public class FSTUtil { private FSTUtil() {} - /** Holds a pair (automaton, fst) of states and accumulated output in the intersected machine. */ - public static final class Path { - - /** Node in the automaton where path ends: */ - public final int state; - - /** Node in the FST where path ends: */ - public final FST.Arc fstNode; - - /** Output of the path so far: */ - public final T output; - - /** Input of the path so far: */ - public final IntsRefBuilder input; - - /** Sole constructor. */ - public Path(int state, FST.Arc fstNode, T output, IntsRefBuilder input) { - this.state = state; - this.fstNode = fstNode; - this.output = output; - this.input = input; - } - } + /** + * Holds a pair (automaton, fst) of states and accumulated output in the intersected machine. + * + * @param state Node in the automaton where path ends: + * @param fstNode Node in the FST where path ends: + * @param output Output of the path so far: + * @param input Input of the path so far: + */ + public record Path(int state, FST.Arc fstNode, T output, IntsRefBuilder input) {} /** * Enumerates all minimal prefix paths in the automaton that also intersect the FST, accumulating diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FreeTextSuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FreeTextSuggester.java index 87346e9482eb..ae40aeb7d5cb 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FreeTextSuggester.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FreeTextSuggester.java @@ -615,7 +615,7 @@ public List lookup(final CharSequence key, Set contexts, // Must do num+seen.size() for queue depth because we may // reject up to seen.size() paths in acceptResult(): Util.TopNSearcher searcher = - new Util.TopNSearcher(fst, num, num + seen.size(), weightComparator) { + new Util.TopNSearcher<>(fst, num, num + seen.size(), Comparator.naturalOrder()) { BytesRefBuilder scratchBytes = new BytesRefBuilder(); @@ -666,7 +666,7 @@ protected boolean acceptResult(IntsRef input, Long output) { for (Result completion : completions) { token.setLength(prefixLength); // append suffix - Util.toBytesRef(completion.input, suffix); + Util.toBytesRef(completion.input(), suffix); token.append(suffix); // System.out.println(" completion " + token.utf8ToString()); @@ -693,7 +693,7 @@ protected boolean acceptResult(IntsRef input, Long output) { (long) (Long.MAX_VALUE * backoff - * ((double) decodeWeight(completion.output)) + * ((double) decodeWeight(completion.output())) / contextCount)); results.add(result); assert results.size() == seen.size(); @@ -702,19 +702,15 @@ protected boolean acceptResult(IntsRef input, Long output) { backoff *= ALPHA; } - Collections.sort( - results, - new Comparator() { - @Override - public int compare(LookupResult a, LookupResult b) { - if (a.value > b.value) { - return -1; - } else if (a.value < b.value) { - return 1; - } else { - // Tie break by UTF16 sort order: - return ((String) a.key).compareTo((String) b.key); - } + results.sort( + (a, b) -> { + if (a.value > b.value) { + return -1; + } else if (a.value < b.value) { + return 1; + } else { + // Tie break by UTF16 sort order: + return ((String) a.key).compareTo((String) b.key); } }); @@ -761,14 +757,6 @@ private Long lookupPrefix( return output; } - static final Comparator weightComparator = - new Comparator() { - @Override - public int compare(Long left, Long right) { - return left.compareTo(right); - } - }; - /** Returns the weight associated with an input string, or null if it does not exist. */ public Object get(CharSequence key) { throw new UnsupportedOperationException(); diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/Completion101PostingsFormat.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/Completion101PostingsFormat.java new file mode 100644 index 000000000000..2a51f01fd7ae --- /dev/null +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/Completion101PostingsFormat.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.suggest.document; + +import org.apache.lucene.codecs.PostingsFormat; + +/** + * {@link CompletionPostingsFormat} for {@link + * org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat} + * + * @lucene.experimental + */ +public class Completion101PostingsFormat extends CompletionPostingsFormat { + /** Creates a {@link Completion101PostingsFormat} that will load the completion FST on-heap. */ + public Completion101PostingsFormat() { + this(FSTLoadMode.ON_HEAP); + } + + /** + * Creates a {@link Completion101PostingsFormat} that will use the provided fstLoadMode + * to determine if the completion FST should be loaded on or off heap. + */ + public Completion101PostingsFormat(FSTLoadMode fstLoadMode) { + super("Completion101", fstLoadMode); + } + + @Override + protected PostingsFormat delegatePostingsFormat() { + return PostingsFormat.forName("Lucene101"); + } +} diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/Completion912PostingsFormat.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/Completion912PostingsFormat.java index 341b034e8e8b..77a62443f312 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/Completion912PostingsFormat.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/Completion912PostingsFormat.java @@ -19,8 +19,10 @@ import org.apache.lucene.codecs.PostingsFormat; /** - * {@link CompletionPostingsFormat} for {@link - * org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat} + * {@link org.apache.lucene.search.suggest.document.CompletionPostingsFormat} for {@code + * org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat}. This format is only used + * for backward-compatibility of the index format and cannot be used to write data, use {@link + * Completion101PostingsFormat} on new indices. * * @lucene.experimental */ diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionFieldsConsumer.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionFieldsConsumer.java index 8c6a0856f05b..73518308b363 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionFieldsConsumer.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionFieldsConsumer.java @@ -163,19 +163,7 @@ public void close() throws IOException { } } - private static class CompletionMetaData { - private final long filePointer; - private final long minWeight; - private final long maxWeight; - private final byte type; - - private CompletionMetaData(long filePointer, long minWeight, long maxWeight, byte type) { - this.filePointer = filePointer; - this.minWeight = minWeight; - this.maxWeight = maxWeight; - this.type = type; - } - } + private record CompletionMetaData(long filePointer, long minWeight, long maxWeight, byte type) {} // builds an FST based on the terms written private static class CompletionTermWriter { diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextQuery.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextQuery.java index c727b1e621c7..bbcfc7feb439 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextQuery.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextQuery.java @@ -249,23 +249,14 @@ private static Automaton toContextAutomaton( } } - /** Holder for context value meta data */ - private static class ContextMetaData { - - /** Boost associated with a context value */ - private final float boost; - - /** - * flag to indicate whether the context value should be treated as an exact value or a context - * prefix - */ - private final boolean exact; - - private ContextMetaData(float boost, boolean exact) { - this.boost = boost; - this.exact = exact; - } - } + /** + * Holder for context value meta data + * + * @param boost Boost associated with a context value + * @param exact flag to indicate whether the context value should be treated as an exact value or + * a context prefix + */ + private record ContextMetaData(float boost, boolean exact) {} private static class ContextCompletionWeight extends CompletionWeight { diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggester.java index a56365fed9d6..4e25e2ac608a 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggester.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggester.java @@ -146,10 +146,13 @@ public void lookup( final CharsRefBuilder spare = new CharsRefBuilder(); - Comparator> comparator = getComparator(); Util.TopNSearcher> searcher = - new Util.TopNSearcher>( - fst, topN, queueSize, comparator, new ScoringPathComparator(scorer)) { + new Util.TopNSearcher<>( + fst, + topN, + queueSize, + (o1, o2) -> Long.compare(o1.output1, o2.output1), + new ScoringPathComparator(scorer)) { private final ByteArrayDataInput scratchInput = new ByteArrayDataInput(); @@ -226,8 +229,8 @@ protected boolean acceptResult(Util.FSTPath> path) { }; for (FSTUtil.Path> path : prefixPaths) { - scorer.weight.setNextMatch(path.input.get()); - BytesRef output = path.output.output2; + scorer.weight.setNextMatch(path.input().get()); + BytesRef output = path.output().output2; int payload = -1; if (collector.doSkipDuplicates()) { for (int j = 0; j < output.length; j++) { @@ -241,10 +244,10 @@ protected boolean acceptResult(Util.FSTPath> path) { } searcher.addStartPaths( - path.fstNode, - path.output, + path.fstNode(), + path.output(), false, - path.input, + path.input(), scorer.weight.boost(), scorer.weight.context(), payload); @@ -261,13 +264,8 @@ protected boolean acceptResult(Util.FSTPath> path) { * Compares partial completion paths using {@link CompletionScorer#score(float, float)}, breaks * ties comparing path inputs */ - private static class ScoringPathComparator + private record ScoringPathComparator(CompletionScorer scorer) implements Comparator>> { - private final CompletionScorer scorer; - - public ScoringPathComparator(CompletionScorer scorer) { - this.scorer = scorer; - } @Override public int compare( @@ -280,15 +278,6 @@ public int compare( } } - private static Comparator> getComparator() { - return new Comparator>() { - @Override - public int compare(Pair o1, Pair o2) { - return Long.compare(o1.output1, o2.output1); - } - }; - } - /** * Simple heuristics to try to avoid over-pruning potential suggestions by the TopNSearcher. Since * suggestion entries can be rejected if they belong to a deleted document, the length of the diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java index 02d9f84d3d4a..7aec1568ad77 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java @@ -136,14 +136,7 @@ private static int maxNumArcsForDedupByte(int currentNumDedupBytes) { return (int) Math.min(maxArcs, 255); } - private static final class Entry implements Comparable { - final BytesRef payload; - final long weight; - - public Entry(BytesRef payload, long weight) { - this.payload = payload; - this.weight = weight; - } + private record Entry(BytesRef payload, long weight) implements Comparable { @Override public int compareTo(Entry o) { diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestIndexSearcher.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestIndexSearcher.java index f89efeffb067..e1f26836204e 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestIndexSearcher.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestIndexSearcher.java @@ -21,6 +21,7 @@ import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.search.BulkScorer; import org.apache.lucene.search.CollectionTerminatedException; +import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.LeafCollector; import org.apache.lucene.search.Weight; @@ -71,7 +72,8 @@ public void suggest(CompletionQuery query, TopSuggestDocsCollector collector) th LeafCollector leafCollector = null; try { leafCollector = collector.getLeafCollector(context); - scorer.score(leafCollector, context.reader().getLiveDocs()); + scorer.score( + leafCollector, context.reader().getLiveDocs(), 0, DocIdSetIterator.NO_MORE_DOCS); } catch ( @SuppressWarnings("unused") CollectionTerminatedException e) { diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java index a3c0fdd137f9..3710ff6b476b 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java @@ -194,7 +194,8 @@ public List lookup( // complete top-N TopResults completions = null; try { - completions = Util.shortestPaths(fst, arc, prefixOutput, weightComparator, num, !exactFirst); + completions = + Util.shortestPaths(fst, arc, prefixOutput, Comparator.naturalOrder(), num, !exactFirst); assert completions.isComplete; } catch (IOException bogus) { throw new RuntimeException(bogus); @@ -204,10 +205,10 @@ public List lookup( for (Result completion : completions) { scratch.setLength(prefixLength); // append suffix - Util.toBytesRef(completion.input, suffix); + Util.toBytesRef(completion.input(), suffix); scratch.append(suffix); spare.copyUTF8Bytes(scratch.get()); - results.add(new LookupResult(spare.toString(), decodeWeight(completion.output))); + results.add(new LookupResult(spare.toString(), decodeWeight(completion.output()))); } return results; } @@ -301,14 +302,6 @@ protected long decode(BytesRef scratch, ByteArrayDataInput tmpInput) { } } - static final Comparator weightComparator = - new Comparator() { - @Override - public int compare(Long left, Long right) { - return left.compareTo(right); - } - }; - /** Returns byte size of the underlying FST. */ @Override public long ramBytesUsed() { diff --git a/lucene/suggest/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat b/lucene/suggest/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat index 8544a97b88fa..5c0365616cde 100644 --- a/lucene/suggest/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat +++ b/lucene/suggest/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat @@ -35,3 +35,4 @@ org.apache.lucene.search.suggest.document.Completion84PostingsFormat org.apache.lucene.search.suggest.document.Completion90PostingsFormat org.apache.lucene.search.suggest.document.Completion99PostingsFormat org.apache.lucene.search.suggest.document.Completion912PostingsFormat +org.apache.lucene.search.suggest.document.Completion101PostingsFormat diff --git a/lucene/suggest/src/test/org/apache/lucene/search/spell/TestWordBreakSpellChecker.java b/lucene/suggest/src/test/org/apache/lucene/search/spell/TestWordBreakSpellChecker.java index 38121ced691e..12a1a5782dfd 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/spell/TestWordBreakSpellChecker.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/spell/TestWordBreakSpellChecker.java @@ -183,17 +183,17 @@ public void testCombiningWords() throws Exception { assertSuggestionEquals(cs[2], "yeight", 1.0f, 4, 5); for (int i = 3; i < 5; i++) { - assertEquals(3, cs[i].originalTermIndexes.length); - assertEquals(2, cs[i].suggestion.score, 0); + assertEquals(3, cs[i].originalTermIndexes().length); + assertEquals(2, cs[i].suggestion().score, 0); assertTrue( - (cs[i].originalTermIndexes[0] == 1 - && cs[i].originalTermIndexes[1] == 2 - && cs[i].originalTermIndexes[2] == 3 - && cs[i].suggestion.string.equals("hundredeight")) - || (cs[i].originalTermIndexes[0] == 3 - && cs[i].originalTermIndexes[1] == 4 - && cs[i].originalTermIndexes[2] == 5 - && cs[i].suggestion.string.equals("eightyeight"))); + (cs[i].originalTermIndexes()[0] == 1 + && cs[i].originalTermIndexes()[1] == 2 + && cs[i].originalTermIndexes()[2] == 3 + && cs[i].suggestion().string.equals("hundredeight")) + || (cs[i].originalTermIndexes()[0] == 3 + && cs[i].originalTermIndexes()[1] == 4 + && cs[i].originalTermIndexes()[2] == 5 + && cs[i].suggestion().string.equals("eightyeight"))); } cs = wbsp.suggestWordCombinations(terms, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX); @@ -428,8 +428,8 @@ public void testRandom() throws Exception { wbsp.suggestWordCombinations(terms, originals.size(), ir, SuggestMode.SUGGEST_ALWAYS); boolean failed = true; for (CombineSuggestion cs1 : cs) { - assertEquals(2, cs1.originalTermIndexes.length); - if (cs1.suggestion.string.equals(left + right)) { + assertEquals(2, cs1.originalTermIndexes().length); + if (cs1.suggestion().string.equals(left + right)) { failed = false; } } @@ -448,9 +448,9 @@ public void testRandom() throws Exception { private static void assertSuggestionEquals( CombineSuggestion cs, String word, float score, int... termIndexes) { - assertEquals(word, cs.suggestion.string); - assertEquals(score, cs.suggestion.score, 0); - assertArrayEquals(termIndexes, cs.originalTermIndexes); + assertEquals(word, cs.suggestion().string); + assertEquals(score, cs.suggestion().score, 0); + assertArrayEquals(termIndexes, cs.originalTermIndexes()); } private static void assertSuggestionEquals(SuggestWord sw, String word, float score) { diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/Average.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/Average.java index 2c2c53343220..737449cfd6b7 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/Average.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/Average.java @@ -19,19 +19,15 @@ import java.util.List; import java.util.Locale; -/** Average with standard deviation. */ -final class Average { - /** Average (in milliseconds). */ - public final double avg; - - /** Standard deviation (in milliseconds). */ - public final double stddev; - +/** + * Average with standard deviation. + * + * @param avg Average (in milliseconds). + * @param stddev Standard deviation (in milliseconds). + */ +record Average(double avg, double stddev) { /** */ - Average(double avg, double stddev) { - this.avg = avg; - this.stddev = stddev; - } + Average {} @Override public String toString() { diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/SuggestRebuildTestUtil.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/SuggestRebuildTestUtil.java index afad8f6e387b..bde129463ab0 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/SuggestRebuildTestUtil.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/SuggestRebuildTestUtil.java @@ -108,19 +108,13 @@ public interface ExceptionalCallback { * An InputArrayIterator wrapper whose {@link InputIterator#next} method releases on a Semaphore, * and then acquires from a differnet Semaphore. */ - private static final class DelayedInputIterator implements InputIterator { - final Semaphore releaseOnNext; - final Semaphore acquireOnNext; - final InputIterator inner; - - public DelayedInputIterator( - final Semaphore releaseOnNext, final Semaphore acquireOnNext, final InputIterator inner) { + private record DelayedInputIterator( + Semaphore releaseOnNext, Semaphore acquireOnNext, InputIterator inner) + implements InputIterator { + private DelayedInputIterator { assert null != releaseOnNext; assert null != acquireOnNext; assert null != inner; - this.releaseOnNext = releaseOnNext; - this.acquireOnNext = acquireOnNext; - this.inner = inner; } @Override diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/TestLookupBenchmark.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/TestLookupBenchmark.java index 80e6add070db..55c4f8c8dc09 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/TestLookupBenchmark.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/TestLookupBenchmark.java @@ -235,7 +235,7 @@ public void runPerformanceTest( lookup.getClass().getSimpleName(), input.size(), result.average, - input.size() / result.average.avg); + input.size() / result.average.avg()); } } diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestAnalyzingInfixSuggester.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestAnalyzingInfixSuggester.java index 6983c7eba595..3ea17ee146b5 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestAnalyzingInfixSuggester.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestAnalyzingInfixSuggester.java @@ -227,19 +227,15 @@ public void testLookupsDuringReBuild() throws Exception { a.close(); } - /** Used to return highlighted result; see {@link LookupResult#highlightKey} */ - private static final class LookupHighlightFragment { - /** Portion of text for this fragment. */ - public final String text; - - /** True if this text matched a part of the user's query. */ - public final boolean isHit; - + /** + * Used to return highlighted result; see {@link LookupResult#highlightKey} + * + * @param text Portion of text for this fragment. + * @param isHit True if this text matched a part of the user's query. + */ + private record LookupHighlightFragment(String text, boolean isHit) { /** Sole constructor. */ - public LookupHighlightFragment(String text, boolean isHit) { - this.text = text; - this.isHit = isHit; - } + private LookupHighlightFragment {} @Override public String toString() { diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestAnalyzingSuggester.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestAnalyzingSuggester.java index 75003c9293a6..e61935d9395a 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestAnalyzingSuggester.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestAnalyzingSuggester.java @@ -616,18 +616,8 @@ public void testNonExactFirst() throws Exception { } // Holds surface form separately: - private static class TermFreq2 implements Comparable { - public final String surfaceForm; - public final String analyzedForm; - public final long weight; - public final BytesRef payload; - - public TermFreq2(String surfaceForm, String analyzedForm, long weight, BytesRef payload) { - this.surfaceForm = surfaceForm; - this.analyzedForm = analyzedForm; - this.weight = weight; - this.payload = payload; - } + private record TermFreq2(String surfaceForm, String analyzedForm, long weight, BytesRef payload) + implements Comparable { @Override public int compareTo(TermFreq2 other) { diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestFuzzySuggester.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestFuzzySuggester.java index 14e6ba392d33..68807710129f 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestFuzzySuggester.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestFuzzySuggester.java @@ -503,16 +503,8 @@ public void testNonExactFirst() throws Exception { } // Holds surface form separately: - private static class TermFreqPayload2 implements Comparable { - public final String surfaceForm; - public final String analyzedForm; - public final long weight; - - public TermFreqPayload2(String surfaceForm, String analyzedForm, long weight) { - this.surfaceForm = surfaceForm; - this.analyzedForm = analyzedForm; - this.weight = weight; - } + private record TermFreqPayload2(String surfaceForm, String analyzedForm, long weight) + implements Comparable { @Override public int compareTo(TermFreqPayload2 other) { @@ -948,7 +940,7 @@ private static String addRandomEdit(String string, int prefixLength) { } return builder.toString(); } - // NOTE: fall through to delete: + // NOTE: fall through to delete: case 2: // Delete input[i] for (int j = i + 1; j < input.length; j++) { diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestPrefixCompletionQuery.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestPrefixCompletionQuery.java index cb2ffc4b608b..6670a0b78538 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestPrefixCompletionQuery.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestPrefixCompletionQuery.java @@ -193,7 +193,7 @@ public void testMostlyFilteredOutDocuments() throws Exception { // if at most half of the top scoring documents have been filtered out // the search should be admissible for a single segment TopSuggestDocs suggest = indexSearcher.suggest(query, num, false); - assertTrue(suggest.totalHits.value >= 1); + assertTrue(suggest.totalHits.value() >= 1); assertEquals("abc_" + topScore, suggest.scoreLookupDocs()[0].key.toString()); assertEquals((float) topScore, suggest.scoreLookupDocs()[0].score, 0); @@ -481,7 +481,7 @@ public void testGhostField() throws Exception { PrefixCompletionQuery query = new PrefixCompletionQuery(analyzer, new Term("suggest_field", "app")); - assertEquals(0, indexSearcher.suggest(query, 3, false).totalHits.value); + assertEquals(0, indexSearcher.suggest(query, 3, false).totalHits.value()); query = new PrefixCompletionQuery(analyzer, new Term("suggest_field2", "app")); assertSuggestions(indexSearcher.suggest(query, 3, false), new Entry("apples", 3)); diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java index 2d6e9471c984..4a59d09d35ce 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java @@ -138,7 +138,7 @@ public void testEmpty() throws Exception { PrefixCompletionQuery query = new PrefixCompletionQuery(analyzer, new Term("suggest_field", "ab")); TopSuggestDocs lookupDocs = suggestIndexSearcher.suggest(query, 3, false); - assertEquals(0L, lookupDocs.totalHits.value); + assertEquals(0L, lookupDocs.totalHits.value()); reader.close(); iw.close(); } @@ -510,7 +510,7 @@ public Bits getBits(LeafReaderContext context) throws IOException { PrefixCompletionQuery query = new PrefixCompletionQuery(analyzer, new Term("suggest_field", "abc_"), filter); TopSuggestDocs suggest = indexSearcher.suggest(query, num, false); - assertEquals(0L, suggest.totalHits.value); + assertEquals(0L, suggest.totalHits.value()); reader.close(); iw.close(); } @@ -539,7 +539,7 @@ public void testSuggestOnAllDeletedDocuments() throws Exception { PrefixCompletionQuery query = new PrefixCompletionQuery(analyzer, new Term("suggest_field", "abc_")); TopSuggestDocs suggest = indexSearcher.suggest(query, num, false); - assertEquals(0L, suggest.totalHits.value); + assertEquals(0L, suggest.totalHits.value()); reader.close(); iw.close(); @@ -702,7 +702,7 @@ public void testReturnedDocID() throws Exception { PrefixCompletionQuery query = new PrefixCompletionQuery(analyzer, new Term("suggest_field", "abc_")); TopSuggestDocs suggest = indexSearcher.suggest(query, num, false); - assertEquals(num, suggest.totalHits.value); + assertEquals(num, suggest.totalHits.value()); StoredFields storedFields = reader.storedFields(); for (SuggestScoreDoc suggestScoreDoc : suggest.scoreLookupDocs()) { String key = suggestScoreDoc.key.toString(); @@ -745,7 +745,7 @@ public void testScoring() throws Exception { PrefixCompletionQuery query = new PrefixCompletionQuery(analyzer, new Term("suggest_field", prefix)); TopSuggestDocs suggest = indexSearcher.suggest(query, num, false); - assertTrue(suggest.totalHits.value > 0); + assertTrue(suggest.totalHits.value() > 0); float topScore = -1; for (SuggestScoreDoc scoreDoc : suggest.scoreLookupDocs()) { if (topScore != -1) { @@ -798,7 +798,7 @@ public void testRealisticKeys() throws Exception { PrefixCompletionQuery query = new PrefixCompletionQuery(analyzer, new Term("suggest_field", title)); TopSuggestDocs suggest = indexSearcher.suggest(query, mappings.size(), false); - assertTrue(suggest.totalHits.value > 0); + assertTrue(suggest.totalHits.value() > 0); boolean matched = false; for (ScoreDoc scoreDoc : suggest.scoreDocs) { matched = Float.compare(scoreDoc.score, (float) entry.getValue()) == 0; @@ -951,7 +951,7 @@ static IndexWriterConfig iwcWithSuggestField(Analyzer analyzer, final Set= 0; assert floatValues.dimension() > 0; return floatValues; @@ -139,7 +139,7 @@ public ByteVectorValues getByteVectorValues(String field) throws IOException { && fi.getVectorEncoding() == VectorEncoding.BYTE; ByteVectorValues values = delegate.getByteVectorValues(field); assert values != null; - assert values.docID() == -1; + assert values.iterator().docID() == -1; assert values.size() >= 0; assert values.dimension() > 0; return values; diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/blockterms/LuceneFixedGap.java b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/blockterms/LuceneFixedGap.java index 9688c6c0e35f..7c5a0bcf2545 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/blockterms/LuceneFixedGap.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/blockterms/LuceneFixedGap.java @@ -28,9 +28,9 @@ import org.apache.lucene.codecs.blockterms.FixedGapTermsIndexWriter; import org.apache.lucene.codecs.blockterms.TermsIndexReaderBase; import org.apache.lucene.codecs.blockterms.TermsIndexWriterBase; -import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat; -import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader; -import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter; +import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat; +import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader; +import org.apache.lucene.codecs.lucene101.Lucene101PostingsWriter; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; @@ -38,7 +38,7 @@ // any PostingsFormat and make it ord-able... /** - * Customized version of {@link Lucene912PostingsFormat} that uses {@link FixedGapTermsIndexWriter}. + * Customized version of {@link Lucene101PostingsFormat} that uses {@link FixedGapTermsIndexWriter}. */ public final class LuceneFixedGap extends PostingsFormat { final int termIndexInterval; @@ -54,7 +54,7 @@ public LuceneFixedGap(int termIndexInterval) { @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - PostingsWriterBase docs = new Lucene912PostingsWriter(state); + PostingsWriterBase docs = new Lucene101PostingsWriter(state); // TODO: should we make the terms index more easily // pluggable? Ie so that this codec would record which @@ -91,7 +91,7 @@ public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { - PostingsReaderBase postings = new Lucene912PostingsReader(state); + PostingsReaderBase postings = new Lucene101PostingsReader(state); TermsIndexReaderBase indexReader; boolean success = false; diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/blockterms/LuceneVarGapDocFreqInterval.java b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/blockterms/LuceneVarGapDocFreqInterval.java index 1fafc8f448c7..ff5e0fc07dda 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/blockterms/LuceneVarGapDocFreqInterval.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/blockterms/LuceneVarGapDocFreqInterval.java @@ -29,9 +29,9 @@ import org.apache.lucene.codecs.blockterms.TermsIndexWriterBase; import org.apache.lucene.codecs.blockterms.VariableGapTermsIndexReader; import org.apache.lucene.codecs.blockterms.VariableGapTermsIndexWriter; -import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat; -import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader; -import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter; +import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat; +import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader; +import org.apache.lucene.codecs.lucene101.Lucene101PostingsWriter; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; @@ -39,7 +39,7 @@ // any PostingsFormat and make it ord-able... /** - * Customized version of {@link Lucene912PostingsFormat} that uses {@link + * Customized version of {@link Lucene101PostingsFormat} that uses {@link * VariableGapTermsIndexWriter} with a fixed interval, but forcing high docfreq terms to be indexed * terms. */ @@ -59,7 +59,7 @@ public LuceneVarGapDocFreqInterval(int docFreqThreshold, int termIndexInterval) @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - PostingsWriterBase docs = new Lucene912PostingsWriter(state); + PostingsWriterBase docs = new Lucene101PostingsWriter(state); // TODO: should we make the terms index more easily // pluggable? Ie so that this codec would record which @@ -100,7 +100,7 @@ public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { - PostingsReaderBase postings = new Lucene912PostingsReader(state); + PostingsReaderBase postings = new Lucene101PostingsReader(state); TermsIndexReaderBase indexReader; boolean success = false; diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/blockterms/LuceneVarGapFixedInterval.java b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/blockterms/LuceneVarGapFixedInterval.java index bfaf0914651a..7899f3d54363 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/blockterms/LuceneVarGapFixedInterval.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/blockterms/LuceneVarGapFixedInterval.java @@ -29,9 +29,9 @@ import org.apache.lucene.codecs.blockterms.TermsIndexWriterBase; import org.apache.lucene.codecs.blockterms.VariableGapTermsIndexReader; import org.apache.lucene.codecs.blockterms.VariableGapTermsIndexWriter; -import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat; -import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader; -import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter; +import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat; +import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader; +import org.apache.lucene.codecs.lucene101.Lucene101PostingsWriter; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; @@ -39,7 +39,7 @@ // any PostingsFormat and make it ord-able... /** - * Customized version of {@link Lucene912PostingsFormat} that uses {@link + * Customized version of {@link Lucene101PostingsFormat} that uses {@link * VariableGapTermsIndexWriter} with a fixed interval. */ public final class LuceneVarGapFixedInterval extends PostingsFormat { @@ -56,7 +56,7 @@ public LuceneVarGapFixedInterval(int termIndexInterval) { @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - PostingsWriterBase docs = new Lucene912PostingsWriter(state); + PostingsWriterBase docs = new Lucene101PostingsWriter(state); // TODO: should we make the terms index more easily // pluggable? Ie so that this codec would record which @@ -95,7 +95,7 @@ public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { - PostingsReaderBase postings = new Lucene912PostingsReader(state); + PostingsReaderBase postings = new Lucene101PostingsReader(state); TermsIndexReaderBase indexReader; boolean success = false; diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/cranky/CrankyCompoundFormat.java b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/cranky/CrankyCompoundFormat.java index bced58d2a6f3..4e0ac271859d 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/cranky/CrankyCompoundFormat.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/cranky/CrankyCompoundFormat.java @@ -34,9 +34,8 @@ class CrankyCompoundFormat extends CompoundFormat { } @Override - public CompoundDirectory getCompoundReader(Directory dir, SegmentInfo si, IOContext context) - throws IOException { - return delegate.getCompoundReader(dir, si, context); + public CompoundDirectory getCompoundReader(Directory dir, SegmentInfo si) throws IOException { + return delegate.getCompoundReader(dir, si); } @Override diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/mockrandom/MockRandomPostingsFormat.java b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/mockrandom/MockRandomPostingsFormat.java index 6b10e7b19f49..8770e0d81fa2 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/mockrandom/MockRandomPostingsFormat.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/mockrandom/MockRandomPostingsFormat.java @@ -35,10 +35,10 @@ import org.apache.lucene.codecs.blockterms.VariableGapTermsIndexWriter; import org.apache.lucene.codecs.blocktreeords.OrdsBlockTreeTermsReader; import org.apache.lucene.codecs.blocktreeords.OrdsBlockTreeTermsWriter; +import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader; +import org.apache.lucene.codecs.lucene101.Lucene101PostingsWriter; import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader; import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter; -import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader; -import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter; import org.apache.lucene.codecs.memory.FSTTermsReader; import org.apache.lucene.codecs.memory.FSTTermsWriter; import org.apache.lucene.index.FieldInfo; @@ -121,7 +121,7 @@ public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException random.nextInt(); // consume a random for buffersize - PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(state); + PostingsWriterBase postingsWriter = new Lucene101PostingsWriter(state); final FieldsConsumer fields; final int t1 = random.nextInt(4); @@ -289,7 +289,7 @@ public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException System.out.println("MockRandomCodec: readBufferSize=" + readBufferSize); } - PostingsReaderBase postingsReader = new Lucene912PostingsReader(state); + PostingsReaderBase postingsReader = new Lucene101PostingsReader(state); final FieldsProducer fields; final int t1 = random.nextInt(4); diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/ramonly/RAMOnlyPostingsFormat.java b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/ramonly/RAMOnlyPostingsFormat.java index abc0299db2d2..be547fa29ef8 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/ramonly/RAMOnlyPostingsFormat.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/ramonly/RAMOnlyPostingsFormat.java @@ -327,9 +327,9 @@ public RAMPostingsWriterImpl startTerm(BytesRef text) { } public void finishTerm(BytesRef text, TermStats stats) { - assert stats.docFreq > 0; - assert stats.docFreq == current.docs.size(); - current.totalTermFreq = stats.totalTermFreq; + assert stats.docFreq() > 0; + assert stats.docFreq() == current.docs.size(); + current.totalTermFreq = stats.totalTermFreq(); field.termToDocs.put(current.term, current); } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/uniformsplit/UniformSplitRot13PostingsFormat.java b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/uniformsplit/UniformSplitRot13PostingsFormat.java index 4b13531327b1..1961ecc746e2 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/uniformsplit/UniformSplitRot13PostingsFormat.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/uniformsplit/UniformSplitRot13PostingsFormat.java @@ -23,8 +23,8 @@ import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.PostingsReaderBase; import org.apache.lucene.codecs.PostingsWriterBase; -import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader; -import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter; +import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader; +import org.apache.lucene.codecs.lucene101.Lucene101PostingsWriter; import org.apache.lucene.codecs.uniformsplit.BlockDecoder; import org.apache.lucene.codecs.uniformsplit.BlockEncoder; import org.apache.lucene.codecs.uniformsplit.IndexDictionary; @@ -67,7 +67,7 @@ public static void resetEncodingFlags() { @Override public FieldsConsumer fieldsConsumer(SegmentWriteState segmentWriteState) throws IOException { - PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(segmentWriteState); + PostingsWriterBase postingsWriter = new Lucene101PostingsWriter(segmentWriteState); boolean success = false; try { FieldsConsumer fieldsConsumer = createFieldsConsumer(segmentWriteState, postingsWriter); @@ -145,7 +145,7 @@ public void writeTo(DataOutput dataOutput) throws IOException { @Override public FieldsProducer fieldsProducer(SegmentReadState segmentReadState) throws IOException { - PostingsReaderBase postingsReader = new Lucene912PostingsReader(segmentReadState); + PostingsReaderBase postingsReader = new Lucene101PostingsReader(segmentReadState); boolean success = false; try { FieldsProducer fieldsProducer = createFieldsProducer(segmentReadState, postingsReader); diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/geo/BaseGeoPointTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/geo/BaseGeoPointTestCase.java index d371292324d5..6861188f6994 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/geo/BaseGeoPointTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/geo/BaseGeoPointTestCase.java @@ -39,6 +39,7 @@ import org.apache.lucene.document.StringField; import org.apache.lucene.geo.Circle; import org.apache.lucene.geo.Component2D; +import org.apache.lucene.geo.GeoEncodingUtils; import org.apache.lucene.geo.GeoUtils; import org.apache.lucene.geo.LatLonGeometry; import org.apache.lucene.geo.Polygon; @@ -1642,18 +1643,18 @@ private TopDocs searchSmallSet(Query query, int size) throws Exception { public void testSmallSetRect() throws Exception { TopDocs td = searchSmallSet(newRectQuery("point", 32.778, 32.779, -96.778, -96.777), 5); - assertEquals(4, td.totalHits.value); + assertEquals(4, td.totalHits.value()); } public void testSmallSetDateline() throws Exception { TopDocs td = searchSmallSet(newRectQuery("point", -45.0, -44.0, 179.0, -179.0), 20); - assertEquals(2, td.totalHits.value); + assertEquals(2, td.totalHits.value()); } public void testSmallSetMultiValued() throws Exception { TopDocs td = searchSmallSet(newRectQuery("point", 32.755, 32.776, -96.454, -96.770), 20); // 3 single valued docs + 2 multi-valued docs - assertEquals(5, td.totalHits.value); + assertEquals(5, td.totalHits.value()); } public void testSmallSetWholeMap() throws Exception { @@ -1666,7 +1667,7 @@ public void testSmallSetWholeMap() throws Exception { GeoUtils.MIN_LON_INCL, GeoUtils.MAX_LON_INCL), 20); - assertEquals(24, td.totalHits.value); + assertEquals(24, td.totalHits.value()); } public void testSmallSetPoly() throws Exception { @@ -1690,7 +1691,7 @@ public void testSmallSetPoly() throws Exception { -96.6041564, -96.7449188, -96.76826477, -96.7682647 })), 5); - assertEquals(2, td.totalHits.value); + assertEquals(2, td.totalHits.value()); } public void testSmallSetPolyWholeMap() throws Exception { @@ -1714,18 +1715,18 @@ public void testSmallSetPolyWholeMap() throws Exception { GeoUtils.MIN_LON_INCL })), 20); - assertEquals("testWholeMap failed", 24, td.totalHits.value); + assertEquals("testWholeMap failed", 24, td.totalHits.value()); } public void testSmallSetDistance() throws Exception { TopDocs td = searchSmallSet(newDistanceQuery("point", 32.94823588839368, -96.4538113027811, 6000), 20); - assertEquals(2, td.totalHits.value); + assertEquals(2, td.totalHits.value()); } public void testSmallSetTinyDistance() throws Exception { TopDocs td = searchSmallSet(newDistanceQuery("point", 40.720611, -73.998776, 1), 20); - assertEquals(2, td.totalHits.value); + assertEquals(2, td.totalHits.value()); } /** see https://issues.apache.org/jira/browse/LUCENE-6905 */ @@ -1734,7 +1735,7 @@ public void testSmallSetDistanceNotEmpty() throws Exception { searchSmallSet( newDistanceQuery("point", -88.56029371730983, -177.23537676036358, 7757.999232959935), 20); - assertEquals(2, td.totalHits.value); + assertEquals(2, td.totalHits.value()); } /** Explicitly large */ @@ -1742,13 +1743,50 @@ public void testSmallSetHugeDistance() throws Exception { TopDocs td = searchSmallSet( newDistanceQuery("point", 32.94823588839368, -96.4538113027811, 6000000), 20); - assertEquals(16, td.totalHits.value); + assertEquals(16, td.totalHits.value()); } public void testSmallSetDistanceDateline() throws Exception { TopDocs td = searchSmallSet( newDistanceQuery("point", 32.94823588839368, -179.9538113027811, 120000), 20); - assertEquals(3, td.totalHits.value); + assertEquals(3, td.totalHits.value()); + } + + public void testNarrowPolygonCloseToNorthPole() throws Exception { + IndexWriterConfig iwc = newIndexWriterConfig(); + iwc.setMergeScheduler(new SerialMergeScheduler()); + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, iwc); + + // index point closes to Lat 90 + Document doc = new Document(); + final int base = Integer.MAX_VALUE; + addPointToDoc( + FIELD_NAME, + doc, + GeoEncodingUtils.decodeLatitude(base - 2), + GeoEncodingUtils.decodeLongitude(base - 2)); + w.addDocument(doc); + w.flush(); + + // query testing + final IndexReader reader = DirectoryReader.open(w); + final IndexSearcher s = newSearcher(reader); + + double minLat = GeoEncodingUtils.decodeLatitude(base - 3); + double maxLat = GeoEncodingUtils.decodeLatitude(base); + double minLon = GeoEncodingUtils.decodeLongitude(base - 3); + double maxLon = GeoEncodingUtils.decodeLongitude(base); + + Query query = + newPolygonQuery( + FIELD_NAME, + new Polygon( + new double[] {minLat, minLat, maxLat, maxLat, minLat}, + new double[] {minLon, maxLon, maxLon, minLon, minLon})); + + assertEquals(1, s.count(query)); + IOUtils.close(w, reader, dir); } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/geo/BaseXYPointTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/geo/BaseXYPointTestCase.java index 3a733b5462f2..d9deb9fbeede 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/geo/BaseXYPointTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/geo/BaseXYPointTestCase.java @@ -1475,18 +1475,18 @@ private TopDocs searchSmallSet(Query query, int size) throws Exception { public void testSmallSetRect() throws Exception { TopDocs td = searchSmallSet(newRectQuery("point", 32.778f, 32.779f, -96.778f, -96.777f), 5); - assertEquals(4, td.totalHits.value); + assertEquals(4, td.totalHits.value()); } public void testSmallSetRect2() throws Exception { TopDocs td = searchSmallSet(newRectQuery("point", -45.0f, -44.0f, -180.0f, 180.0f), 20); - assertEquals(2, td.totalHits.value); + assertEquals(2, td.totalHits.value()); } public void testSmallSetMultiValued() throws Exception { TopDocs td = searchSmallSet(newRectQuery("point", 32.755f, 32.776f, -180f, 180.770f), 20); // 3 single valued docs + 2 multi-valued docs - assertEquals(5, td.totalHits.value); + assertEquals(5, td.totalHits.value()); } public void testSmallSetWholeSpace() throws Exception { @@ -1495,7 +1495,7 @@ public void testSmallSetWholeSpace() throws Exception { newRectQuery( "point", -Float.MAX_VALUE, Float.MAX_VALUE, -Float.MAX_VALUE, Float.MAX_VALUE), 20); - assertEquals(24, td.totalHits.value); + assertEquals(24, td.totalHits.value()); } public void testSmallSetPoly() throws Exception { @@ -1525,7 +1525,7 @@ public void testSmallSetPoly() throws Exception { -96.7682647f })), 5); - assertEquals(2, td.totalHits.value); + assertEquals(2, td.totalHits.value()); } public void testSmallSetPolyWholeSpace() throws Exception { @@ -1549,18 +1549,18 @@ public void testSmallSetPolyWholeSpace() throws Exception { -Float.MAX_VALUE })), 20); - assertEquals("testWholeMap failed", 24, td.totalHits.value); + assertEquals("testWholeMap failed", 24, td.totalHits.value()); } public void testSmallSetDistance() throws Exception { TopDocs td = searchSmallSet(newDistanceQuery("point", 32.94823588839368f, -96.4538113027811f, 6.0f), 20); - assertEquals(11, td.totalHits.value); + assertEquals(11, td.totalHits.value()); } public void testSmallSetTinyDistance() throws Exception { TopDocs td = searchSmallSet(newDistanceQuery("point", 40.720611f, -73.998776f, 0.1f), 20); - assertEquals(2, td.totalHits.value); + assertEquals(2, td.totalHits.value()); } /** Explicitly large */ @@ -1568,6 +1568,6 @@ public void testSmallSetHugeDistance() throws Exception { TopDocs td = searchSmallSet( newDistanceQuery("point", 32.94823588839368f, -96.4538113027811f, Float.MAX_VALUE), 20); - assertEquals(24, td.totalHits.value); + assertEquals(24, td.totalHits.value()); } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/AssertingLeafReader.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/AssertingLeafReader.java index 3151754a075b..ae6e88138906 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/AssertingLeafReader.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/AssertingLeafReader.java @@ -21,8 +21,10 @@ import java.util.Iterator; import java.util.List; import java.util.Objects; +import java.util.RandomAccess; import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.DocValuesSkipIndexType; import org.apache.lucene.index.DocValuesSkipper; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.FieldInfo; @@ -153,6 +155,12 @@ public AssertingTermVectors(TermVectors in) { this.in = in; } + @Override + public void prefetch(int docID) throws IOException { + assertThread("TermVectors", creationThread); + in.prefetch(docID); + } + @Override public Fields get(int doc) throws IOException { assertThread("TermVectors", creationThread); @@ -693,7 +701,10 @@ public int getDocIdUpTo(int level) { public List getImpacts(int level) { assert validFor == Math.max(impactsEnum.docID(), impactsEnum.lastShallowTarget) : "Cannot reuse impacts after advancing the iterator"; - return in.getImpacts(level); + List impacts = in.getImpacts(level); + assert impacts.size() <= 1 || impacts instanceof RandomAccess + : "impact lists longer than 1 should implement RandomAccess but saw impacts = " + impacts; + return impacts; } } @@ -1619,10 +1630,10 @@ public DocValuesSkipper getDocValuesSkipper(String field) throws IOException { DocValuesSkipper skipper = super.getDocValuesSkipper(field); FieldInfo fi = getFieldInfos().fieldInfo(field); if (skipper != null) { - assert fi.hasDocValuesSkipIndex(); + assert fi.docValuesSkipIndexType() != DocValuesSkipIndexType.NONE; return new AssertingDocValuesSkipper(skipper); } else { - assert fi == null || fi.hasDocValuesSkipIndex() == false; + assert fi == null || fi.docValuesSkipIndexType() == DocValuesSkipIndexType.NONE; return null; } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseCompoundFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseCompoundFormatTestCase.java index b0e30ef2272e..0fe563d75508 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseCompoundFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseCompoundFormatTestCase.java @@ -64,7 +64,7 @@ public void testEmpty() throws IOException { SegmentInfo si = newSegmentInfo(dir, "_123"); si.setFiles(Collections.emptySet()); si.getCodec().compoundFormat().write(dir, si, IOContext.DEFAULT); - Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si, IOContext.DEFAULT); + Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si); assertEquals(0, cfs.listAll().length); cfs.close(); dir.close(); @@ -84,7 +84,7 @@ public void testSingleFile() throws IOException { si.setFiles(Collections.singleton(testfile)); si.getCodec().compoundFormat().write(dir, si, IOContext.DEFAULT); - Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si, IOContext.DEFAULT); + Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si); IndexInput expected = dir.openInput(testfile, newIOContext(random())); IndexInput actual = cfs.openInput(testfile, newIOContext(random())); @@ -107,7 +107,7 @@ public void testTwoFiles() throws IOException { si.setFiles(Arrays.asList(files)); si.getCodec().compoundFormat().write(dir, si, IOContext.DEFAULT); - Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si, IOContext.DEFAULT); + Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si); for (String file : files) { IndexInput expected = dir.openInput(file, newIOContext(random())); @@ -136,7 +136,7 @@ public void testDoubleClose() throws IOException { si.setFiles(Collections.singleton(testfile)); si.getCodec().compoundFormat().write(dir, si, IOContext.DEFAULT); - Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si, IOContext.DEFAULT); + Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si); assertEquals(1, cfs.listAll().length); cfs.close(); cfs.close(); // second close should not throw exception @@ -215,10 +215,7 @@ public void testListAll() throws Exception { for (SegmentCommitInfo si : infos) { if (si.info.getUseCompoundFile()) { try (Directory cfsDir = - si.info - .getCodec() - .compoundFormat() - .getCompoundReader(dir, si.info, newIOContext(random()))) { + si.info.getCodec().compoundFormat().getCompoundReader(dir, si.info)) { for (String cfsFile : cfsDir.listAll()) { try (IndexInput cfsIn = cfsDir.openInput(cfsFile, IOContext.DEFAULT)) { assert cfsIn != null; @@ -237,7 +234,7 @@ public void testCreateOutputDisabled() throws IOException { SegmentInfo si = newSegmentInfo(dir, "_123"); si.setFiles(Collections.emptyList()); si.getCodec().compoundFormat().write(dir, si, IOContext.DEFAULT); - Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si, IOContext.DEFAULT); + Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si); expectThrows( UnsupportedOperationException.class, () -> { @@ -260,7 +257,7 @@ public void testDeleteFileDisabled() throws IOException { SegmentInfo si = newSegmentInfo(dir, "_123"); si.setFiles(Collections.emptyList()); si.getCodec().compoundFormat().write(dir, si, IOContext.DEFAULT); - Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si, IOContext.DEFAULT); + Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si); expectThrows( UnsupportedOperationException.class, () -> { @@ -283,7 +280,7 @@ public void testRenameFileDisabled() throws IOException { SegmentInfo si = newSegmentInfo(dir, "_123"); si.setFiles(Collections.emptyList()); si.getCodec().compoundFormat().write(dir, si, IOContext.DEFAULT); - Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si, IOContext.DEFAULT); + Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si); expectThrows( UnsupportedOperationException.class, () -> { @@ -306,7 +303,7 @@ public void testSyncDisabled() throws IOException { SegmentInfo si = newSegmentInfo(dir, "_123"); si.setFiles(Collections.emptyList()); si.getCodec().compoundFormat().write(dir, si, IOContext.DEFAULT); - Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si, IOContext.DEFAULT); + Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si); expectThrows( UnsupportedOperationException.class, () -> { @@ -329,7 +326,7 @@ public void testMakeLockDisabled() throws IOException { SegmentInfo si = newSegmentInfo(dir, "_123"); si.setFiles(Collections.emptyList()); si.getCodec().compoundFormat().write(dir, si, IOContext.DEFAULT); - Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si, IOContext.DEFAULT); + Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si); expectThrows( UnsupportedOperationException.class, () -> { @@ -374,7 +371,7 @@ public void testRandomFiles() throws IOException { si.setFiles(files); si.getCodec().compoundFormat().write(dir, si, IOContext.DEFAULT); - Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si, IOContext.DEFAULT); + Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si); for (String file : files) { IndexInput check = dir.openInput(file, newIOContext(random())); @@ -411,7 +408,7 @@ public void testManySubFiles() throws IOException { si.setFiles(files); si.getCodec().compoundFormat().write(dir, si, IOContext.DEFAULT); - Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si, IOContext.DEFAULT); + Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si); final IndexInput[] ins = new IndexInput[FILE_COUNT]; for (int fileIdx = 0; fileIdx < FILE_COUNT; fileIdx++) { @@ -793,7 +790,7 @@ protected static Directory createLargeCFS(Directory dir) throws IOException { si.setFiles(files); si.getCodec().compoundFormat().write(dir, si, IOContext.DEFAULT); - Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si, IOContext.DEFAULT); + Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si); return cfs; } @@ -817,7 +814,7 @@ public void testResourceNameInsideCompoundFile() throws Exception { si.setFiles(Collections.singletonList(subFile)); si.getCodec().compoundFormat().write(dir, si, IOContext.DEFAULT); - Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si, IOContext.DEFAULT); + Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si); IndexInput in = cfs.openInput(subFile, IOContext.DEFAULT); String desc = in.toString(); assertTrue( @@ -899,7 +896,7 @@ public void testCheckIntegrity() throws IOException { ReadBytesDirectoryWrapper readTrackingDir = new ReadBytesDirectoryWrapper(dir); CompoundDirectory compoundDir = - si.getCodec().compoundFormat().getCompoundReader(readTrackingDir, si, IOContext.DEFAULT); + si.getCodec().compoundFormat().getCompoundReader(readTrackingDir, si); compoundDir.checkIntegrity(); Map readBytes = readTrackingDir.getReadBytes(); assertEquals(createdFiles, readBytes.keySet()); diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseDocValuesFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseDocValuesFormatTestCase.java index a312b42a910a..9b99aeecba64 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseDocValuesFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseDocValuesFormatTestCase.java @@ -24,6 +24,7 @@ import java.io.PrintStream; import java.util.function.Supplier; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.document.BinaryDocValuesField; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.NumericDocValuesField; @@ -31,22 +32,26 @@ import org.apache.lucene.document.SortedNumericDocValuesField; import org.apache.lucene.document.SortedSetDocValuesField; import org.apache.lucene.document.StringField; +import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.CheckIndex; import org.apache.lucene.index.CheckIndex.Status.DocValuesStatus; import org.apache.lucene.index.CodecReader; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.DocValuesSkipper; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.SerialMergeScheduler; import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.SortedNumericDocValues; import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.TermsEnum.SeekStatus; +import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.analysis.MockAnalyzer; import org.apache.lucene.tests.util.TestUtil; @@ -832,4 +837,74 @@ private interface DocValuesWrapper { int docID(); } + + public void testMismatchedFields() throws Exception { + Directory dir1 = newDirectory(); + IndexWriter w1 = new IndexWriter(dir1, newIndexWriterConfig()); + Document doc = new Document(); + doc.add(new BinaryDocValuesField("binary", new BytesRef("lucene"))); + doc.add(new NumericDocValuesField("numeric", 0L)); + doc.add(new SortedDocValuesField("sorted", new BytesRef("search"))); + doc.add(new SortedNumericDocValuesField("sorted_numeric", 1L)); + doc.add(new SortedSetDocValuesField("sorted_set", new BytesRef("engine"))); + w1.addDocument(doc); + + Directory dir2 = newDirectory(); + IndexWriter w2 = + new IndexWriter(dir2, newIndexWriterConfig().setMergeScheduler(new SerialMergeScheduler())); + w2.addDocument(doc); + w2.commit(); + + DirectoryReader reader = DirectoryReader.open(w1); + w1.close(); + w2.addIndexes(new MismatchedCodecReader((CodecReader) getOnlyLeafReader(reader), random())); + reader.close(); + w2.forceMerge(1); + reader = DirectoryReader.open(w2); + w2.close(); + + LeafReader leafReader = getOnlyLeafReader(reader); + + BinaryDocValues bdv = leafReader.getBinaryDocValues("binary"); + assertNotNull(bdv); + assertEquals(0, bdv.nextDoc()); + assertEquals(new BytesRef("lucene"), bdv.binaryValue()); + assertEquals(1, bdv.nextDoc()); + assertEquals(new BytesRef("lucene"), bdv.binaryValue()); + assertEquals(DocIdSetIterator.NO_MORE_DOCS, bdv.nextDoc()); + + NumericDocValues ndv = leafReader.getNumericDocValues("numeric"); + assertNotNull(ndv); + assertEquals(0, ndv.nextDoc()); + assertEquals(0, ndv.longValue()); + assertEquals(1, ndv.nextDoc()); + assertEquals(0, ndv.longValue()); + assertEquals(DocIdSetIterator.NO_MORE_DOCS, ndv.nextDoc()); + + SortedDocValues sdv = leafReader.getSortedDocValues("sorted"); + assertNotNull(sdv); + assertEquals(0, sdv.nextDoc()); + assertEquals(new BytesRef("search"), sdv.lookupOrd(sdv.ordValue())); + assertEquals(1, sdv.nextDoc()); + assertEquals(new BytesRef("search"), sdv.lookupOrd(sdv.ordValue())); + assertEquals(DocIdSetIterator.NO_MORE_DOCS, sdv.nextDoc()); + + SortedNumericDocValues sndv = leafReader.getSortedNumericDocValues("sorted_numeric"); + assertNotNull(sndv); + assertEquals(0, sndv.nextDoc()); + assertEquals(1, sndv.nextValue()); + assertEquals(1, sndv.nextDoc()); + assertEquals(1, sndv.nextValue()); + assertEquals(DocIdSetIterator.NO_MORE_DOCS, sndv.nextDoc()); + + SortedSetDocValues ssdv = leafReader.getSortedSetDocValues("sorted_set"); + assertNotNull(ssdv); + assertEquals(0, ssdv.nextDoc()); + assertEquals(new BytesRef("engine"), ssdv.lookupOrd(ssdv.nextOrd())); + assertEquals(1, ssdv.nextDoc()); + assertEquals(new BytesRef("engine"), ssdv.lookupOrd(ssdv.nextOrd())); + assertEquals(DocIdSetIterator.NO_MORE_DOCS, ssdv.nextDoc()); + + IOUtils.close(reader, w2, dir1, dir2); + } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseFieldInfoFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseFieldInfoFormatTestCase.java index 7d4f2839c6d6..ba2c6f362a3c 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseFieldInfoFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseFieldInfoFormatTestCase.java @@ -30,6 +30,7 @@ import org.apache.lucene.document.FieldType; import org.apache.lucene.document.StoredField; import org.apache.lucene.document.TextField; +import org.apache.lucene.index.DocValuesSkipIndexType; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; @@ -87,7 +88,7 @@ public void testOneField() throws Exception { assertFalse(infos2.fieldInfo("field").getDocValuesType() != DocValuesType.NONE); assertFalse(infos2.fieldInfo("field").omitsNorms()); assertFalse(infos2.fieldInfo("field").hasPayloads()); - assertFalse(infos2.fieldInfo("field").hasVectors()); + assertFalse(infos2.fieldInfo("field").hasTermVectors()); assertEquals(0, infos2.fieldInfo("field").getPointDimensionCount()); assertEquals(0, infos2.fieldInfo("field").getVectorDimension()); assertFalse(infos2.fieldInfo("field").isSoftDeletesField()); @@ -303,14 +304,14 @@ public void testRandom() throws Exception { storePayloads = random().nextBoolean(); } } - boolean hasDocValuesSkipIndex = false; + DocValuesSkipIndexType docValuesSkipIndexType = DocValuesSkipIndexType.NONE; if (EnumSet.of( DocValuesType.NUMERIC, DocValuesType.SORTED, DocValuesType.SORTED_NUMERIC, DocValuesType.SORTED_SET) .contains(fieldType.docValuesType())) { - hasDocValuesSkipIndex = fieldType.hasDocValuesSkipIndex(); + docValuesSkipIndexType = fieldType.docValuesSkipIndexType(); } FieldInfo fi = new FieldInfo( @@ -321,7 +322,7 @@ public void testRandom() throws Exception { storePayloads, fieldType.indexOptions(), fieldType.docValuesType(), - hasDocValuesSkipIndex, + docValuesSkipIndexType, -1, new HashMap<>(), fieldType.pointDimensionCount(), @@ -374,7 +375,10 @@ private IndexableFieldType randomFieldType(Random r, String fieldName) { || current == DocValuesType.SORTED_NUMERIC || current == DocValuesType.SORTED || current == DocValuesType.SORTED_SET) { - type.setDocValuesSkipIndex(supportDocValuesSkipIndex() && random().nextBoolean()); + type.setDocValuesSkipIndexType( + supportDocValuesSkipIndex() + ? DocValuesSkipIndexType.RANGE + : DocValuesSkipIndexType.NONE); } } @@ -414,11 +418,11 @@ protected void assertEquals(FieldInfo expected, FieldInfo actual) { assertEquals(expected.number, actual.number); assertEquals(expected.name, actual.name); assertEquals(expected.getDocValuesType(), actual.getDocValuesType()); - assertEquals(expected.hasDocValuesSkipIndex(), actual.hasDocValuesSkipIndex()); + assertEquals(expected.docValuesSkipIndexType(), actual.docValuesSkipIndexType()); assertEquals(expected.getIndexOptions(), actual.getIndexOptions()); assertEquals(expected.hasNorms(), actual.hasNorms()); assertEquals(expected.hasPayloads(), actual.hasPayloads()); - assertEquals(expected.hasVectors(), actual.hasVectors()); + assertEquals(expected.hasTermVectors(), actual.hasTermVectors()); assertEquals(expected.omitsNorms(), actual.omitsNorms()); assertEquals(expected.getDocValuesGen(), actual.getDocValuesGen()); } @@ -455,7 +459,7 @@ private FieldInfo createFieldInfo() { false, TextField.TYPE_STORED.indexOptions(), DocValuesType.NONE, - false, + DocValuesSkipIndexType.NONE, -1, new HashMap<>(), 0, diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseIndexFileFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseIndexFileFormatTestCase.java index 6f2bc38ff643..297c1b777f53 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseIndexFileFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseIndexFileFormatTestCase.java @@ -352,12 +352,12 @@ public void testMultiClose() throws IOException { new FieldInfo( proto.name, proto.number, - proto.hasVectors(), + proto.hasTermVectors(), proto.omitsNorms(), proto.hasPayloads(), proto.getIndexOptions(), proto.getDocValuesType(), - proto.hasDocValuesSkipIndex(), + proto.docValuesSkipIndexType(), proto.getDocValuesGen(), new HashMap<>(), proto.getPointDimensionCount(), diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java index a10d26423494..752f21ea5d7a 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java @@ -16,22 +16,31 @@ */ package org.apache.lucene.tests.index; +import static com.carrotsearch.randomizedtesting.RandomizedTest.randomIntBetween; import static java.nio.charset.StandardCharsets.UTF_8; import static org.apache.lucene.index.VectorSimilarityFunction.DOT_PRODUCT; import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; +import java.io.BufferedReader; import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; +import java.util.List; import java.util.Map; import java.util.Set; +import java.util.concurrent.atomic.AtomicReference; import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.FilterCodec; import org.apache.lucene.codecs.KnnFieldVectorsWriter; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.codecs.KnnVectorsWriter; +import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.KnnByteVectorField; @@ -43,6 +52,7 @@ import org.apache.lucene.index.CheckIndex; import org.apache.lucene.index.CodecReader; import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.DocValuesSkipIndexType; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; @@ -51,16 +61,25 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.MergePolicy; +import org.apache.lucene.index.MergeScheduler; +import org.apache.lucene.index.MergeTrigger; import org.apache.lucene.index.NoMergePolicy; import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.SerialMergeScheduler; import org.apache.lucene.index.StoredFields; import org.apache.lucene.index.Term; import org.apache.lucene.index.VectorEncoding; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.KnnFloatVectorQuery; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.TopDocs; @@ -71,6 +90,7 @@ import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.InfoStream; import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.VectorUtil; @@ -99,8 +119,8 @@ public void init() { protected void addRandomFields(Document doc) { switch (vectorEncoding) { case BYTE -> doc.add(new KnnByteVectorField("v2", randomVector8(30), similarityFunction)); - case FLOAT32 -> doc.add( - new KnnFloatVectorField("v2", randomNormalizedVector(30), similarityFunction)); + case FLOAT32 -> + doc.add(new KnnFloatVectorField("v2", randomNormalizedVector(30), similarityFunction)); } } @@ -230,6 +250,106 @@ public void testIllegalDimChangeTwoWriters() throws Exception { } } + public void testMergingWithDifferentKnnFields() throws Exception { + try (var dir = newDirectory()) { + IndexWriterConfig iwc = new IndexWriterConfig(); + Codec codec = getCodec(); + if (codec.knnVectorsFormat() instanceof PerFieldKnnVectorsFormat perFieldKnnVectorsFormat) { + final KnnVectorsFormat format = + perFieldKnnVectorsFormat.getKnnVectorsFormatForField("field"); + iwc.setCodec( + new FilterCodec(codec.getName(), codec) { + @Override + public KnnVectorsFormat knnVectorsFormat() { + return format; + } + }); + } + TestMergeScheduler mergeScheduler = new TestMergeScheduler(); + iwc.setMergeScheduler(mergeScheduler); + iwc.setMergePolicy(new ForceMergePolicy(iwc.getMergePolicy())); + try (var writer = new IndexWriter(dir, iwc)) { + for (int i = 0; i < 10; i++) { + var doc = new Document(); + doc.add(new KnnFloatVectorField("field", new float[] {i, i + 1, i + 2, i + 3})); + writer.addDocument(doc); + } + writer.commit(); + for (int i = 0; i < 10; i++) { + var doc = new Document(); + doc.add(new KnnFloatVectorField("otherVector", new float[] {i, i, i, i})); + writer.addDocument(doc); + } + writer.commit(); + writer.forceMerge(1); + assertNull(mergeScheduler.ex.get()); + } + } + } + + public void testMergingWithDifferentByteKnnFields() throws Exception { + try (var dir = newDirectory()) { + IndexWriterConfig iwc = new IndexWriterConfig(); + Codec codec = getCodec(); + if (codec.knnVectorsFormat() instanceof PerFieldKnnVectorsFormat perFieldKnnVectorsFormat) { + final KnnVectorsFormat format = + perFieldKnnVectorsFormat.getKnnVectorsFormatForField("field"); + iwc.setCodec( + new FilterCodec(codec.getName(), codec) { + @Override + public KnnVectorsFormat knnVectorsFormat() { + return format; + } + }); + } + TestMergeScheduler mergeScheduler = new TestMergeScheduler(); + iwc.setMergeScheduler(mergeScheduler); + iwc.setMergePolicy(new ForceMergePolicy(iwc.getMergePolicy())); + try (var writer = new IndexWriter(dir, iwc)) { + for (int i = 0; i < 10; i++) { + var doc = new Document(); + doc.add( + new KnnByteVectorField("field", new byte[] {(byte) i, (byte) i, (byte) i, (byte) i})); + writer.addDocument(doc); + } + writer.commit(); + for (int i = 0; i < 10; i++) { + var doc = new Document(); + doc.add( + new KnnByteVectorField( + "otherVector", new byte[] {(byte) i, (byte) i, (byte) i, (byte) i})); + writer.addDocument(doc); + } + writer.commit(); + writer.forceMerge(1); + assertNull(mergeScheduler.ex.get()); + } + } + } + + private static final class TestMergeScheduler extends MergeScheduler { + AtomicReference ex = new AtomicReference<>(); + + @Override + public void merge(MergeSource mergeSource, MergeTrigger trigger) throws IOException { + while (true) { + MergePolicy.OneMerge merge = mergeSource.getNextMerge(); + if (merge == null) { + break; + } + try { + mergeSource.merge(merge); + } catch (IllegalStateException | IllegalArgumentException e) { + ex.set(e); + break; + } + } + } + + @Override + public void close() {} + } + @SuppressWarnings("unchecked") public void testWriterRamEstimate() throws Exception { final FieldInfos fieldInfos = new FieldInfos(new FieldInfo[0]); @@ -271,7 +391,7 @@ public void testWriterRamEstimate() throws Exception { false, IndexOptions.NONE, DocValuesType.NONE, - false, + DocValuesSkipIndexType.NONE, -1, Map.of(), 0, @@ -328,9 +448,10 @@ public void testAddIndexesDirectory0() throws Exception { try (IndexReader reader = DirectoryReader.open(w2)) { LeafReader r = getOnlyLeafReader(reader); FloatVectorValues vectorValues = r.getFloatVectorValues(fieldName); - assertEquals(0, vectorValues.nextDoc()); - assertEquals(0, vectorValues.vectorValue()[0], 0); - assertEquals(NO_MORE_DOCS, vectorValues.nextDoc()); + KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); + assertEquals(0, iterator.nextDoc()); + assertEquals(0, vectorValues.vectorValue(0)[0], 0); + assertEquals(NO_MORE_DOCS, iterator.nextDoc()); } } } @@ -353,9 +474,10 @@ public void testAddIndexesDirectory1() throws Exception { try (IndexReader reader = DirectoryReader.open(w2)) { LeafReader r = getOnlyLeafReader(reader); FloatVectorValues vectorValues = r.getFloatVectorValues(fieldName); - assertNotEquals(NO_MORE_DOCS, vectorValues.nextDoc()); - assertEquals(0, vectorValues.vectorValue()[0], 0); - assertEquals(NO_MORE_DOCS, vectorValues.nextDoc()); + KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); + assertNotEquals(NO_MORE_DOCS, iterator.nextDoc()); + assertEquals(0, vectorValues.vectorValue(iterator.index())[0], 0); + assertEquals(NO_MORE_DOCS, iterator.nextDoc()); } } } @@ -380,12 +502,13 @@ public void testAddIndexesDirectory01() throws Exception { try (IndexReader reader = DirectoryReader.open(w2)) { LeafReader r = getOnlyLeafReader(reader); FloatVectorValues vectorValues = r.getFloatVectorValues(fieldName); - assertEquals(0, vectorValues.nextDoc()); + KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); + assertEquals(0, iterator.nextDoc()); // The merge order is randomized, we might get 0 first, or 1 - float value = vectorValues.vectorValue()[0]; + float value = vectorValues.vectorValue(0)[0]; assertTrue(value == 0 || value == 1); - assertEquals(1, vectorValues.nextDoc()); - value += vectorValues.vectorValue()[0]; + assertEquals(1, iterator.nextDoc()); + value += vectorValues.vectorValue(1)[0]; assertEquals(1, value, 0); } } @@ -770,8 +893,10 @@ public void testSparseVectors() throws Exception { ByteVectorValues byteVectorValues = ctx.reader().getByteVectorValues(fieldName); if (byteVectorValues != null) { docCount += byteVectorValues.size(); - while (byteVectorValues.nextDoc() != NO_MORE_DOCS) { - checksum += byteVectorValues.vectorValue()[0]; + KnnVectorValues.DocIndexIterator iterator = byteVectorValues.iterator(); + while (true) { + if (!(iterator.nextDoc() != NO_MORE_DOCS)) break; + checksum += byteVectorValues.vectorValue(iterator.index())[0]; } } } @@ -781,8 +906,10 @@ public void testSparseVectors() throws Exception { FloatVectorValues vectorValues = ctx.reader().getFloatVectorValues(fieldName); if (vectorValues != null) { docCount += vectorValues.size(); - while (vectorValues.nextDoc() != NO_MORE_DOCS) { - checksum += vectorValues.vectorValue()[0]; + KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); + while (true) { + if (!(iterator.nextDoc() != NO_MORE_DOCS)) break; + checksum += vectorValues.vectorValue(iterator.index())[0]; } } } @@ -841,10 +968,12 @@ public void testFloatVectorScorerIteration() throws Exception { assertSame(iterator, scorer.iterator()); assertNotSame(iterator, scorer); // verify scorer iteration scores are valid & iteration with vectorValues is consistent - while (iterator.nextDoc() != NO_MORE_DOCS && vectorValues.nextDoc() != NO_MORE_DOCS) { + KnnVectorValues.DocIndexIterator valuesIterator = vectorValues.iterator(); + while (iterator.nextDoc() != NO_MORE_DOCS) { + if (!(valuesIterator.nextDoc() != NO_MORE_DOCS)) break; float score = scorer.score(); assertTrue(score >= 0f); - assertEquals(iterator.docID(), vectorValues.docID()); + assertEquals(iterator.docID(), valuesIterator.docID()); } // verify that a new scorer can be obtained after iteration VectorScorer newScorer = vectorValues.scorer(vectorToScore); @@ -900,10 +1029,12 @@ public void testByteVectorScorerIteration() throws Exception { assertSame(iterator, scorer.iterator()); assertNotSame(iterator, scorer); // verify scorer iteration scores are valid & iteration with vectorValues is consistent - while (iterator.nextDoc() != NO_MORE_DOCS && vectorValues.nextDoc() != NO_MORE_DOCS) { + KnnVectorValues.DocIndexIterator valuesIterator = vectorValues.iterator(); + while (iterator.nextDoc() != NO_MORE_DOCS) { + if (!(valuesIterator.nextDoc() != NO_MORE_DOCS)) break; float score = scorer.score(); assertTrue(score >= 0f); - assertEquals(iterator.docID(), vectorValues.docID()); + assertEquals(iterator.docID(), valuesIterator.docID()); } // verify that a new scorer can be obtained after iteration VectorScorer newScorer = vectorValues.scorer(vectorToScore); @@ -1009,12 +1140,16 @@ public void testIndexedValueNotAliased() throws Exception { LeafReader r = getOnlyLeafReader(reader); FloatVectorValues vectorValues = r.getFloatVectorValues(fieldName); assertEquals(3, vectorValues.size()); - vectorValues.nextDoc(); - assertEquals(1, vectorValues.vectorValue()[0], 0); - vectorValues.nextDoc(); - assertEquals(1, vectorValues.vectorValue()[0], 0); - vectorValues.nextDoc(); - assertEquals(2, vectorValues.vectorValue()[0], 0); + KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); + iterator.nextDoc(); + assertEquals(0, iterator.index()); + assertEquals(1, vectorValues.vectorValue(0)[0], 0); + iterator.nextDoc(); + assertEquals(1, iterator.index()); + assertEquals(1, vectorValues.vectorValue(1)[0], 0); + iterator.nextDoc(); + assertEquals(2, iterator.index()); + assertEquals(2, vectorValues.vectorValue(2)[0], 0); } } } @@ -1037,13 +1172,14 @@ public void testSortedIndex() throws Exception { FloatVectorValues vectorValues = leaf.getFloatVectorValues(fieldName); assertEquals(2, vectorValues.dimension()); assertEquals(3, vectorValues.size()); - assertEquals("1", storedFields.document(vectorValues.nextDoc()).get("id")); - assertEquals(-1f, vectorValues.vectorValue()[0], 0); - assertEquals("2", storedFields.document(vectorValues.nextDoc()).get("id")); - assertEquals(1, vectorValues.vectorValue()[0], 0); - assertEquals("4", storedFields.document(vectorValues.nextDoc()).get("id")); - assertEquals(0, vectorValues.vectorValue()[0], 0); - assertEquals(NO_MORE_DOCS, vectorValues.nextDoc()); + KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); + assertEquals("1", storedFields.document(iterator.nextDoc()).get("id")); + assertEquals(-1f, vectorValues.vectorValue(0)[0], 0); + assertEquals("2", storedFields.document(iterator.nextDoc()).get("id")); + assertEquals(1, vectorValues.vectorValue(1)[0], 0); + assertEquals("4", storedFields.document(iterator.nextDoc()).get("id")); + assertEquals(0, vectorValues.vectorValue(2)[0], 0); + assertEquals(NO_MORE_DOCS, iterator.nextDoc()); } } } @@ -1066,13 +1202,13 @@ public void testSortedIndexBytes() throws Exception { ByteVectorValues vectorValues = leaf.getByteVectorValues(fieldName); assertEquals(2, vectorValues.dimension()); assertEquals(3, vectorValues.size()); - assertEquals("1", storedFields.document(vectorValues.nextDoc()).get("id")); - assertEquals(-1, vectorValues.vectorValue()[0], 0); - assertEquals("2", storedFields.document(vectorValues.nextDoc()).get("id")); - assertEquals(1, vectorValues.vectorValue()[0], 0); - assertEquals("4", storedFields.document(vectorValues.nextDoc()).get("id")); - assertEquals(0, vectorValues.vectorValue()[0], 0); - assertEquals(NO_MORE_DOCS, vectorValues.nextDoc()); + assertEquals("1", storedFields.document(vectorValues.iterator().nextDoc()).get("id")); + assertEquals(-1, vectorValues.vectorValue(0)[0], 0); + assertEquals("2", storedFields.document(vectorValues.iterator().nextDoc()).get("id")); + assertEquals(1, vectorValues.vectorValue(1)[0], 0); + assertEquals("4", storedFields.document(vectorValues.iterator().nextDoc()).get("id")); + assertEquals(0, vectorValues.vectorValue(2)[0], 0); + assertEquals(NO_MORE_DOCS, vectorValues.iterator().nextDoc()); } } } @@ -1102,27 +1238,30 @@ public void testIndexMultipleKnnVectorFields() throws Exception { FloatVectorValues vectorValues = leaf.getFloatVectorValues("field1"); assertEquals(2, vectorValues.dimension()); assertEquals(2, vectorValues.size()); - vectorValues.nextDoc(); - assertEquals(1f, vectorValues.vectorValue()[0], 0); - vectorValues.nextDoc(); - assertEquals(2f, vectorValues.vectorValue()[0], 0); - assertEquals(NO_MORE_DOCS, vectorValues.nextDoc()); + KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); + iterator.nextDoc(); + assertEquals(1f, vectorValues.vectorValue(0)[0], 0); + iterator.nextDoc(); + assertEquals(2f, vectorValues.vectorValue(1)[0], 0); + assertEquals(NO_MORE_DOCS, iterator.nextDoc()); FloatVectorValues vectorValues2 = leaf.getFloatVectorValues("field2"); + KnnVectorValues.DocIndexIterator it2 = vectorValues2.iterator(); assertEquals(4, vectorValues2.dimension()); assertEquals(2, vectorValues2.size()); - vectorValues2.nextDoc(); - assertEquals(2f, vectorValues2.vectorValue()[1], 0); - vectorValues2.nextDoc(); - assertEquals(2f, vectorValues2.vectorValue()[1], 0); - assertEquals(NO_MORE_DOCS, vectorValues2.nextDoc()); + it2.nextDoc(); + assertEquals(2f, vectorValues2.vectorValue(0)[1], 0); + it2.nextDoc(); + assertEquals(2f, vectorValues2.vectorValue(1)[1], 0); + assertEquals(NO_MORE_DOCS, it2.nextDoc()); FloatVectorValues vectorValues3 = leaf.getFloatVectorValues("field3"); assertEquals(4, vectorValues3.dimension()); assertEquals(1, vectorValues3.size()); - vectorValues3.nextDoc(); - assertEquals(1f, vectorValues3.vectorValue()[0], 0.1); - assertEquals(NO_MORE_DOCS, vectorValues3.nextDoc()); + KnnVectorValues.DocIndexIterator it3 = vectorValues3.iterator(); + it3.nextDoc(); + assertEquals(1f, vectorValues3.vectorValue(0)[0], 0.1); + assertEquals(NO_MORE_DOCS, it3.nextDoc()); } } } @@ -1186,13 +1325,15 @@ public void testRandom() throws Exception { totalSize += vectorValues.size(); StoredFields storedFields = ctx.reader().storedFields(); int docId; - while ((docId = vectorValues.nextDoc()) != NO_MORE_DOCS) { - float[] v = vectorValues.vectorValue(); + KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); + while (true) { + if (!((docId = iterator.nextDoc()) != NO_MORE_DOCS)) break; + float[] v = vectorValues.vectorValue(iterator.index()); assertEquals(dimension, v.length); String idString = storedFields.document(docId).getField("id").stringValue(); int id = Integer.parseInt(idString); if (ctx.reader().getLiveDocs() == null || ctx.reader().getLiveDocs().get(docId)) { - assertArrayEquals(idString, values[id], v, 0); + assertArrayEquals(idString + " " + docId, values[id], v, 0); ++valueCount; } else { ++numDeletes; @@ -1266,8 +1407,10 @@ public void testRandomBytes() throws Exception { totalSize += vectorValues.size(); StoredFields storedFields = ctx.reader().storedFields(); int docId; - while ((docId = vectorValues.nextDoc()) != NO_MORE_DOCS) { - byte[] v = vectorValues.vectorValue(); + KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); + while (true) { + if (!((docId = iterator.nextDoc()) != NO_MORE_DOCS)) break; + byte[] v = vectorValues.vectorValue(iterator.index()); assertEquals(dimension, v.length); String idString = storedFields.document(docId).getField("id").stringValue(); int id = Integer.parseInt(idString); @@ -1331,8 +1474,8 @@ public void testSearchWithVisitedLimit() throws Exception { ctx.reader() .searchNearestVectors( fieldName, randomNormalizedVector(dimension), k, liveDocs, visitedLimit); - assertEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, results.totalHits.relation); - assertEquals(visitedLimit, results.totalHits.value); + assertEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, results.totalHits.relation()); + assertEquals(visitedLimit, results.totalHits.value()); // check the limit is not hit when it clearly exceeds the number of vectors k = vectorValues.size(); @@ -1341,8 +1484,8 @@ public void testSearchWithVisitedLimit() throws Exception { ctx.reader() .searchNearestVectors( fieldName, randomNormalizedVector(dimension), k, liveDocs, visitedLimit); - assertEquals(TotalHits.Relation.EQUAL_TO, results.totalHits.relation); - assertTrue(results.totalHits.value <= visitedLimit); + assertEquals(TotalHits.Relation.EQUAL_TO, results.totalHits.relation()); + assertTrue(results.totalHits.value() <= visitedLimit); } } } @@ -1386,8 +1529,10 @@ public void testRandomWithUpdatesAndGraph() throws Exception { StoredFields storedFields = ctx.reader().storedFields(); int docId; int numLiveDocsWithVectors = 0; - while ((docId = vectorValues.nextDoc()) != NO_MORE_DOCS) { - float[] v = vectorValues.vectorValue(); + KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); + while (true) { + if (!((docId = iterator.nextDoc()) != NO_MORE_DOCS)) break; + float[] v = vectorValues.vectorValue(iterator.index()); assertEquals(dimension, v.length); String idString = storedFields.document(docId).getField("id").stringValue(); int id = Integer.parseInt(idString); @@ -1594,25 +1739,27 @@ public void testAdvance() throws Exception { FloatVectorValues vectorValues = r.getFloatVectorValues(fieldName); int[] vectorDocs = new int[vectorValues.size() + 1]; int cur = -1; + KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); while (++cur < vectorValues.size() + 1) { - vectorDocs[cur] = vectorValues.nextDoc(); + vectorDocs[cur] = iterator.nextDoc(); if (cur != 0) { assertTrue(vectorDocs[cur] > vectorDocs[cur - 1]); } } vectorValues = r.getFloatVectorValues(fieldName); + DocIdSetIterator iter = vectorValues.iterator(); cur = -1; for (int i = 0; i < numdocs; i++) { // randomly advance to i if (random().nextInt(4) == 3) { while (vectorDocs[++cur] < i) {} - assertEquals(vectorDocs[cur], vectorValues.advance(i)); - assertEquals(vectorDocs[cur], vectorValues.docID()); - if (vectorValues.docID() == NO_MORE_DOCS) { + assertEquals(vectorDocs[cur], iter.advance(i)); + assertEquals(vectorDocs[cur], iter.docID()); + if (iter.docID() == NO_MORE_DOCS) { break; } // make i equal to docid so that it is greater than docId in the next loop iteration - i = vectorValues.docID(); + i = iter.docID(); } } } @@ -1663,6 +1810,7 @@ public void testVectorValuesReportCorrectDocs() throws Exception { double checksum = 0; int docCount = 0; long sumDocIds = 0; + long sumOrdToDocIds = 0; switch (vectorEncoding) { case BYTE -> { for (LeafReaderContext ctx : r.leaves()) { @@ -1670,11 +1818,18 @@ public void testVectorValuesReportCorrectDocs() throws Exception { if (byteVectorValues != null) { docCount += byteVectorValues.size(); StoredFields storedFields = ctx.reader().storedFields(); - while (byteVectorValues.nextDoc() != NO_MORE_DOCS) { - checksum += byteVectorValues.vectorValue()[0]; - Document doc = storedFields.document(byteVectorValues.docID(), Set.of("id")); + KnnVectorValues.DocIndexIterator iter = byteVectorValues.iterator(); + for (iter.nextDoc(); iter.docID() != NO_MORE_DOCS; iter.nextDoc()) { + int ord = iter.index(); + checksum += byteVectorValues.vectorValue(ord)[0]; + Document doc = storedFields.document(iter.docID(), Set.of("id")); sumDocIds += Integer.parseInt(doc.get("id")); } + for (int ord = 0; ord < byteVectorValues.size(); ord++) { + Document doc = + storedFields.document(byteVectorValues.ordToDoc(ord), Set.of("id")); + sumOrdToDocIds += Integer.parseInt(doc.get("id")); + } } } } @@ -1684,11 +1839,17 @@ public void testVectorValuesReportCorrectDocs() throws Exception { if (vectorValues != null) { docCount += vectorValues.size(); StoredFields storedFields = ctx.reader().storedFields(); - while (vectorValues.nextDoc() != NO_MORE_DOCS) { - checksum += vectorValues.vectorValue()[0]; - Document doc = storedFields.document(vectorValues.docID(), Set.of("id")); + KnnVectorValues.DocIndexIterator iter = vectorValues.iterator(); + for (iter.nextDoc(); iter.docID() != NO_MORE_DOCS; iter.nextDoc()) { + int ord = iter.index(); + checksum += vectorValues.vectorValue(ord)[0]; + Document doc = storedFields.document(iter.docID(), Set.of("id")); sumDocIds += Integer.parseInt(doc.get("id")); } + for (int ord = 0; ord < vectorValues.size(); ord++) { + Document doc = storedFields.document(vectorValues.ordToDoc(ord), Set.of("id")); + sumOrdToDocIds += Integer.parseInt(doc.get("id")); + } } } } @@ -1700,7 +1861,223 @@ public void testVectorValuesReportCorrectDocs() throws Exception { vectorEncoding == VectorEncoding.BYTE ? numDocs * 0.2 : 1e-5); assertEquals(fieldDocCount, docCount); assertEquals(fieldSumDocIDs, sumDocIds); + assertEquals(fieldSumDocIDs, sumOrdToDocIds); + } + } + } + + public void testMismatchedFields() throws Exception { + Directory dir1 = newDirectory(); + IndexWriter w1 = new IndexWriter(dir1, newIndexWriterConfig()); + Document doc = new Document(); + doc.add(new KnnFloatVectorField("float", new float[] {1f, 2f})); + doc.add(new KnnByteVectorField("byte", new byte[] {42})); + w1.addDocument(doc); + + Directory dir2 = newDirectory(); + IndexWriter w2 = + new IndexWriter(dir2, newIndexWriterConfig().setMergeScheduler(new SerialMergeScheduler())); + w2.addDocument(doc); + w2.commit(); + + DirectoryReader reader = DirectoryReader.open(w1); + w1.close(); + w2.addIndexes(new MismatchedCodecReader((CodecReader) getOnlyLeafReader(reader), random())); + reader.close(); + w2.forceMerge(1); + reader = DirectoryReader.open(w2); + w2.close(); + + LeafReader leafReader = getOnlyLeafReader(reader); + + ByteVectorValues byteVectors = leafReader.getByteVectorValues("byte"); + assertNotNull(byteVectors); + KnnVectorValues.DocIndexIterator iter = byteVectors.iterator(); + assertEquals(0, iter.nextDoc()); + assertArrayEquals(new byte[] {42}, byteVectors.vectorValue(0)); + assertEquals(1, iter.nextDoc()); + assertArrayEquals(new byte[] {42}, byteVectors.vectorValue(1)); + assertEquals(DocIdSetIterator.NO_MORE_DOCS, iter.nextDoc()); + + FloatVectorValues floatVectors = leafReader.getFloatVectorValues("float"); + assertNotNull(floatVectors); + iter = floatVectors.iterator(); + assertEquals(0, iter.nextDoc()); + float[] vector = floatVectors.vectorValue(0); + assertEquals(2, vector.length); + assertEquals(1f, vector[0], 0f); + assertEquals(2f, vector[1], 0f); + assertEquals(1, iter.nextDoc()); + vector = floatVectors.vectorValue(1); + assertEquals(2, vector.length); + assertEquals(1f, vector[0], 0f); + assertEquals(2f, vector[1], 0f); + assertEquals(DocIdSetIterator.NO_MORE_DOCS, iter.nextDoc()); + + IOUtils.close(reader, w2, dir1, dir2); + } + + /** + * Test that the query is a viable approximation to exact search. This test is designed to uncover + * gross failures only, not to represent the true expected recall. + */ + public void testRecall() throws IOException { + VectorSimilarityFunction[] functions = { + VectorSimilarityFunction.EUCLIDEAN, + VectorSimilarityFunction.COSINE, + VectorSimilarityFunction.DOT_PRODUCT, + VectorSimilarityFunction.MAXIMUM_INNER_PRODUCT + }; + for (VectorSimilarityFunction similarity : functions) { + assertRecall(similarity, 0.5, 1.0); + } + } + + protected void assertRecall(VectorSimilarityFunction similarity, double min, double max) + throws IOException { + int dim = 16; + int recalled = 0; + try (Directory indexStore = getKnownIndexStore("field", dim, similarity); + IndexReader reader = DirectoryReader.open(indexStore)) { + IndexSearcher searcher = newSearcher(reader); + float[] queryEmbedding = new float[dim]; + // indexed 421 lines from LICENSE.txt + // indexed 157 lines from NOTICE.txt + int topK = 10; + int numQueries = 578; + String[] testQueries = { + "Apache Lucene", + "Apache License", + "TERMS AND CONDITIONS", + "Copyright 2001", + "Permission is hereby", + "Copyright © 2003", + "The dictionary comes from Morfologik project", + "The levenshtein automata tables" + }; + for (String queryString : testQueries) { + computeLineEmbedding(queryString, queryEmbedding); + + // pass match-all "filter" to force full traversal, bypassing graph + KnnFloatVectorQuery exactQuery = + new KnnFloatVectorQuery("field", queryEmbedding, 1000, new MatchAllDocsQuery()); + assertEquals(numQueries, searcher.count(exactQuery)); // Same for exact search + + KnnFloatVectorQuery query = new KnnFloatVectorQuery("field", queryEmbedding, topK); + assertEquals(10, searcher.count(query)); // Expect some results without timeout + TopDocs results = searcher.search(query, topK); + Set resultDocs = new HashSet<>(); + int i = 0; + for (ScoreDoc scoreDoc : results.scoreDocs) { + if (VERBOSE) { + System.out.println( + "result " + + i++ + + ": " + + reader.storedFields().document(scoreDoc.doc) + + " " + + scoreDoc); + } + resultDocs.add(scoreDoc.doc); + } + TopDocs expected = searcher.search(exactQuery, topK); + i = 0; + for (ScoreDoc scoreDoc : expected.scoreDocs) { + if (VERBOSE) { + System.out.println( + "expected " + + i++ + + ": " + + reader.storedFields().document(scoreDoc.doc) + + " " + + scoreDoc); + } + if (resultDocs.contains(scoreDoc.doc)) { + ++recalled; + } + } + } + int totalResults = testQueries.length * topK; + assertTrue( + "Average recall for " + + similarity + + " should be at least " + + (totalResults * min) + + " / " + + totalResults + + ", got " + + recalled, + recalled >= (int) (totalResults * min)); + assertTrue( + "Average recall for " + + similarity + + " should be no more than " + + (totalResults * max) + + " / " + + totalResults + + ", got " + + recalled, + recalled <= (int) (totalResults * max)); + } + } + + /** Creates a new directory and adds documents with the given vectors as kNN vector fields */ + Directory getKnownIndexStore( + String field, int dimension, VectorSimilarityFunction vectorSimilarityFunction) + throws IOException { + Directory indexStore = newDirectory(random()); + IndexWriter writer = new IndexWriter(indexStore, newIndexWriterConfig()); + float[] scratch = new float[dimension]; + for (String file : List.of("LICENSE.txt", "NOTICE.txt")) { + try (InputStream in = BaseKnnVectorsFormatTestCase.class.getResourceAsStream(file); + BufferedReader reader = new BufferedReader(new InputStreamReader(in, UTF_8))) { + String line; + int lineNo = -1; + while ((line = reader.readLine()) != null) { + line = line.strip(); + if (line.isEmpty()) { + continue; + } + ++lineNo; + Document doc = new Document(); + doc.add( + new KnnFloatVectorField( + field, computeLineEmbedding(line, scratch), vectorSimilarityFunction)); + doc.add(new StoredField("text", line)); + doc.add(new StringField("id", file + "." + lineNo, Field.Store.YES)); + writer.addDocument(doc); + if (random().nextBoolean()) { + // Add some documents without a vector + addDocuments(writer, "id" + lineNo + ".", randomIntBetween(1, 5)); + } + } + // System.out.println("indexed " + (lineNo + 1) + " lines from " + file); + } + } + // Add some documents without a vector nor an id + addDocuments(writer, null, 5); + writer.close(); + return indexStore; + } + + private float[] computeLineEmbedding(String line, float[] vector) { + Arrays.fill(vector, 0); + for (int i = 0; i < line.length(); i++) { + char c = line.charAt(i); + vector[i % vector.length] += c / ((float) (i + 1) / vector.length); + } + VectorUtil.l2normalize(vector, false); + return vector; + } + + private void addDocuments(IndexWriter writer, String idBase, int count) throws IOException { + for (int i = 0; i < count; i++) { + Document doc = new Document(); + doc.add(new StringField("other", "value", Field.Store.NO)); + if (idBase != null) { + doc.add(new StringField("id", idBase + i, Field.Store.YES)); } + writer.addDocument(doc); } } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BasePointsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BasePointsFormatTestCase.java index 265e3f073be2..a15dd07a79ef 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BasePointsFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BasePointsFormatTestCase.java @@ -30,6 +30,7 @@ import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.IntPoint; +import org.apache.lucene.document.LongPoint; import org.apache.lucene.document.NumericDocValuesField; import org.apache.lucene.document.StringField; import org.apache.lucene.index.CodecReader; @@ -46,6 +47,7 @@ import org.apache.lucene.index.PointValues; import org.apache.lucene.index.PointValues.IntersectVisitor; import org.apache.lucene.index.PointValues.Relation; +import org.apache.lucene.index.SerialMergeScheduler; import org.apache.lucene.index.Term; import org.apache.lucene.internal.tests.ConcurrentMergeSchedulerAccess; import org.apache.lucene.internal.tests.TestSecrets; @@ -1408,4 +1410,80 @@ public int getDocCount() { } }; } + + public void testMismatchedFields() throws Exception { + Directory dir1 = newDirectory(); + IndexWriter w1 = new IndexWriter(dir1, newIndexWriterConfig()); + Document doc = new Document(); + doc.add(new LongPoint("f", 1L)); + doc.add(new LongPoint("g", 42L, 43L)); + w1.addDocument(doc); + + Directory dir2 = newDirectory(); + IndexWriter w2 = + new IndexWriter(dir2, newIndexWriterConfig().setMergeScheduler(new SerialMergeScheduler())); + w2.addDocument(doc); + w2.commit(); + + DirectoryReader reader = DirectoryReader.open(w1); + w1.close(); + w2.addIndexes(new MismatchedCodecReader((CodecReader) getOnlyLeafReader(reader), random())); + reader.close(); + w2.forceMerge(1); + reader = DirectoryReader.open(w2); + w2.close(); + + LeafReader leafReader = getOnlyLeafReader(reader); + assertEquals(2, leafReader.maxDoc()); + + PointValues fPoints = leafReader.getPointValues("f"); + assertEquals(2, fPoints.size()); + fPoints.intersect( + new IntersectVisitor() { + + int expectedDoc = 0; + + @Override + public void visit(int docID, byte[] packedValue) throws IOException { + assertEquals(LongPoint.pack(1L), new BytesRef(packedValue)); + assertEquals(expectedDoc++, docID); + } + + @Override + public void visit(int docID) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) { + return Relation.CELL_CROSSES_QUERY; + } + }); + + PointValues gPoints = leafReader.getPointValues("g"); + assertEquals(2, fPoints.size()); + gPoints.intersect( + new IntersectVisitor() { + + int expectedDoc = 0; + + @Override + public void visit(int docID, byte[] packedValue) throws IOException { + assertEquals(LongPoint.pack(42L, 43L), new BytesRef(packedValue)); + assertEquals(expectedDoc++, docID); + } + + @Override + public void visit(int docID) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) { + return Relation.CELL_CROSSES_QUERY; + } + }); + + IOUtils.close(reader, w2, dir1, dir2); + } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BasePostingsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BasePostingsFormatTestCase.java index 8f8233ee680e..8e0292b3f8db 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BasePostingsFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BasePostingsFormatTestCase.java @@ -42,6 +42,7 @@ import org.apache.lucene.document.FieldType; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; +import org.apache.lucene.index.CodecReader; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.Fields; import org.apache.lucene.index.IndexOptions; @@ -54,6 +55,7 @@ import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.SerialMergeScheduler; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; @@ -71,6 +73,7 @@ import org.apache.lucene.tests.util.RamUsageTester; import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; import org.junit.AfterClass; import org.junit.BeforeClass; @@ -1728,4 +1731,41 @@ public void testLineFileDocs() throws IOException { TestUtil.checkIndex(dir); } } + + public void testMismatchedFields() throws Exception { + Directory dir1 = newDirectory(); + IndexWriter w1 = new IndexWriter(dir1, newIndexWriterConfig()); + Document doc = new Document(); + doc.add(new StringField("f", "a", Store.NO)); + doc.add(new StringField("g", "b", Store.NO)); + w1.addDocument(doc); + + Directory dir2 = newDirectory(); + IndexWriter w2 = + new IndexWriter(dir2, newIndexWriterConfig().setMergeScheduler(new SerialMergeScheduler())); + w2.addDocument(doc); + w2.commit(); + + DirectoryReader reader = DirectoryReader.open(w1); + w1.close(); + w2.addIndexes(new MismatchedCodecReader((CodecReader) getOnlyLeafReader(reader), random())); + reader.close(); + w2.forceMerge(1); + reader = DirectoryReader.open(w2); + w2.close(); + + LeafReader leafReader = getOnlyLeafReader(reader); + + TermsEnum te = leafReader.terms("f").iterator(); + assertEquals("a", te.next().utf8ToString()); + assertEquals(2, te.docFreq()); + assertNull(te.next()); + + te = leafReader.terms("g").iterator(); + assertEquals("b", te.next().utf8ToString()); + assertEquals(2, te.docFreq()); + assertNull(te.next()); + + IOUtils.close(reader, w2, dir1, dir2); + } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseStoredFieldsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseStoredFieldsFormatTestCase.java index 63aa50bf713c..84fdcde81732 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseStoredFieldsFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseStoredFieldsFormatTestCase.java @@ -177,7 +177,7 @@ public void testRandomStoredFields() throws IOException { System.out.println("TEST: test id=" + testID); } TopDocs hits = s.search(new TermQuery(new Term("id", testID)), 1); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); Document doc = storedFields.document(hits.scoreDocs[0].doc); Document docExp = docs.get(testID); for (int i = 0; i < fieldCount; i++) { @@ -471,9 +471,9 @@ public void run() { try { StoredFields storedFields = rd.storedFields(); final TopDocs topDocs = searcher.search(query, 1); - if (topDocs.totalHits.value != 1) { + if (topDocs.totalHits.value() != 1) { throw new IllegalStateException( - "Expected 1 hit, got " + topDocs.totalHits.value); + "Expected 1 hit, got " + topDocs.totalHits.value()); } final Document sdoc = storedFields.document(topDocs.scoreDocs[0].doc); if (sdoc == null || sdoc.get("fld") == null) { @@ -783,7 +783,7 @@ public void testBigDocuments() throws IOException { for (int i = 0; i < numDocs; ++i) { final Query query = new TermQuery(new Term("id", "" + i)); final TopDocs topDocs = searcher.search(query, 1); - assertEquals("" + i, 1, topDocs.totalHits.value); + assertEquals("" + i, 1, topDocs.totalHits.value()); final Document doc = storedFields.document(topDocs.scoreDocs[0].doc); assertNotNull(doc); final IndexableField[] fieldValues = doc.getFields("fld"); @@ -943,7 +943,7 @@ public void testRandomStoredFieldsWithIndexSort() throws Exception { System.out.println("TEST: test id=" + testID); } TopDocs hits = searcher.search(new TermQuery(new Term("id", testID)), 1); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); List expectedFields = docs.get(testID).getFields().stream() .filter(f -> f.fieldType().stored()) diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/LegacyBaseDocValuesFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/LegacyBaseDocValuesFormatTestCase.java index 2e797c2ced43..a77050a981c5 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/LegacyBaseDocValuesFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/LegacyBaseDocValuesFormatTestCase.java @@ -145,7 +145,7 @@ public void testOneNumber() throws IOException { assertEquals(1, isearcher.count(new TermQuery(new Term("fieldname", longTerm)))); Query query = new TermQuery(new Term("fieldname", "text")); TopDocs hits = isearcher.search(query, 1); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); // Iterate through the results: for (int i = 0; i < hits.scoreDocs.length; i++) { Document hitDoc = storedFields.document(hits.scoreDocs[i].doc); @@ -182,7 +182,7 @@ public void testOneFloat() throws IOException { assertEquals(1, isearcher.count(new TermQuery(new Term("fieldname", longTerm)))); Query query = new TermQuery(new Term("fieldname", "text")); TopDocs hits = isearcher.search(query, 1); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); // Iterate through the results: for (int i = 0; i < hits.scoreDocs.length; i++) { int docID = hits.scoreDocs[i].doc; @@ -221,7 +221,7 @@ public void testTwoNumbers() throws IOException { assertEquals(1, isearcher.count(new TermQuery(new Term("fieldname", longTerm)))); Query query = new TermQuery(new Term("fieldname", "text")); TopDocs hits = isearcher.search(query, 1); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); // Iterate through the results: for (int i = 0; i < hits.scoreDocs.length; i++) { int docID = hits.scoreDocs[i].doc; @@ -262,7 +262,7 @@ public void testTwoBinaryValues() throws IOException { assertEquals(1, isearcher.count(new TermQuery(new Term("fieldname", longTerm)))); Query query = new TermQuery(new Term("fieldname", "text")); TopDocs hits = isearcher.search(query, 1); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); // Iterate through the results: for (int i = 0; i < hits.scoreDocs.length; i++) { int hitDocID = hits.scoreDocs[i].doc; @@ -320,7 +320,7 @@ public void testVariouslyCompressibleBinaryValues() throws IOException { String id = Integer.toString(i); Query query = new TermQuery(new Term("id", id)); TopDocs hits = isearcher.search(query, 1); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); // Iterate through the results: int hitDocID = hits.scoreDocs[0].doc; Document hitDoc = storedFields.document(hitDocID); @@ -358,7 +358,7 @@ public void testTwoFieldsMixed() throws IOException { assertEquals(1, isearcher.count(new TermQuery(new Term("fieldname", longTerm)))); Query query = new TermQuery(new Term("fieldname", "text")); TopDocs hits = isearcher.search(query, 1); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); // Iterate through the results: for (int i = 0; i < hits.scoreDocs.length; i++) { int docID = hits.scoreDocs[i].doc; @@ -400,7 +400,7 @@ public void testThreeFieldsMixed() throws IOException { Query query = new TermQuery(new Term("fieldname", "text")); TopDocs hits = isearcher.search(query, 1); StoredFields storedFields = isearcher.storedFields(); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); // Iterate through the results: for (int i = 0; i < hits.scoreDocs.length; i++) { int docID = hits.scoreDocs[i].doc; @@ -447,7 +447,7 @@ public void testThreeFieldsMixed2() throws IOException { assertEquals(1, isearcher.count(new TermQuery(new Term("fieldname", longTerm)))); Query query = new TermQuery(new Term("fieldname", "text")); TopDocs hits = isearcher.search(query, 1); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); BytesRef scratch; // Iterate through the results: for (int i = 0; i < hits.scoreDocs.length; i++) { @@ -627,7 +627,7 @@ public void testBytes() throws IOException { assertEquals(1, isearcher.count(new TermQuery(new Term("fieldname", longTerm)))); Query query = new TermQuery(new Term("fieldname", "text")); TopDocs hits = isearcher.search(query, 1); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); // Iterate through the results: for (int i = 0; i < hits.scoreDocs.length; i++) { int hitDocID = hits.scoreDocs[i].doc; @@ -735,7 +735,7 @@ public void testSortedBytes() throws IOException { assertEquals(1, isearcher.count(new TermQuery(new Term("fieldname", longTerm)))); Query query = new TermQuery(new Term("fieldname", "text")); TopDocs hits = isearcher.search(query, 1); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); BytesRef scratch; // Iterate through the results: StoredFields storedFields = isearcher.storedFields(); @@ -1251,7 +1251,7 @@ public void testDocValuesSimple() throws IOException { query.add(new TermQuery(new Term("docId", "4")), BooleanClause.Occur.SHOULD); TopDocs search = searcher.search(query.build(), 10); - assertEquals(5, search.totalHits.value); + assertEquals(5, search.totalHits.value()); ScoreDoc[] scoreDocs = search.scoreDocs; NumericDocValues docValues = getOnlyLeafReader(reader).getNumericDocValues("docId"); for (int i = 0; i < scoreDocs.length; i++) { diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/MismatchedCodecReader.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/MismatchedCodecReader.java new file mode 100644 index 000000000000..8c856aafcba2 --- /dev/null +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/MismatchedCodecReader.java @@ -0,0 +1,216 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.tests.index; + +import java.io.IOException; +import java.util.Objects; +import java.util.Random; +import org.apache.lucene.codecs.DocValuesProducer; +import org.apache.lucene.codecs.NormsProducer; +import org.apache.lucene.codecs.StoredFieldsReader; +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.CodecReader; +import org.apache.lucene.index.DocValuesSkipper; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.FilterCodecReader; +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.SortedDocValues; +import org.apache.lucene.index.SortedNumericDocValues; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.index.StoredFieldVisitor; + +/** + * Shuffles field numbers around to try to trip bugs where field numbers are assumed to always be + * consistent across segments. + */ +public class MismatchedCodecReader extends FilterCodecReader { + + private final FieldInfos shuffled; + + /** Sole constructor. */ + public MismatchedCodecReader(CodecReader in, Random random) { + super(in); + shuffled = MismatchedLeafReader.shuffleInfos(in.getFieldInfos(), random); + } + + @Override + public FieldInfos getFieldInfos() { + return shuffled; + } + + @Override + public CacheHelper getCoreCacheHelper() { + return in.getCoreCacheHelper(); + } + + @Override + public CacheHelper getReaderCacheHelper() { + return in.getReaderCacheHelper(); + } + + @Override + public StoredFieldsReader getFieldsReader() { + StoredFieldsReader in = super.getFieldsReader(); + if (in == null) { + return null; + } + return new MismatchedStoredFieldsReader(in, shuffled); + } + + private static class MismatchedStoredFieldsReader extends StoredFieldsReader { + + private final StoredFieldsReader in; + private final FieldInfos shuffled; + + MismatchedStoredFieldsReader(StoredFieldsReader in, FieldInfos shuffled) { + this.in = Objects.requireNonNull(in); + this.shuffled = shuffled; + } + + @Override + public void close() throws IOException { + in.close(); + } + + @Override + public StoredFieldsReader clone() { + return new MismatchedStoredFieldsReader(in.clone(), shuffled); + } + + @Override + public void checkIntegrity() throws IOException { + in.checkIntegrity(); + } + + @Override + public void document(int docID, StoredFieldVisitor visitor) throws IOException { + in.document(docID, new MismatchedLeafReader.MismatchedVisitor(visitor, shuffled)); + } + } + + @Override + public DocValuesProducer getDocValuesReader() { + DocValuesProducer in = super.getDocValuesReader(); + if (in == null) { + return null; + } + return new MismatchedDocValuesProducer(in, shuffled, super.getFieldInfos()); + } + + private static class MismatchedDocValuesProducer extends DocValuesProducer { + + private final DocValuesProducer in; + private final FieldInfos shuffled; + private final FieldInfos orig; + + MismatchedDocValuesProducer(DocValuesProducer in, FieldInfos shuffled, FieldInfos orig) { + this.in = Objects.requireNonNull(in); + this.shuffled = shuffled; + this.orig = orig; + } + + @Override + public void close() throws IOException { + in.close(); + } + + private FieldInfo remapFieldInfo(FieldInfo field) { + FieldInfo fi = shuffled.fieldInfo(field.name); + assert fi != null && fi.number == field.number; + return orig.fieldInfo(field.name); + } + + @Override + public NumericDocValues getNumeric(FieldInfo field) throws IOException { + return in.getNumeric(remapFieldInfo(field)); + } + + @Override + public BinaryDocValues getBinary(FieldInfo field) throws IOException { + return in.getBinary(remapFieldInfo(field)); + } + + @Override + public SortedDocValues getSorted(FieldInfo field) throws IOException { + return in.getSorted(remapFieldInfo(field)); + } + + @Override + public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException { + return in.getSortedNumeric(remapFieldInfo(field)); + } + + @Override + public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { + return in.getSortedSet(remapFieldInfo(field)); + } + + @Override + public DocValuesSkipper getSkipper(FieldInfo field) throws IOException { + return in.getSkipper(remapFieldInfo(field)); + } + + @Override + public void checkIntegrity() throws IOException { + in.checkIntegrity(); + } + } + + @Override + public NormsProducer getNormsReader() { + NormsProducer in = super.getNormsReader(); + if (in == null) { + return null; + } + return new MismatchedNormsProducer(in, shuffled, super.getFieldInfos()); + } + + private static class MismatchedNormsProducer extends NormsProducer { + + private final NormsProducer in; + private final FieldInfos shuffled; + private final FieldInfos orig; + + MismatchedNormsProducer(NormsProducer in, FieldInfos shuffled, FieldInfos orig) { + this.in = Objects.requireNonNull(in); + this.shuffled = shuffled; + this.orig = orig; + } + + @Override + public void close() throws IOException { + in.close(); + } + + private FieldInfo remapFieldInfo(FieldInfo field) { + FieldInfo fi = shuffled.fieldInfo(field.name); + assert fi != null && fi.number == field.number; + return orig.fieldInfo(field.name); + } + + @Override + public NumericDocValues getNorms(FieldInfo field) throws IOException { + return in.getNorms(remapFieldInfo(field)); + } + + @Override + public void checkIntegrity() throws IOException { + in.checkIntegrity(); + } + } +} diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/MismatchedLeafReader.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/MismatchedLeafReader.java index ab907b768023..46404f514c6a 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/MismatchedLeafReader.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/MismatchedLeafReader.java @@ -28,8 +28,6 @@ import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.StoredFieldVisitor; import org.apache.lucene.index.StoredFields; -import org.apache.lucene.search.KnnCollector; -import org.apache.lucene.util.Bits; /** * Shuffles field numbers around to try to trip bugs where field numbers are assumed to always be @@ -55,7 +53,7 @@ public StoredFields storedFields() throws IOException { return new StoredFields() { @Override public void document(int docID, StoredFieldVisitor visitor) throws IOException { - inStoredFields.document(docID, new MismatchedVisitor(visitor)); + inStoredFields.document(docID, new MismatchedVisitor(visitor, shuffled)); } }; } @@ -70,18 +68,6 @@ public CacheHelper getReaderCacheHelper() { return in.getReaderCacheHelper(); } - @Override - public void searchNearestVectors( - String field, float[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { - in.searchNearestVectors(field, target, knnCollector, acceptDocs); - } - - @Override - public void searchNearestVectors( - String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { - in.searchNearestVectors(field, target, knnCollector, acceptDocs); - } - static FieldInfos shuffleInfos(FieldInfos infos, Random random) { // first, shuffle the order List shuffled = new ArrayList<>(); @@ -98,12 +84,12 @@ static FieldInfos shuffleInfos(FieldInfos infos, Random random) { new FieldInfo( oldInfo.name, // name i, // number - oldInfo.hasVectors(), // storeTermVector + oldInfo.hasTermVectors(), // storeTermVector oldInfo.omitsNorms(), // omitNorms oldInfo.hasPayloads(), // storePayloads oldInfo.getIndexOptions(), // indexOptions oldInfo.getDocValuesType(), // docValuesType - oldInfo.hasDocValuesSkipIndex(), // hasDocValuesSkipIndex + oldInfo.docValuesSkipIndexType(), // docValuesSkipIndexType oldInfo.getDocValuesGen(), // dvGen oldInfo.attributes(), // attributes oldInfo.getPointDimensionCount(), // data dimension count @@ -124,11 +110,13 @@ static FieldInfos shuffleInfos(FieldInfos infos, Random random) { /** StoredFieldsVisitor that remaps actual field numbers to our new shuffled ones. */ // TODO: its strange this part of our IR api exposes FieldInfo, // no other "user-accessible" codec apis do this? - class MismatchedVisitor extends StoredFieldVisitor { + static class MismatchedVisitor extends StoredFieldVisitor { final StoredFieldVisitor in; + final FieldInfos shuffled; - MismatchedVisitor(StoredFieldVisitor in) { + MismatchedVisitor(StoredFieldVisitor in, FieldInfos shuffled) { this.in = in; + this.shuffled = shuffled; } @Override diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/MockRandomMergePolicy.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/MockRandomMergePolicy.java index 1b509c7c3965..d3f202ad9dcb 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/MockRandomMergePolicy.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/MockRandomMergePolicy.java @@ -23,6 +23,9 @@ import java.util.Map; import java.util.Random; import java.util.Set; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Executor; +import java.util.concurrent.FutureTask; import org.apache.lucene.index.CodecReader; import org.apache.lucene.index.DocValues; import org.apache.lucene.index.FilterLeafReader; @@ -36,6 +39,8 @@ import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.BitSet; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.ThreadInterruptedException; /** MergePolicy that makes random decisions for testing. */ public class MockRandomMergePolicy extends MergePolicy { @@ -232,8 +237,7 @@ public CacheHelper getReaderCacheHelper() { "NOTE: MockRandomMergePolicy now swaps in a MismatchedLeafReader for merging reader=" + reader); } - return SlowCodecReaderWrapper.wrap( - new MismatchedLeafReader(new MergeReaderWrapper(reader), r)); + return new MismatchedCodecReader(reader, r); } else { // otherwise, reader is unchanged return reader; @@ -241,7 +245,8 @@ public CacheHelper getReaderCacheHelper() { } @Override - public Sorter.DocMap reorder(CodecReader reader, Directory dir) throws IOException { + public Sorter.DocMap reorder(CodecReader reader, Directory dir, Executor executor) + throws IOException { if (r.nextBoolean()) { if (LuceneTestCase.VERBOSE) { System.out.println("NOTE: MockRandomMergePolicy now reverses reader=" + reader); @@ -249,6 +254,19 @@ public Sorter.DocMap reorder(CodecReader reader, Directory dir) throws IOExcepti // Reverse the doc ID order return reverse(reader); } + if (executor != null && r.nextBoolean()) { + // submit random work to the executor + Runnable dummyRunnable = () -> {}; + FutureTask task = new FutureTask<>(dummyRunnable, null); + executor.execute(task); + try { + task.get(); + } catch (InterruptedException e) { + throw new ThreadInterruptedException(e); + } catch (ExecutionException e) { + throw IOUtils.rethrowAlways(e.getCause()); + } + } return null; } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/PerThreadPKLookup.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/PerThreadPKLookup.java index 0a259b7f1790..5db9a2409e8c 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/PerThreadPKLookup.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/PerThreadPKLookup.java @@ -19,9 +19,12 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Collections; -import java.util.Comparator; +import java.util.HashMap; import java.util.List; +import java.util.Map; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexReader.CacheHelper; +import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.Terms; @@ -37,51 +40,80 @@ */ public class PerThreadPKLookup { + private final String idFieldName; protected final TermsEnum[] termsEnums; protected final PostingsEnum[] postingsEnums; protected final Bits[] liveDocs; protected final int[] docBases; - protected final int numSegs; + protected final int numEnums; protected final boolean hasDeletions; + private final Map enumIndexes; - public PerThreadPKLookup(IndexReader r, String idFieldName) throws IOException { + public PerThreadPKLookup(IndexReader reader, String idFieldName) throws IOException { + this(reader, idFieldName, Collections.emptyMap(), null, null); + } - List leaves = new ArrayList<>(r.leaves()); + private PerThreadPKLookup( + IndexReader reader, + String idFieldName, + Map prevEnumIndexes, + TermsEnum[] reusableTermsEnums, + PostingsEnum[] reusablePostingsEnums) + throws IOException { + this.idFieldName = idFieldName; + List leaves = new ArrayList<>(reader.leaves()); // Larger segments are more likely to have the id, so we sort largest to smallest by numDocs: - Collections.sort( - leaves, - new Comparator() { - @Override - public int compare(LeafReaderContext c1, LeafReaderContext c2) { - return c2.reader().numDocs() - c1.reader().numDocs(); - } - }); + leaves.sort((c1, c2) -> c2.reader().numDocs() - c1.reader().numDocs()); termsEnums = new TermsEnum[leaves.size()]; postingsEnums = new PostingsEnum[leaves.size()]; liveDocs = new Bits[leaves.size()]; docBases = new int[leaves.size()]; - int numSegs = 0; + enumIndexes = new HashMap<>(); + int numEnums = 0; boolean hasDeletions = false; + for (int i = 0; i < leaves.size(); i++) { - Terms terms = leaves.get(i).reader().terms(idFieldName); - if (terms != null) { - termsEnums[numSegs] = terms.iterator(); - assert termsEnums[numSegs] != null; - docBases[numSegs] = leaves.get(i).docBase; - liveDocs[numSegs] = leaves.get(i).reader().getLiveDocs(); - hasDeletions |= leaves.get(i).reader().hasDeletions(); - numSegs++; + LeafReaderContext context = leaves.get(i); + LeafReader leafReader = context.reader(); + CacheHelper cacheHelper = leafReader.getCoreCacheHelper(); + IndexReader.CacheKey cacheKey = cacheHelper == null ? null : cacheHelper.getKey(); + + if (cacheKey != null && prevEnumIndexes.containsKey(cacheKey)) { + // Reuse termsEnum, postingsEnum. + int seg = prevEnumIndexes.get(cacheKey); + termsEnums[numEnums] = reusableTermsEnums[seg]; + postingsEnums[numEnums] = reusablePostingsEnums[seg]; + } else { + // New or empty segment. + Terms terms = leafReader.terms(idFieldName); + if (terms != null) { + termsEnums[numEnums] = terms.iterator(); + assert termsEnums[numEnums] != null; + } + } + + if (termsEnums[numEnums] != null) { + if (cacheKey != null) { + enumIndexes.put(cacheKey, numEnums); + } + + docBases[numEnums] = context.docBase; + liveDocs[numEnums] = leafReader.getLiveDocs(); + hasDeletions |= leafReader.hasDeletions(); + + numEnums++; } } - this.numSegs = numSegs; + + this.numEnums = numEnums; this.hasDeletions = hasDeletions; } /** Returns docID if found, else -1. */ public int lookup(BytesRef id) throws IOException { - for (int seg = 0; seg < numSegs; seg++) { + for (int seg = 0; seg < numEnums; seg++) { if (termsEnums[seg].seekExact(id)) { postingsEnums[seg] = termsEnums[seg].postings(postingsEnums[seg], 0); int docID = -1; @@ -97,5 +129,12 @@ public int lookup(BytesRef id) throws IOException { return -1; } - // TODO: add reopen method to carry over re-used enums...? + /** Reuse previous PerThreadPKLookup's termsEnum and postingsEnum. */ + public PerThreadPKLookup reopen(IndexReader reader) throws IOException { + if (reader == null) { + return null; + } + return new PerThreadPKLookup( + reader, this.idFieldName, this.enumIndexes, this.termsEnums, this.postingsEnums); + } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/RandomCodec.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/RandomCodec.java index 8059ad09cdc8..fbc54c30954f 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/RandomCodec.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/RandomCodec.java @@ -308,7 +308,7 @@ public RandomlySplittingBKDWriter( protected int split(byte[] minPackedValue, byte[] maxPackedValue, int[] parentDims) { // BKD normally defaults by the widest dimension, to try to make as squarish cells as // possible, but we just pick a random one ;) - return random.nextInt(config.numIndexDims); + return random.nextInt(config.numIndexDims()); } } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/RandomPostingsTester.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/RandomPostingsTester.java index 5dea79f203a2..78329e889a2e 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/RandomPostingsTester.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/RandomPostingsTester.java @@ -45,6 +45,7 @@ import org.apache.lucene.codecs.FieldsProducer; import org.apache.lucene.codecs.NormsProducer; import org.apache.lucene.index.BaseTermsEnum; +import org.apache.lucene.index.DocValuesSkipIndexType; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; @@ -157,7 +158,7 @@ public RandomPostingsTester(Random random) throws IOException { true, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS, DocValuesType.NONE, - false, + DocValuesSkipIndexType.NONE, -1, new HashMap<>(), 0, @@ -732,7 +733,7 @@ public FieldsProducer buildIndex( doPayloads, indexOptions, DocValuesType.NONE, - false, + DocValuesSkipIndexType.NONE, -1, new HashMap<>(), 0, diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/ThreadedIndexingAndSearchingTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/ThreadedIndexingAndSearchingTestCase.java index 6143b7212a97..44c72caa3978 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/ThreadedIndexingAndSearchingTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/ThreadedIndexingAndSearchingTestCase.java @@ -540,7 +540,7 @@ public void runTest(String testName) throws Exception { } IndexSearcher searcher = newSearcher(reader, false); - sum += searcher.search(new TermQuery(new Term("body", "united")), 10).totalHits.value; + sum += searcher.search(new TermQuery(new Term("body", "united")), 10).totalHits.value(); if (VERBOSE) { System.out.println("TEST: warm visited " + sum + " fields"); @@ -628,12 +628,12 @@ public void message(String component, String message) { // Verify: make sure delIDs are in fact deleted: for (String id : delIDs) { final TopDocs hits = s.search(new TermQuery(new Term("docid", id)), 1); - if (hits.totalHits.value != 0) { + if (hits.totalHits.value() != 0) { System.out.println( "doc id=" + id + " is supposed to be deleted, but got " - + hits.totalHits.value + + hits.totalHits.value() + " hits; first docID=" + hits.scoreDocs[0].doc); doFail = true; @@ -643,12 +643,12 @@ public void message(String component, String message) { // Verify: make sure delPackIDs are in fact deleted: for (String id : delPackIDs) { final TopDocs hits = s.search(new TermQuery(new Term("packID", id)), 1); - if (hits.totalHits.value != 0) { + if (hits.totalHits.value() != 0) { System.out.println( "packID=" + id + " is supposed to be deleted, but got " - + hits.totalHits.value + + hits.totalHits.value() + " matches"); doFail = true; } @@ -660,14 +660,14 @@ public void message(String component, String message) { StoredFields storedFields = s.storedFields(); if (!subDocs.deleted) { // We sort by relevance but the scores should be identical so sort falls back to by docID: - if (hits.totalHits.value != subDocs.subIDs.size()) { + if (hits.totalHits.value() != subDocs.subIDs.size()) { System.out.println( "packID=" + subDocs.packID + ": expected " + subDocs.subIDs.size() + " hits but got " - + hits.totalHits.value); + + hits.totalHits.value()); doFail = true; } else { int lastDocID = -1; @@ -687,7 +687,7 @@ public void message(String component, String message) { lastDocID = startDocID - 1; for (String subID : subDocs.subIDs) { hits = s.search(new TermQuery(new Term("docid", subID)), 1); - assertEquals(1, hits.totalHits.value); + assertEquals(1, hits.totalHits.value()); final int docID = hits.scoreDocs[0].doc; if (lastDocID != -1) { assertEquals(1 + lastDocID, docID); @@ -700,7 +700,7 @@ public void message(String component, String message) { // deleted. We can't verify packID is deleted // because we can re-use packID for update: for (String subID : subDocs.subIDs) { - assertEquals(0, s.search(new TermQuery(new Term("docid", subID)), 1).totalHits.value); + assertEquals(0, s.search(new TermQuery(new Term("docid", subID)), 1).totalHits.value()); } } } @@ -714,12 +714,12 @@ public void message(String component, String message) { String stringID = "" + id; if (!delIDs.contains(stringID)) { final TopDocs hits = s.search(new TermQuery(new Term("docid", stringID)), 1); - if (hits.totalHits.value != 1) { + if (hits.totalHits.value() != 1) { System.out.println( "doc id=" + stringID + " is not supposed to be deleted, but got hitCount=" - + hits.totalHits.value + + hits.totalHits.value() + "; delIDs=" + delIDs); doFail = true; @@ -779,9 +779,11 @@ public void message(String component, String message) { private long runQuery(IndexSearcher s, Query q) throws Exception { s.search(q, 10); long hitCount = - s.search(q, 10, new Sort(new SortField("titleDV", SortField.Type.STRING))).totalHits.value; + s.search(q, 10, new Sort(new SortField("titleDV", SortField.Type.STRING))) + .totalHits + .value(); final Sort dvSort = new Sort(new SortField("titleDV", SortField.Type.STRING)); - long hitCount2 = s.search(q, 10, dvSort).totalHits.value; + long hitCount2 = s.search(q, 10, dvSort).totalHits.value(); assertEquals(hitCount, hitCount2); return hitCount; } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/search/AssertingBulkScorer.java b/lucene/test-framework/src/java/org/apache/lucene/tests/search/AssertingBulkScorer.java index 6d9c5e0fcd37..f683b2f76ec4 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/search/AssertingBulkScorer.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/search/AssertingBulkScorer.java @@ -19,7 +19,6 @@ import com.carrotsearch.randomizedtesting.generators.RandomNumbers; import java.io.IOException; import java.util.Random; -import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.search.BulkScorer; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.LeafCollector; @@ -58,24 +57,6 @@ public long cost() { return in.cost(); } - @Override - public void score(LeafCollector collector, Bits acceptDocs) throws IOException { - assert max == 0; - collector = new AssertingLeafCollector(collector, 0, PostingsEnum.NO_MORE_DOCS); - if (random.nextBoolean()) { - try { - final int next = score(collector, acceptDocs, 0, PostingsEnum.NO_MORE_DOCS); - assert next == DocIdSetIterator.NO_MORE_DOCS; - } catch ( - @SuppressWarnings("unused") - UnsupportedOperationException e) { - in.score(collector, acceptDocs); - } - } else { - in.score(collector, acceptDocs); - } - } - @Override public int score(LeafCollector collector, Bits acceptDocs, int min, final int max) throws IOException { diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/search/AssertingIndexSearcher.java b/lucene/test-framework/src/java/org/apache/lucene/tests/search/AssertingIndexSearcher.java index f5fa29b14948..8c8b35f596c3 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/search/AssertingIndexSearcher.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/search/AssertingIndexSearcher.java @@ -17,12 +17,10 @@ package org.apache.lucene.tests.search; import java.io.IOException; -import java.util.List; import java.util.Random; import java.util.concurrent.ExecutorService; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReaderContext; -import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.search.Collector; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; @@ -72,7 +70,7 @@ public Query rewrite(Query original) throws IOException { } @Override - protected void search(List leaves, Weight weight, Collector collector) + protected void search(LeafReaderContextPartition[] leaves, Weight weight, Collector collector) throws IOException { assert weight instanceof AssertingWeight; AssertingCollector assertingCollector = AssertingCollector.wrap(collector); diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/search/AssertingScorable.java b/lucene/test-framework/src/java/org/apache/lucene/tests/search/AssertingScorable.java index 3a87d4c0f1a1..2f3d604ad256 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/search/AssertingScorable.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/search/AssertingScorable.java @@ -33,7 +33,8 @@ public AssertingScorable(Scorable in) { @Override public float score() throws IOException { final float score = in.score(); - assert !Float.isNaN(score) : "NaN score for in=" + in; + // Note: score >= 0 returns false for NaN + assert score >= 0 : "score=" + score + " for in=" + in; return score; } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/search/AssertingScorer.java b/lucene/test-framework/src/java/org/apache/lucene/tests/search/AssertingScorer.java index 8badba0d12b7..dd408befdbf3 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/search/AssertingScorer.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/search/AssertingScorer.java @@ -183,7 +183,7 @@ public int advance(int target) throws IOException { } else { state = IteratorState.ITERATING; } - assert in.docID() == advanced; + assert in.docID() == advanced : in.docID() + " != " + advanced + " in " + in; assert AssertingScorer.this.in.docID() == in.docID(); return doc = advanced; } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/search/CheckHits.java b/lucene/test-framework/src/java/org/apache/lucene/tests/search/CheckHits.java index d422511dc0bf..f5d2c7ea1746 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/search/CheckHits.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/search/CheckHits.java @@ -554,9 +554,9 @@ public TopFieldDocs search(Query query, int n, Sort sort) throws IOException { } @Override - public void search(Query query, Collector results) throws IOException { + public void search(Query query, Collector collector) throws IOException { checkExplanations(query); - super.search(query, results); + super.search(query, collector); } @Override @@ -654,6 +654,10 @@ public static class MatchesAsserter extends SimpleCollector { private final Weight weight; private LeafReaderContext context; int lastCheckedDoc = -1; + // with intra-segment concurrency, we may start from a doc id that isn't -1. We need to make + // sure that we don't go outside of the bounds of the current slice, meaning -1 can't be + // reliably used to signal that we are collecting the first doc for a given segment partition. + boolean collectedOnce = false; public MatchesAsserter(Query query, IndexSearcher searcher) throws IOException { this.weight = searcher.createWeight(searcher.rewrite(query), ScoreMode.COMPLETE_NO_SCORES, 1); @@ -671,7 +675,7 @@ public void collect(int doc) throws IOException { assertNotNull( "Unexpected null Matches object in doc" + doc + " for query " + this.weight.getQuery(), matches); - if (lastCheckedDoc != doc - 1) { + if (collectedOnce && lastCheckedDoc != doc - 1) { assertNull( "Unexpected non-null Matches object in non-matching doc" + doc @@ -679,6 +683,7 @@ public void collect(int doc) throws IOException { + this.weight.getQuery(), this.weight.matches(context, doc - 1)); } + collectedOnce = true; lastCheckedDoc = doc; } @@ -700,12 +705,10 @@ public static void checkTopScores(Random random, Query query, IndexSearcher sear private static void doCheckTopScores(Query query, IndexSearcher searcher, int numHits) throws IOException { - boolean supportsConcurrency = searcher.getSlices().length > 1; TopScoreDocCollectorManager complete = - new TopScoreDocCollectorManager( - numHits, null, Integer.MAX_VALUE, supportsConcurrency); // COMPLETE + new TopScoreDocCollectorManager(numHits, null, Integer.MAX_VALUE); // COMPLETE TopScoreDocCollectorManager topScores = - new TopScoreDocCollectorManager(numHits, null, 1, supportsConcurrency); // TOP_SCORES + new TopScoreDocCollectorManager(numHits, null, 1); // TOP_SCORES TopDocs completeTopDocs = searcher.search(query, complete); TopDocs topScoresTopDocs = searcher.search(query, topScores); checkEqual(query, completeTopDocs.scoreDocs, topScoresTopDocs.scoreDocs); diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/search/ScorerIndexSearcher.java b/lucene/test-framework/src/java/org/apache/lucene/tests/search/ScorerIndexSearcher.java index a77229ef68fb..cd119b6f86d3 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/search/ScorerIndexSearcher.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/search/ScorerIndexSearcher.java @@ -52,8 +52,16 @@ public ScorerIndexSearcher(IndexReader r) { } @Override - protected void searchLeaf(LeafReaderContext ctx, Weight weight, Collector collector) + protected void searchLeaf( + LeafReaderContext ctx, int minDocId, int maxDocId, Weight weight, Collector collector) throws IOException { + // the default slices method does not create segment partitions, and we don't provide an + // executor to this searcher in our codebase, so we should not run into this problem. This class + // can though be used externally, hence it is better to provide a clear and hard error. + if (minDocId != 0 || maxDocId != DocIdSetIterator.NO_MORE_DOCS) { + throw new IllegalStateException( + "intra-segment concurrency is not supported by this searcher"); + } // we force the use of Scorer (not BulkScorer) to make sure // that the scorer passed to LeafCollector.setScorer supports // Scorer.getChildren diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/search/SearchEquivalenceTestBase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/search/SearchEquivalenceTestBase.java index a65d1823726f..63a478951410 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/search/SearchEquivalenceTestBase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/search/SearchEquivalenceTestBase.java @@ -196,8 +196,8 @@ protected void assertSubsetOf(Query q1, Query q2, Query filter) throws Exception TopDocs td1 = s1.search(q1, reader.maxDoc(), sort); TopDocs td2 = s2.search(q2, reader.maxDoc(), sort); assertTrue( - "too many hits: " + td1.totalHits.value + " > " + td2.totalHits.value, - td1.totalHits.value <= td2.totalHits.value); + "too many hits: " + td1.totalHits.value() + " > " + td2.totalHits.value(), + td1.totalHits.value() <= td2.totalHits.value()); // fill the superset into a bitset BitSet bitset = new BitSet(); @@ -235,7 +235,7 @@ protected void assertSameScores(Query q1, Query q2, Query filter) throws Excepti } TopDocs td1 = s1.search(q1, reader.maxDoc()); TopDocs td2 = s2.search(q2, reader.maxDoc()); - assertEquals(td1.totalHits.value, td2.totalHits.value); + assertEquals(td1.totalHits.value(), td2.totalHits.value()); for (int i = 0; i < td1.scoreDocs.length; ++i) { assertEquals(td1.scoreDocs[i].doc, td2.scoreDocs[i].doc); assertEquals(td1.scoreDocs[i].score, td2.scoreDocs[i].score, 10e-5); diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/search/ShardSearchingTestBase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/search/ShardSearchingTestBase.java index b72c7f3f8f4f..7a3785d2b596 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/search/ShardSearchingTestBase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/search/ShardSearchingTestBase.java @@ -62,21 +62,7 @@ public SearcherExpiredException(String message) { } } - private static class FieldAndShardVersion { - private final long version; - private final int nodeID; - private final String field; - - public FieldAndShardVersion(int nodeID, long version, String field) { - this.nodeID = nodeID; - this.version = version; - this.field = field; - } - - @Override - public int hashCode() { - return (int) (version * nodeID + field.hashCode()); - } + private record FieldAndShardVersion(int nodeID, long version, String field) { @Override public boolean equals(Object _other) { @@ -101,21 +87,7 @@ public String toString() { } } - private static class TermAndShardVersion { - private final long version; - private final int nodeID; - private final Term term; - - public TermAndShardVersion(int nodeID, long version, Term term) { - this.nodeID = nodeID; - this.version = version; - this.term = term; - } - - @Override - public int hashCode() { - return (int) (version * nodeID + term.hashCode()); - } + private record TermAndShardVersion(int nodeID, long version, Term term) { @Override public boolean equals(Object _other) { @@ -665,13 +637,5 @@ protected void finish() throws InterruptedException, IOException { } /** An IndexSearcher and associated version (lease) */ - protected static class SearcherAndVersion { - public final IndexSearcher searcher; - public final long version; - - public SearcherAndVersion(IndexSearcher searcher, long version) { - this.searcher = searcher; - this.version = version; - } - } + protected record SearcherAndVersion(IndexSearcher searcher, long version) {} } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/store/BaseChunkedDirectoryTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/store/BaseChunkedDirectoryTestCase.java index dd956c6c3fd0..8de332eeec9b 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/store/BaseChunkedDirectoryTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/store/BaseChunkedDirectoryTestCase.java @@ -33,6 +33,7 @@ import org.apache.lucene.tests.index.RandomIndexWriter; import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.GroupVIntUtil; /** * Base class for Directories that "chunk" the input into blocks. @@ -77,7 +78,7 @@ public void testCloneClose() throws Exception { expectThrows( AlreadyClosedException.class, () -> { - two.readGroupVInts(values, values.length); + GroupVIntUtil.readGroupVInts(two, values, values.length); }); assertEquals(5, three.readVInt()); one.close(); @@ -105,7 +106,7 @@ public void testCloneSliceClose() throws Exception { expectThrows( AlreadyClosedException.class, () -> { - one.readGroupVInts(values, values.length); + GroupVIntUtil.readGroupVInts(one, values, values.length); }); assertEquals(2, two.readInt()); // reopen a new slice "another": diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/store/BaseDirectoryTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/store/BaseDirectoryTestCase.java index 9cc271a9d618..41d72c509dbe 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/store/BaseDirectoryTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/store/BaseDirectoryTestCase.java @@ -59,6 +59,7 @@ import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BitUtil; +import org.apache.lucene.util.GroupVIntUtil; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.packed.PackedInts; import org.junit.Assert; @@ -1458,7 +1459,7 @@ public void testDataTypes() throws IOException { assertEquals(43, in.readByte()); assertEquals(12345, in.readShort()); assertEquals(1234567890, in.readInt()); - in.readGroupVInts(restored, 4); + GroupVIntUtil.readGroupVInts(in, restored, 4); assertArrayEquals(values, restored); assertEquals(1234567890123456789L, in.readLong()); in.close(); @@ -1485,7 +1486,7 @@ public void testGroupVIntOverflow() throws IOException { out.writeGroupVInts(values, limit); out.close(); try (IndexInput in = dir.openInput("test", IOContext.DEFAULT)) { - in.readGroupVInts(restore, limit); + GroupVIntUtil.readGroupVInts(in, restore, limit); for (int i = 0; i < limit; i++) { assertEquals(values[i], restore[i]); } @@ -1533,7 +1534,7 @@ protected void doTestGroupVInt( IndexInput groupVIntIn = dir.openInput("group-varint", IOContext.DEFAULT); IndexInput vIntIn = dir.openInput("vint", IOContext.DEFAULT); for (int iter = 0; iter < iterations; iter++) { - groupVIntIn.readGroupVInts(values, numValuesArray[iter]); + GroupVIntUtil.readGroupVInts(groupVIntIn, values, numValuesArray[iter]); for (int j = 0; j < numValuesArray[iter]; j++) { assertEquals(vIntIn.readVInt(), values[j]); } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/store/MockDirectoryWrapper.java b/lucene/test-framework/src/java/org/apache/lucene/tests/store/MockDirectoryWrapper.java index 2f30a8cda501..5f329209d804 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/store/MockDirectoryWrapper.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/store/MockDirectoryWrapper.java @@ -53,6 +53,7 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.Lock; +import org.apache.lucene.store.ReadAdvice; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.tests.util.ThrottledIndexOutput; @@ -812,6 +813,8 @@ public synchronized IndexInput openInput(String name, IOContext context) throws false); } + // record the read advice before randomizing the context + ReadAdvice readAdvice = context.readAdvice(); context = LuceneTestCase.newIOContext(randomState, context); final boolean confined = context == IOContext.READONCE; if (name.startsWith(IndexFileNames.SEGMENTS) && confined == false) { @@ -831,15 +834,15 @@ public synchronized IndexInput openInput(String name, IOContext context) throws System.out.println( "MockDirectoryWrapper: using SlowClosingMockIndexInputWrapper for file " + name); } - ii = new SlowClosingMockIndexInputWrapper(this, name, delegateInput, confined); + ii = new SlowClosingMockIndexInputWrapper(this, name, delegateInput, readAdvice, confined); } else if (useSlowOpenClosers && randomInt == 1) { if (LuceneTestCase.VERBOSE) { System.out.println( "MockDirectoryWrapper: using SlowOpeningMockIndexInputWrapper for file " + name); } - ii = new SlowOpeningMockIndexInputWrapper(this, name, delegateInput, confined); + ii = new SlowOpeningMockIndexInputWrapper(this, name, delegateInput, readAdvice, confined); } else { - ii = new MockIndexInputWrapper(this, name, delegateInput, null, confined); + ii = new MockIndexInputWrapper(this, name, delegateInput, null, readAdvice, confined); } addFileHandle(ii, name, Handle.Input); return ii; diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/store/MockIndexInputWrapper.java b/lucene/test-framework/src/java/org/apache/lucene/tests/store/MockIndexInputWrapper.java index 872790086149..3171d8d22169 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/store/MockIndexInputWrapper.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/store/MockIndexInputWrapper.java @@ -23,6 +23,7 @@ import org.apache.lucene.internal.tests.TestSecrets; import org.apache.lucene.store.FilterIndexInput; import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.ReadAdvice; /** * Used by MockDirectoryWrapper to create an input stream that keeps track of when it's been closed. @@ -39,6 +40,7 @@ public class MockIndexInputWrapper extends FilterIndexInput { // Which MockIndexInputWrapper we were cloned from, or null if we are not a clone: private final MockIndexInputWrapper parent; + private final ReadAdvice readAdvice; private final boolean confined; private final Thread thread; @@ -48,6 +50,7 @@ public MockIndexInputWrapper( String name, IndexInput delegate, MockIndexInputWrapper parent, + ReadAdvice readAdvice, boolean confined) { super("MockIndexInputWrapper(name=" + name + " delegate=" + delegate + ")", delegate); @@ -57,6 +60,7 @@ public MockIndexInputWrapper( this.parent = parent; this.name = name; this.dir = dir; + this.readAdvice = readAdvice; this.confined = confined; this.thread = Thread.currentThread(); } @@ -107,7 +111,8 @@ public MockIndexInputWrapper clone() { dir.inputCloneCount.incrementAndGet(); IndexInput iiclone = in.clone(); MockIndexInputWrapper clone = - new MockIndexInputWrapper(dir, name, iiclone, parent != null ? parent : this, confined); + new MockIndexInputWrapper( + dir, name, iiclone, parent != null ? parent : this, readAdvice, confined); // Pending resolution on LUCENE-686 we may want to // uncomment this code so that we also track that all // clones get closed: @@ -135,7 +140,26 @@ public IndexInput slice(String sliceDescription, long offset, long length) throw IndexInput slice = in.slice(sliceDescription, offset, length); MockIndexInputWrapper clone = new MockIndexInputWrapper( - dir, sliceDescription, slice, parent != null ? parent : this, confined); + dir, sliceDescription, slice, parent != null ? parent : this, readAdvice, confined); + return clone; + } + + @Override + public IndexInput slice(String sliceDescription, long offset, long length, ReadAdvice readAdvice) + throws IOException { + if (this.readAdvice != ReadAdvice.NORMAL) { + throw new IllegalStateException( + "slice() may only be called with a custom read advice on inputs that have been open with ReadAdvice.NORMAL"); + } + ensureOpen(); + if (dir.verboseClone) { + new Exception("slice: " + this).printStackTrace(System.out); + } + dir.inputCloneCount.incrementAndGet(); + IndexInput slice = in.slice(sliceDescription, offset, length); + MockIndexInputWrapper clone = + new MockIndexInputWrapper( + dir, sliceDescription, slice, parent != null ? parent : this, readAdvice, confined); return clone; } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/store/SlowClosingMockIndexInputWrapper.java b/lucene/test-framework/src/java/org/apache/lucene/tests/store/SlowClosingMockIndexInputWrapper.java index 1f9e61f51950..851860f1c648 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/store/SlowClosingMockIndexInputWrapper.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/store/SlowClosingMockIndexInputWrapper.java @@ -19,6 +19,7 @@ import java.io.IOException; import org.apache.lucene.internal.tests.TestSecrets; import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.ReadAdvice; import org.apache.lucene.util.SuppressForbidden; import org.apache.lucene.util.ThreadInterruptedException; @@ -35,8 +36,12 @@ class SlowClosingMockIndexInputWrapper extends MockIndexInputWrapper { } public SlowClosingMockIndexInputWrapper( - MockDirectoryWrapper dir, String name, IndexInput delegate, boolean confined) { - super(dir, name, delegate, null, confined); + MockDirectoryWrapper dir, + String name, + IndexInput delegate, + ReadAdvice readAdvice, + boolean confined) { + super(dir, name, delegate, null, readAdvice, confined); } @SuppressForbidden(reason = "Thread sleep") diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/store/SlowOpeningMockIndexInputWrapper.java b/lucene/test-framework/src/java/org/apache/lucene/tests/store/SlowOpeningMockIndexInputWrapper.java index 033785af9c7c..0d75408ec8e1 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/store/SlowOpeningMockIndexInputWrapper.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/store/SlowOpeningMockIndexInputWrapper.java @@ -19,6 +19,7 @@ import java.io.IOException; import org.apache.lucene.internal.tests.TestSecrets; import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.ReadAdvice; import org.apache.lucene.util.SuppressForbidden; import org.apache.lucene.util.ThreadInterruptedException; @@ -35,9 +36,13 @@ class SlowOpeningMockIndexInputWrapper extends MockIndexInputWrapper { @SuppressForbidden(reason = "Thread sleep") public SlowOpeningMockIndexInputWrapper( - MockDirectoryWrapper dir, String name, IndexInput delegate, boolean confined) + MockDirectoryWrapper dir, + String name, + IndexInput delegate, + ReadAdvice readAdvice, + boolean confined) throws IOException { - super(dir, name, delegate, null, confined); + super(dir, name, delegate, null, readAdvice, confined); try { Thread.sleep(50); } catch (InterruptedException ie) { diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/util/LuceneTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/util/LuceneTestCase.java index 3299d8ddca93..84fa120b88b1 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/util/LuceneTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/util/LuceneTestCase.java @@ -17,10 +17,12 @@ package org.apache.lucene.tests.util; +import static com.carrotsearch.randomizedtesting.RandomizedTest.frequently; import static com.carrotsearch.randomizedtesting.RandomizedTest.randomBoolean; import static com.carrotsearch.randomizedtesting.RandomizedTest.systemPropertyAsBoolean; import static com.carrotsearch.randomizedtesting.RandomizedTest.systemPropertyAsInt; import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; +import static org.apache.lucene.search.IndexSearcher.LeafSlice; import com.carrotsearch.randomizedtesting.JUnit4MethodProvider; import com.carrotsearch.randomizedtesting.LifecycleScope; @@ -92,6 +94,7 @@ import java.util.Set; import java.util.TimeZone; import java.util.TreeSet; +import java.util.concurrent.Executor; import java.util.concurrent.ExecutorService; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.ThreadPoolExecutor; @@ -179,6 +182,7 @@ import org.apache.lucene.tests.index.FieldFilterLeafReader; import org.apache.lucene.tests.index.MergingCodecReader; import org.apache.lucene.tests.index.MergingDirectoryReaderWrapper; +import org.apache.lucene.tests.index.MismatchedCodecReader; import org.apache.lucene.tests.index.MismatchedDirectoryReader; import org.apache.lucene.tests.index.MismatchedLeafReader; import org.apache.lucene.tests.index.MockIndexWriterEventListener; @@ -864,6 +868,18 @@ public static void assumeNoException(String msg, Exception e) { RandomizedTest.assumeNoException(msg, e); } + public static void assertFloatUlpEquals(final float x, final float y, final short maxUlps) { + assertTrue( + x + " and " + y + " are not within " + maxUlps + " ULPs of each other", + TestUtil.floatUlpEquals(x, y, maxUlps)); + } + + public static void assertDoubleUlpEquals(final double x, final double y, final int maxUlps) { + assertTrue( + x + " and " + y + " are not within " + maxUlps + " ULPs of each other", + TestUtil.doubleUlpEquals(x, y, maxUlps)); + } + /** * Return args as a {@link Set} instance. The order of elements is not preserved in * iterators. @@ -941,10 +957,10 @@ public static IndexWriterConfig newIndexWriterConfig(Random r, Analyzer a) { } else if (rarely(r)) { ConcurrentMergeScheduler cms; if (r.nextBoolean()) { - cms = new ConcurrentMergeScheduler(); + cms = new TestConcurrentMergeScheduler(); } else { cms = - new ConcurrentMergeScheduler() { + new TestConcurrentMergeScheduler() { @Override protected synchronized boolean maybeStall(MergeSource mergeSource) { return true; @@ -963,7 +979,8 @@ protected synchronized boolean maybeStall(MergeSource mergeSource) { } else { // Always use consistent settings, else CMS's dynamic (SSD or not) // defaults can change, hurting reproducibility: - ConcurrentMergeScheduler cms = new ConcurrentMergeScheduler(); + ConcurrentMergeScheduler cms = + randomBoolean() ? new TestConcurrentMergeScheduler() : new ConcurrentMergeScheduler(); // Only 1 thread can run at once (should maybe help reproducibility), // with up to 3 pending merges before segment-producing threads are @@ -1730,12 +1747,14 @@ public static IndexReader wrapReader(IndexReader r) throws IOException { System.out.println( "NOTE: LuceneTestCase.wrapReader: wrapping previous reader=" + r - + " with MismatchedLeaf/DirectoryReader"); + + " with MismatchedLeaf/Directory/CodecReader"); } if (r instanceof LeafReader) { r = new MismatchedLeafReader((LeafReader) r, random); } else if (r instanceof DirectoryReader) { r = new MismatchedDirectoryReader((DirectoryReader) r, random); + } else if (r instanceof CodecReader) { + r = new MismatchedCodecReader((CodecReader) r, random); } break; case 4: @@ -1922,8 +1941,32 @@ public static IndexSearcher newSearcher( */ public static IndexSearcher newSearcher( IndexReader r, boolean maybeWrap, boolean wrapWithAssertions, boolean useThreads) { + if (useThreads) { + return newSearcher(r, maybeWrap, wrapWithAssertions, Concurrency.INTRA_SEGMENT); + } + return newSearcher(r, maybeWrap, wrapWithAssertions, Concurrency.NONE); + } + + /** What level of concurrency is supported by the searcher being created */ + public enum Concurrency { + /** No concurrency, meaning an executor won't be provided to the searcher */ + NONE, + /** + * Inter-segment concurrency, meaning an executor will be provided to the searcher and slices + * will be randomly created to concurrently search entire segments + */ + INTER_SEGMENT, + /** + * Intra-segment concurrency, meaning an executor will be provided to the searcher and slices + * will be randomly created to concurrently search segment partitions + */ + INTRA_SEGMENT + } + + public static IndexSearcher newSearcher( + IndexReader r, boolean maybeWrap, boolean wrapWithAssertions, Concurrency concurrency) { Random random = random(); - if (useThreads == false) { + if (concurrency == Concurrency.NONE) { if (maybeWrap) { try { r = maybeWrapReader(r); @@ -1973,7 +2016,8 @@ public static IndexSearcher newSearcher( new AssertingIndexSearcher(random, r, ex) { @Override protected LeafSlice[] slices(List leaves) { - return slices(leaves, maxDocPerSlice, maxSegmentsPerSlice); + return LuceneTestCase.slices( + leaves, maxDocPerSlice, maxSegmentsPerSlice, concurrency); } }; } else { @@ -1981,7 +2025,8 @@ protected LeafSlice[] slices(List leaves) { new AssertingIndexSearcher(random, r.getContext(), ex) { @Override protected LeafSlice[] slices(List leaves) { - return slices(leaves, maxDocPerSlice, maxSegmentsPerSlice); + return LuceneTestCase.slices( + leaves, maxDocPerSlice, maxSegmentsPerSlice, concurrency); } }; } @@ -1990,7 +2035,8 @@ protected LeafSlice[] slices(List leaves) { new IndexSearcher(r, ex) { @Override protected LeafSlice[] slices(List leaves) { - return slices(leaves, maxDocPerSlice, maxSegmentsPerSlice); + return LuceneTestCase.slices( + leaves, maxDocPerSlice, maxSegmentsPerSlice, concurrency); } }; } @@ -2003,6 +2049,25 @@ protected LeafSlice[] slices(List leaves) { } } + /** + * Creates leaf slices according to the concurrency argument, that optionally leverage + * intra-segment concurrency by splitting segments into multiple partitions according to the + * maxDocsPerSlice argument. + */ + private static LeafSlice[] slices( + List leaves, + int maxDocsPerSlice, + int maxSegmentsPerSlice, + Concurrency concurrency) { + assert concurrency != Concurrency.NONE; + // Rarely test slices without partitions even though intra-segment concurrency is supported + return IndexSearcher.slices( + leaves, + maxDocsPerSlice, + maxSegmentsPerSlice, + concurrency == Concurrency.INTRA_SEGMENT && frequently()); + } + /** * Gets a resource from the test's classpath as {@link Path}. This method should only be used, if * a real file is needed. To get a stream, code should prefer {@link #getDataInputStream(String)}. @@ -3244,4 +3309,17 @@ protected static KnnVectorsFormat randomVectorFormat(VectorEncoding vectorEncodi .toList(); return RandomPicks.randomFrom(random(), availableFormats); } + + /** + * This is a test merge scheduler that will always use the intra merge executor to ensure we test + * it. + */ + static class TestConcurrentMergeScheduler extends ConcurrentMergeScheduler { + @Override + public Executor getIntraMergeExecutor(MergePolicy.OneMerge merge) { + assert intraMergeExecutor != null : "scaledExecutor is not initialized"; + // Always do the intra merge executor to ensure we test it + return intraMergeExecutor; + } + } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/util/RamUsageTester.java b/lucene/test-framework/src/java/org/apache/lucene/tests/util/RamUsageTester.java index 7e793f2c6c1c..e50f865d9441 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/util/RamUsageTester.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/util/RamUsageTester.java @@ -307,15 +307,7 @@ private long charArraySize(int len) { }); /** Cached information about a given class. */ - private static final class ClassCache { - public final long alignedShallowInstanceSize; - public final Field[] referenceFields; - - public ClassCache(long alignedShallowInstanceSize, Field[] referenceFields) { - this.alignedShallowInstanceSize = alignedShallowInstanceSize; - this.referenceFields = referenceFields; - } - } + private record ClassCache(long alignedShallowInstanceSize, Field[] referenceFields) {} /** Create a cached information about shallow size and reference fields for a given class. */ @SuppressForbidden(reason = "We need to access private fields of measured objects.") diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestRuleSetupAndRestoreClassEnv.java b/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestRuleSetupAndRestoreClassEnv.java index 378444e394a1..1ace55a98a06 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestRuleSetupAndRestoreClassEnv.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestRuleSetupAndRestoreClassEnv.java @@ -38,7 +38,7 @@ import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.PostingsFormat; -import org.apache.lucene.codecs.lucene912.Lucene912Codec; +import org.apache.lucene.codecs.lucene101.Lucene101Codec; import org.apache.lucene.codecs.simpletext.SimpleTextCodec; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.tests.codecs.asserting.AssertingCodec; @@ -188,9 +188,9 @@ public String toString() { } else if ("Compressing".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal == 6 && !shouldAvoidCodec("Compressing"))) { codec = CompressingCodec.randomInstance(random); - } else if ("Lucene912".equals(TEST_CODEC) - || ("random".equals(TEST_CODEC) && randomVal == 5 && !shouldAvoidCodec("Lucene912"))) { - codec = new Lucene912Codec(RandomPicks.randomFrom(random, Lucene912Codec.Mode.values())); + } else if ("Lucene100".equals(TEST_CODEC) + || ("random".equals(TEST_CODEC) && randomVal == 5 && !shouldAvoidCodec("Lucene100"))) { + codec = new Lucene101Codec(RandomPicks.randomFrom(random, Lucene101Codec.Mode.values())); } else if (!"random".equals(TEST_CODEC)) { codec = Codec.forName(TEST_CODEC); } else if ("random".equals(TEST_POSTINGSFORMAT)) { diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestUtil.java b/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestUtil.java index 195b55639580..6715edecc166 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestUtil.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestUtil.java @@ -55,9 +55,9 @@ import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.blocktreeords.BlockTreeOrdsPostingsFormat; +import org.apache.lucene.codecs.lucene101.Lucene101Codec; +import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat; import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat; -import org.apache.lucene.codecs.lucene912.Lucene912Codec; -import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat; import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; @@ -464,6 +464,90 @@ private static void checkReaderSanity(LeafReader reader) throws IOException { } } + /** + * Returns true if the arguments are equal or within the range of allowed error (inclusive). + * Returns {@code false} if either of the arguments is NaN. + * + *

    Two float numbers are considered equal if there are {@code (maxUlps - 1)} (or fewer) + * floating point numbers between them, i.e. two adjacent floating point numbers are considered + * equal. + * + *

    Adapted from org.apache.commons.numbers.core.Precision + * + *

    github: https://github.com/apache/commons-numbers release 1.2 + * + * @param x first value + * @param y second value + * @param maxUlps {@code (maxUlps - 1)} is the number of floating point values between {@code x} + * and {@code y}. + * @return {@code true} if there are fewer than {@code maxUlps} floating point values between + * {@code x} and {@code y}. + */ + public static boolean floatUlpEquals(final float x, final float y, final short maxUlps) { + final int xInt = Float.floatToRawIntBits(x); + final int yInt = Float.floatToRawIntBits(y); + + if ((xInt ^ yInt) < 0) { + // Numbers have opposite signs, take care of overflow. + // Remove the sign bit to obtain the absolute ULP above zero. + final int deltaPlus = xInt & Integer.MAX_VALUE; + final int deltaMinus = yInt & Integer.MAX_VALUE; + + // Note: + // If either value is NaN, the exponent bits are set to (255 << 23) and the + // distance above 0.0 is always above a short ULP error. So omit the test + // for NaN and return directly. + + // Avoid possible overflow from adding the deltas by splitting the comparison + return deltaPlus <= maxUlps && deltaMinus <= (maxUlps - deltaPlus); + } + + // Numbers have same sign, there is no risk of overflow. + return Math.abs(xInt - yInt) <= maxUlps && !Float.isNaN(x) && !Float.isNaN(y); + } + + /** + * Returns true if the arguments are equal or within the range of allowed error (inclusive). + * Returns {@code false} if either of the arguments is NaN. + * + *

    Two double numbers are considered equal if there are {@code (maxUlps - 1)} (or fewer) + * floating point numbers between them, i.e. two adjacent floating point numbers are considered + * equal. + * + *

    Adapted from org.apache.commons.numbers.core.Precision + * + *

    github: https://github.com/apache/commons-numbers release 1.2 + * + * @param x first value + * @param y second value + * @param maxUlps {@code (maxUlps - 1)} is the number of floating point values between {@code x} + * and {@code y}. + * @return {@code true} if there are fewer than {@code maxUlps} floating point values between + * {@code x} and {@code y}. + */ + public static boolean doubleUlpEquals(final double x, final double y, final int maxUlps) { + final long xInt = Double.doubleToRawLongBits(x); + final long yInt = Double.doubleToRawLongBits(y); + + if ((xInt ^ yInt) < 0) { + // Numbers have opposite signs, take care of overflow. + // Remove the sign bit to obtain the absolute ULP above zero. + final long deltaPlus = xInt & Long.MAX_VALUE; + final long deltaMinus = yInt & Long.MAX_VALUE; + + // Note: + // If either value is NaN, the exponent bits are set to (2047L << 52) and the + // distance above 0.0 is always above an integer ULP error. So omit the test + // for NaN and return directly. + + // Avoid possible overflow from adding the deltas by splitting the comparison + return deltaPlus <= maxUlps && deltaMinus <= (maxUlps - deltaPlus); + } + + // Numbers have same sign, there is no risk of overflow. + return Math.abs(xInt - yInt) <= maxUlps && !Double.isNaN(x) && !Double.isNaN(y); + } + /** start and end are BOTH inclusive */ public static int nextInt(Random r, int start, int end) { return RandomNumbers.randomIntBetween(r, start, end); @@ -1226,12 +1310,31 @@ public DocValuesFormat getDocValuesFormatForField(String field) { }; } + /** + * Return a Codec that can read any of the default codecs and formats, but always writes in the + * specified format. + */ + public static Codec alwaysKnnVectorsFormat(final KnnVectorsFormat format) { + // TODO: we really need for knn vectors impls etc to announce themselves + // (and maybe their params, too) to infostream on flush and merge. + // otherwise in a real debugging situation we won't know whats going on! + if (LuceneTestCase.VERBOSE) { + System.out.println("TestUtil: forcing knn vectors format to:" + format); + } + return new AssertingCodec() { + @Override + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return format; + } + }; + } + /** * Returns the actual default codec (e.g. LuceneMNCodec) for this version of Lucene. This may be * different from {@link Codec#getDefault()} because that is randomized. */ public static Codec getDefaultCodec() { - return new Lucene912Codec(); + return new Lucene101Codec(); } /** @@ -1239,7 +1342,7 @@ public static Codec getDefaultCodec() { * Lucene. */ public static PostingsFormat getDefaultPostingsFormat() { - return new Lucene912PostingsFormat(); + return new Lucene101PostingsFormat(); } /** @@ -1250,7 +1353,7 @@ public static PostingsFormat getDefaultPostingsFormat() { */ public static PostingsFormat getDefaultPostingsFormat( int minItemsPerBlock, int maxItemsPerBlock) { - return new Lucene912PostingsFormat(minItemsPerBlock, maxItemsPerBlock); + return new Lucene101PostingsFormat(minItemsPerBlock, maxItemsPerBlock); } /** Returns a random postings format that supports term ordinals */ @@ -1260,8 +1363,8 @@ public static PostingsFormat getPostingsFormatWithOrds(Random r) { return new LuceneFixedGap(); case 1: return new BlockTreeOrdsPostingsFormat(); - // TODO: these don't actually support ords! - // case 2: return new FSTOrdPostingsFormat(); + // TODO: these don't actually support ords! + // case 2: return new FSTOrdPostingsFormat(); default: throw new AssertionError(); } @@ -1375,15 +1478,17 @@ public static void assertAttributeReflection( /** Assert that the given {@link TopDocs} have the same top docs and consistent hit counts. */ public static void assertConsistent(TopDocs expected, TopDocs actual) { Assert.assertEquals( - "wrong total hits", expected.totalHits.value == 0, actual.totalHits.value == 0); - if (expected.totalHits.relation == TotalHits.Relation.EQUAL_TO) { - if (actual.totalHits.relation == TotalHits.Relation.EQUAL_TO) { - Assert.assertEquals("wrong total hits", expected.totalHits.value, actual.totalHits.value); + "wrong total hits", expected.totalHits.value() == 0, actual.totalHits.value() == 0); + if (expected.totalHits.relation() == TotalHits.Relation.EQUAL_TO) { + if (actual.totalHits.relation() == TotalHits.Relation.EQUAL_TO) { + Assert.assertEquals( + "wrong total hits", expected.totalHits.value(), actual.totalHits.value()); } else { - Assert.assertTrue("wrong total hits", expected.totalHits.value >= actual.totalHits.value); + Assert.assertTrue( + "wrong total hits", expected.totalHits.value() >= actual.totalHits.value()); } - } else if (actual.totalHits.relation == TotalHits.Relation.EQUAL_TO) { - Assert.assertTrue("wrong total hits", expected.totalHits.value <= actual.totalHits.value); + } else if (actual.totalHits.relation() == TotalHits.Relation.EQUAL_TO) { + Assert.assertTrue("wrong total hits", expected.totalHits.value() <= actual.totalHits.value()); } Assert.assertEquals("wrong hit count", expected.scoreDocs.length, actual.scoreDocs.length); for (int hitIDX = 0; hitIDX < expected.scoreDocs.length; hitIDX++) { diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/util/automaton/AutomatonTestUtil.java b/lucene/test-framework/src/java/org/apache/lucene/tests/util/automaton/AutomatonTestUtil.java index 38819479cfc6..5a308f371335 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/util/automaton/AutomatonTestUtil.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/util/automaton/AutomatonTestUtil.java @@ -16,6 +16,7 @@ */ package org.apache.lucene.tests.util.automaton; +import java.util.ArrayDeque; import java.util.ArrayList; import java.util.BitSet; import java.util.HashMap; @@ -33,6 +34,7 @@ import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.Operations; import org.apache.lucene.util.automaton.RegExp; +import org.apache.lucene.util.automaton.StatePair; import org.apache.lucene.util.automaton.TooComplexToDeterminizeException; import org.apache.lucene.util.automaton.Transition; @@ -154,15 +156,7 @@ public static class RandomAcceptedStrings { private final Automaton a; private final Transition[][] transitions; - private static class ArrivingTransition { - final int from; - final Transition t; - - public ArrivingTransition(int from, Transition t) { - this.from = from; - this.t = t; - } - } + private record ArrivingTransition(int from, Transition t) {} public RandomAcceptedStrings(Automaton a) { this.a = a; @@ -533,4 +527,82 @@ public static boolean isDeterministicSlow(Automaton a) { assert a.isDeterministic() == true; return true; } + + /** + * Returns true if these two automata accept exactly the same language. This is a costly + * computation! Both automata must be determinized and have no dead states! + */ + public static boolean sameLanguage(Automaton a1, Automaton a2) { + if (a1 == a2) { + return true; + } + return subsetOf(a2, a1) && subsetOf(a1, a2); + } + + /** + * Returns true if the language of a1 is a subset of the language of a2. + * Both automata must be determinized and must have no dead states. + * + *

    Complexity: quadratic in number of states. + */ + public static boolean subsetOf(Automaton a1, Automaton a2) { + if (a1.isDeterministic() == false) { + throw new IllegalArgumentException("a1 must be deterministic"); + } + if (a2.isDeterministic() == false) { + throw new IllegalArgumentException("a2 must be deterministic"); + } + assert Operations.hasDeadStatesFromInitial(a1) == false; + assert Operations.hasDeadStatesFromInitial(a2) == false; + if (a1.getNumStates() == 0) { + // Empty language is alwyas a subset of any other language + return true; + } else if (a2.getNumStates() == 0) { + return Operations.isEmpty(a1); + } + + // TODO: cutover to iterators instead + Transition[][] transitions1 = a1.getSortedTransitions(); + Transition[][] transitions2 = a2.getSortedTransitions(); + ArrayDeque worklist = new ArrayDeque<>(); + HashSet visited = new HashSet<>(); + StatePair p = new StatePair(0, 0); + worklist.add(p); + visited.add(p); + while (worklist.size() > 0) { + p = worklist.removeFirst(); + if (a1.isAccept(p.s1) && a2.isAccept(p.s2) == false) { + return false; + } + Transition[] t1 = transitions1[p.s1]; + Transition[] t2 = transitions2[p.s2]; + for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) { + while (b2 < t2.length && t2[b2].max < t1[n1].min) { + b2++; + } + int min1 = t1[n1].min, max1 = t1[n1].max; + + for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++) { + if (t2[n2].min > min1) { + return false; + } + if (t2[n2].max < Character.MAX_CODE_POINT) { + min1 = t2[n2].max + 1; + } else { + min1 = Character.MAX_CODE_POINT; + max1 = Character.MIN_CODE_POINT; + } + StatePair q = new StatePair(t1[n1].dest, t2[n2].dest); + if (!visited.contains(q)) { + worklist.add(q); + visited.add(q); + } + } + if (min1 <= max1) { + return false; + } + } + } + return true; + } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/util/fst/FSTTester.java b/lucene/test-framework/src/java/org/apache/lucene/tests/util/fst/FSTTester.java index 521e8cbe6454..cbd2ac3f762b 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/util/fst/FSTTester.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/util/fst/FSTTester.java @@ -164,14 +164,7 @@ static IntsRef toIntsRef(BytesRef br, IntsRefBuilder ir) { } /** Holds one input/output pair. */ - public static class InputOutput implements Comparable> { - public final IntsRef input; - public final T output; - - public InputOutput(IntsRef input, T output) { - this.input = input; - this.output = output; - } + public record InputOutput(IntsRef input, T output) implements Comparable> { @Override public int compareTo(InputOutput other) { diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/util/hnsw/HnswTestUtil.java b/lucene/test-framework/src/java/org/apache/lucene/tests/util/hnsw/HnswTestUtil.java deleted file mode 100644 index 955665544bcc..000000000000 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/util/hnsw/HnswTestUtil.java +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.tests.util.hnsw; - -import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; - -import java.io.IOException; -import java.util.ArrayDeque; -import java.util.ArrayList; -import java.util.Deque; -import java.util.List; -import org.apache.lucene.codecs.hnsw.HnswGraphProvider; -import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; -import org.apache.lucene.index.CodecReader; -import org.apache.lucene.index.FilterLeafReader; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.util.FixedBitSet; -import org.apache.lucene.util.hnsw.HnswGraph; - -/** Utilities for use in tests involving HNSW graphs */ -public class HnswTestUtil { - - /** - * Returns true iff level 0 of the graph is fully connected - that is every node is reachable from - * any entry point. - */ - public static boolean isFullyConnected(HnswGraph knnValues) throws IOException { - return componentSizes(knnValues).size() < 2; - } - - /** - * Returns the sizes of the distinct graph components on level 0. If the graph is fully-connected - * there will only be a single component. If the graph is empty, the returned list will be empty. - */ - public static List componentSizes(HnswGraph hnsw) throws IOException { - List sizes = new ArrayList<>(); - FixedBitSet connectedNodes = new FixedBitSet(hnsw.size()); - assert hnsw.size() == hnsw.getNodesOnLevel(0).size(); - int total = 0; - while (total < connectedNodes.length()) { - int componentSize = traverseConnectedNodes(hnsw, connectedNodes); - assert componentSize > 0; - sizes.add(componentSize); - total += componentSize; - } - return sizes; - } - - // count the nodes in a connected component of the graph and set the bits of its nodes in - // connectedNodes bitset - private static int traverseConnectedNodes(HnswGraph hnswGraph, FixedBitSet connectedNodes) - throws IOException { - // Start at entry point and search all nodes on this level - int entryPoint = nextClearBit(connectedNodes, 0); - if (entryPoint == NO_MORE_DOCS) { - return 0; - } - Deque stack = new ArrayDeque<>(); - stack.push(entryPoint); - int count = 0; - while (!stack.isEmpty()) { - int node = stack.pop(); - if (connectedNodes.get(node)) { - continue; - } - count++; - connectedNodes.set(node); - hnswGraph.seek(0, node); - int friendOrd; - while ((friendOrd = hnswGraph.nextNeighbor()) != NO_MORE_DOCS) { - stack.push(friendOrd); - } - } - return count; - } - - private static int nextClearBit(FixedBitSet bits, int index) { - // Does not depend on the ghost bits being clear! - long[] barray = bits.getBits(); - assert index >= 0 && index < bits.length() : "index=" + index + ", numBits=" + bits.length(); - int i = index >> 6; - long word = ~(barray[i] >> index); // skip all the bits to the right of index - - if (word != 0) { - return index + Long.numberOfTrailingZeros(word); - } - - while (++i < barray.length) { - word = ~barray[i]; - if (word != 0) { - int next = (i << 6) + Long.numberOfTrailingZeros(word); - if (next >= bits.length()) { - return NO_MORE_DOCS; - } else { - return next; - } - } - } - return NO_MORE_DOCS; - } - - public static boolean graphIsConnected(IndexReader reader, String vectorField) - throws IOException { - for (LeafReaderContext ctx : reader.leaves()) { - CodecReader codecReader = (CodecReader) FilterLeafReader.unwrap(ctx.reader()); - HnswGraph graph = - ((HnswGraphProvider) - ((PerFieldKnnVectorsFormat.FieldsReader) codecReader.getVectorReader()) - .getFieldReader(vectorField)) - .getGraph(vectorField); - if (isFullyConnected(graph) == false) { - return false; - } - } - return true; - } -} diff --git a/lucene/test-framework/src/resources/org/apache/lucene/tests/geo/github-13841-1.geojson.gz b/lucene/test-framework/src/resources/org/apache/lucene/tests/geo/github-13841-1.geojson.gz new file mode 100644 index 000000000000..4b933e785342 Binary files /dev/null and b/lucene/test-framework/src/resources/org/apache/lucene/tests/geo/github-13841-1.geojson.gz differ diff --git a/lucene/test-framework/src/resources/org/apache/lucene/tests/geo/github-13841-2.wkt.gz b/lucene/test-framework/src/resources/org/apache/lucene/tests/geo/github-13841-2.wkt.gz new file mode 100644 index 000000000000..18089bef04ea Binary files /dev/null and b/lucene/test-framework/src/resources/org/apache/lucene/tests/geo/github-13841-2.wkt.gz differ diff --git a/lucene/test-framework/src/resources/org/apache/lucene/tests/geo/github-13841-3.wkt.gz b/lucene/test-framework/src/resources/org/apache/lucene/tests/geo/github-13841-3.wkt.gz new file mode 100644 index 000000000000..34f3f6dde225 Binary files /dev/null and b/lucene/test-framework/src/resources/org/apache/lucene/tests/geo/github-13841-3.wkt.gz differ diff --git a/lucene/test-framework/src/resources/org/apache/lucene/tests/index/LICENSE.txt b/lucene/test-framework/src/resources/org/apache/lucene/tests/index/LICENSE.txt new file mode 100644 index 000000000000..fc1b33ae9b3e --- /dev/null +++ b/lucene/test-framework/src/resources/org/apache/lucene/tests/index/LICENSE.txt @@ -0,0 +1,507 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + + +Some code in core/src/java/org/apache/lucene/util/UnicodeUtil.java was +derived from unicode conversion examples available at +http://www.unicode.org/Public/PROGRAMS/CVTUTF. Here is the copyright +from those sources: + +/* + * Copyright 2001-2004 Unicode, Inc. + * + * Disclaimer + * + * This source code is provided as is by Unicode, Inc. No claims are + * made as to fitness for any particular purpose. No warranties of any + * kind are expressed or implied. The recipient agrees to determine + * applicability of information provided. If this file has been + * purchased on magnetic or optical media from Unicode, Inc., the + * sole remedy for any claim will be exchange of defective media + * within 90 days of receipt. + * + * Limitations on Rights to Redistribute This Code + * + * Unicode, Inc. hereby grants the right to freely use the information + * supplied in this file in the creation of products supporting the + * Unicode Standard, and to make copies of this file in any form + * for internal or external distribution as long as this notice + * remains attached. + */ + + +Some code in core/src/java/org/apache/lucene/util/ArrayUtil.java was +derived from Python 2.4.2 sources available at +http://www.python.org. Full license is here: + + http://www.python.org/download/releases/2.4.2/license/ + +Some code in core/src/java/org/apache/lucene/util/UnicodeUtil.java was +derived from Python 3.1.2 sources available at +http://www.python.org. Full license is here: + + http://www.python.org/download/releases/3.1.2/license/ + +Some code in core/src/java/org/apache/lucene/util/automaton was +derived from Brics automaton sources available at +www.brics.dk/automaton/. Here is the copyright from those sources: + +/* + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +The levenshtein automata tables in core/src/java/org/apache/lucene/util/automaton +were automatically generated with the moman/finenight FSA package. +Here is the copyright for those sources: + +# Copyright (c) 2010, Jean-Philippe Barrette-LaPierre, +# +# Permission is hereby granted, free of charge, to any person +# obtaining a copy of this software and associated documentation +# files (the "Software"), to deal in the Software without +# restriction, including without limitation the rights to use, +# copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +Some code in core/src/java/org/apache/lucene/util/UnicodeUtil.java was +derived from ICU (http://www.icu-project.org) +The full license is available here: + https://github.com/unicode-org/icu/blob/main/icu4c/LICENSE + +/* + * Copyright (C) 1999-2010, International Business Machines + * Corporation and others. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, and/or sell copies of the + * Software, and to permit persons to whom the Software is furnished to do so, + * provided that the above copyright notice(s) and this permission notice appear + * in all copies of the Software and that both the above copyright notice(s) and + * this permission notice appear in supporting documentation. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE + * LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR + * ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER + * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * Except as contained in this notice, the name of a copyright holder shall not + * be used in advertising or otherwise to promote the sale, use or other + * dealings in this Software without prior written authorization of the + * copyright holder. + */ + +The following license applies to the Snowball stemmers: + +Copyright (c) 2001, Dr Martin Porter +Copyright (c) 2002, Richard Boulton +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holders nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The following license applies to the KStemmer: + +Copyright © 2003, +Center for Intelligent Information Retrieval, +University of Massachusetts, Amherst. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +3. The names "Center for Intelligent Information Retrieval" and +"University of Massachusetts" must not be used to endorse or promote products +derived from this software without prior written permission. To obtain +permission, contact info@ciir.cs.umass.edu. + +THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF MASSACHUSETTS AND OTHER CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +SUCH DAMAGE. + +The following license applies to the Morfologik project: + +Copyright (c) 2006 Dawid Weiss +Copyright (c) 2007-2011 Dawid Weiss, Marcin Miłkowski +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + * Neither the name of Morfologik nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +--- + +The dictionary comes from Morfologik project. Morfologik uses data from +Polish ispell/myspell dictionary hosted at http://www.sjp.pl/slownik/en/ and +is licenced on the terms of (inter alia) LGPL and Creative Commons +ShareAlike. The part-of-speech tags were added in Morfologik project and +are not found in the data from sjp.pl. The tagset is similar to IPI PAN +tagset. + +--- + +The following license applies to the Morfeusz project, +used by org.apache.lucene.analysis.morfologik. + +BSD-licensed dictionary of Polish (SGJP) +http://sgjp.pl/morfeusz/ + +Copyright © 2011 Zygmunt Saloni, Włodzimierz Gruszczyński, +Marcin Woliński, Robert Wołosz + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the + distribution. + +THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS +OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDERS OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN +IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +--- + +core/src/java/org/apache/lucene/util/compress/LZ4.java is a Java +implementation of the LZ4 (https://github.com/lz4/lz4/tree/dev/lib) +compression format for Lucene's DataInput/DataOutput abstractions. + +LZ4 Library +Copyright (c) 2011-2016, Yann Collet +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/lucene/test-framework/src/resources/org/apache/lucene/tests/index/NOTICE.txt b/lucene/test-framework/src/resources/org/apache/lucene/tests/index/NOTICE.txt new file mode 100644 index 000000000000..ea6903484c0c --- /dev/null +++ b/lucene/test-framework/src/resources/org/apache/lucene/tests/index/NOTICE.txt @@ -0,0 +1,197 @@ +Apache Lucene +Copyright 2001-2022 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + +Includes software from other Apache Software Foundation projects, +including, but not limited to: + - Apache Jakarta Regexp + - Apache Commons + - Apache Xerces + +ICU4J, (under analysis/icu) is licensed under an MIT styles license +and Copyright (c) 1995-2008 International Business Machines Corporation and others + +Some data files (under analysis/icu/src/data) are derived from Unicode data such +as the Unicode Character Database. See http://unicode.org/copyright.html for more +details. + +Brics Automaton (under core/src/java/org/apache/lucene/util/automaton) is +BSD-licensed, created by Anders Møller. See http://www.brics.dk/automaton/ + +The levenshtein automata tables (under core/src/java/org/apache/lucene/util/automaton) were +automatically generated with the moman/finenight FSA library, created by +Jean-Philippe Barrette-LaPierre. This library is available under an MIT license, +see http://sites.google.com/site/rrettesite/moman and +http://bitbucket.org/jpbarrette/moman/overview/ + +The class org.apache.lucene.util.WeakIdentityMap was derived from +the Apache CXF project and is Apache License 2.0. + +The class org.apache.lucene.util.compress.LZ4 is a Java rewrite of the LZ4 +compression library (https://github.com/lz4/lz4/tree/dev/lib) that is licensed +under the 2-clause BSD license. +(https://opensource.org/licenses/bsd-license.php) + +The Google Code Prettify is Apache License 2.0. +See http://code.google.com/p/google-code-prettify/ + +This product includes code (JaspellTernarySearchTrie) from Java Spelling Checkin +g Package (jaspell): http://jaspell.sourceforge.net/ +License: The BSD License (http://www.opensource.org/licenses/bsd-license.php) + +The snowball stemmers in + analysis/common/src/java/net/sf/snowball +were developed by Martin Porter and Richard Boulton. +The snowball stopword lists in + analysis/common/src/resources/org/apache/lucene/analysis/snowball +were developed by Martin Porter and Richard Boulton. +The full snowball package is available from + https://snowballstem.org/ + +The KStem stemmer in + analysis/common/src/org/apache/lucene/analysis/en +was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst) +under the BSD-license. + +The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default +stopword list that is BSD-licensed created by Jacques Savoy. These files reside in: +analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt +See http://members.unine.ch/jacques.savoy/clef/index.html. + +The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers +(common) are based on BSD-licensed reference implementations created by Jacques Savoy and +Ljiljana Dolamic. These files reside in: +analysis/common/src/java/org/apache/lucene/analysis/de/GermanLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/es/SpanishLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java + +The Stempel analyzer (stempel) includes BSD-licensed software developed +by the Egothor project http://egothor.sf.net/, created by Leo Galambos, Martin Kvapil, +and Edmond Nolan. + +The Polish analyzer (stempel) comes with a default +stopword list that is BSD-licensed created by the Carrot2 project. The file resides +in stempel/src/resources/org/apache/lucene/analysis/pl/stopwords.txt. +See https://github.com/carrot2/carrot2. + +The SmartChineseAnalyzer source code (smartcn) was +provided by Xiaoping Gao and copyright 2009 by www.imdict.net. + +WordBreakTestUnicode_*.java (under modules/analysis/common/src/test/) +is derived from Unicode data such as the Unicode Character Database. +See http://unicode.org/copyright.html for more details. + +The Morfologik analyzer (morfologik) includes BSD-licensed software +developed by Dawid Weiss and Marcin Miłkowski +(https://github.com/morfologik/morfologik-stemming) and uses +data from the BSD-licensed dictionary of Polish (SGJP, http://sgjp.pl/morfeusz/). + +=========================================================================== +Kuromoji Japanese Morphological Analyzer - Apache Lucene Integration +=========================================================================== + +This software includes a binary and/or source version of data from + + mecab-ipadic-2.7.0-20070801 + +which can be obtained from + + http://atilika.com/releases/mecab-ipadic/mecab-ipadic-2.7.0-20070801.tar.gz + +or + + http://jaist.dl.sourceforge.net/project/mecab/mecab-ipadic/2.7.0-20070801/mecab-ipadic-2.7.0-20070801.tar.gz + +=========================================================================== +mecab-ipadic-2.7.0-20070801 Notice +=========================================================================== + +Nara Institute of Science and Technology (NAIST), +the copyright holders, disclaims all warranties with regard to this +software, including all implied warranties of merchantability and +fitness, in no event shall NAIST be liable for +any special, indirect or consequential damages or any damages +whatsoever resulting from loss of use, data or profits, whether in an +action of contract, negligence or other tortuous action, arising out +of or in connection with the use or performance of this software. + +A large portion of the dictionary entries +originate from ICOT Free Software. The following conditions for ICOT +Free Software applies to the current dictionary as well. + +Each User may also freely distribute the Program, whether in its +original form or modified, to any third party or parties, PROVIDED +that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear +on, or be attached to, the Program, which is distributed substantially +in the same form as set out herein and that such intended +distribution, if actually made, will neither violate or otherwise +contravene any of the laws and regulations of the countries having +jurisdiction over the User or the intended distribution itself. + +NO WARRANTY + +The program was produced on an experimental basis in the course of the +research and development conducted during the project and is provided +to users as so produced on an experimental basis. Accordingly, the +program is provided without any warranty whatsoever, whether express, +implied, statutory or otherwise. The term "warranty" used herein +includes, but is not limited to, any warranty of the quality, +performance, merchantability and fitness for a particular purpose of +the program and the nonexistence of any infringement or violation of +any right of any third party. + +Each user of the program will agree and understand, and be deemed to +have agreed and understood, that there is no warranty whatsoever for +the program and, accordingly, the entire risk arising from or +otherwise connected with the program is assumed by the user. + +Therefore, neither ICOT, the copyright holder, or any other +organization that participated in or was otherwise related to the +development of the program and their respective officials, directors, +officers and other employees shall be held liable for any and all +damages, including, without limitation, general, special, incidental +and consequential damages, arising out of or otherwise in connection +with the use or inability to use the program or any product, material +or result produced or otherwise obtained by using the program, +regardless of whether they have been advised of, or otherwise had +knowledge of, the possibility of such damages at any time during the +project or thereafter. Each user will be deemed to have agreed to the +foregoing by his or her commencement of use of the program. The term +"use" as used herein includes, but is not limited to, the use, +modification, copying and distribution of the program and the +production of secondary products from the program. + +In the case where the program, whether in its original form or +modified, was distributed or delivered to or received by a user from +any person, organization or entity other than ICOT, unless it makes or +grants independently of ICOT any specific warranty to the user in +writing, such person, organization or entity, will also be exempted +from and not be held liable to the user for any such damages as noted +above as far as the program is concerned. + +=========================================================================== +Nori Korean Morphological Analyzer - Apache Lucene Integration +=========================================================================== + +This software includes a binary and/or source version of data from + + mecab-ko-dic-2.1.1-20180720 + +which can be obtained from + + https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/mecab-ko-dic-2.1.1-20180720.tar.gz diff --git a/lucene/test-framework/src/test/org/apache/lucene/tests/search/TestPerThreadPKLookup.java b/lucene/test-framework/src/test/org/apache/lucene/tests/search/TestPerThreadPKLookup.java index 2136727838cf..7807626c462d 100644 --- a/lucene/test-framework/src/test/org/apache/lucene/tests/search/TestPerThreadPKLookup.java +++ b/lucene/test-framework/src/test/org/apache/lucene/tests/search/TestPerThreadPKLookup.java @@ -31,6 +31,94 @@ public class TestPerThreadPKLookup extends LuceneTestCase { + public void testReopen() throws Exception { + Directory dir = newDirectory(); + IndexWriter writer = + new IndexWriter( + dir, + new IndexWriterConfig(new MockAnalyzer(random())) + .setMergePolicy(NoMergePolicy.INSTANCE)); + + Document doc; + doc = new Document(); + doc.add(new KeywordField("PK", "1", Field.Store.NO)); + writer.addDocument(doc); + + doc = new Document(); + doc.add(new KeywordField("PK", "2", Field.Store.NO)); + writer.addDocument(doc); + writer.flush(); + + // Terms in PK is null. + doc = new Document(); + doc.add(new KeywordField("PK2", "3", Field.Store.NO)); + writer.addDocument(doc); + + doc = new Document(); + doc.add(new KeywordField("PK2", "4", Field.Store.NO)); + writer.addDocument(doc); + writer.flush(); + + DirectoryReader reader1 = DirectoryReader.open(writer); + PerThreadPKLookup pkLookup1 = new PerThreadPKLookup(reader1, "PK"); + + doc = new Document(); + doc.add(new KeywordField("PK", "5", Field.Store.NO)); + writer.addDocument(doc); + + doc = new Document(); + doc.add(new KeywordField("PK", "6", Field.Store.NO)); + writer.addDocument(doc); + // Update liveDocs. + writer.deleteDocuments(new Term("PK", "1")); + writer.flush(); + + // Terms in PK is null. + doc = new Document(); + doc.add(new KeywordField("PK2", "7", Field.Store.NO)); + writer.addDocument(doc); + + doc = new Document(); + doc.add(new KeywordField("PK2", "8", Field.Store.NO)); + writer.addDocument(doc); + writer.flush(); + + assertEquals(0, pkLookup1.lookup(newBytesRef("1"))); + assertEquals(1, pkLookup1.lookup(newBytesRef("2"))); + assertEquals(-1, pkLookup1.lookup(newBytesRef("5"))); + assertEquals(-1, pkLookup1.lookup(newBytesRef("8"))); + DirectoryReader reader2 = DirectoryReader.openIfChanged(reader1); + PerThreadPKLookup pkLookup2 = pkLookup1.reopen(reader2); + + assertEquals(-1, pkLookup2.lookup(newBytesRef("1"))); + assertEquals(1, pkLookup2.lookup(newBytesRef("2"))); + assertEquals(4, pkLookup2.lookup(newBytesRef("5"))); + assertEquals(-1, pkLookup2.lookup(newBytesRef("8"))); + + doc = new Document(); + doc.add(new KeywordField("PK", "9", Field.Store.NO)); + writer.addDocument(doc); + + doc = new Document(); + doc.add(new KeywordField("PK", "10", Field.Store.NO)); + writer.addDocument(doc); + writer.flush(); + + assertEquals(-1, pkLookup2.lookup(newBytesRef("9"))); + DirectoryReader reader3 = DirectoryReader.openIfChanged(reader2); + PerThreadPKLookup pkLookup3 = pkLookup2.reopen(reader3); + assertEquals(8, pkLookup3.lookup(newBytesRef("9"))); + + DirectoryReader reader4 = DirectoryReader.openIfChanged(reader3); + assertNull(pkLookup3.reopen(reader4)); + + writer.close(); + reader1.close(); + reader2.close(); + reader3.close(); + dir.close(); + } + public void testPKLookupWithUpdate() throws Exception { Directory dir = newDirectory(); IndexWriter writer = diff --git a/lucene/test-framework/src/test/org/apache/lucene/tests/util/TestFloatingPointUlpEquality.java b/lucene/test-framework/src/test/org/apache/lucene/tests/util/TestFloatingPointUlpEquality.java new file mode 100644 index 000000000000..ab1d99a83a13 --- /dev/null +++ b/lucene/test-framework/src/test/org/apache/lucene/tests/util/TestFloatingPointUlpEquality.java @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.tests.util; + +import static org.apache.lucene.tests.util.TestUtil.doubleUlpEquals; +import static org.apache.lucene.tests.util.TestUtil.floatUlpEquals; + +/** + * Tests for floating point equality utility methods. + * + *

    Adapted from org.apache.commons.numbers.core.PrecisionTest + * + *

    github: https://github.com/apache/commons-numbers release 1.2 + */ +public class TestFloatingPointUlpEquality extends LuceneTestCase { + public static void testDoubleEqualsWithAllowedUlps() { + assertTrue(doubleUlpEquals(0.0, -0.0, 1)); + assertTrue(doubleUlpEquals(Double.MIN_VALUE, -0.0, 1)); + assertFalse(doubleUlpEquals(Double.MIN_VALUE, -Double.MIN_VALUE, 1)); + + assertTrue(doubleUlpEquals(1.0, 1 + Math.ulp(1d), 1)); + assertFalse(doubleUlpEquals(1.0, 1 + 2 * Math.ulp(1d), 1)); + + for (double value : new double[] {153.0, -128.0, 0.0, 1.0}) { + assertTrue(doubleUlpEquals(value, value, 1)); + assertTrue(doubleUlpEquals(value, Math.nextUp(value), 1)); + assertFalse(doubleUlpEquals(value, Math.nextUp(Math.nextUp(value)), 1)); + assertTrue(doubleUlpEquals(value, Math.nextDown(value), 1)); + assertFalse(doubleUlpEquals(value, Math.nextDown(Math.nextDown(value)), 1)); + assertFalse(doubleUlpEquals(value, value, -1)); + assertFalse(doubleUlpEquals(value, Math.nextUp(value), 0)); + assertTrue(doubleUlpEquals(value, Math.nextUp(Math.nextUp(value)), 2)); + assertTrue(doubleUlpEquals(value, Math.nextDown(Math.nextDown(value)), 2)); + } + + assertTrue(doubleUlpEquals(Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY, 1)); + assertTrue(doubleUlpEquals(Double.MAX_VALUE, Double.POSITIVE_INFINITY, 1)); + + assertTrue(doubleUlpEquals(Double.NEGATIVE_INFINITY, Double.NEGATIVE_INFINITY, 1)); + assertTrue(doubleUlpEquals(-Double.MAX_VALUE, Double.NEGATIVE_INFINITY, 1)); + + assertFalse(doubleUlpEquals(Double.NaN, Double.NaN, 1)); + assertFalse(doubleUlpEquals(Double.NaN, Double.NaN, 0)); + assertFalse(doubleUlpEquals(Double.NaN, 0, 0)); + assertFalse(doubleUlpEquals(0, Double.NaN, 0)); + assertFalse(doubleUlpEquals(Double.NaN, Double.POSITIVE_INFINITY, 0)); + assertFalse(doubleUlpEquals(Double.NaN, Double.NEGATIVE_INFINITY, 0)); + + // Create a NaN representation 1 ulp above infinity. + // This hits not equal coverage for binary representations within the ulp but using NaN. + final double nan = + Double.longBitsToDouble(Double.doubleToRawLongBits(Double.POSITIVE_INFINITY) + 1); + assertFalse(doubleUlpEquals(nan, Double.POSITIVE_INFINITY, 1)); + assertFalse(doubleUlpEquals(Double.POSITIVE_INFINITY, nan, 1)); + + assertFalse( + doubleUlpEquals(Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY, Integer.MAX_VALUE)); + assertFalse(doubleUlpEquals(0, Double.MAX_VALUE, Integer.MAX_VALUE)); + // Here: f == 5.304989477E-315; + // it is used to test the maximum ULP distance between two opposite sign numbers. + final double f = Double.longBitsToDouble(1L << 30); + assertFalse(doubleUlpEquals(-f, f, Integer.MAX_VALUE)); + assertTrue(doubleUlpEquals(-f, Math.nextDown(f), Integer.MAX_VALUE)); + assertTrue(doubleUlpEquals(Math.nextUp(-f), f, Integer.MAX_VALUE)); + // Maximum distance between same sign numbers. + final double f2 = Double.longBitsToDouble((1L << 30) + Integer.MAX_VALUE); + assertTrue(doubleUlpEquals(f, f2, Integer.MAX_VALUE)); + assertFalse(doubleUlpEquals(f, Math.nextUp(f2), Integer.MAX_VALUE)); + assertFalse(doubleUlpEquals(Math.nextDown(f), f2, Integer.MAX_VALUE)); + } + + public static void testFloatEqualsWithAllowedUlps() { + assertTrue(floatUlpEquals(0.0f, -0.0f, (short) 1)); + assertTrue(floatUlpEquals(Float.MIN_VALUE, -0.0f, (short) 1)); + assertFalse(floatUlpEquals(Float.MIN_VALUE, -Float.MIN_VALUE, (short) 1)); + + assertTrue(floatUlpEquals(1.0f, 1f + Math.ulp(1f), (short) 1)); + assertFalse(floatUlpEquals(1.0f, 1f + 2 * Math.ulp(1f), (short) 1)); + + for (float value : new float[] {153.0f, -128.0f, 0.0f, 1.0f}) { + assertTrue(floatUlpEquals(value, value, (short) 1)); + assertTrue(floatUlpEquals(value, Math.nextUp(value), (short) 1)); + assertFalse(floatUlpEquals(value, Math.nextUp(Math.nextUp(value)), (short) 1)); + assertTrue(floatUlpEquals(value, Math.nextDown(value), (short) 1)); + assertFalse(floatUlpEquals(value, Math.nextDown(Math.nextDown(value)), (short) 1)); + assertFalse(floatUlpEquals(value, value, (short) -1)); + assertFalse(floatUlpEquals(value, Math.nextUp(value), (short) 0)); + assertTrue(floatUlpEquals(value, Math.nextUp(Math.nextUp(value)), (short) 2)); + assertTrue(floatUlpEquals(value, Math.nextDown(Math.nextDown(value)), (short) 2)); + } + + assertTrue(floatUlpEquals(Float.POSITIVE_INFINITY, Float.POSITIVE_INFINITY, (short) 1)); + assertTrue(floatUlpEquals(Float.MAX_VALUE, Float.POSITIVE_INFINITY, (short) 1)); + + assertTrue(floatUlpEquals(Float.NEGATIVE_INFINITY, Float.NEGATIVE_INFINITY, (short) 1)); + assertTrue(floatUlpEquals(-Float.MAX_VALUE, Float.NEGATIVE_INFINITY, (short) 1)); + + assertFalse(floatUlpEquals(Float.NaN, Float.NaN, (short) 1)); + assertFalse(floatUlpEquals(Float.NaN, Float.NaN, (short) 0)); + assertFalse(floatUlpEquals(Float.NaN, 0, (short) 0)); + assertFalse(floatUlpEquals(0, Float.NaN, (short) 0)); + assertFalse(floatUlpEquals(Float.NaN, Float.POSITIVE_INFINITY, (short) 0)); + assertFalse(floatUlpEquals(Float.NaN, Float.NEGATIVE_INFINITY, (short) 0)); + + assertFalse(floatUlpEquals(Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY, (short) 32767)); + // The 31-bit integer specification of the max positive ULP allows an extremely + // large range of a 23-bit mantissa and 8-bit exponent + assertTrue(floatUlpEquals(0, Float.MAX_VALUE, (short) 32767)); + // Here: f == 2; + // it is used to test the maximum ULP distance between two opposite sign numbers. + final float f = Float.intBitsToFloat(1 << 30); + assertFalse(floatUlpEquals(-f, f, (short) 32767)); + assertTrue(floatUlpEquals(-f, Math.nextDown(f), (short) 32767)); + assertTrue(floatUlpEquals(Math.nextUp(-f), f, (short) 32767)); + // Maximum distance between same sign finite numbers is not possible as the upper + // limit is NaN. Check that it is not equal to anything. + final float f2 = Float.intBitsToFloat(Integer.MAX_VALUE); + assertEquals(Double.NaN, f2, 0); + assertFalse(floatUlpEquals(f2, Float.MAX_VALUE, (short) 32767)); + assertFalse(floatUlpEquals(f2, 0, (short) 32767)); + } +} diff --git a/versions.toml b/versions.toml index 6a137975d3e2..80dc51f39bf2 100644 --- a/versions.toml +++ b/versions.toml @@ -8,18 +8,18 @@ ecj = "3.36.0" errorprone = "2.18.0" flexmark = "0.61.24" # @keep This is GJF version for spotless/ tidy. -googleJavaFormat = "1.18.1" -groovy = "3.0.21" +googleJavaFormat = "1.23.0" +groovy = "4.0.22" hamcrest = "2.2" icu4j = "74.2" javacc = "7.0.12" jflex = "1.8.2" -jgit = "5.13.1.202206130422-r" +jgit = "6.10.0.202406032230-r" jmh = "1.37" jts = "1.17.0" junit = "4.13.1" # @keep Minimum gradle version to run the build -minGradle = "8.8" +minGradle = "8.10" # @keep This is the minimum required Java version. minJava = "21" morfologik = "2.1.9" @@ -49,7 +49,7 @@ flexmark-ext-abbreviation = { module = "com.vladsch.flexmark:flexmark-ext-abbrev flexmark-ext-attributes = { module = "com.vladsch.flexmark:flexmark-ext-attributes", version.ref = "flexmark" } flexmark-ext-autolink = { module = "com.vladsch.flexmark:flexmark-ext-autolink", version.ref = "flexmark" } flexmark-ext-tables = { module = "com.vladsch.flexmark:flexmark-ext-tables", version.ref = "flexmark" } -groovy = { module = "org.codehaus.groovy:groovy-all", version.ref = "groovy" } +groovy = { module = "org.apache.groovy:groovy-all", version.ref = "groovy" } hamcrest = { module = "org.hamcrest:hamcrest", version.ref = "hamcrest" } icu4j = { module = "com.ibm.icu:icu4j", version.ref = "icu4j" } javacc = { module = "net.java.dev.javacc:javacc", version.ref = "javacc" } @@ -76,10 +76,10 @@ zstd = { module = "com.github.luben:zstd-jni", version.ref = "zstd" } benmanes-versions = "com.github.ben-manes.versions:0.51.0" dependencychecks = "com.carrotsearch.gradle.dependencychecks:0.0.9" errorprone = "net.ltgt.errorprone:3.1.0" -forbiddenapis = "de.thetaphi.forbiddenapis:3.7" +forbiddenapis = "de.thetaphi.forbiddenapis:3.8" jacocolog = "org.barfuin.gradle.jacocolog:3.1.0" owasp-dependencycheck = "org.owasp.dependencycheck:7.2.0" randomizedtesting = "com.carrotsearch.gradle.randomizedtesting:0.0.6" -spotless = "com.diffplug.spotless:6.5.2" +spotless = "com.diffplug.spotless:6.9.1" undercouch-download = "de.undercouch.download:5.2.0" versionCatalogUpdate = "nl.littlerobots.version-catalog-update:0.8.4"