Merge branch 'main' into skipbydocvalues

apache · Jul 21, 2023 · 56a51f7 · 56a51f7
2 parents 4b26a0e + 59c56a0
commit 56a51f7
Show file tree

Hide file tree

Showing 112 changed files with 2,411 additions and 1,295 deletions.
diff --git a/gradle/generation/forUtil.gradle b/gradle/generation/forUtil.gradle
@@ -34,7 +34,7 @@ configure(project(":lucene:core")) {
       quietExec {
         workingDir genDir
         executable project.externalTool("python3")
-        args = []
+        args = [ '-B', genScript ]
       }
     }
   }
@@ -59,7 +59,7 @@ configure(project(":lucene:backward-codecs")) {
       quietExec {
         workingDir genDir
         executable project.externalTool("python3")
-        args = []
+        args = [ '-B', genScript ]
       }
     }
   }

diff --git a/gradle/validation/forbidden-apis/defaults.all.txt b/gradle/validation/forbidden-apis/defaults.all.txt
@@ -37,6 +37,11 @@ java.lang.Character#codePointAt(char[],int) @ Implicit end offset is error-prone
 java.io.File#delete() @ use Files.delete for real exception, IOUtils.deleteFilesIgnoringExceptions if you dont care
 
 java.util.Collections#shuffle(java.util.List) @ Use shuffle(List, Random) instead so that it can be reproduced
+java.util.Stack @ Use more modern java.util.ArrayDeque as it is not synchronized
+java.util.Vector @ Use more modern java.util.ArrayList as it is not synchronized
+
+# TODO (needs some fix in forbiddenapis): this also hits java.util.Properties:
+# java.util.Hashtable @ Use more modern java.util.HashMap as it is not synchronized
 
 java.util.Locale#forLanguageTag(java.lang.String) @ use new Locale.Builder().setLanguageTag(...).build() which has error handling
 java.util.Locale#toString() @ use Locale#toLanguageTag() for a standardized BCP47 locale name

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -55,6 +55,8 @@ API Changes
 
 * GITHUB#12321: Reduced visibility of StringsToAutomaton. Please use Automata#makeStringUnion instead. (Greg Miller)
 
+* GITHUB#12407: Removed Scorable#docID. (Adrien Grand)
+
 New Features
 ---------------------
 
@@ -73,6 +75,8 @@ Improvements
 
 * LUCENE-10652: Add a top-n range faceting example to RangeFacetsExample. (Yuting Gan)
 
+* GITHUB#12447: Hunspell: speed up the dictionary enumeration (Peter Gromov)
+
 Optimizations
 ---------------------
 
@@ -118,15 +122,22 @@ Other
 
 API Changes
 ---------------------
-(No changes)
+
+* GITHUB#11248: IntBlockPool's SliceReader, SliceWriter, and all int slice functionality are moved out to MemoryIndex.
+  (Stefan Vodita)
 
 New Features
 ---------------------
-(No changes)
+
+* GITHUB#12383: Introduced LeafCollector#finish, a hook that runs after
+  collection has finished running on a leaf. (Adrien Grand)
+
+* LUCENE-8183, GITHUB#9231: Added the abbility to get noSubMatches and noOverlappingMatches in
+  HyphenationCompoundWordFilter (Martin Demberger, original from Rupert Westenthaler)
 
 Improvements
 ---------------------
-(No changes)
+* GITHUB#12374: Add CachingLeafSlicesSupplier to compute the LeafSlices for concurrent segment search (Sorabh Hamirwasia)
 
 Optimizations
 ---------------------
@@ -136,22 +147,40 @@ Optimizations
 * GITHUB#12361: Faster top-level disjunctions sorted by descending score.
   (Adrien Grand)
 
+* GITHUB#12444: Faster top-level disjunctions sorted by descending score in
+  case of many terms or queries that expose suboptimal score upper bounds.
+  (Adrien Grand)
+
+* GITHUB#12383: Assign a dummy simScorer in TermsWeight if score is not needed. (Sagar Upadhyaya)
+
 * GITHUB#12372: Reduce allocation during HNSW construction (Jonathan Ellis)
 
 * GITHUB#12385: Restore parallel knn query rewrite across segments rather than slices (Luca Cavanna)
 
 * GITHUB#12381: Skip docs with DocValues in NumericLeafComparator. (Lu Xugang)
 
+* GITHUB#12381: Speed up NumericDocValuesWriter with index sorting. (Chao Zhang)
+
+
 Bug Fixes
 ---------------------
 
-* GITHUB#9660: Throw and ArithmeticException when the offset overflows in a ByteBlockPool. (Stefan Vodita)
+* GITHUB#9660: Throw an ArithmeticException when the offset overflows in a ByteBlockPool. (Stefan Vodita)
 
 * GITHUB#12388: JoinUtil queries were ignoring boosts. (Alan Woodward)
 
+* GITHUB#12413: Fix HNSW graph search bug that potentially leaked unapproved docs (Ben Trent).
+
+* GITHUB#12423: Respect timeouts in ExitableDirectoryReader when searching with byte[] vectors (Ben Trent).
+
 Other
 ---------------------
-(No changes)
+
+* GITHUB#12404: Remove usage and add some legacy java.util classes to forbiddenapis (Stack, Hashtable, Vector).
+  (Uwe Schindler)
+
+* GITHUB#12410: Refactor vectorization support (split provider from implementation classes).
+  (Uwe Schindler, Chris Hegarty)
 
 ======================== Lucene 9.7.0 =======================
 

diff --git a/lucene/MIGRATE.md b/lucene/MIGRATE.md
@@ -143,6 +143,18 @@ Lucene 9.2 or stay with 9.0.
 
 See LUCENE-10558 for more details and workarounds.
 
+### Removed Scorable#docID() (GITHUB#12407)
+
+This method has been removed in order to enable more search-time optimizations.
+Use the doc ID passed to `LeafCollector#collect` to know which doc ID is being
+collected.
+
+### ScoreCachingWrappingScorer now wraps a LeafCollector instead of a Scorable (GITHUB#12407)
+
+In order to adapt to the removal of `Scorable#docID()`,
+`ScoreCachingWrappingScorer` now wraps a `LeafCollector` rather than a
+`Scorable`.
+
 ## Migration from Lucene 8.x to Lucene 9.0
 
 ### Rename of binary artifacts from '**-analyzers-**' to '**-analysis-**' (LUCENE-9562)

diff --git a/...mmon/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java b/...mmon/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java
@@ -34,6 +34,9 @@
  */
 public class HyphenationCompoundWordTokenFilter extends CompoundWordTokenFilterBase {
   private final HyphenationTree hyphenator;
+  private final boolean noSubMatches;
+  private final boolean noOverlappingMatches;
+  private final boolean calcSubMatches;
 
   /**
    * Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
@@ -51,6 +54,8 @@ public HyphenationCompoundWordTokenFilter(
         DEFAULT_MIN_WORD_SIZE,
         DEFAULT_MIN_SUBWORD_SIZE,
         DEFAULT_MAX_SUBWORD_SIZE,
+        false,
+        false,
         false);
   }
 
@@ -73,9 +78,47 @@ public HyphenationCompoundWordTokenFilter(
       int minSubwordSize,
       int maxSubwordSize,
       boolean onlyLongestMatch) {
+    this(
+        input,
+        hyphenator,
+        dictionary,
+        minWordSize,
+        minSubwordSize,
+        maxSubwordSize,
+        onlyLongestMatch,
+        false,
+        false);
+  }
+
+  /**
+   * Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
+   *
+   * @param input the {@link org.apache.lucene.analysis.TokenStream} to process
+   * @param hyphenator the hyphenation pattern tree to use for hyphenation
+   * @param dictionary the word dictionary to match against.
+   * @param minWordSize only words longer than this get processed
+   * @param minSubwordSize only subwords longer than this get to the output stream
+   * @param maxSubwordSize only subwords shorter than this get to the output stream
+   * @param onlyLongestMatch Add only the longest matching subword to the stream
+   * @param noSubMatches Excludes subwords that are enclosed by an other token
+   * @param noOverlappingMatches Excludes subwords that overlap with an other subword
+   */
+  public HyphenationCompoundWordTokenFilter(
+      TokenStream input,
+      HyphenationTree hyphenator,
+      CharArraySet dictionary,
+      int minWordSize,
+      int minSubwordSize,
+      int maxSubwordSize,
+      boolean onlyLongestMatch,
+      boolean noSubMatches,
+      boolean noOverlappingMatches) {
     super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
 
     this.hyphenator = Objects.requireNonNull(hyphenator, "hyphenator");
+    this.noSubMatches = noSubMatches;
+    this.noOverlappingMatches = noOverlappingMatches;
+    this.calcSubMatches = !onlyLongestMatch && !noSubMatches && !noOverlappingMatches;
   }
 
   /**
@@ -140,69 +183,71 @@ public static HyphenationTree getHyphenationTree(InputSource hyphenationSource)
 
   @Override
   protected void decompose() {
+    // if the token is in the dictionary and we are not interested in subMatches
+    // we can skip decomposing this token (see testNoSubAndTokenInDictionary unit test)
+    // NOTE:
+    // we check against token and the token that is one character
+    // shorter to avoid problems with genitive 's characters and other binding characters
+    if (dictionary != null
+        && !this.calcSubMatches
+        && (dictionary.contains(termAtt.buffer(), 0, termAtt.length())
+            || termAtt.length() > 1
+                && dictionary.contains(termAtt.buffer(), 0, termAtt.length() - 1))) {
+      return; // the whole token is in the dictionary - do not decompose
+    }
+
     // get the hyphenation points
     Hyphenation hyphens = hyphenator.hyphenate(termAtt.buffer(), 0, termAtt.length(), 1, 1);
     // No hyphen points found -> exit
     if (hyphens == null) {
       return;
     }
+    int maxSubwordSize = Math.min(this.maxSubwordSize, termAtt.length() - 1);
+
+    int consumed = -1; // hyp of the longest token added (for noSub)
 
     final int[] hyp = hyphens.getHyphenationPoints();
 
     for (int i = 0; i < hyp.length; ++i) {
-      int remaining = hyp.length - i;
+      if (noOverlappingMatches) { // if we do not want overlapping subwords
+        i = Math.max(i, consumed); // skip over consumed hyp
+      }
       int start = hyp[i];
-      CompoundToken longestMatchToken = null;
-      for (int j = 1; j < remaining; j++) {
-        int partLength = hyp[i + j] - start;
+      int until = noSubMatches ? Math.max(consumed, i) : i;
+      for (int j = hyp.length - 1; j > until; j--) {
+        int partLength = hyp[j] - start;
 
         // if the part is longer than maxSubwordSize we
         // are done with this round
-        if (partLength > this.maxSubwordSize) {
-          break;
+        if (partLength > maxSubwordSize) {
+          continue;
         }
 
         // we only put subwords to the token stream
         // that are longer than minPartSize
         if (partLength < this.minSubwordSize) {
           // BOGUS/BROKEN/FUNKY/WACKO: somehow we have negative 'parts' according to the
           // calculation above, and we rely upon minSubwordSize being >=0 to filter them out...
-          continue;
+          break;
         }
 
         // check the dictionary
         if (dictionary == null || dictionary.contains(termAtt.buffer(), start, partLength)) {
-          if (this.onlyLongestMatch) {
-            if (longestMatchToken != null) {
-              if (longestMatchToken.txt.length() < partLength) {
-                longestMatchToken = new CompoundToken(start, partLength);
-              }
-            } else {
-              longestMatchToken = new CompoundToken(start, partLength);
-            }
-          } else {
-            tokens.add(new CompoundToken(start, partLength));
+          tokens.add(new CompoundToken(start, partLength));
+          consumed = j; // mark the current hyp as consumed
+          if (!calcSubMatches) {
+            break; // do not search for shorter matches
           }
         } else if (dictionary.contains(termAtt.buffer(), start, partLength - 1)) {
           // check the dictionary again with a word that is one character
-          // shorter
-          // to avoid problems with genitive 's characters and other binding
-          // characters
-          if (this.onlyLongestMatch) {
-            if (longestMatchToken != null) {
-              if (longestMatchToken.txt.length() < partLength - 1) {
-                longestMatchToken = new CompoundToken(start, partLength - 1);
-              }
-            } else {
-              longestMatchToken = new CompoundToken(start, partLength - 1);
-            }
-          } else {
-            tokens.add(new CompoundToken(start, partLength - 1));
+          // shorter to avoid problems with genitive 's characters and
+          // other binding characters
+          tokens.add(new CompoundToken(start, partLength - 1));
+          consumed = j; // mark the current hyp as consumed
+          if (!calcSubMatches) {
+            break; // do not search for shorter matches
           }
-        }
-      }
-      if (this.onlyLongestMatch && longestMatchToken != null) {
-        tokens.add(longestMatchToken);
+        } // else dictionary is present but does not contain the part
       }
     }
   }

diff --git a/...c/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilterFactory.java b/...c/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilterFactory.java
@@ -77,6 +77,8 @@ public class HyphenationCompoundWordTokenFilterFactory extends TokenFilterFactor
   private final int minSubwordSize;
   private final int maxSubwordSize;
   private final boolean onlyLongestMatch;
+  private final boolean noSubMatches;
+  private final boolean noOverlappingMatches;
 
   /** Creates a new HyphenationCompoundWordTokenFilterFactory */
   public HyphenationCompoundWordTokenFilterFactory(Map<String, String> args) {
@@ -90,6 +92,8 @@ public HyphenationCompoundWordTokenFilterFactory(Map<String, String> args) {
     maxSubwordSize =
         getInt(args, "maxSubwordSize", CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE);
     onlyLongestMatch = getBoolean(args, "onlyLongestMatch", false);
+    noSubMatches = getBoolean(args, "noSubMatches", false);
+    noOverlappingMatches = getBoolean(args, "noOverlappingMatches", false);
     if (!args.isEmpty()) {
       throw new IllegalArgumentException("Unknown parameters: " + args);
     }
@@ -127,6 +131,8 @@ public TokenFilter create(TokenStream input) {
         minWordSize,
         minSubwordSize,
         maxSubwordSize,
-        onlyLongestMatch);
+        onlyLongestMatch,
+        noSubMatches,
+        noOverlappingMatches);
   }
 }
diff --git a/...analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/TernaryTree.java b/...analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/TernaryTree.java
@@ -17,8 +17,9 @@
 package org.apache.lucene.analysis.compound.hyphenation;
 
 import java.io.PrintStream;
+import java.util.ArrayDeque;
+import java.util.Deque;
 import java.util.Enumeration;
-import java.util.Stack;
 
 /**
  *
@@ -457,20 +458,20 @@ public Item clone() {
     }
 
     /** Node stack */
-    Stack<Item> ns;
+    Deque<Item> ns;
 
     /** key stack implemented with a StringBuilder */
     StringBuilder ks;
 
     public Iterator() {
       cur = -1;
-      ns = new Stack<>();
+      ns = new ArrayDeque<>();
       ks = new StringBuilder();
       rewind();
     }
 
     public void rewind() {
-      ns.removeAllElements();
+      ns.clear();
       ks.setLength(0);
       cur = root;
       run();
@@ -501,7 +502,7 @@ private int up() {
       Item i = new Item();
       int res = 0;
 
-      if (ns.empty()) {
+      if (ns.isEmpty()) {
         return -1;
       }
 
@@ -538,7 +539,7 @@ private int up() {
             break;
 
           default:
-            if (ns.empty()) {
+            if (ns.isEmpty()) {
               return -1;
             }
             climb = true;