From 0da782d6377aad2d1527643c34b513543e69f184 Mon Sep 17 00:00:00 2001 From: zhouhui Date: Sat, 30 Mar 2024 15:26:15 +0800 Subject: [PATCH 01/16] Avoid reload block. --- .../lucene90/blocktree/SegmentTermsEnum.java | 11 +++- .../blocktree/SegmentTermsEnumFrame.java | 55 +++++++++++++++++++ 2 files changed, 65 insertions(+), 1 deletion(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java index 479736099ef2..887d51e92365 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java @@ -309,6 +309,7 @@ private boolean setEOF() { @Override public boolean seekExact(BytesRef target) throws IOException { + int originEndCount = -1; if (fr.index == null) { throw new IllegalStateException("terms index was not loaded"); } @@ -435,7 +436,9 @@ public boolean seekExact(BytesRef target) throws IOException { // rewind frame ord=" + lastFrame.ord); // } currentFrame = lastFrame; - currentFrame.rewind(); + originEndCount = currentFrame.entCount; + currentFrame.rewind2(); + } else { // Target is exactly the same as current term assert term.length() == target.length; @@ -519,6 +522,9 @@ public boolean seekExact(BytesRef target) throws IOException { currentFrame.loadBlock(); final SeekStatus result = currentFrame.scanToTerm(target, true); + if (originEndCount != -1) { + currentFrame.entCount = originEndCount; + } if (result == SeekStatus.FOUND) { // if (DEBUG) { // System.out.println(" return FOUND term=" + term.utf8ToString() + " " + term); @@ -574,6 +580,9 @@ public boolean seekExact(BytesRef target) throws IOException { currentFrame.loadBlock(); final SeekStatus result = currentFrame.scanToTerm(target, true); + if (originEndCount != -1) { + currentFrame.entCount = originEndCount; + } if (result == SeekStatus.FOUND) { // if (DEBUG) { // System.out.println(" return FOUND term=" + term.utf8ToString() + " " + term); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java index 66231313e520..eae8d38585be 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java @@ -284,6 +284,61 @@ void rewind() { */ } + // Only rewind, don't force reload block. + // Reset reader position, don't read, decompress. + // Current term greater than target, reduce endCount. + void rewind2() { + + // Force reload: + fp = fpOrig; + // TODO: Reset entCount after this seek. + entCount = nextEnt; + nextEnt = 0; + hasTerms = hasTermsOrig; + if (isFloor) { + floorDataReader.setPosition(rewindPos); + numFollowFloorBlocks = floorDataReader.readVInt(); + assert numFollowFloorBlocks > 0; + nextFloorLabel = floorDataReader.readByte() & 0xff; + } + + suffixesReader.setPosition(0); + suffixLengthsReader.setPosition(0); + /* + //System.out.println("rewind"); + // Keeps the block loaded, but rewinds its state: + if (nextEnt > 0 || fp != fpOrig) { + if (DEBUG) { + System.out.println(" rewind frame ord=" + ord + " fpOrig=" + fpOrig + " fp=" + fp + " hasTerms?=" + hasTerms + " isFloor?=" + isFloor + " nextEnt=" + nextEnt + " prefixLen=" + prefix); + } + if (fp != fpOrig) { + fp = fpOrig; + nextEnt = -1; + } else { + nextEnt = 0; + } + hasTerms = hasTermsOrig; + if (isFloor) { + floorDataReader.rewind(); + numFollowFloorBlocks = floorDataReader.readVInt(); + nextFloorLabel = floorDataReader.readByte() & 0xff; + } + assert suffixBytes != null; + suffixesReader.rewind(); + assert statBytes != null; + statsReader.rewind(); + metaDataUpto = 0; + state.termBlockOrd = 0; + // TODO: skip this if !hasTerms? Then postings + // impl wouldn't have to write useless 0 byte + postingsReader.resetTermsBlock(fieldInfo, state); + lastSubFP = -1; + } else if (DEBUG) { + System.out.println(" skip rewind fp=" + fp + " fpOrig=" + fpOrig + " nextEnt=" + nextEnt + " ord=" + ord); + } + */ + } + // Decodes next entry; returns true if it's a sub-block public boolean next() throws IOException { if (isLeafBlock) { From d8ec44ee4ed5dac6c439a4146f64ae3395f2567e Mon Sep 17 00:00:00 2001 From: zhouhui Date: Mon, 8 Apr 2024 10:41:08 +0800 Subject: [PATCH 02/16] Rewind without reload for non-floor block or first floor block. --- .../lucene90/blocktree/SegmentTermsEnum.java | 26 +++++--- .../blocktree/SegmentTermsEnumFrame.java | 36 ++++++----- .../lucene99/TestLucene99PostingsFormat.java | 60 +++++++++++++++++-- .../index/BasePostingsFormatTestCase.java | 60 +++++++++++++++++++ 4 files changed, 154 insertions(+), 28 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java index 887d51e92365..1633e8f45b60 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java @@ -309,7 +309,6 @@ private boolean setEOF() { @Override public boolean seekExact(BytesRef target) throws IOException { - int originEndCount = -1; if (fr.index == null) { throw new IllegalStateException("terms index was not loaded"); } @@ -436,8 +435,21 @@ public boolean seekExact(BytesRef target) throws IOException { // rewind frame ord=" + lastFrame.ord); // } currentFrame = lastFrame; - originEndCount = currentFrame.entCount; - currentFrame.rewind2(); + + // Only rewindWithoutReload for non-floor block or first floor block. + // TODO: We need currentFrame's first entry to judge whether we can rewindWithoutReload for + // non-first floor blocks. + if (currentFrame.fp != currentFrame.fpOrig + || currentFrame.entCount == 0 + || currentFrame.nextEnt == -1) { + currentFrame.rewind(); + } else { + // Since target greater than current term, we could reduce entCount to nextEnt, and + // revert it after scanToTerm. + // origEndCount = currentFrame.entCount; + // currentFrame.entCount = currentFrame.nextEnt; + currentFrame.rewindWithoutReload(); + } } else { // Target is exactly the same as current term @@ -522,9 +534,7 @@ public boolean seekExact(BytesRef target) throws IOException { currentFrame.loadBlock(); final SeekStatus result = currentFrame.scanToTerm(target, true); - if (originEndCount != -1) { - currentFrame.entCount = originEndCount; - } + if (result == SeekStatus.FOUND) { // if (DEBUG) { // System.out.println(" return FOUND term=" + term.utf8ToString() + " " + term); @@ -580,9 +590,7 @@ public boolean seekExact(BytesRef target) throws IOException { currentFrame.loadBlock(); final SeekStatus result = currentFrame.scanToTerm(target, true); - if (originEndCount != -1) { - currentFrame.entCount = originEndCount; - } + if (result == SeekStatus.FOUND) { // if (DEBUG) { // System.out.println(" return FOUND term=" + term.utf8ToString() + " " + term); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java index eae8d38585be..7b47357a0b0e 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java @@ -285,25 +285,31 @@ void rewind() { } // Only rewind, don't force reload block. - // Reset reader position, don't read, decompress. + // Reset readers' position, don't read, decompress. // Current term greater than target, reduce endCount. - void rewind2() { - - // Force reload: - fp = fpOrig; - // TODO: Reset entCount after this seek. - entCount = nextEnt; + void rewindWithoutReload() { nextEnt = 0; - hasTerms = hasTermsOrig; - if (isFloor) { - floorDataReader.setPosition(rewindPos); - numFollowFloorBlocks = floorDataReader.readVInt(); - assert numFollowFloorBlocks > 0; - nextFloorLabel = floorDataReader.readByte() & 0xff; - } - suffixesReader.setPosition(0); suffixLengthsReader.setPosition(0); + statsReader.setPosition(0); + bytesReader.setPosition(0); + + // TODO: Since we only rewind without reload for fist floor(currentFrame.fp == + // currentFrame.fpOrig) + // So no need to set floorDataReader again? + // if (isFloor) { + // floorDataReader.setPosition(rewindPos); + // numFollowFloorBlocks = floorDataReader.readVInt(); + // assert numFollowFloorBlocks > 0; + // nextFloorLabel = floorDataReader.readByte() & 0xff; + // } + + metaDataUpto = 0; + + statsSingletonRunLength = 0; + state.termBlockOrd = 0; + lastSubFP = -1; + // state.termBlockOrd = 0; /* //System.out.println("rewind"); // Keeps the block loaded, but rewinds its state: diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99PostingsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99PostingsFormat.java index 99c0e0a6ae28..c09085d09eba 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99PostingsFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99PostingsFormat.java @@ -17,6 +17,8 @@ package org.apache.lucene.codecs.lucene99; import static org.apache.lucene.codecs.lucene99.Lucene99ScoreSkipReader.readImpacts; +import static org.apache.lucene.tests.util.TestUtil.alwaysPostingsFormat; +import static org.apache.lucene.tests.util.TestUtil.getDefaultPostingsFormat; import java.io.IOException; import java.util.Arrays; @@ -24,15 +26,13 @@ import java.util.List; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.CompetitiveImpactAccumulator; +import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.lucene90.blocktree.FieldReader; import org.apache.lucene.codecs.lucene90.blocktree.Stats; import org.apache.lucene.codecs.lucene99.Lucene99ScoreSkipReader.MutableImpactList; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; -import org.apache.lucene.index.DirectoryReader; -import org.apache.lucene.index.Impact; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.*; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; @@ -41,6 +41,7 @@ import org.apache.lucene.tests.analysis.MockAnalyzer; import org.apache.lucene.tests.index.BasePostingsFormatTestCase; import org.apache.lucene.tests.util.TestUtil; +import org.apache.lucene.util.BytesRef; public class TestLucene99PostingsFormat extends BasePostingsFormatTestCase { private final Codec codec = TestUtil.alwaysPostingsFormat(new Lucene99PostingsFormat()); @@ -143,4 +144,55 @@ private void doTestImpactSerialization(List impacts) throws IOException } } } + + public void testFloorBlocks() throws Exception { + Directory dir = newDirectory(); + // Set minTermBlockSize to 2, maxTermBlockSize to 3, to generate deep subBlock. + PostingsFormat postingsFormat = getDefaultPostingsFormat(2, 3); + + IndexWriter writer = + new IndexWriter(dir, newIndexWriterConfig().setCodec(alwaysPostingsFormat(postingsFormat))); + String[] categories = + new String[] { + "regular", "request1", "request2", "request3", "request4", "rest", "teacher", "team" + }; + + for (String category : categories) { + Document doc = new Document(); + doc.add(newStringField("category", category, Field.Store.YES)); + writer.addDocument(doc); + } + + IndexReader reader = DirectoryReader.open(writer); + + TermsEnum termsEnum = getOnlyLeafReader(reader).terms("category").iterator(); + + BytesRef target = new BytesRef("request2"); + assertTrue(termsEnum.seekExact(target)); + assertEquals(termsEnum.term(), target); + + target = new BytesRef("request1"); + assertTrue(termsEnum.seekExact(target)); + assertEquals(termsEnum.term(), target); + + target = new BytesRef("request4"); + assertTrue(termsEnum.seekExact(target)); + assertEquals(termsEnum.term(), target); + + target = new BytesRef("request3"); + assertTrue(termsEnum.seekExact(target)); + assertEquals(termsEnum.term(), target); + + target = new BytesRef("request4"); + assertTrue(termsEnum.seekExact(target)); + assertEquals(termsEnum.term(), target); + + target = new BytesRef("request1"); + assertTrue(termsEnum.seekExact(target)); + assertEquals(termsEnum.term(), target); + + writer.close(); + reader.close(); + dir.close(); + } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BasePostingsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BasePostingsFormatTestCase.java index 45e8bd8e858b..71399836b124 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BasePostingsFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BasePostingsFormatTestCase.java @@ -369,6 +369,66 @@ public void testGhosts() throws Exception { dir.close(); } + public void testBinarySearchTermLeaf() throws Exception { + Directory dir = newDirectory(); + + IndexWriterConfig iwc = newIndexWriterConfig(null); + iwc.setCodec(getCodec()); + iwc.setMergePolicy(newTieredMergePolicy()); + IndexWriter iw = new IndexWriter(dir, iwc); + + for (int i = 100000; i <= 100400; i++) { + // only add odd number + if (i % 2 == 1) { + Document document = new Document(); + document.add(new StringField("id", i + "", Field.Store.NO)); + iw.addDocument(document); + } + } + iw.commit(); + iw.forceMerge(1); + + DirectoryReader reader = DirectoryReader.open(iw); + TermsEnum termsEnum = getOnlyLeafReader(reader).terms("id").iterator(); + + BytesRef target1 = new BytesRef(100003 + ""); + assertTrue(termsEnum.seekExact(target1)); + assertEquals(termsEnum.term(), target1); + BytesRef target2 = new BytesRef(100001 + ""); + assertTrue(termsEnum.seekExact(target2)); + assertEquals(termsEnum.term(), target2); + + // test seekExact + for (int i = 100000; i <= 100400; i++) { + BytesRef target = new BytesRef(i + ""); + if (i % 2 == 1) { + assertTrue(termsEnum.seekExact(target)); + assertEquals(termsEnum.term(), target); + } else { + assertFalse(termsEnum.seekExact(target)); + } + } + + // test seekCeil + for (int i = 100000; i < 100400; i++) { + BytesRef target = new BytesRef(i + ""); + if (i % 2 == 1) { + assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(target)); + assertEquals(termsEnum.term(), target); + if (i <= 100397) { + assertEquals(new BytesRef(i + 2 + ""), termsEnum.next()); + } + } else { + assertEquals(SeekStatus.NOT_FOUND, termsEnum.seekCeil(target)); + assertEquals(new BytesRef(i + 1 + ""), termsEnum.term()); + } + } + assertEquals(SeekStatus.END, termsEnum.seekCeil(new BytesRef(100400 + ""))); + reader.close(); + iw.close(); + dir.close(); + } + // tests that level 2 ghost fields still work public void testLevel2Ghosts() throws Exception { Directory dir = newDirectory(); From 95fd6e0113b0543c11a7c322d8ffc8cfeca4ce30 Mon Sep 17 00:00:00 2001 From: zhouhui Date: Fri, 12 Apr 2024 15:45:04 +0800 Subject: [PATCH 03/16] Reduce entCount when target less than last term in same frame. --- .../lucene90/blocktree/SegmentTermsEnum.java | 41 +++++++++++++++++-- .../blocktree/SegmentTermsEnumFrame.java | 32 +++++++++++++++ 2 files changed, 70 insertions(+), 3 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java index 1633e8f45b60..1a594b29a0af 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java @@ -309,6 +309,8 @@ private boolean setEOF() { @Override public boolean seekExact(BytesRef target) throws IOException { + long withOutReloadFp = -1; + int origNextEnt = -1; if (fr.index == null) { throw new IllegalStateException("terms index was not loaded"); } @@ -434,6 +436,10 @@ public boolean seekExact(BytesRef target) throws IOException { // System.out.println(" target is before current (shares prefixLen=" + targetUpto + "); // rewind frame ord=" + lastFrame.ord); // } + + // If current frame changed, we can't reduce entCount. since target just less than different + // frame's last term. + boolean currentIsLast = currentFrame.fp == lastFrame.fp; currentFrame = lastFrame; // Only rewindWithoutReload for non-floor block or first floor block. @@ -444,10 +450,15 @@ public boolean seekExact(BytesRef target) throws IOException { || currentFrame.nextEnt == -1) { currentFrame.rewind(); } else { - // Since target greater than current term, we could reduce entCount to nextEnt, and + // Since target greater than last term, and stay on same frame with last term, we can + // reduce entCount + // to nextEnt, and // revert it after scanToTerm. - // origEndCount = currentFrame.entCount; - // currentFrame.entCount = currentFrame.nextEnt; + if (currentIsLast) { + origNextEnt = currentFrame.nextEnt; + withOutReloadFp = currentFrame.fp; + } + // TODO: take this to seekCeil. currentFrame.rewindWithoutReload(); } @@ -533,8 +544,20 @@ public boolean seekExact(BytesRef target) throws IOException { currentFrame.loadBlock(); + // We still stay on withOutReload frame, reduce entCount to nextEnt. + int origEntCount = -1; + if (currentFrame.fp == withOutReloadFp && origNextEnt != 0) { + origEntCount = currentFrame.entCount; + currentFrame.entCount = origNextEnt; + } + final SeekStatus result = currentFrame.scanToTerm(target, true); + // Revert entCount to origEntCount. + if (origEntCount != -1) { + currentFrame.entCount = origEntCount; + } + if (result == SeekStatus.FOUND) { // if (DEBUG) { // System.out.println(" return FOUND term=" + term.utf8ToString() + " " + term); @@ -589,8 +612,20 @@ public boolean seekExact(BytesRef target) throws IOException { currentFrame.loadBlock(); + // We still stay on withOutReload frame, reduce entCount to nextEnt. + int origEntCount = -1; + if (currentFrame.fp == withOutReloadFp && origNextEnt != 0) { + origEntCount = currentFrame.entCount; + currentFrame.entCount = origNextEnt; + } + final SeekStatus result = currentFrame.scanToTerm(target, true); + // Revert entCount to origEntCount. + if (origEntCount != -1) { + currentFrame.entCount = origEntCount; + } + if (result == SeekStatus.FOUND) { // if (DEBUG) { // System.out.println(" return FOUND term=" + term.utf8ToString() + " " + term); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java index 7b47357a0b0e..9a1e8df5f7e7 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java @@ -288,6 +288,7 @@ void rewind() { // Reset readers' position, don't read, decompress. // Current term greater than target, reduce endCount. void rewindWithoutReload() { + // Set nextEnt to 0, to prevent force load. nextEnt = 0; suffixesReader.setPosition(0); suffixLengthsReader.setPosition(0); @@ -817,4 +818,35 @@ private void fillTerm() { ste.term.grow(termLength); System.arraycopy(suffixBytes, startBytePos, ste.term.bytes(), prefix, suffix); } + + // Used for debugging. + @Override + public String toString() { + return "fp: " + + fp + + ", fpOrig: " + + fpOrig + + ", fpEnd: " + + fpEnd + + ", lastSubFP: " + + lastSubFP + + ", entCount: " + + entCount + + ", nextEnt: " + + nextEnt + + ", isLeafBlock: " + + isLeafBlock + + ", isFloor: " + + isFloor + + ", isLastInFloor: " + + isLastInFloor + + ", nextFloorLabel: " + + nextFloorLabel + + ", suffixesPos: " + + suffixesReader.getPosition() + + ", suffixLengthsPos: " + + suffixLengthsReader.getPosition() + + ", floorDataPos: " + + floorDataReader.getPosition(); + } } From 60f3d2654c249712e6461de4d6dda8f6f465cfdb Mon Sep 17 00:00:00 2001 From: zhouhui Date: Fri, 12 Apr 2024 17:15:05 +0800 Subject: [PATCH 04/16] Revert BasePostingsFormatTestCase, TestLucene99PostingsFormat to resolve conflicts. --- .../lucene99/TestLucene99PostingsFormat.java | 60 ++---------------- .../index/BasePostingsFormatTestCase.java | 62 +------------------ 2 files changed, 5 insertions(+), 117 deletions(-) diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99PostingsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99PostingsFormat.java index c09085d09eba..99c0e0a6ae28 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99PostingsFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99PostingsFormat.java @@ -17,8 +17,6 @@ package org.apache.lucene.codecs.lucene99; import static org.apache.lucene.codecs.lucene99.Lucene99ScoreSkipReader.readImpacts; -import static org.apache.lucene.tests.util.TestUtil.alwaysPostingsFormat; -import static org.apache.lucene.tests.util.TestUtil.getDefaultPostingsFormat; import java.io.IOException; import java.util.Arrays; @@ -26,13 +24,15 @@ import java.util.List; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.CompetitiveImpactAccumulator; -import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.lucene90.blocktree.FieldReader; import org.apache.lucene.codecs.lucene90.blocktree.Stats; import org.apache.lucene.codecs.lucene99.Lucene99ScoreSkipReader.MutableImpactList; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; -import org.apache.lucene.index.*; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.Impact; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; @@ -41,7 +41,6 @@ import org.apache.lucene.tests.analysis.MockAnalyzer; import org.apache.lucene.tests.index.BasePostingsFormatTestCase; import org.apache.lucene.tests.util.TestUtil; -import org.apache.lucene.util.BytesRef; public class TestLucene99PostingsFormat extends BasePostingsFormatTestCase { private final Codec codec = TestUtil.alwaysPostingsFormat(new Lucene99PostingsFormat()); @@ -144,55 +143,4 @@ private void doTestImpactSerialization(List impacts) throws IOException } } } - - public void testFloorBlocks() throws Exception { - Directory dir = newDirectory(); - // Set minTermBlockSize to 2, maxTermBlockSize to 3, to generate deep subBlock. - PostingsFormat postingsFormat = getDefaultPostingsFormat(2, 3); - - IndexWriter writer = - new IndexWriter(dir, newIndexWriterConfig().setCodec(alwaysPostingsFormat(postingsFormat))); - String[] categories = - new String[] { - "regular", "request1", "request2", "request3", "request4", "rest", "teacher", "team" - }; - - for (String category : categories) { - Document doc = new Document(); - doc.add(newStringField("category", category, Field.Store.YES)); - writer.addDocument(doc); - } - - IndexReader reader = DirectoryReader.open(writer); - - TermsEnum termsEnum = getOnlyLeafReader(reader).terms("category").iterator(); - - BytesRef target = new BytesRef("request2"); - assertTrue(termsEnum.seekExact(target)); - assertEquals(termsEnum.term(), target); - - target = new BytesRef("request1"); - assertTrue(termsEnum.seekExact(target)); - assertEquals(termsEnum.term(), target); - - target = new BytesRef("request4"); - assertTrue(termsEnum.seekExact(target)); - assertEquals(termsEnum.term(), target); - - target = new BytesRef("request3"); - assertTrue(termsEnum.seekExact(target)); - assertEquals(termsEnum.term(), target); - - target = new BytesRef("request4"); - assertTrue(termsEnum.seekExact(target)); - assertEquals(termsEnum.term(), target); - - target = new BytesRef("request1"); - assertTrue(termsEnum.seekExact(target)); - assertEquals(termsEnum.term(), target); - - writer.close(); - reader.close(); - dir.close(); - } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BasePostingsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BasePostingsFormatTestCase.java index 71399836b124..da6ac9f67dbc 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BasePostingsFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BasePostingsFormatTestCase.java @@ -369,66 +369,6 @@ public void testGhosts() throws Exception { dir.close(); } - public void testBinarySearchTermLeaf() throws Exception { - Directory dir = newDirectory(); - - IndexWriterConfig iwc = newIndexWriterConfig(null); - iwc.setCodec(getCodec()); - iwc.setMergePolicy(newTieredMergePolicy()); - IndexWriter iw = new IndexWriter(dir, iwc); - - for (int i = 100000; i <= 100400; i++) { - // only add odd number - if (i % 2 == 1) { - Document document = new Document(); - document.add(new StringField("id", i + "", Field.Store.NO)); - iw.addDocument(document); - } - } - iw.commit(); - iw.forceMerge(1); - - DirectoryReader reader = DirectoryReader.open(iw); - TermsEnum termsEnum = getOnlyLeafReader(reader).terms("id").iterator(); - - BytesRef target1 = new BytesRef(100003 + ""); - assertTrue(termsEnum.seekExact(target1)); - assertEquals(termsEnum.term(), target1); - BytesRef target2 = new BytesRef(100001 + ""); - assertTrue(termsEnum.seekExact(target2)); - assertEquals(termsEnum.term(), target2); - - // test seekExact - for (int i = 100000; i <= 100400; i++) { - BytesRef target = new BytesRef(i + ""); - if (i % 2 == 1) { - assertTrue(termsEnum.seekExact(target)); - assertEquals(termsEnum.term(), target); - } else { - assertFalse(termsEnum.seekExact(target)); - } - } - - // test seekCeil - for (int i = 100000; i < 100400; i++) { - BytesRef target = new BytesRef(i + ""); - if (i % 2 == 1) { - assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(target)); - assertEquals(termsEnum.term(), target); - if (i <= 100397) { - assertEquals(new BytesRef(i + 2 + ""), termsEnum.next()); - } - } else { - assertEquals(SeekStatus.NOT_FOUND, termsEnum.seekCeil(target)); - assertEquals(new BytesRef(i + 1 + ""), termsEnum.term()); - } - } - assertEquals(SeekStatus.END, termsEnum.seekCeil(new BytesRef(100400 + ""))); - reader.close(); - iw.close(); - dir.close(); - } - // tests that level 2 ghost fields still work public void testLevel2Ghosts() throws Exception { Directory dir = newDirectory(); @@ -1679,7 +1619,7 @@ public void testLineFileDocs() throws IOException { // Use a FS dir and a non-randomized IWC to not slow down indexing try (Directory dir = newFSDirectory(createTempDir())) { try (LineFileDocs docs = new LineFileDocs(random()); - IndexWriter w = new IndexWriter(dir, new IndexWriterConfig())) { + IndexWriter w = new IndexWriter(dir, new IndexWriterConfig())) { final int numDocs = atLeast(10_000); for (int i = 0; i < numDocs; ++i) { // Only keep the body field, and don't index term vectors on it, we only care about From 3969078527e6ae94626bac7cf7286bf9bf141c71 Mon Sep 17 00:00:00 2001 From: zhouhui Date: Fri, 12 Apr 2024 17:17:54 +0800 Subject: [PATCH 05/16] Add testFloorBlocks case. --- .../lucene99/TestLucene99PostingsFormat.java | 54 +++++++++++++++++++ .../index/BasePostingsFormatTestCase.java | 2 +- 2 files changed, 55 insertions(+), 1 deletion(-) diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99PostingsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99PostingsFormat.java index 341805e8a3e4..0e829d21dec2 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99PostingsFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99PostingsFormat.java @@ -17,6 +17,8 @@ package org.apache.lucene.codecs.lucene99; import static org.apache.lucene.codecs.lucene99.Lucene99ScoreSkipReader.readImpacts; +import static org.apache.lucene.tests.util.TestUtil.alwaysPostingsFormat; +import static org.apache.lucene.tests.util.TestUtil.getDefaultPostingsFormat; import java.io.IOException; import java.util.Arrays; @@ -24,6 +26,7 @@ import java.util.List; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.CompetitiveImpactAccumulator; +import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.lucene90.blocktree.FieldReader; import org.apache.lucene.codecs.lucene90.blocktree.Stats; import org.apache.lucene.codecs.lucene99.Lucene99ScoreSkipReader.MutableImpactList; @@ -150,4 +153,55 @@ protected void subCheckBinarySearch(TermsEnum termsEnum) throws Exception { assertEquals(TermsEnum.SeekStatus.NOT_FOUND, termsEnum.seekCeil(new BytesRef("10004a"))); assertEquals(termsEnum.term(), new BytesRef("100051")); } + + public void testFloorBlocks() throws Exception { + Directory dir = newDirectory(); + // Set minTermBlockSize to 2, maxTermBlockSize to 3, to generate deep subBlock. + PostingsFormat postingsFormat = getDefaultPostingsFormat(2, 3); + + IndexWriter writer = + new IndexWriter(dir, newIndexWriterConfig().setCodec(alwaysPostingsFormat(postingsFormat))); + String[] categories = + new String[] { + "regular", "request1", "request2", "request3", "request4", "rest", "teacher", "team" + }; + + for (String category : categories) { + Document doc = new Document(); + doc.add(newStringField("category", category, Field.Store.YES)); + writer.addDocument(doc); + } + + IndexReader reader = DirectoryReader.open(writer); + + TermsEnum termsEnum = getOnlyLeafReader(reader).terms("category").iterator(); + + BytesRef target = new BytesRef("request2"); + assertTrue(termsEnum.seekExact(target)); + assertEquals(termsEnum.term(), target); + + target = new BytesRef("request1"); + assertTrue(termsEnum.seekExact(target)); + assertEquals(termsEnum.term(), target); + + target = new BytesRef("request4"); + assertTrue(termsEnum.seekExact(target)); + assertEquals(termsEnum.term(), target); + + target = new BytesRef("request3"); + assertTrue(termsEnum.seekExact(target)); + assertEquals(termsEnum.term(), target); + + target = new BytesRef("request4"); + assertTrue(termsEnum.seekExact(target)); + assertEquals(termsEnum.term(), target); + + target = new BytesRef("request1"); + assertTrue(termsEnum.seekExact(target)); + assertEquals(termsEnum.term(), target); + + writer.close(); + reader.close(); + dir.close(); + } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BasePostingsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BasePostingsFormatTestCase.java index 6d9a051a66d7..4d0024b93e30 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BasePostingsFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BasePostingsFormatTestCase.java @@ -1674,7 +1674,7 @@ public void testLineFileDocs() throws IOException { // Use a FS dir and a non-randomized IWC to not slow down indexing try (Directory dir = newFSDirectory(createTempDir())) { try (LineFileDocs docs = new LineFileDocs(random()); - IndexWriter w = new IndexWriter(dir, new IndexWriterConfig())) { + IndexWriter w = new IndexWriter(dir, new IndexWriterConfig())) { final int numDocs = atLeast(10_000); for (int i = 0; i < numDocs; ++i) { // Only keep the body field, and don't index term vectors on it, we only care about From e7f7188b37bcca278fe938693f2b3315653281f0 Mon Sep 17 00:00:00 2001 From: zhouhui Date: Mon, 15 Apr 2024 10:43:55 +0800 Subject: [PATCH 06/16] Rewind without reload for seekCeil. --- .../lucene90/blocktree/SegmentTermsEnum.java | 51 ++++++++++++++++++- .../lucene99/TestLucene99PostingsFormat.java | 26 ++++++++++ 2 files changed, 76 insertions(+), 1 deletion(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java index 1a594b29a0af..507fdb2dee12 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java @@ -643,6 +643,8 @@ public boolean seekExact(BytesRef target) throws IOException { @Override public SeekStatus seekCeil(BytesRef target) throws IOException { + long withOutReloadFp = -1; + int origNextEnt = -1; if (fr.index == null) { throw new IllegalStateException("terms index was not loaded"); @@ -762,8 +764,30 @@ public SeekStatus seekCeil(BytesRef target) throws IOException { // System.out.println(" target is before current (shares prefixLen=" + targetUpto + "); // rewind frame ord=" + lastFrame.ord); // } + + // If current frame changed, we can't reduce entCount. since target just less than different + // frame's last term. + boolean currentIsLast = currentFrame.fp == lastFrame.fp; currentFrame = lastFrame; - currentFrame.rewind(); + + // Only rewindWithoutReload for non-floor block or first floor block. + // TODO: We need currentFrame's first entry to judge whether we can rewindWithoutReload for + // non-first floor blocks. + if (currentFrame.fp != currentFrame.fpOrig + || currentFrame.entCount == 0 + || currentFrame.nextEnt == -1) { + currentFrame.rewind(); + } else { + // Since target greater than last term, and stay on same frame with last term, we can + // reduce entCount + // to nextEnt, and + // revert it after scanToTerm. + if (currentIsLast) { + origNextEnt = currentFrame.nextEnt; + withOutReloadFp = currentFrame.fp; + } + currentFrame.rewindWithoutReload(); + } } else { // Target is exactly the same as current term assert term.length() == target.length; @@ -833,8 +857,21 @@ public SeekStatus seekCeil(BytesRef target) throws IOException { currentFrame.loadBlock(); + // We still stay on withOutReload frame, reduce entCount to nextEnt. + int origEntCount = -1; + if (currentFrame.fp == withOutReloadFp && origNextEnt != 0) { + origEntCount = currentFrame.entCount; + currentFrame.entCount = origNextEnt; + } + // if (DEBUG) System.out.println(" now scanToTerm"); final SeekStatus result = currentFrame.scanToTerm(target, false); + + // Revert entCount to origEntCount. + if (origEntCount != -1) { + currentFrame.entCount = origEntCount; + } + if (result == SeekStatus.END) { term.copyBytes(target); termExists = false; @@ -890,8 +927,20 @@ public SeekStatus seekCeil(BytesRef target) throws IOException { currentFrame.loadBlock(); + // We still stay on withOutReload frame, reduce entCount to nextEnt. + int origEntCount = -1; + if (currentFrame.fp == withOutReloadFp && origNextEnt != 0) { + origEntCount = currentFrame.entCount; + currentFrame.entCount = origNextEnt; + } + final SeekStatus result = currentFrame.scanToTerm(target, false); + // Revert entCount to origEntCount. + if (origEntCount != -1) { + currentFrame.entCount = origEntCount; + } + if (result == SeekStatus.END) { term.copyBytes(target); termExists = false; diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99PostingsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99PostingsFormat.java index 0e829d21dec2..c5d5dde4b02f 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99PostingsFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99PostingsFormat.java @@ -176,6 +176,7 @@ public void testFloorBlocks() throws Exception { TermsEnum termsEnum = getOnlyLeafReader(reader).terms("category").iterator(); + // Test seekExact. BytesRef target = new BytesRef("request2"); assertTrue(termsEnum.seekExact(target)); assertEquals(termsEnum.term(), target); @@ -200,6 +201,31 @@ public void testFloorBlocks() throws Exception { assertTrue(termsEnum.seekExact(target)); assertEquals(termsEnum.term(), target); + // Test seekCeil. + target = new BytesRef("request2"); + assertEquals(TermsEnum.SeekStatus.FOUND, termsEnum.seekCeil(target)); + assertEquals(termsEnum.term(), target); + + target = new BytesRef("request1"); + assertEquals(TermsEnum.SeekStatus.FOUND, termsEnum.seekCeil(target)); + assertEquals(termsEnum.term(), target); + + target = new BytesRef("request4"); + assertEquals(TermsEnum.SeekStatus.FOUND, termsEnum.seekCeil(target)); + assertEquals(termsEnum.term(), target); + + target = new BytesRef("request3"); + assertEquals(TermsEnum.SeekStatus.FOUND, termsEnum.seekCeil(target)); + assertEquals(termsEnum.term(), target); + + target = new BytesRef("request4"); + assertEquals(TermsEnum.SeekStatus.FOUND, termsEnum.seekCeil(target)); + assertEquals(termsEnum.term(), target); + + target = new BytesRef("request1"); + assertEquals(TermsEnum.SeekStatus.FOUND, termsEnum.seekCeil(target)); + assertEquals(termsEnum.term(), target); + writer.close(); reader.close(); dir.close(); From 90997e31f4189a6c91f14d16a6fbb9802f614b3b Mon Sep 17 00:00:00 2001 From: zhouhui Date: Wed, 24 Apr 2024 10:51:50 +0800 Subject: [PATCH 07/16] Remove stale todo. --- .../lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java | 1 - 1 file changed, 1 deletion(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java index 507fdb2dee12..4f77ad86f788 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java @@ -458,7 +458,6 @@ public boolean seekExact(BytesRef target) throws IOException { origNextEnt = currentFrame.nextEnt; withOutReloadFp = currentFrame.fp; } - // TODO: take this to seekCeil. currentFrame.rewindWithoutReload(); } From 258b0615ba51ddc2bd77a3aa1a1e8f5240576304 Mon Sep 17 00:00:00 2001 From: zhouhui Date: Wed, 24 Apr 2024 16:16:18 +0800 Subject: [PATCH 08/16] Only reduce entCount for leaf block. --- .../lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java index 4f77ad86f788..778bdeaf5191 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java @@ -454,7 +454,7 @@ public boolean seekExact(BytesRef target) throws IOException { // reduce entCount // to nextEnt, and // revert it after scanToTerm. - if (currentIsLast) { + if (currentIsLast && currentFrame.isLeafBlock) { origNextEnt = currentFrame.nextEnt; withOutReloadFp = currentFrame.fp; } @@ -781,7 +781,7 @@ public SeekStatus seekCeil(BytesRef target) throws IOException { // reduce entCount // to nextEnt, and // revert it after scanToTerm. - if (currentIsLast) { + if (currentIsLast && currentFrame.isLeafBlock) { origNextEnt = currentFrame.nextEnt; withOutReloadFp = currentFrame.fp; } From 21b21c9a9a6f11a2f44bfc5af4944886d29aecaa Mon Sep 17 00:00:00 2001 From: zhouhui Date: Thu, 25 Apr 2024 17:08:28 +0800 Subject: [PATCH 09/16] Fix comment. --- .../lucene90/blocktree/SegmentTermsEnum.java | 20 ++++++++----------- 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java index 778bdeaf5191..87f120a0299d 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java @@ -437,8 +437,9 @@ public boolean seekExact(BytesRef target) throws IOException { // rewind frame ord=" + lastFrame.ord); // } - // If current frame changed, we can't reduce entCount. since target just less than different - // frame's last term. + // We got lastFrame by comparing target and term, and target less than last seeked term in + // currentFrame. If lastFrame's fp is same with currentFrame's fp, we can reduce entCount to + // nextEnt. boolean currentIsLast = currentFrame.fp == lastFrame.fp; currentFrame = lastFrame; @@ -450,10 +451,7 @@ public boolean seekExact(BytesRef target) throws IOException { || currentFrame.nextEnt == -1) { currentFrame.rewind(); } else { - // Since target greater than last term, and stay on same frame with last term, we can - // reduce entCount - // to nextEnt, and - // revert it after scanToTerm. + // Prepare to reduce entCount. if (currentIsLast && currentFrame.isLeafBlock) { origNextEnt = currentFrame.nextEnt; withOutReloadFp = currentFrame.fp; @@ -764,8 +762,9 @@ public SeekStatus seekCeil(BytesRef target) throws IOException { // rewind frame ord=" + lastFrame.ord); // } - // If current frame changed, we can't reduce entCount. since target just less than different - // frame's last term. + // We got lastFrame by comparing target and term, and target less than last seeked term in + // currentFrame. If lastFrame's fp is same with currentFrame's fp, we can reduce entCount to + // nextEnt. boolean currentIsLast = currentFrame.fp == lastFrame.fp; currentFrame = lastFrame; @@ -777,10 +776,7 @@ public SeekStatus seekCeil(BytesRef target) throws IOException { || currentFrame.nextEnt == -1) { currentFrame.rewind(); } else { - // Since target greater than last term, and stay on same frame with last term, we can - // reduce entCount - // to nextEnt, and - // revert it after scanToTerm. + // Prepare to reduce entCount. if (currentIsLast && currentFrame.isLeafBlock) { origNextEnt = currentFrame.nextEnt; withOutReloadFp = currentFrame.fp; From a610c4e891f6d41529d0301a92029d9b2a6b2333 Mon Sep 17 00:00:00 2001 From: zhouhui Date: Thu, 25 Apr 2024 17:11:02 +0800 Subject: [PATCH 10/16] Add more test cases. --- .../lucene99/TestLucene99PostingsFormat.java | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99PostingsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99PostingsFormat.java index c5d5dde4b02f..07338171a774 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99PostingsFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99PostingsFormat.java @@ -201,6 +201,22 @@ public void testFloorBlocks() throws Exception { assertTrue(termsEnum.seekExact(target)); assertEquals(termsEnum.term(), target); + target = new BytesRef("request4"); + assertTrue(termsEnum.seekExact(target)); + assertEquals(termsEnum.term(), target); + + target = new BytesRef("regular"); + assertTrue(termsEnum.seekExact(target)); + assertEquals(termsEnum.term(), target); + + target = new BytesRef("rest"); + assertTrue(termsEnum.seekExact(target)); + assertEquals(termsEnum.term(), target); + + target = new BytesRef("regular"); + assertTrue(termsEnum.seekExact(target)); + assertEquals(termsEnum.term(), target); + // Test seekCeil. target = new BytesRef("request2"); assertEquals(TermsEnum.SeekStatus.FOUND, termsEnum.seekCeil(target)); @@ -226,6 +242,22 @@ public void testFloorBlocks() throws Exception { assertEquals(TermsEnum.SeekStatus.FOUND, termsEnum.seekCeil(target)); assertEquals(termsEnum.term(), target); + target = new BytesRef("request4"); + assertEquals(TermsEnum.SeekStatus.FOUND, termsEnum.seekCeil(target)); + assertEquals(termsEnum.term(), target); + + target = new BytesRef("regular"); + assertEquals(TermsEnum.SeekStatus.FOUND, termsEnum.seekCeil(target)); + assertEquals(termsEnum.term(), target); + + target = new BytesRef("rest"); + assertEquals(TermsEnum.SeekStatus.FOUND, termsEnum.seekCeil(target)); + assertEquals(termsEnum.term(), target); + + target = new BytesRef("regular"); + assertEquals(TermsEnum.SeekStatus.FOUND, termsEnum.seekCeil(target)); + assertEquals(termsEnum.term(), target); + writer.close(); reader.close(); dir.close(); From ab465e03169be96887f3dc29ef5b71b12e2652ac Mon Sep 17 00:00:00 2001 From: zhouhui Date: Fri, 24 May 2024 15:39:35 +0800 Subject: [PATCH 11/16] Typo. --- .../codecs/lucene90/blocktree/SegmentTermsEnum.java | 8 ++++---- .../codecs/lucene90/blocktree/SegmentTermsEnumFrame.java | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java index 87f120a0299d..aac78cdb70a5 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java @@ -550,7 +550,7 @@ public boolean seekExact(BytesRef target) throws IOException { final SeekStatus result = currentFrame.scanToTerm(target, true); - // Revert entCount to origEntCount. + // Restore entCount to origEntCount. if (origEntCount != -1) { currentFrame.entCount = origEntCount; } @@ -618,7 +618,7 @@ public boolean seekExact(BytesRef target) throws IOException { final SeekStatus result = currentFrame.scanToTerm(target, true); - // Revert entCount to origEntCount. + // Restore entCount to origEntCount. if (origEntCount != -1) { currentFrame.entCount = origEntCount; } @@ -862,7 +862,7 @@ public SeekStatus seekCeil(BytesRef target) throws IOException { // if (DEBUG) System.out.println(" now scanToTerm"); final SeekStatus result = currentFrame.scanToTerm(target, false); - // Revert entCount to origEntCount. + // Restore entCount to origEntCount. if (origEntCount != -1) { currentFrame.entCount = origEntCount; } @@ -931,7 +931,7 @@ public SeekStatus seekCeil(BytesRef target) throws IOException { final SeekStatus result = currentFrame.scanToTerm(target, false); - // Revert entCount to origEntCount. + // Restore entCount to origEntCount. if (origEntCount != -1) { currentFrame.entCount = origEntCount; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java index e478b53c61d7..00772df4d660 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java @@ -298,7 +298,7 @@ void rewindWithoutReload() { statsReader.setPosition(0); bytesReader.setPosition(0); - // TODO: Since we only rewind without reload for fist floor(currentFrame.fp == + // TODO: Since we only rewind without reload for first floor(currentFrame.fp == // currentFrame.fpOrig) // So no need to set floorDataReader again? // if (isFloor) { From 73f4f3bfdd0d669df7ae31f3d99e5fa6e4741439 Mon Sep 17 00:00:00 2001 From: zhouhui Date: Mon, 27 May 2024 15:54:24 +0800 Subject: [PATCH 12/16] Fix comment. --- .../lucene90/blocktree/SegmentTermsEnum.java | 26 ++++++++++++------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java index aac78cdb70a5..5191932be36a 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java @@ -438,17 +438,20 @@ public boolean seekExact(BytesRef target) throws IOException { // } // We got lastFrame by comparing target and term, and target less than last seeked term in - // currentFrame. If lastFrame's fp is same with currentFrame's fp, we can reduce entCount to - // nextEnt. + // currentFrame. If lastFrame's fp is same with currentFrame's fp, and finally we seek the + // same block, we can reduce entCount to nextEnt. boolean currentIsLast = currentFrame.fp == lastFrame.fp; currentFrame = lastFrame; // Only rewindWithoutReload for non-floor block or first floor block. // TODO: We need currentFrame's first entry to judge whether we can rewindWithoutReload for // non-first floor blocks. - if (currentFrame.fp != currentFrame.fpOrig - || currentFrame.entCount == 0 - || currentFrame.nextEnt == -1) { + if (currentFrame.fp + != currentFrame.fpOrig // this is a floor multi-block and not the first one. + || currentFrame.nextEnt + == -1) { // this is a block we pushed in stack but haven't loaded its data, or a + // block we just changed its fp and set nextEnt to -1 to prepare reload by + // scanToFloorFrame. currentFrame.rewind(); } else { // Prepare to reduce entCount. @@ -763,17 +766,20 @@ public SeekStatus seekCeil(BytesRef target) throws IOException { // } // We got lastFrame by comparing target and term, and target less than last seeked term in - // currentFrame. If lastFrame's fp is same with currentFrame's fp, we can reduce entCount to - // nextEnt. + // currentFrame. If lastFrame's fp is same with currentFrame's fp, and finally we seek the + // same block, we can reduce entCount to nextEnt. boolean currentIsLast = currentFrame.fp == lastFrame.fp; currentFrame = lastFrame; // Only rewindWithoutReload for non-floor block or first floor block. // TODO: We need currentFrame's first entry to judge whether we can rewindWithoutReload for // non-first floor blocks. - if (currentFrame.fp != currentFrame.fpOrig - || currentFrame.entCount == 0 - || currentFrame.nextEnt == -1) { + if (currentFrame.fp + != currentFrame.fpOrig // this is a floor multi-block and not the first one. + || currentFrame.nextEnt + == -1) { // this is a block we pushed in stack but haven't loaded its data, or a + // block we just changed its fp and set nextEnt to -1 to prepare reload by + // scanToFloorFrame. currentFrame.rewind(); } else { // Prepare to reduce entCount. From 51ba012437a275b91b3756f1a4000aed57695434 Mon Sep 17 00:00:00 2001 From: zhouhui Date: Tue, 28 May 2024 10:08:28 +0800 Subject: [PATCH 13/16] Clean code. --- .../lucene90/blocktree/SegmentTermsEnum.java | 34 +++++++++---------- 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java index 5191932be36a..6c91124a57d7 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java @@ -443,15 +443,7 @@ public boolean seekExact(BytesRef target) throws IOException { boolean currentIsLast = currentFrame.fp == lastFrame.fp; currentFrame = lastFrame; - // Only rewindWithoutReload for non-floor block or first floor block. - // TODO: We need currentFrame's first entry to judge whether we can rewindWithoutReload for - // non-first floor blocks. - if (currentFrame.fp - != currentFrame.fpOrig // this is a floor multi-block and not the first one. - || currentFrame.nextEnt - == -1) { // this is a block we pushed in stack but haven't loaded its data, or a - // block we just changed its fp and set nextEnt to -1 to prepare reload by - // scanToFloorFrame. + if (shouldRewind()) { currentFrame.rewind(); } else { // Prepare to reduce entCount. @@ -771,15 +763,7 @@ public SeekStatus seekCeil(BytesRef target) throws IOException { boolean currentIsLast = currentFrame.fp == lastFrame.fp; currentFrame = lastFrame; - // Only rewindWithoutReload for non-floor block or first floor block. - // TODO: We need currentFrame's first entry to judge whether we can rewindWithoutReload for - // non-first floor blocks. - if (currentFrame.fp - != currentFrame.fpOrig // this is a floor multi-block and not the first one. - || currentFrame.nextEnt - == -1) { // this is a block we pushed in stack but haven't loaded its data, or a - // block we just changed its fp and set nextEnt to -1 to prepare reload by - // scanToFloorFrame. + if (shouldRewind()) { currentFrame.rewind(); } else { // Prepare to reduce entCount. @@ -961,6 +945,20 @@ public SeekStatus seekCeil(BytesRef target) throws IOException { } } + // We should rewind a block in two cases: + // 1: currentFrame.fp != currentFrame.fpOrig. This is a floor multi-block and not the first one. + // 2: currentFrame.nextEnt == -1. This is a block we pushed in stack but haven't loaded its data, + // or a block we just changed its fp and set nextEnt to -1 by scanToFloorFrame to prepare reload. + // This means we only rewindWithoutReload for non-floor block or first floor block. + // TODO: We need currentFrame's first entry to judge whether we can rewindWithoutReload for + // non-first floor blocks. + private boolean shouldRewind() { + if (currentFrame.fp != currentFrame.fpOrig || currentFrame.nextEnt == -1) { + return true; + } + return false; + } + @SuppressWarnings("unused") private void printSeekState(PrintStream out) throws IOException { if (currentFrame == staticFrame) { From b0567d7c1f3e7ef080dcfa34f2380d4e8a717768 Mon Sep 17 00:00:00 2001 From: zhouhui Date: Tue, 4 Jun 2024 11:14:45 +0800 Subject: [PATCH 14/16] Clear comment. --- .../codecs/lucene90/blocktree/SegmentTermsEnum.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java index 6c91124a57d7..36e94454251a 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java @@ -437,9 +437,9 @@ public boolean seekExact(BytesRef target) throws IOException { // rewind frame ord=" + lastFrame.ord); // } - // We got lastFrame by comparing target and term, and target less than last seeked term in - // currentFrame. If lastFrame's fp is same with currentFrame's fp, and finally we seek the - // same block, we can reduce entCount to nextEnt. + // We got lastFrame by comparing target and last seeked term(at this point, currentFrame is + // last seeked block), and less than last seeked term. If lastFrame's fp is same with + // currentFrame's fp, and finally we seek the same block, we can reduce entCount to nextEnt. boolean currentIsLast = currentFrame.fp == lastFrame.fp; currentFrame = lastFrame; @@ -757,9 +757,9 @@ public SeekStatus seekCeil(BytesRef target) throws IOException { // rewind frame ord=" + lastFrame.ord); // } - // We got lastFrame by comparing target and term, and target less than last seeked term in - // currentFrame. If lastFrame's fp is same with currentFrame's fp, and finally we seek the - // same block, we can reduce entCount to nextEnt. + // We got lastFrame by comparing target and last seeked term(at this point, currentFrame is + // last seeked block), and less than last seeked term. If lastFrame's fp is same with + // currentFrame's fp, and finally we seek the same block, we can reduce entCount to nextEnt. boolean currentIsLast = currentFrame.fp == lastFrame.fp; currentFrame = lastFrame; From 4b5896c9d15bef703354e3e613bec225e0d9bf75 Mon Sep 17 00:00:00 2001 From: zhouhui Date: Mon, 12 Aug 2024 10:53:26 +0800 Subject: [PATCH 15/16] Revert SegmentTermsEnum, TestLucene99PostingsFormat to resolve conflicts. --- .../lucene90/blocktree/SegmentTermsEnum.java | 130 ++---------------- .../lucene99/TestLucene99PostingsFormat.java | 112 --------------- 2 files changed, 15 insertions(+), 227 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java index 36e94454251a..d69b5d209e6a 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java @@ -309,8 +309,6 @@ private boolean setEOF() { @Override public boolean seekExact(BytesRef target) throws IOException { - long withOutReloadFp = -1; - int origNextEnt = -1; if (fr.index == null) { throw new IllegalStateException("terms index was not loaded"); } @@ -377,9 +375,9 @@ public boolean seekExact(BytesRef target) throws IOException { arc = arcs[1 + targetUpto]; assert arc.label() == (target.bytes[target.offset + targetUpto] & 0xFF) : "arc.label=" - + (char) arc.label() - + " targetLabel=" - + (char) (target.bytes[target.offset + targetUpto] & 0xFF); + + (char) arc.label() + + " targetLabel=" + + (char) (target.bytes[target.offset + targetUpto] & 0xFF); outputAccumulator.push(arc.output()); if (arc.isFinal()) { @@ -436,24 +434,8 @@ public boolean seekExact(BytesRef target) throws IOException { // System.out.println(" target is before current (shares prefixLen=" + targetUpto + "); // rewind frame ord=" + lastFrame.ord); // } - - // We got lastFrame by comparing target and last seeked term(at this point, currentFrame is - // last seeked block), and less than last seeked term. If lastFrame's fp is same with - // currentFrame's fp, and finally we seek the same block, we can reduce entCount to nextEnt. - boolean currentIsLast = currentFrame.fp == lastFrame.fp; currentFrame = lastFrame; - - if (shouldRewind()) { - currentFrame.rewind(); - } else { - // Prepare to reduce entCount. - if (currentIsLast && currentFrame.isLeafBlock) { - origNextEnt = currentFrame.nextEnt; - withOutReloadFp = currentFrame.fp; - } - currentFrame.rewindWithoutReload(); - } - + currentFrame.rewind(); } else { // Target is exactly the same as current term assert term.length() == target.length; @@ -536,20 +518,7 @@ public boolean seekExact(BytesRef target) throws IOException { currentFrame.loadBlock(); - // We still stay on withOutReload frame, reduce entCount to nextEnt. - int origEntCount = -1; - if (currentFrame.fp == withOutReloadFp && origNextEnt != 0) { - origEntCount = currentFrame.entCount; - currentFrame.entCount = origNextEnt; - } - final SeekStatus result = currentFrame.scanToTerm(target, true); - - // Restore entCount to origEntCount. - if (origEntCount != -1) { - currentFrame.entCount = origEntCount; - } - if (result == SeekStatus.FOUND) { // if (DEBUG) { // System.out.println(" return FOUND term=" + term.utf8ToString() + " " + term); @@ -604,20 +573,7 @@ public boolean seekExact(BytesRef target) throws IOException { currentFrame.loadBlock(); - // We still stay on withOutReload frame, reduce entCount to nextEnt. - int origEntCount = -1; - if (currentFrame.fp == withOutReloadFp && origNextEnt != 0) { - origEntCount = currentFrame.entCount; - currentFrame.entCount = origNextEnt; - } - final SeekStatus result = currentFrame.scanToTerm(target, true); - - // Restore entCount to origEntCount. - if (origEntCount != -1) { - currentFrame.entCount = origEntCount; - } - if (result == SeekStatus.FOUND) { // if (DEBUG) { // System.out.println(" return FOUND term=" + term.utf8ToString() + " " + term); @@ -635,8 +591,6 @@ public boolean seekExact(BytesRef target) throws IOException { @Override public SeekStatus seekCeil(BytesRef target) throws IOException { - long withOutReloadFp = -1; - int origNextEnt = -1; if (fr.index == null) { throw new IllegalStateException("terms index was not loaded"); @@ -700,9 +654,9 @@ public SeekStatus seekCeil(BytesRef target) throws IOException { arc = arcs[1 + targetUpto]; assert arc.label() == (target.bytes[target.offset + targetUpto] & 0xFF) : "arc.label=" - + (char) arc.label() - + " targetLabel=" - + (char) (target.bytes[target.offset + targetUpto] & 0xFF); + + (char) arc.label() + + " targetLabel=" + + (char) (target.bytes[target.offset + targetUpto] & 0xFF); outputAccumulator.push(arc.output()); if (arc.isFinal()) { @@ -756,23 +710,8 @@ public SeekStatus seekCeil(BytesRef target) throws IOException { // System.out.println(" target is before current (shares prefixLen=" + targetUpto + "); // rewind frame ord=" + lastFrame.ord); // } - - // We got lastFrame by comparing target and last seeked term(at this point, currentFrame is - // last seeked block), and less than last seeked term. If lastFrame's fp is same with - // currentFrame's fp, and finally we seek the same block, we can reduce entCount to nextEnt. - boolean currentIsLast = currentFrame.fp == lastFrame.fp; currentFrame = lastFrame; - - if (shouldRewind()) { - currentFrame.rewind(); - } else { - // Prepare to reduce entCount. - if (currentIsLast && currentFrame.isLeafBlock) { - origNextEnt = currentFrame.nextEnt; - withOutReloadFp = currentFrame.fp; - } - currentFrame.rewindWithoutReload(); - } + currentFrame.rewind(); } else { // Target is exactly the same as current term assert term.length() == target.length; @@ -842,21 +781,8 @@ public SeekStatus seekCeil(BytesRef target) throws IOException { currentFrame.loadBlock(); - // We still stay on withOutReload frame, reduce entCount to nextEnt. - int origEntCount = -1; - if (currentFrame.fp == withOutReloadFp && origNextEnt != 0) { - origEntCount = currentFrame.entCount; - currentFrame.entCount = origNextEnt; - } - // if (DEBUG) System.out.println(" now scanToTerm"); final SeekStatus result = currentFrame.scanToTerm(target, false); - - // Restore entCount to origEntCount. - if (origEntCount != -1) { - currentFrame.entCount = origEntCount; - } - if (result == SeekStatus.END) { term.copyBytes(target); termExists = false; @@ -912,20 +838,8 @@ public SeekStatus seekCeil(BytesRef target) throws IOException { currentFrame.loadBlock(); - // We still stay on withOutReload frame, reduce entCount to nextEnt. - int origEntCount = -1; - if (currentFrame.fp == withOutReloadFp && origNextEnt != 0) { - origEntCount = currentFrame.entCount; - currentFrame.entCount = origNextEnt; - } - final SeekStatus result = currentFrame.scanToTerm(target, false); - // Restore entCount to origEntCount. - if (origEntCount != -1) { - currentFrame.entCount = origEntCount; - } - if (result == SeekStatus.END) { term.copyBytes(target); termExists = false; @@ -945,20 +859,6 @@ public SeekStatus seekCeil(BytesRef target) throws IOException { } } - // We should rewind a block in two cases: - // 1: currentFrame.fp != currentFrame.fpOrig. This is a floor multi-block and not the first one. - // 2: currentFrame.nextEnt == -1. This is a block we pushed in stack but haven't loaded its data, - // or a block we just changed its fp and set nextEnt to -1 by scanToFloorFrame to prepare reload. - // This means we only rewindWithoutReload for non-floor block or first floor block. - // TODO: We need currentFrame's first entry to judge whether we can rewindWithoutReload for - // non-first floor blocks. - private boolean shouldRewind() { - if (currentFrame.fp != currentFrame.fpOrig || currentFrame.nextEnt == -1) { - return true; - } - return false; - } - @SuppressWarnings("unused") private void printSeekState(PrintStream out) throws IOException { if (currentFrame == staticFrame) { @@ -991,8 +891,8 @@ private void printSeekState(PrintStream out) throws IOException { + f.isFloor + " code=" + ((f.fp << Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS) - + (f.hasTerms ? Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS : 0) - + (f.isFloor ? Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR : 0)) + + (f.hasTerms ? Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS : 0) + + (f.isFloor ? Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR : 0)) + " isLastInFloor=" + f.isLastInFloor + " mdUpto=" @@ -1021,8 +921,8 @@ private void printSeekState(PrintStream out) throws IOException { + f.isFloor + " code=" + ((f.fp << Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS) - + (f.hasTerms ? Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS : 0) - + (f.isFloor ? Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR : 0)) + + (f.hasTerms ? Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS : 0) + + (f.isFloor ? Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR : 0)) + " lastSubFP=" + f.lastSubFP + " isLastInFloor=" @@ -1311,9 +1211,9 @@ void prepareRead() { void setFloorData(ByteArrayDataInput floorData) { assert outputIndex == num - 1 : "floor data should be stored in last arc, get outputIndex: " - + outputIndex - + ", num: " - + num; + + outputIndex + + ", num: " + + num; BytesRef output = outputs[outputIndex]; floorData.reset(output.bytes, output.offset + index, output.length - index); } diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99PostingsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99PostingsFormat.java index 07338171a774..341805e8a3e4 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99PostingsFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99PostingsFormat.java @@ -17,8 +17,6 @@ package org.apache.lucene.codecs.lucene99; import static org.apache.lucene.codecs.lucene99.Lucene99ScoreSkipReader.readImpacts; -import static org.apache.lucene.tests.util.TestUtil.alwaysPostingsFormat; -import static org.apache.lucene.tests.util.TestUtil.getDefaultPostingsFormat; import java.io.IOException; import java.util.Arrays; @@ -26,7 +24,6 @@ import java.util.List; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.CompetitiveImpactAccumulator; -import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.lucene90.blocktree.FieldReader; import org.apache.lucene.codecs.lucene90.blocktree.Stats; import org.apache.lucene.codecs.lucene99.Lucene99ScoreSkipReader.MutableImpactList; @@ -153,113 +150,4 @@ protected void subCheckBinarySearch(TermsEnum termsEnum) throws Exception { assertEquals(TermsEnum.SeekStatus.NOT_FOUND, termsEnum.seekCeil(new BytesRef("10004a"))); assertEquals(termsEnum.term(), new BytesRef("100051")); } - - public void testFloorBlocks() throws Exception { - Directory dir = newDirectory(); - // Set minTermBlockSize to 2, maxTermBlockSize to 3, to generate deep subBlock. - PostingsFormat postingsFormat = getDefaultPostingsFormat(2, 3); - - IndexWriter writer = - new IndexWriter(dir, newIndexWriterConfig().setCodec(alwaysPostingsFormat(postingsFormat))); - String[] categories = - new String[] { - "regular", "request1", "request2", "request3", "request4", "rest", "teacher", "team" - }; - - for (String category : categories) { - Document doc = new Document(); - doc.add(newStringField("category", category, Field.Store.YES)); - writer.addDocument(doc); - } - - IndexReader reader = DirectoryReader.open(writer); - - TermsEnum termsEnum = getOnlyLeafReader(reader).terms("category").iterator(); - - // Test seekExact. - BytesRef target = new BytesRef("request2"); - assertTrue(termsEnum.seekExact(target)); - assertEquals(termsEnum.term(), target); - - target = new BytesRef("request1"); - assertTrue(termsEnum.seekExact(target)); - assertEquals(termsEnum.term(), target); - - target = new BytesRef("request4"); - assertTrue(termsEnum.seekExact(target)); - assertEquals(termsEnum.term(), target); - - target = new BytesRef("request3"); - assertTrue(termsEnum.seekExact(target)); - assertEquals(termsEnum.term(), target); - - target = new BytesRef("request4"); - assertTrue(termsEnum.seekExact(target)); - assertEquals(termsEnum.term(), target); - - target = new BytesRef("request1"); - assertTrue(termsEnum.seekExact(target)); - assertEquals(termsEnum.term(), target); - - target = new BytesRef("request4"); - assertTrue(termsEnum.seekExact(target)); - assertEquals(termsEnum.term(), target); - - target = new BytesRef("regular"); - assertTrue(termsEnum.seekExact(target)); - assertEquals(termsEnum.term(), target); - - target = new BytesRef("rest"); - assertTrue(termsEnum.seekExact(target)); - assertEquals(termsEnum.term(), target); - - target = new BytesRef("regular"); - assertTrue(termsEnum.seekExact(target)); - assertEquals(termsEnum.term(), target); - - // Test seekCeil. - target = new BytesRef("request2"); - assertEquals(TermsEnum.SeekStatus.FOUND, termsEnum.seekCeil(target)); - assertEquals(termsEnum.term(), target); - - target = new BytesRef("request1"); - assertEquals(TermsEnum.SeekStatus.FOUND, termsEnum.seekCeil(target)); - assertEquals(termsEnum.term(), target); - - target = new BytesRef("request4"); - assertEquals(TermsEnum.SeekStatus.FOUND, termsEnum.seekCeil(target)); - assertEquals(termsEnum.term(), target); - - target = new BytesRef("request3"); - assertEquals(TermsEnum.SeekStatus.FOUND, termsEnum.seekCeil(target)); - assertEquals(termsEnum.term(), target); - - target = new BytesRef("request4"); - assertEquals(TermsEnum.SeekStatus.FOUND, termsEnum.seekCeil(target)); - assertEquals(termsEnum.term(), target); - - target = new BytesRef("request1"); - assertEquals(TermsEnum.SeekStatus.FOUND, termsEnum.seekCeil(target)); - assertEquals(termsEnum.term(), target); - - target = new BytesRef("request4"); - assertEquals(TermsEnum.SeekStatus.FOUND, termsEnum.seekCeil(target)); - assertEquals(termsEnum.term(), target); - - target = new BytesRef("regular"); - assertEquals(TermsEnum.SeekStatus.FOUND, termsEnum.seekCeil(target)); - assertEquals(termsEnum.term(), target); - - target = new BytesRef("rest"); - assertEquals(TermsEnum.SeekStatus.FOUND, termsEnum.seekCeil(target)); - assertEquals(termsEnum.term(), target); - - target = new BytesRef("regular"); - assertEquals(TermsEnum.SeekStatus.FOUND, termsEnum.seekCeil(target)); - assertEquals(termsEnum.term(), target); - - writer.close(); - reader.close(); - dir.close(); - } } From 77af39f176d8ad6f2f3d2669a560223993e1fdd8 Mon Sep 17 00:00:00 2001 From: zhouhui Date: Mon, 12 Aug 2024 14:44:08 +0800 Subject: [PATCH 16/16] Resolve conflicts. --- .../lucene90/blocktree/SegmentTermsEnum.java | 136 ++++++++++++++++-- .../TestLucene912PostingsFormat.java | 119 ++++++++++++++- 2 files changed, 238 insertions(+), 17 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java index 63c795a5bee5..1a8cadae9266 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java @@ -309,6 +309,10 @@ private boolean setEOF() { } private IOBooleanSupplier prepareSeekExact(BytesRef target, boolean prefetch) throws IOException { + // Record rewind without reload block's state. + long withOutReloadFp = -1; + int origNextEnt = -1; + if (fr.index == null) { throw new IllegalStateException("terms index was not loaded"); } @@ -375,9 +379,9 @@ private IOBooleanSupplier prepareSeekExact(BytesRef target, boolean prefetch) th arc = arcs[1 + targetUpto]; assert arc.label() == (target.bytes[target.offset + targetUpto] & 0xFF) : "arc.label=" - + (char) arc.label() - + " targetLabel=" - + (char) (target.bytes[target.offset + targetUpto] & 0xFF); + + (char) arc.label() + + " targetLabel=" + + (char) (target.bytes[target.offset + targetUpto] & 0xFF); outputAccumulator.push(arc.output()); if (arc.isFinal()) { @@ -421,8 +425,23 @@ private IOBooleanSupplier prepareSeekExact(BytesRef target, boolean prefetch) th // System.out.println(" target is before current (shares prefixLen=" + targetUpto + "); // rewind frame ord=" + lastFrame.ord); // } + + // We got lastFrame by comparing target and last seeked term(at this point, currentFrame is + // last seeked block), and less than last seeked term. If lastFrame's fp is same with + // currentFrame's fp, and finally we seek the same block, we can reduce entCount to nextEnt. + boolean currentIsLast = currentFrame.fp == lastFrame.fp; currentFrame = lastFrame; - currentFrame.rewind(); + + if (shouldRewind()) { + currentFrame.rewind(); + } else { + // Prepare to reduce entCount. + if (currentIsLast && currentFrame.isLeafBlock) { + origNextEnt = currentFrame.nextEnt; + withOutReloadFp = currentFrame.fp; + } + currentFrame.rewindWithoutReload(); + } } else { // Target is exactly the same as current term assert term.length() == target.length; @@ -507,10 +526,25 @@ private IOBooleanSupplier prepareSeekExact(BytesRef target, boolean prefetch) th currentFrame.prefetchBlock(); } + long finalWithOutReloadFp = withOutReloadFp; + int finalOrigNextEnt = origNextEnt; return () -> { currentFrame.loadBlock(); + // If we still stay on withOutReload frame, reduce entCount to nextEnt. + int origEntCount = -1; + if (currentFrame.fp == finalWithOutReloadFp && finalOrigNextEnt != 0) { + origEntCount = currentFrame.entCount; + currentFrame.entCount = finalOrigNextEnt; + } + final SeekStatus result = currentFrame.scanToTerm(target, true); + + // Restore entCount to origEntCount. + if (origEntCount != -1) { + currentFrame.entCount = origEntCount; + } + if (result == SeekStatus.FOUND) { // if (DEBUG) { // System.out.println(" return FOUND term=" + term.utf8ToString() + " " + term); @@ -568,10 +602,25 @@ private IOBooleanSupplier prepareSeekExact(BytesRef target, boolean prefetch) th currentFrame.prefetchBlock(); } + long finalWithOutReloadFp1 = withOutReloadFp; + int finalOrigNextEnt1 = origNextEnt; return () -> { currentFrame.loadBlock(); + // If we still stay on withOutReload frame, reduce entCount to nextEnt. + int origEntCount = -1; + if (currentFrame.fp == finalWithOutReloadFp1 && finalOrigNextEnt1 != 0) { + origEntCount = currentFrame.entCount; + currentFrame.entCount = finalOrigNextEnt1; + } + final SeekStatus result = currentFrame.scanToTerm(target, true); + + // Restore entCount to origEntCount. + if (origEntCount != -1) { + currentFrame.entCount = origEntCount; + } + if (result == SeekStatus.FOUND) { // if (DEBUG) { // System.out.println(" return FOUND term=" + term.utf8ToString() + " " + term); @@ -601,6 +650,9 @@ public boolean seekExact(BytesRef target) throws IOException { @Override public SeekStatus seekCeil(BytesRef target) throws IOException { + // Record rewind without reload block's state. + long withOutReloadFp = -1; + int origNextEnt = -1; if (fr.index == null) { throw new IllegalStateException("terms index was not loaded"); @@ -664,9 +716,9 @@ public SeekStatus seekCeil(BytesRef target) throws IOException { arc = arcs[1 + targetUpto]; assert arc.label() == (target.bytes[target.offset + targetUpto] & 0xFF) : "arc.label=" - + (char) arc.label() - + " targetLabel=" - + (char) (target.bytes[target.offset + targetUpto] & 0xFF); + + (char) arc.label() + + " targetLabel=" + + (char) (target.bytes[target.offset + targetUpto] & 0xFF); outputAccumulator.push(arc.output()); if (arc.isFinal()) { @@ -708,8 +760,23 @@ public SeekStatus seekCeil(BytesRef target) throws IOException { // System.out.println(" target is before current (shares prefixLen=" + targetUpto + "); // rewind frame ord=" + lastFrame.ord); // } + + // We got lastFrame by comparing target and last seeked term(at this point, currentFrame is + // last seeked block), and less than last seeked term. If lastFrame's fp is same with + // currentFrame's fp, and finally we seek the same block, we can reduce entCount to nextEnt. + boolean currentIsLast = currentFrame.fp == lastFrame.fp; currentFrame = lastFrame; - currentFrame.rewind(); + + if (shouldRewind()) { + currentFrame.rewind(); + } else { + // Prepare to reduce entCount. + if (currentIsLast && currentFrame.isLeafBlock) { + origNextEnt = currentFrame.nextEnt; + withOutReloadFp = currentFrame.fp; + } + currentFrame.rewindWithoutReload(); + } } else { // Target is exactly the same as current term assert term.length() == target.length; @@ -779,8 +846,21 @@ public SeekStatus seekCeil(BytesRef target) throws IOException { currentFrame.loadBlock(); + // If we still stay on withOutReload frame, reduce entCount to nextEnt. + int origEntCount = -1; + if (currentFrame.fp == withOutReloadFp && origNextEnt != 0) { + origEntCount = currentFrame.entCount; + currentFrame.entCount = origNextEnt; + } + // if (DEBUG) System.out.println(" now scanToTerm"); final SeekStatus result = currentFrame.scanToTerm(target, false); + + // Restore entCount to origEntCount. + if (origEntCount != -1) { + currentFrame.entCount = origEntCount; + } + if (result == SeekStatus.END) { term.copyBytes(target); termExists = false; @@ -836,8 +916,20 @@ public SeekStatus seekCeil(BytesRef target) throws IOException { currentFrame.loadBlock(); + // If we still stay on withOutReload frame, reduce entCount to nextEnt. + int origEntCount = -1; + if (currentFrame.fp == withOutReloadFp && origNextEnt != 0) { + origEntCount = currentFrame.entCount; + currentFrame.entCount = origNextEnt; + } + final SeekStatus result = currentFrame.scanToTerm(target, false); + // Restore entCount to origEntCount. + if (origEntCount != -1) { + currentFrame.entCount = origEntCount; + } + if (result == SeekStatus.END) { term.copyBytes(target); termExists = false; @@ -857,6 +949,20 @@ public SeekStatus seekCeil(BytesRef target) throws IOException { } } + // We should rewind a block in two cases: + // 1: currentFrame.fp != currentFrame.fpOrig. This is a floor multi-block and not the first one. + // 2: currentFrame.nextEnt == -1. This is a block we pushed in stack but haven't loaded its data, + // or a block we just changed its fp and set nextEnt to -1 by scanToFloorFrame to prepare reload. + // This means we only rewindWithoutReload for non-floor block or first floor block. + // TODO: We need currentFrame's first entry to judge whether we can rewindWithoutReload for + // non-first floor blocks. + private boolean shouldRewind() { + if (currentFrame.fp != currentFrame.fpOrig || currentFrame.nextEnt == -1) { + return true; + } + return false; + } + @SuppressWarnings("unused") private void printSeekState(PrintStream out) throws IOException { if (currentFrame == staticFrame) { @@ -889,8 +995,8 @@ private void printSeekState(PrintStream out) throws IOException { + f.isFloor + " code=" + ((f.fp << Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS) - + (f.hasTerms ? Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS : 0) - + (f.isFloor ? Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR : 0)) + + (f.hasTerms ? Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS : 0) + + (f.isFloor ? Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR : 0)) + " isLastInFloor=" + f.isLastInFloor + " mdUpto=" @@ -919,8 +1025,8 @@ private void printSeekState(PrintStream out) throws IOException { + f.isFloor + " code=" + ((f.fp << Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS) - + (f.hasTerms ? Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS : 0) - + (f.isFloor ? Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR : 0)) + + (f.hasTerms ? Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS : 0) + + (f.isFloor ? Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR : 0)) + " lastSubFP=" + f.lastSubFP + " isLastInFloor=" @@ -1211,9 +1317,9 @@ void prepareRead() { void setFloorData(ByteArrayDataInput floorData) { assert outputIndex == num - 1 : "floor data should be stored in last arc, get outputIndex: " - + outputIndex - + ", num: " - + num; + + outputIndex + + ", num: " + + num; BytesRef output = outputs[outputIndex]; floorData.reset(output.bytes, output.offset + index, output.length - index); } diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene912/TestLucene912PostingsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene912/TestLucene912PostingsFormat.java index 1b8d0618c601..6c82e0b67da7 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene912/TestLucene912PostingsFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene912/TestLucene912PostingsFormat.java @@ -16,12 +16,16 @@ */ package org.apache.lucene.codecs.lucene912; +import static org.apache.lucene.tests.util.TestUtil.alwaysPostingsFormat; +import static org.apache.lucene.tests.util.TestUtil.getDefaultPostingsFormat; + import java.io.IOException; import java.util.Arrays; import java.util.Collections; import java.util.List; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.CompetitiveImpactAccumulator; +import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.lucene90.blocktree.FieldReader; import org.apache.lucene.codecs.lucene90.blocktree.Stats; import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader.MutableImpactList; @@ -29,8 +33,10 @@ import org.apache.lucene.document.Field; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.Impact; +import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.TermsEnum; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.ByteArrayDataOutput; import org.apache.lucene.store.Directory; @@ -39,13 +45,13 @@ import org.apache.lucene.store.IndexOutput; import org.apache.lucene.tests.analysis.MockAnalyzer; import org.apache.lucene.tests.index.BasePostingsFormatTestCase; -import org.apache.lucene.tests.util.TestUtil; +import org.apache.lucene.util.BytesRef; public class TestLucene912PostingsFormat extends BasePostingsFormatTestCase { @Override protected Codec getCodec() { - return TestUtil.alwaysPostingsFormat(new Lucene912PostingsFormat()); + return alwaysPostingsFormat(new Lucene912PostingsFormat()); } public void testVInt15() throws IOException { @@ -154,4 +160,113 @@ private void doTestImpactSerialization(List impacts) throws IOException } } } + + public void testBackwardSeek() throws Exception { + Directory dir = newDirectory(); + // Set minTermBlockSize to 2, maxTermBlockSize to 3, to generate deep subBlock. + PostingsFormat postingsFormat = getDefaultPostingsFormat(2, 3); + + IndexWriter writer = + new IndexWriter(dir, newIndexWriterConfig().setCodec(alwaysPostingsFormat(postingsFormat))); + String[] categories = + new String[] { + "regular", "request1", "request2", "request3", "request4", "rest", "teacher", "team" + }; + + for (String category : categories) { + Document doc = new Document(); + doc.add(newStringField("category", category, Field.Store.YES)); + writer.addDocument(doc); + } + + IndexReader reader = DirectoryReader.open(writer); + + TermsEnum termsEnum = getOnlyLeafReader(reader).terms("category").iterator(); + + // Test seekExact. + BytesRef target = new BytesRef("request2"); + assertTrue(termsEnum.seekExact(target)); + assertEquals(termsEnum.term(), target); + + target = new BytesRef("request1"); + assertTrue(termsEnum.seekExact(target)); + assertEquals(termsEnum.term(), target); + + target = new BytesRef("request4"); + assertTrue(termsEnum.seekExact(target)); + assertEquals(termsEnum.term(), target); + + target = new BytesRef("request3"); + assertTrue(termsEnum.seekExact(target)); + assertEquals(termsEnum.term(), target); + + target = new BytesRef("request4"); + assertTrue(termsEnum.seekExact(target)); + assertEquals(termsEnum.term(), target); + + target = new BytesRef("request1"); + assertTrue(termsEnum.seekExact(target)); + assertEquals(termsEnum.term(), target); + + target = new BytesRef("request4"); + assertTrue(termsEnum.seekExact(target)); + assertEquals(termsEnum.term(), target); + + target = new BytesRef("regular"); + assertTrue(termsEnum.seekExact(target)); + assertEquals(termsEnum.term(), target); + + target = new BytesRef("rest"); + assertTrue(termsEnum.seekExact(target)); + assertEquals(termsEnum.term(), target); + + target = new BytesRef("regular"); + assertTrue(termsEnum.seekExact(target)); + assertEquals(termsEnum.term(), target); + + // Test seekCeil. + target = new BytesRef("request2"); + assertEquals(TermsEnum.SeekStatus.FOUND, termsEnum.seekCeil(target)); + assertEquals(termsEnum.term(), target); + + target = new BytesRef("request1"); + assertEquals(TermsEnum.SeekStatus.FOUND, termsEnum.seekCeil(target)); + assertEquals(termsEnum.term(), target); + + target = new BytesRef("request4"); + assertEquals(TermsEnum.SeekStatus.FOUND, termsEnum.seekCeil(target)); + assertEquals(termsEnum.term(), target); + + target = new BytesRef("request3"); + assertEquals(TermsEnum.SeekStatus.FOUND, termsEnum.seekCeil(target)); + assertEquals(termsEnum.term(), target); + + target = new BytesRef("request4"); + assertEquals(TermsEnum.SeekStatus.FOUND, termsEnum.seekCeil(target)); + assertEquals(termsEnum.term(), target); + + target = new BytesRef("request1"); + assertEquals(TermsEnum.SeekStatus.FOUND, termsEnum.seekCeil(target)); + assertEquals(termsEnum.term(), target); + + target = new BytesRef("request4"); + assertEquals(TermsEnum.SeekStatus.FOUND, termsEnum.seekCeil(target)); + assertEquals(termsEnum.term(), target); + + target = new BytesRef("regular"); + assertEquals(TermsEnum.SeekStatus.FOUND, termsEnum.seekCeil(target)); + assertEquals(termsEnum.term(), target); + + target = new BytesRef("rest"); + assertEquals(TermsEnum.SeekStatus.FOUND, termsEnum.seekCeil(target)); + assertEquals(termsEnum.term(), target); + + target = new BytesRef("regular"); + assertEquals(TermsEnum.SeekStatus.FOUND, termsEnum.seekCeil(target)); + assertEquals(termsEnum.term(), target); + + writer.close(); + reader.close(); + dir.close(); + } }