From a5ffbeed6da25e6f931276d5b881feb933b3e8f8 Mon Sep 17 00:00:00 2001 From: waddyano Date: Mon, 21 Feb 2022 16:37:37 -0800 Subject: [PATCH] First go at shrinking segment sizes - all by adjusting how locations are stored Position, start and end offset are all stored as deltas from the last location - though end is a delta from start and so is the length of the original token - however in many cases this is not stored as described later. Every block of locations associated with a term has its first integer stored as a set of flags - which in practice will end up as a single byte in the file. This currently has one flag which says the fields in the locations are explicitly stored or not. If not stored all fields in the locations are the same as the field the associated dictionary refers to. The position delta is actually stored by shifting it left one bit and using the bottom bit to indicate if the length to be stored to indicate the end of the token is the same as the length of the term associated with the list of tokens. If the length is the same the bit is set and the end field does not exist. --- cmd/ice/cmd/root.go | 2 +- dict.go | 14 ++++--- footer.go | 2 +- merge.go | 97 +++++++++++++++++++++++++++++++++++++++------ new.go | 84 ++++++++++++++++++++++++++++++++++++--- posting.go | 78 +++++++++++++++++++++++++++++++----- segment.go | 12 +++++- 7 files changed, 251 insertions(+), 38 deletions(-) diff --git a/cmd/ice/cmd/root.go b/cmd/ice/cmd/root.go index 42e3723..0580552 100644 --- a/cmd/ice/cmd/root.go +++ b/cmd/ice/cmd/root.go @@ -110,7 +110,7 @@ func openFromFile(path string) (segment.Segment, closeFunc, error) { seg, err := ice.Load(data) if err != nil { _ = closeF() - return nil, noCloseFunc, fmt.Errorf("error loading segment: %v", err) + return nil, noCloseFunc, fmt.Errorf("error loading segment %s: %v", path, err) } return seg, closeF, nil diff --git a/dict.go b/dict.go index cb4676a..5fde5c9 100644 --- a/dict.go +++ b/dict.go @@ -50,7 +50,7 @@ func (d *Dictionary) postingsList(term []byte, except *roaring.Bitmap, rv *Posti if rv == nil || rv == emptyPostingsList { return emptyPostingsList, nil } - return d.postingsListInit(rv, except), nil + return d.postingsListInit(rv, term, except), nil } postingsOffset, exists, err := d.fstReader.Get(term) @@ -61,14 +61,14 @@ func (d *Dictionary) postingsList(term []byte, except *roaring.Bitmap, rv *Posti if rv == nil || rv == emptyPostingsList { return emptyPostingsList, nil } - return d.postingsListInit(rv, except), nil + return d.postingsListInit(rv, term, except), nil } - return d.postingsListFromOffset(postingsOffset, except, rv) + return d.postingsListFromOffset(postingsOffset, term, except, rv) } -func (d *Dictionary) postingsListFromOffset(postingsOffset uint64, except *roaring.Bitmap, rv *PostingsList) (*PostingsList, error) { - rv = d.postingsListInit(rv, except) +func (d *Dictionary) postingsListFromOffset(postingsOffset uint64, term []byte, except *roaring.Bitmap, rv *PostingsList) (*PostingsList, error) { + rv = d.postingsListInit(rv, term, except) err := rv.read(postingsOffset, d) if err != nil { @@ -78,7 +78,7 @@ func (d *Dictionary) postingsListFromOffset(postingsOffset uint64, except *roari return rv, nil } -func (d *Dictionary) postingsListInit(rv *PostingsList, except *roaring.Bitmap) *PostingsList { +func (d *Dictionary) postingsListInit(rv *PostingsList, term []byte, except *roaring.Bitmap) *PostingsList { if rv == nil || rv == emptyPostingsList { rv = &PostingsList{} } else { @@ -92,6 +92,8 @@ func (d *Dictionary) postingsListInit(rv *PostingsList, except *roaring.Bitmap) rv.postings = postings } rv.sb = d.sb + rv.dict = d + rv.term = term rv.except = except return rv } diff --git a/footer.go b/footer.go index 3018065..fcd7643 100644 --- a/footer.go +++ b/footer.go @@ -77,7 +77,7 @@ func parseFooter(data *segment.Data) (*footer, error) { return nil, err } rv.version = binary.BigEndian.Uint32(verData) - if rv.version != Version { + if rv.version > Version { return nil, fmt.Errorf("unsupported version %d", rv.version) } diff --git a/merge.go b/merge.go index a65264c..e658cbf 100644 --- a/merge.go +++ b/merge.go @@ -281,7 +281,7 @@ func persistMergedRestField(segments []*Segment, dropsIn []*roaring.Bitmap, fiel } if !bytes.Equal(prevTerm, term) || prevTerm == nil { - err = prepareNewTerm(newSegDocCount, chunkMode, tfEncoder, locEncoder, fieldFreqs, fieldID, enumerator, + err = prepareNewTerm(term, newSegDocCount, chunkMode, tfEncoder, locEncoder, fieldFreqs, fieldID, enumerator, dicts, drops) if err != nil { return err @@ -289,7 +289,7 @@ func persistMergedRestField(segments []*Segment, dropsIn []*roaring.Bitmap, fiel } postings, err = dicts[itrI].postingsListFromOffset( - postingsOffset, drops[itrI], postings) + postingsOffset, term, drops[itrI], postings) if err != nil { return err } @@ -301,7 +301,7 @@ func persistMergedRestField(segments []*Segment, dropsIn []*roaring.Bitmap, fiel // can no longer optimize by copying, since chunk factor could have changed lastDocNum, lastFreq, lastNorm, bufLoc, err = mergeTermFreqNormLocs( - fieldsMap, postItr, newDocNums[itrI], newRoaring, + term, fieldsMap, fieldID, postItr, newDocNums[itrI], newRoaring, tfEncoder, locEncoder, bufLoc, fieldDocTracking) if err != nil { @@ -427,7 +427,7 @@ func buildMergedDocVals(newSegDocCount uint64, w *countHashWriter, closeCh chan return nil } -func prepareNewTerm(newSegDocCount uint64, chunkMode uint32, tfEncoder, locEncoder *chunkedIntCoder, +func prepareNewTerm(term []byte, newSegDocCount uint64, chunkMode uint32, tfEncoder, locEncoder *chunkedIntCoder, fieldFreqs map[uint16]uint64, fieldID int, enumerator *enumerator, dicts []*Dictionary, drops []*roaring.Bitmap) error { var err error @@ -437,7 +437,7 @@ func prepareNewTerm(newSegDocCount uint64, chunkMode uint32, tfEncoder, locEncod lowItrIdxs, lowItrVals := enumerator.GetLowIdxsAndValues() for i, idx := range lowItrIdxs { var pl *PostingsList - pl, err = dicts[idx].postingsListFromOffset(lowItrVals[i], drops[idx], nil) + pl, err = dicts[idx].postingsListFromOffset(lowItrVals[i], term, drops[idx], nil) if err != nil { return err } @@ -556,7 +556,7 @@ func setupActiveForField(segments []*Segment, dropsIn []*roaring.Bitmap, newDocN const numUintsLocation = 4 -func mergeTermFreqNormLocs(fieldsMap map[string]uint16, postItr *PostingsIterator, +func mergeTermFreqNormLocs(term []byte, fieldsMap map[string]uint16, fieldID int, postItr *PostingsIterator, newDocNums []uint64, newRoaring *roaring.Bitmap, tfEncoder, locEncoder *chunkedIntCoder, bufLoc []uint64, docTracking *roaring.Bitmap) ( lastDocNum, lastFreq, lastNorm uint64, bufLocOut []uint64, err error) { @@ -583,9 +583,36 @@ func mergeTermFreqNormLocs(fieldsMap map[string]uint16, postItr *PostingsIterato if len(locs) > 0 { numBytesLocs := 0 + numFieldBytes := 0 + var locFlags LocFlags + var lastEnd uint64 + var lastPos uint64 for _, loc := range locs { - numBytesLocs += totalUvarintBytes(uint64(fieldsMap[loc.Field()]-1), - uint64(loc.Pos()), uint64(loc.Start()), uint64(loc.End())) + locFieldID := fieldsMap[loc.Field()] - 1 + if Version == 1 { + numBytesLocs += totalUvarintBytes(uint64(locFieldID), + uint64(loc.Pos()), uint64(loc.Start()), uint64(loc.End())) + } else { + if locFieldID != uint16(fieldID) { + locFlags |= LocFlagStoredField + } + numBytesLocs += numUvarintBytes((uint64(loc.Pos()) - lastPos) << 1) + numBytesLocs += numUvarintBytes(uint64(loc.Start()) - lastEnd) + if len(term) != loc.End()-loc.Start() { + numBytesLocs += numUvarintBytes(uint64(loc.End() - loc.Start())) + } + numFieldBytes += numUvarintBytes(uint64(locFieldID)) + lastEnd = uint64(loc.End()) + lastPos = uint64(loc.Pos()) + } + } + + if Version == 2 { + if locFlags&LocFlagStoredField != 0 { + numBytesLocs += numFieldBytes + } + + numBytesLocs += numUvarintBytes(uint64(locFlags)) } err = locEncoder.Add(hitNewDocNum, uint64(numBytesLocs)) @@ -593,15 +620,59 @@ func mergeTermFreqNormLocs(fieldsMap map[string]uint16, postItr *PostingsIterato return 0, 0, 0, nil, err } + if Version == 2 { + err = locEncoder.Add(hitNewDocNum, uint64(locFlags)) + if err != nil { + return 0, 0, 0, nil, err + } + } + + lastEnd = 0 + lastPos = 0 for _, loc := range locs { if cap(bufLoc) < numUintsLocation { bufLoc = make([]uint64, 0, numUintsLocation) } - args := bufLoc[0:4] - args[0] = uint64(fieldsMap[loc.Field()] - 1) - args[1] = uint64(loc.Pos()) - args[2] = uint64(loc.Start()) - args[3] = uint64(loc.End()) + var args []uint64 + if Version == 2 { + var endNotStoredFlag uint64 + if len(term) == loc.End()-loc.Start() { + endNotStoredFlag = 1 + } + if (locFlags & LocFlagStoredField) == 0 { + if endNotStoredFlag != 0 { + args = bufLoc[0:2] + args[0] = (uint64(loc.Pos())-lastPos)<<1 | endNotStoredFlag + args[1] = uint64(loc.Start()) - lastEnd + } else { + args = bufLoc[0:3] + args[0] = (uint64(loc.Pos()) - lastPos) << 1 + args[1] = uint64(loc.Start()) - lastEnd + args[2] = uint64(loc.End() - loc.Start()) + } + } else { + if endNotStoredFlag != 0 { + args = bufLoc[0:3] + args[0] = uint64(fieldsMap[loc.Field()] - 1) + args[1] = (uint64(loc.Pos())-lastPos)<<1 | endNotStoredFlag + args[2] = uint64(loc.Start()) - lastEnd + } else { + args = bufLoc[0:4] + args[0] = uint64(fieldsMap[loc.Field()] - 1) + args[1] = (uint64(loc.Pos()) - lastPos) << 1 + args[2] = uint64(loc.Start()) - lastEnd + args[3] = uint64(loc.End() - loc.Start()) + } + } + lastEnd = uint64(loc.End()) + lastPos = uint64(loc.Pos()) + } else { + args = bufLoc[0:4] + args[0] = uint64(fieldsMap[loc.Field()] - 1) + args[1] = uint64(loc.Pos()) + args[2] = uint64(loc.Start()) + args[3] = uint64(loc.End()) + } err = locEncoder.Add(hitNewDocNum, args...) if err != nil { return 0, 0, 0, nil, err diff --git a/new.go b/new.go index 5af2b24..900aeee 100644 --- a/new.go +++ b/new.go @@ -17,6 +17,7 @@ package ice import ( "bytes" "encoding/binary" + "log" "math" "sort" "sync" @@ -704,7 +705,7 @@ func (s *interim) writeDictsField(docTermMap [][]byte, fieldID int, terms []stri dict := s.Dicts[fieldID] for _, term := range terms { // terms are already sorted - err2 := s.writeDictsTermField(docTermMap, dict, term, tfEncoder, locEncoder, buf) + err2 := s.writeDictsTermField(docTermMap, dict, fieldID, term, tfEncoder, locEncoder, buf) if err2 != nil { return err2 } @@ -779,7 +780,7 @@ func (s *interim) writeDictsField(docTermMap [][]byte, fieldID int, terms []stri return nil } -func (s *interim) writeDictsTermField(docTermMap [][]byte, dict map[string]uint64, term string, tfEncoder, +func (s *interim) writeDictsTermField(docTermMap [][]byte, dict map[string]uint64, fieldID int, term string, tfEncoder, locEncoder *chunkedIntCoder, buf []byte) error { pid := dict[term] - 1 @@ -813,9 +814,39 @@ func (s *interim) writeDictsTermField(docTermMap [][]byte, dict map[string]uint6 if freqNorm.numLocs > 0 { numBytesLocs := 0 + numFieldBytes := 0 + var lastEnd uint64 + var lastPos uint64 + var locFlags LocFlags for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] { - numBytesLocs += totalUvarintBytes( - uint64(loc.fieldID), loc.pos, loc.start, loc.end) + if loc.end < loc.start { + log.Fatal("backwards!!!\n") + } + if Version == 1 { + numBytesLocs += totalUvarintBytes( + uint64(loc.fieldID), loc.pos, loc.start, loc.end) + } else { + if loc.fieldID != uint16(fieldID) { + locFlags |= LocFlagStoredField + } + + numBytesLocs += numUvarintBytes((loc.pos - lastPos) << 1) + numBytesLocs += numUvarintBytes(loc.start - lastEnd) + if uint64(len(term)) != loc.end-loc.start { + numBytesLocs += numUvarintBytes(loc.end - loc.start) + } + numFieldBytes += numUvarintBytes(uint64(loc.fieldID)) + lastEnd = loc.end + lastPos = loc.pos + } + } + + if Version == 2 { + if locFlags&LocFlagStoredField != 0 { + numBytesLocs += numFieldBytes + } + + numBytesLocs += numUvarintBytes(uint64(locFlags)) } err = locEncoder.Add(docNum, uint64(numBytesLocs)) @@ -823,9 +854,50 @@ func (s *interim) writeDictsTermField(docTermMap [][]byte, dict map[string]uint6 return err } + if Version == 2 { + err = locEncoder.Add(docNum, uint64(locFlags)) + if err != nil { + return err + } + } + + lastEnd = 0 + lastPos = 0 for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] { - err = locEncoder.Add(docNum, - uint64(loc.fieldID), loc.pos, loc.start, loc.end) + if loc.start < lastEnd { + log.Fatal("loc backwards\n") + } + + var endNotStoredFlag uint64 + if uint64(len(term)) == loc.end-loc.start { + endNotStoredFlag = 1 + } + + if Version == 1 { + err = locEncoder.Add(docNum, + uint64(loc.fieldID), loc.pos, loc.start, loc.end) + } else { + posDelta := loc.pos - lastPos + if (locFlags & LocFlagStoredField) != 0 { + if endNotStoredFlag != 0 { + err = locEncoder.Add(docNum, + uint64(loc.fieldID), (posDelta<<1)|endNotStoredFlag, loc.start-lastEnd) + } else { + err = locEncoder.Add(docNum, + uint64(loc.fieldID), posDelta<<1, loc.start-lastEnd, loc.end-loc.start) + } + } else { + if endNotStoredFlag != 0 { + err = locEncoder.Add(docNum, + (posDelta<<1)|endNotStoredFlag, loc.start-lastEnd) + } else { + err = locEncoder.Add(docNum, + posDelta<<1, loc.start-lastEnd, loc.end-loc.start) + } + } + } + lastEnd = loc.end + lastPos = loc.pos if err != nil { return err } diff --git a/posting.go b/posting.go index c0dc2d4..616e5c5 100644 --- a/posting.go +++ b/posting.go @@ -23,6 +23,12 @@ import ( segment "github.com/blugelabs/bluge_segment_api" ) +type LocFlags uint8 + +const ( + LocFlagStoredField LocFlags = 1 << iota +) + // FST or vellum value (uint64) encoding is determined by the top two // highest-order or most significant bits... // @@ -78,12 +84,15 @@ const docNum1HitFinished = math.MaxUint64 // PostingsList is an in-memory representation of a postings list type PostingsList struct { sb *Segment + dict *Dictionary postingsOffset uint64 freqOffset uint64 locOffset uint64 postings *roaring.Bitmap except *roaring.Bitmap + term []byte + // when normBits1Hit != 0, then this postings list came from a // 1-hit encoding, and only the docNum1Hit & normBits1Hit apply docNum1Hit uint64 @@ -410,32 +419,67 @@ func decodeFreqHasLocs(freqHasLocs uint64) (int, bool) { // readLocation processes all the integers on the stream representing a single // location. -func (i *PostingsIterator) readLocation(l *Location) error { +func (i *PostingsIterator) readLocation(l *Location, locFlags LocFlags, lastEnd uint64, lastPos uint64) error { // read off field - fieldID, err := i.locReader.readUvarint() - if err != nil { - return fmt.Errorf("error reading location field: %v", err) + var savedFieldId uint64 + version := i.postings.sb.Version() + if version == 1 || (locFlags&LocFlagStoredField) != 0 { + var err error + savedFieldId, err = i.locReader.readUvarint() + if err != nil { + return fmt.Errorf("error reading location field: %v", err) + } } + // read off pos pos, err := i.locReader.readUvarint() if err != nil { return fmt.Errorf("error reading location pos: %v", err) } + + pos += lastPos + + endNotStored := false + if version == 2 { + if (pos & 1) != 0 { + endNotStored = true + } + pos >>= 1 + } + // read off start start, err := i.locReader.readUvarint() if err != nil { return fmt.Errorf("error reading location start: %v", err) } + // read off end - end, err := i.locReader.readUvarint() - if err != nil { - return fmt.Errorf("error reading location end: %v", err) + var end uint64 + if !endNotStored { + var err error + end, err = i.locReader.readUvarint() + if err != nil { + return fmt.Errorf("error reading location end: %v", err) + } } - l.field = i.postings.sb.fieldsInv[fieldID] + if version == 1 || (locFlags&LocFlagStoredField) != 0 { + l.field = i.postings.sb.fieldsInv[savedFieldId] + } else { + l.field = i.postings.sb.fieldsInv[i.postings.dict.fieldID] + } l.pos = int(pos) l.start = int(start) - l.end = int(end) + if version == 2 { + l.start = int(start + lastEnd) + if endNotStored { + l.end = l.start + len(i.postings.term) + } else { + l.end = int(end) + } + } else { + l.end = int(end) + } return nil } @@ -498,14 +542,28 @@ func (i *PostingsIterator) nextAtOrAfter(atOrAfter uint64) (segment.Posting, err return nil, fmt.Errorf("error reading location numLocsBytes: %v", err) } + var locFlags LocFlags + if i.postings.sb.Version() == 2 { + locFlags64, err := i.locReader.readUvarint() + if err != nil { + return nil, fmt.Errorf("error reading location locFlags: %v", err) + } + locFlags = LocFlags(locFlags64) + numLocsBytes -= uint64(numUvarintBytes(locFlags64)) + } + j := 0 startBytesRemaining := i.locReader.Len() // # bytes remaining in the locReader + var lastEnd uint64 + var lastPos uint64 for startBytesRemaining-i.locReader.Len() < int(numLocsBytes) { - err := i.readLocation(&i.nextLocs[j]) + err := i.readLocation(&i.nextLocs[j], locFlags, lastEnd, lastPos) if err != nil { return nil, err } rv.locs = append(rv.locs, &i.nextLocs[j]) + lastEnd = uint64(i.nextLocs[j].end) + lastPos = uint64(i.nextLocs[j].pos) j++ } } diff --git a/segment.go b/segment.go index 5b77791..6cf6a91 100644 --- a/segment.go +++ b/segment.go @@ -20,6 +20,8 @@ import ( "encoding/binary" "fmt" "io" + "os" + "strconv" "sync" "github.com/RoaringBitmap/roaring" @@ -28,7 +30,15 @@ import ( "github.com/golang/snappy" ) -const Version uint32 = 1 +func getVersion() uint32 { + v, e := strconv.ParseUint(os.Getenv("ICEV"), 10, 32) + if e != nil { + return 2 + } + return uint32(v) +} + +var Version uint32 = getVersion() const Type string = "ice"