Skip to content

Commit

Permalink
First go at shrinking segment sizes - all by adjusting how locations …
Browse files Browse the repository at this point in the history
…are stored

Position, start and end offset are all stored as deltas from the last location - though end is a delta from start and so is the length of the original token - however in many cases this is not stored as described later.
Every block of locations associated with a term has its first integer stored as a set of flags - which in practice will end up as a single byte in the file. This currently has one flag which says the fields in the locations are explicitly stored or not. If not stored all fields in the locations are the same as the field the associated dictionary refers to.
The position delta is actually stored by shifting it left one bit and using the bottom bit to indicate if the length to be stored to indicate the end of the token is the same as the length of the term associated with the list of tokens. If the length is the same the bit is set and the end field does not exist.
  • Loading branch information
waddyano committed Feb 22, 2022
1 parent 3993755 commit a5ffbee
Show file tree
Hide file tree
Showing 7 changed files with 251 additions and 38 deletions.
2 changes: 1 addition & 1 deletion cmd/ice/cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ func openFromFile(path string) (segment.Segment, closeFunc, error) {
seg, err := ice.Load(data)
if err != nil {
_ = closeF()
return nil, noCloseFunc, fmt.Errorf("error loading segment: %v", err)
return nil, noCloseFunc, fmt.Errorf("error loading segment %s: %v", path, err)
}

return seg, closeF, nil
Expand Down
14 changes: 8 additions & 6 deletions dict.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ func (d *Dictionary) postingsList(term []byte, except *roaring.Bitmap, rv *Posti
if rv == nil || rv == emptyPostingsList {
return emptyPostingsList, nil
}
return d.postingsListInit(rv, except), nil
return d.postingsListInit(rv, term, except), nil
}

postingsOffset, exists, err := d.fstReader.Get(term)
Expand All @@ -61,14 +61,14 @@ func (d *Dictionary) postingsList(term []byte, except *roaring.Bitmap, rv *Posti
if rv == nil || rv == emptyPostingsList {
return emptyPostingsList, nil
}
return d.postingsListInit(rv, except), nil
return d.postingsListInit(rv, term, except), nil
}

return d.postingsListFromOffset(postingsOffset, except, rv)
return d.postingsListFromOffset(postingsOffset, term, except, rv)
}

func (d *Dictionary) postingsListFromOffset(postingsOffset uint64, except *roaring.Bitmap, rv *PostingsList) (*PostingsList, error) {
rv = d.postingsListInit(rv, except)
func (d *Dictionary) postingsListFromOffset(postingsOffset uint64, term []byte, except *roaring.Bitmap, rv *PostingsList) (*PostingsList, error) {
rv = d.postingsListInit(rv, term, except)

err := rv.read(postingsOffset, d)
if err != nil {
Expand All @@ -78,7 +78,7 @@ func (d *Dictionary) postingsListFromOffset(postingsOffset uint64, except *roari
return rv, nil
}

func (d *Dictionary) postingsListInit(rv *PostingsList, except *roaring.Bitmap) *PostingsList {
func (d *Dictionary) postingsListInit(rv *PostingsList, term []byte, except *roaring.Bitmap) *PostingsList {
if rv == nil || rv == emptyPostingsList {
rv = &PostingsList{}
} else {
Expand All @@ -92,6 +92,8 @@ func (d *Dictionary) postingsListInit(rv *PostingsList, except *roaring.Bitmap)
rv.postings = postings
}
rv.sb = d.sb
rv.dict = d
rv.term = term
rv.except = except
return rv
}
Expand Down
2 changes: 1 addition & 1 deletion footer.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ func parseFooter(data *segment.Data) (*footer, error) {
return nil, err
}
rv.version = binary.BigEndian.Uint32(verData)
if rv.version != Version {
if rv.version > Version {
return nil, fmt.Errorf("unsupported version %d", rv.version)
}

Expand Down
97 changes: 84 additions & 13 deletions merge.go
Original file line number Diff line number Diff line change
Expand Up @@ -281,15 +281,15 @@ func persistMergedRestField(segments []*Segment, dropsIn []*roaring.Bitmap, fiel
}

if !bytes.Equal(prevTerm, term) || prevTerm == nil {
err = prepareNewTerm(newSegDocCount, chunkMode, tfEncoder, locEncoder, fieldFreqs, fieldID, enumerator,
err = prepareNewTerm(term, newSegDocCount, chunkMode, tfEncoder, locEncoder, fieldFreqs, fieldID, enumerator,
dicts, drops)
if err != nil {
return err
}
}

postings, err = dicts[itrI].postingsListFromOffset(
postingsOffset, drops[itrI], postings)
postingsOffset, term, drops[itrI], postings)
if err != nil {
return err
}
Expand All @@ -301,7 +301,7 @@ func persistMergedRestField(segments []*Segment, dropsIn []*roaring.Bitmap, fiel

// can no longer optimize by copying, since chunk factor could have changed
lastDocNum, lastFreq, lastNorm, bufLoc, err = mergeTermFreqNormLocs(
fieldsMap, postItr, newDocNums[itrI], newRoaring,
term, fieldsMap, fieldID, postItr, newDocNums[itrI], newRoaring,
tfEncoder, locEncoder, bufLoc, fieldDocTracking)

if err != nil {
Expand Down Expand Up @@ -427,7 +427,7 @@ func buildMergedDocVals(newSegDocCount uint64, w *countHashWriter, closeCh chan
return nil
}

func prepareNewTerm(newSegDocCount uint64, chunkMode uint32, tfEncoder, locEncoder *chunkedIntCoder,
func prepareNewTerm(term []byte, newSegDocCount uint64, chunkMode uint32, tfEncoder, locEncoder *chunkedIntCoder,
fieldFreqs map[uint16]uint64, fieldID int, enumerator *enumerator, dicts []*Dictionary,
drops []*roaring.Bitmap) error {
var err error
Expand All @@ -437,7 +437,7 @@ func prepareNewTerm(newSegDocCount uint64, chunkMode uint32, tfEncoder, locEncod
lowItrIdxs, lowItrVals := enumerator.GetLowIdxsAndValues()
for i, idx := range lowItrIdxs {
var pl *PostingsList
pl, err = dicts[idx].postingsListFromOffset(lowItrVals[i], drops[idx], nil)
pl, err = dicts[idx].postingsListFromOffset(lowItrVals[i], term, drops[idx], nil)
if err != nil {
return err
}
Expand Down Expand Up @@ -556,7 +556,7 @@ func setupActiveForField(segments []*Segment, dropsIn []*roaring.Bitmap, newDocN

const numUintsLocation = 4

func mergeTermFreqNormLocs(fieldsMap map[string]uint16, postItr *PostingsIterator,
func mergeTermFreqNormLocs(term []byte, fieldsMap map[string]uint16, fieldID int, postItr *PostingsIterator,
newDocNums []uint64, newRoaring *roaring.Bitmap,
tfEncoder, locEncoder *chunkedIntCoder, bufLoc []uint64, docTracking *roaring.Bitmap) (
lastDocNum, lastFreq, lastNorm uint64, bufLocOut []uint64, err error) {
Expand All @@ -583,25 +583,96 @@ func mergeTermFreqNormLocs(fieldsMap map[string]uint16, postItr *PostingsIterato

if len(locs) > 0 {
numBytesLocs := 0
numFieldBytes := 0
var locFlags LocFlags
var lastEnd uint64
var lastPos uint64
for _, loc := range locs {
numBytesLocs += totalUvarintBytes(uint64(fieldsMap[loc.Field()]-1),
uint64(loc.Pos()), uint64(loc.Start()), uint64(loc.End()))
locFieldID := fieldsMap[loc.Field()] - 1
if Version == 1 {
numBytesLocs += totalUvarintBytes(uint64(locFieldID),
uint64(loc.Pos()), uint64(loc.Start()), uint64(loc.End()))
} else {
if locFieldID != uint16(fieldID) {
locFlags |= LocFlagStoredField
}
numBytesLocs += numUvarintBytes((uint64(loc.Pos()) - lastPos) << 1)
numBytesLocs += numUvarintBytes(uint64(loc.Start()) - lastEnd)
if len(term) != loc.End()-loc.Start() {
numBytesLocs += numUvarintBytes(uint64(loc.End() - loc.Start()))
}
numFieldBytes += numUvarintBytes(uint64(locFieldID))
lastEnd = uint64(loc.End())
lastPos = uint64(loc.Pos())
}
}

if Version == 2 {
if locFlags&LocFlagStoredField != 0 {
numBytesLocs += numFieldBytes
}

numBytesLocs += numUvarintBytes(uint64(locFlags))
}

err = locEncoder.Add(hitNewDocNum, uint64(numBytesLocs))
if err != nil {
return 0, 0, 0, nil, err
}

if Version == 2 {
err = locEncoder.Add(hitNewDocNum, uint64(locFlags))
if err != nil {
return 0, 0, 0, nil, err
}
}

lastEnd = 0
lastPos = 0
for _, loc := range locs {
if cap(bufLoc) < numUintsLocation {
bufLoc = make([]uint64, 0, numUintsLocation)
}
args := bufLoc[0:4]
args[0] = uint64(fieldsMap[loc.Field()] - 1)
args[1] = uint64(loc.Pos())
args[2] = uint64(loc.Start())
args[3] = uint64(loc.End())
var args []uint64
if Version == 2 {
var endNotStoredFlag uint64
if len(term) == loc.End()-loc.Start() {
endNotStoredFlag = 1
}
if (locFlags & LocFlagStoredField) == 0 {
if endNotStoredFlag != 0 {
args = bufLoc[0:2]
args[0] = (uint64(loc.Pos())-lastPos)<<1 | endNotStoredFlag
args[1] = uint64(loc.Start()) - lastEnd
} else {
args = bufLoc[0:3]
args[0] = (uint64(loc.Pos()) - lastPos) << 1
args[1] = uint64(loc.Start()) - lastEnd
args[2] = uint64(loc.End() - loc.Start())
}
} else {
if endNotStoredFlag != 0 {
args = bufLoc[0:3]
args[0] = uint64(fieldsMap[loc.Field()] - 1)
args[1] = (uint64(loc.Pos())-lastPos)<<1 | endNotStoredFlag
args[2] = uint64(loc.Start()) - lastEnd
} else {
args = bufLoc[0:4]
args[0] = uint64(fieldsMap[loc.Field()] - 1)
args[1] = (uint64(loc.Pos()) - lastPos) << 1
args[2] = uint64(loc.Start()) - lastEnd
args[3] = uint64(loc.End() - loc.Start())
}
}
lastEnd = uint64(loc.End())
lastPos = uint64(loc.Pos())
} else {
args = bufLoc[0:4]
args[0] = uint64(fieldsMap[loc.Field()] - 1)
args[1] = uint64(loc.Pos())
args[2] = uint64(loc.Start())
args[3] = uint64(loc.End())
}
err = locEncoder.Add(hitNewDocNum, args...)
if err != nil {
return 0, 0, 0, nil, err
Expand Down
84 changes: 78 additions & 6 deletions new.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ package ice
import (
"bytes"
"encoding/binary"
"log"
"math"
"sort"
"sync"
Expand Down Expand Up @@ -704,7 +705,7 @@ func (s *interim) writeDictsField(docTermMap [][]byte, fieldID int, terms []stri
dict := s.Dicts[fieldID]

for _, term := range terms { // terms are already sorted
err2 := s.writeDictsTermField(docTermMap, dict, term, tfEncoder, locEncoder, buf)
err2 := s.writeDictsTermField(docTermMap, dict, fieldID, term, tfEncoder, locEncoder, buf)
if err2 != nil {
return err2
}
Expand Down Expand Up @@ -779,7 +780,7 @@ func (s *interim) writeDictsField(docTermMap [][]byte, fieldID int, terms []stri
return nil
}

func (s *interim) writeDictsTermField(docTermMap [][]byte, dict map[string]uint64, term string, tfEncoder,
func (s *interim) writeDictsTermField(docTermMap [][]byte, dict map[string]uint64, fieldID int, term string, tfEncoder,
locEncoder *chunkedIntCoder, buf []byte) error {
pid := dict[term] - 1

Expand Down Expand Up @@ -813,19 +814,90 @@ func (s *interim) writeDictsTermField(docTermMap [][]byte, dict map[string]uint6

if freqNorm.numLocs > 0 {
numBytesLocs := 0
numFieldBytes := 0
var lastEnd uint64
var lastPos uint64
var locFlags LocFlags
for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] {
numBytesLocs += totalUvarintBytes(
uint64(loc.fieldID), loc.pos, loc.start, loc.end)
if loc.end < loc.start {
log.Fatal("backwards!!!\n")
}
if Version == 1 {
numBytesLocs += totalUvarintBytes(
uint64(loc.fieldID), loc.pos, loc.start, loc.end)
} else {
if loc.fieldID != uint16(fieldID) {
locFlags |= LocFlagStoredField
}

numBytesLocs += numUvarintBytes((loc.pos - lastPos) << 1)
numBytesLocs += numUvarintBytes(loc.start - lastEnd)
if uint64(len(term)) != loc.end-loc.start {
numBytesLocs += numUvarintBytes(loc.end - loc.start)
}
numFieldBytes += numUvarintBytes(uint64(loc.fieldID))
lastEnd = loc.end
lastPos = loc.pos
}
}

if Version == 2 {
if locFlags&LocFlagStoredField != 0 {
numBytesLocs += numFieldBytes
}

numBytesLocs += numUvarintBytes(uint64(locFlags))
}

err = locEncoder.Add(docNum, uint64(numBytesLocs))
if err != nil {
return err
}

if Version == 2 {
err = locEncoder.Add(docNum, uint64(locFlags))
if err != nil {
return err
}
}

lastEnd = 0
lastPos = 0
for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] {
err = locEncoder.Add(docNum,
uint64(loc.fieldID), loc.pos, loc.start, loc.end)
if loc.start < lastEnd {
log.Fatal("loc backwards\n")
}

var endNotStoredFlag uint64
if uint64(len(term)) == loc.end-loc.start {
endNotStoredFlag = 1
}

if Version == 1 {
err = locEncoder.Add(docNum,
uint64(loc.fieldID), loc.pos, loc.start, loc.end)
} else {
posDelta := loc.pos - lastPos
if (locFlags & LocFlagStoredField) != 0 {
if endNotStoredFlag != 0 {
err = locEncoder.Add(docNum,
uint64(loc.fieldID), (posDelta<<1)|endNotStoredFlag, loc.start-lastEnd)
} else {
err = locEncoder.Add(docNum,
uint64(loc.fieldID), posDelta<<1, loc.start-lastEnd, loc.end-loc.start)
}
} else {
if endNotStoredFlag != 0 {
err = locEncoder.Add(docNum,
(posDelta<<1)|endNotStoredFlag, loc.start-lastEnd)
} else {
err = locEncoder.Add(docNum,
posDelta<<1, loc.start-lastEnd, loc.end-loc.start)
}
}
}
lastEnd = loc.end
lastPos = loc.pos
if err != nil {
return err
}
Expand Down
Loading

0 comments on commit a5ffbee

Please sign in to comment.