diff --git a/etc/config.sample.toml b/etc/config.sample.toml index 0846a0b2919..fdf5d50d84d 100644 --- a/etc/config.sample.toml +++ b/etc/config.sample.toml @@ -37,7 +37,9 @@ reporting-disabled = false [data] dir = "/var/opt/influxdb/data" - # Controls the engine type for new shards. + # Controls the engine type for new shards. Options are b1, bz1, or tsm1. + # b1 is the 0.9.2 storage engine, bz1 is the 0.9.3 and 0.9.4 engine. + # tsm1 is the 0.9.5 engine # engine ="bz1" # The following WAL settings are for the b1 storage engine used in 0.9.2. They won't diff --git a/models/points.go b/models/points.go index 1ded7234d5b..592780389d4 100644 --- a/models/points.go +++ b/models/points.go @@ -1021,6 +1021,10 @@ func (p *point) Tags() Tags { i, key = scanTo(p.key, i, '=') i, value = scanTagValue(p.key, i+1) + if len(value) == 0 { + continue + } + tags[string(unescapeTag(key))] = string(unescapeTag(value)) i += 1 @@ -1141,7 +1145,10 @@ func (t Tags) HashKey() []byte { for k, v := range t { ek := escapeTag([]byte(k)) ev := escapeTag([]byte(v)) - escaped[string(ek)] = string(ev) + + if len(ev) > 0 { + escaped[string(ek)] = string(ev) + } } // Extract keys and determine final size. diff --git a/models/points_test.go b/models/points_test.go index 1d4d8bf866a..4186b89d970 100644 --- a/models/points_test.go +++ b/models/points_test.go @@ -605,6 +605,18 @@ func TestParsePointUnescape(t *testing.T) { }, time.Unix(0, 0))) + // tag with no value + test(t, `cpu,regions=east value="1"`, + models.NewPoint("cpu", + models.Tags{ + "regions": "east", + "foobar": "", + }, + models.Fields{ + "value": "1", + }, + time.Unix(0, 0))) + // commas in field values test(t, `cpu,regions=east value="1,0"`, models.NewPoint("cpu", diff --git a/services/copier/service_test.go b/services/copier/service_test.go index a5266087d7f..ce1151d3cf8 100644 --- a/services/copier/service_test.go +++ b/services/copier/service_test.go @@ -19,6 +19,7 @@ import ( // Ensure the service can return shard data. func TestService_handleConn(t *testing.T) { + t.Skip("not implemented for tsm1 engine") s := MustOpenService() defer s.Close() diff --git a/tsdb/config.go b/tsdb/config.go index 9843541e296..52d182c1172 100644 --- a/tsdb/config.go +++ b/tsdb/config.go @@ -42,7 +42,15 @@ const ( // we'll need to create backpressure, otherwise we'll fill up the memory and die. // This number multiplied by the parition count is roughly the max possible memory // size for the in-memory WAL cache. - DefaultPartitionSizeThreshold = 20 * 1024 * 1024 // 20MB + DefaultPartitionSizeThreshold = 50 * 1024 * 1024 // 50MB + + // Default WAL settings for the TSM1 WAL + DefaultFlushMemorySizeThreshold = 5 * 1024 * 1024 // 5MB + DefaultMaxMemorySizeThreshold = 100 * 1024 * 1024 // 100MB + DefaultIndexCompactionAge = time.Minute + DefaultIndexMinCompactionInterval = time.Minute + DefaultIndexMinCompactionFileCount = 5 + DefaultIndexCompactionFullAge = 5 * time.Minute ) type Config struct { @@ -63,6 +71,28 @@ type Config struct { WALFlushColdInterval toml.Duration `toml:"wal-flush-cold-interval"` WALPartitionSizeThreshold uint64 `toml:"wal-partition-size-threshold"` + // WAL configuration options for tsm1 introduced in 0.9.5 + WALFlushMemorySizeThreshold int `toml:"wal-flush-memory-size-threshold"` + WALMaxMemorySizeThreshold int `toml:"wal-max-memory-size-threshold"` + + // compaction options for tsm1 introduced in 0.9.5 + + // IndexCompactionAge specifies the duration after the data file creation time + // at which it is eligible to be compacted + IndexCompactionAge time.Duration `toml:"index-compaction-age"` + + // IndexMinimumCompactionInterval specifies the minimum amount of time that must + // pass after a compaction before another compaction is run + IndexMinCompactionInterval time.Duration `toml:"index-min-compaction-interval"` + + // IndexCompactionFileCount specifies the minimum number of data files that + // must be eligible for compaction before actually running one + IndexMinCompactionFileCount int `toml:"index-compaction-min-file-count"` + + // IndexCompactionFullAge specifies how long after the last write was received + // in the WAL that a full compaction should be performed. + IndexCompactionFullAge time.Duration `toml:"index-compaction-full-age"` + // Query logging QueryLogEnabled bool `toml:"query-log-enabled"` } @@ -74,12 +104,18 @@ func NewConfig() Config { WALFlushInterval: toml.Duration(DefaultWALFlushInterval), WALPartitionFlushDelay: toml.Duration(DefaultWALPartitionFlushDelay), - WALLoggingEnabled: true, - WALReadySeriesSize: DefaultReadySeriesSize, - WALCompactionThreshold: DefaultCompactionThreshold, - WALMaxSeriesSize: DefaultMaxSeriesSize, - WALFlushColdInterval: toml.Duration(DefaultFlushColdInterval), - WALPartitionSizeThreshold: DefaultPartitionSizeThreshold, + WALLoggingEnabled: true, + WALReadySeriesSize: DefaultReadySeriesSize, + WALCompactionThreshold: DefaultCompactionThreshold, + WALMaxSeriesSize: DefaultMaxSeriesSize, + WALFlushColdInterval: toml.Duration(DefaultFlushColdInterval), + WALPartitionSizeThreshold: DefaultPartitionSizeThreshold, + WALFlushMemorySizeThreshold: DefaultFlushMemorySizeThreshold, + WALMaxMemorySizeThreshold: DefaultMaxMemorySizeThreshold, + IndexCompactionAge: DefaultIndexCompactionAge, + IndexMinCompactionFileCount: DefaultIndexMinCompactionFileCount, + IndexCompactionFullAge: DefaultIndexCompactionFullAge, + IndexMinCompactionInterval: DefaultIndexMinCompactionInterval, QueryLogEnabled: true, } diff --git a/tsdb/engine.go b/tsdb/engine.go index c8d5946139f..fb1b2108c5c 100644 --- a/tsdb/engine.go +++ b/tsdb/engine.go @@ -24,7 +24,7 @@ type Engine interface { Close() error SetLogOutput(io.Writer) - LoadMetadataIndex(index *DatabaseIndex, measurementFields map[string]*MeasurementFields) error + LoadMetadataIndex(shard *Shard, index *DatabaseIndex, measurementFields map[string]*MeasurementFields) error Begin(writable bool) (Tx, error) WritePoints(points []models.Point, measurementFieldsToSave map[string]*MeasurementFields, seriesToCreate []*SeriesCreate) error @@ -32,9 +32,23 @@ type Engine interface { DeleteMeasurement(name string, seriesKeys []string) error SeriesCount() (n int, err error) + // PerformMaintenance will get called periodically by the store + PerformMaintenance() + + // Format will return the format for the engine + Format() EngineFormat + io.WriterTo } +type EngineFormat int + +const ( + B1Format EngineFormat = iota + BZ1Format + TSM1Format +) + // NewEngineFunc creates a new engine. type NewEngineFunc func(path string, walPath string, options EngineOptions) Engine @@ -57,9 +71,24 @@ func NewEngine(path string, walPath string, options EngineOptions) (Engine, erro return newEngineFuncs[options.EngineVersion](path, walPath, options), nil } - // Only bolt-based backends are currently supported so open it and check the format. + // Only bolt and tsm1 based storage engines are currently supported var format string if err := func() error { + // if it's a dir then it's a tsm1 engine + f, err := os.Open(path) + if err != nil { + return err + } + fi, err := f.Stat() + f.Close() + if err != nil { + return err + } + if fi.Mode().IsDir() { + format = "tsm1" + return nil + } + db, err := bolt.Open(path, 0666, &bolt.Options{Timeout: 1 * time.Second}) if err != nil { return err diff --git a/tsdb/engine/b1/b1.go b/tsdb/engine/b1/b1.go index fdc337b2f7a..a3f63602cdc 100644 --- a/tsdb/engine/b1/b1.go +++ b/tsdb/engine/b1/b1.go @@ -91,6 +91,14 @@ func NewEngine(path string, walPath string, opt tsdb.EngineOptions) tsdb.Engine // Path returns the path the engine was initialized with. func (e *Engine) Path() string { return e.path } +// PerformMaintenance is for periodic maintenance of the store. A no-op for b1 +func (e *Engine) PerformMaintenance() {} + +// Format returns the format type of this engine +func (e *Engine) Format() tsdb.EngineFormat { + return tsdb.B1Format +} + // Open opens and initializes the engine. func (e *Engine) Open() error { if err := func() error { @@ -174,7 +182,7 @@ func (e *Engine) close() error { func (e *Engine) SetLogOutput(w io.Writer) { e.LogOutput = w } // LoadMetadataIndex loads the shard metadata into memory. -func (e *Engine) LoadMetadataIndex(index *tsdb.DatabaseIndex, measurementFields map[string]*tsdb.MeasurementFields) error { +func (e *Engine) LoadMetadataIndex(shard *tsdb.Shard, index *tsdb.DatabaseIndex, measurementFields map[string]*tsdb.MeasurementFields) error { return e.db.View(func(tx *bolt.Tx) error { // load measurement metadata meta := tx.Bucket([]byte("fields")) diff --git a/tsdb/engine/b1/b1_test.go b/tsdb/engine/b1/b1_test.go index 5c3c19ee3bc..31b90344c36 100644 --- a/tsdb/engine/b1/b1_test.go +++ b/tsdb/engine/b1/b1_test.go @@ -21,7 +21,7 @@ func TestEngine_WritePoints(t *testing.T) { // Create metadata. mf := &tsdb.MeasurementFields{Fields: make(map[string]*tsdb.Field)} - mf.CreateFieldIfNotExists("value", influxql.Float) + mf.CreateFieldIfNotExists("value", influxql.Float, true) seriesToCreate := []*tsdb.SeriesCreate{ {Series: tsdb.NewSeries(string(models.MakeKey([]byte("temperature"), nil)), nil)}, } @@ -84,7 +84,7 @@ func TestEngine_WritePoints_Reverse(t *testing.T) { // Create metadata. mf := &tsdb.MeasurementFields{Fields: make(map[string]*tsdb.Field)} - mf.CreateFieldIfNotExists("value", influxql.Float) + mf.CreateFieldIfNotExists("value", influxql.Float, true) seriesToCreate := []*tsdb.SeriesCreate{ {Series: tsdb.NewSeries(string(models.MakeKey([]byte("temperature"), nil)), nil)}, } diff --git a/tsdb/engine/bz1/bz1.go b/tsdb/engine/bz1/bz1.go index e4d5682ca47..881b82dc431 100644 --- a/tsdb/engine/bz1/bz1.go +++ b/tsdb/engine/bz1/bz1.go @@ -114,6 +114,14 @@ func NewEngine(path string, walPath string, opt tsdb.EngineOptions) tsdb.Engine // Path returns the path the engine was opened with. func (e *Engine) Path() string { return e.path } +// PerformMaintenance is for periodic maintenance of the store. A no-op for bz1 +func (e *Engine) PerformMaintenance() {} + +// Format returns the format type of this engine +func (e *Engine) Format() tsdb.EngineFormat { + return tsdb.BZ1Format +} + // Open opens and initializes the engine. func (e *Engine) Open() error { if err := func() error { @@ -176,7 +184,7 @@ func (e *Engine) close() error { func (e *Engine) SetLogOutput(w io.Writer) {} // LoadMetadataIndex loads the shard metadata into memory. -func (e *Engine) LoadMetadataIndex(index *tsdb.DatabaseIndex, measurementFields map[string]*tsdb.MeasurementFields) error { +func (e *Engine) LoadMetadataIndex(shard *tsdb.Shard, index *tsdb.DatabaseIndex, measurementFields map[string]*tsdb.MeasurementFields) error { if err := e.db.View(func(tx *bolt.Tx) error { // Load measurement metadata fields, err := e.readFields(tx) diff --git a/tsdb/engine/bz1/bz1_test.go b/tsdb/engine/bz1/bz1_test.go index 97873afe3b2..0b0cb1e60cc 100644 --- a/tsdb/engine/bz1/bz1_test.go +++ b/tsdb/engine/bz1/bz1_test.go @@ -38,7 +38,7 @@ func TestEngine_LoadMetadataIndex_Series(t *testing.T) { // Load metadata index. index := tsdb.NewDatabaseIndex() - if err := e.LoadMetadataIndex(index, make(map[string]*tsdb.MeasurementFields)); err != nil { + if err := e.LoadMetadataIndex(nil, index, make(map[string]*tsdb.MeasurementFields)); err != nil { t.Fatal(err) } @@ -80,7 +80,7 @@ func TestEngine_LoadMetadataIndex_Fields(t *testing.T) { // Load metadata index. mfs := make(map[string]*tsdb.MeasurementFields) - if err := e.LoadMetadataIndex(tsdb.NewDatabaseIndex(), mfs); err != nil { + if err := e.LoadMetadataIndex(nil, tsdb.NewDatabaseIndex(), mfs); err != nil { t.Fatal(err) } diff --git a/tsdb/engine/engine.go b/tsdb/engine/engine.go index c5565ff06cf..6c8cb51e193 100644 --- a/tsdb/engine/engine.go +++ b/tsdb/engine/engine.go @@ -3,4 +3,5 @@ package engine import ( _ "github.com/influxdb/influxdb/tsdb/engine/b1" _ "github.com/influxdb/influxdb/tsdb/engine/bz1" + _ "github.com/influxdb/influxdb/tsdb/engine/tsm1" ) diff --git a/tsdb/engine/tsm1/bool.go b/tsdb/engine/tsm1/bool.go new file mode 100644 index 00000000000..83f570a2ef2 --- /dev/null +++ b/tsdb/engine/tsm1/bool.go @@ -0,0 +1,135 @@ +package tsm1 + +// bool encoding uses 1 bit per value. Each compressed byte slice contains a 1 byte header +// indicating the compression type, followed by a variable byte encoded length indicating +// how many booleans are packed in the slice. The remaining bytes contains 1 byte for every +// 8 boolean values encoded. + +import "encoding/binary" + +const ( + // boolUncompressed is an uncompressed boolean format + boolUncompressed = 0 + // boolCompressedBitPacked is an bit packed format using 1 bit per boolean + boolCompressedBitPacked = 1 +) + +type BoolEncoder interface { + Write(b bool) + Bytes() ([]byte, error) +} + +type BoolDecoder interface { + Next() bool + Read() bool + Error() error +} + +type boolEncoder struct { + // The encoded bytes + bytes []byte + + // The current byte being encoded + b byte + + // The number of bools packed into b + i int + + // The total number of bools written + n int +} + +func NewBoolEncoder() BoolEncoder { + return &boolEncoder{} +} + +func (e *boolEncoder) Write(b bool) { + // If we have filled the current byte, flush it + if e.i >= 8 { + e.flush() + } + + // Use 1 bit for each boolen value, shift the current byte + // by 1 and set the least signficant bit acordingly + e.b = e.b << 1 + if b { + e.b |= 1 + } + + // Increment the current bool count + e.i += 1 + // Increment the total bool count + e.n += 1 +} + +func (e *boolEncoder) flush() { + // Pad remaining byte w/ 0s + for e.i < 8 { + e.b = e.b << 1 + e.i += 1 + } + + // If we have bits set, append them to the byte slice + if e.i > 0 { + e.bytes = append(e.bytes, e.b) + e.b = 0 + e.i = 0 + } +} + +func (e *boolEncoder) Bytes() ([]byte, error) { + // Ensure the current byte is flushed + e.flush() + b := make([]byte, 10+1) + + // Store the encoding type in the 4 high bits of the first byte + b[0] = byte(boolCompressedBitPacked) << 4 + + i := 1 + // Encode the number of bools written + i += binary.PutUvarint(b[i:], uint64(e.n)) + + // Append the packed booleans + return append(b[:i], e.bytes...), nil +} + +type boolDecoder struct { + b []byte + i int + n int + err error +} + +func NewBoolDecoder(b []byte) BoolDecoder { + // First byte stores the encoding type, only have 1 bit-packet format + // currently ignore for now. + b = b[1:] + count, n := binary.Uvarint(b) + return &boolDecoder{b: b[n:], i: -1, n: int(count)} +} + +func (e *boolDecoder) Next() bool { + e.i += 1 + return e.i < e.n +} + +func (e *boolDecoder) Read() bool { + // Index into the byte slice + idx := e.i / 8 + + // Bit position + pos := (8 - e.i%8) - 1 + + // The mask to select the bit + mask := byte(1 << uint(pos)) + + // The packed byte + v := e.b[idx] + + // Returns true if the bit is set + return v&mask == mask +} + +func (e *boolDecoder) Error() error { + return e.err +} diff --git a/tsdb/engine/tsm1/bool_test.go b/tsdb/engine/tsm1/bool_test.go new file mode 100644 index 00000000000..ed68987afd1 --- /dev/null +++ b/tsdb/engine/tsm1/bool_test.go @@ -0,0 +1,73 @@ +package tsm1_test + +import ( + "testing" + + "github.com/influxdb/influxdb/tsdb/engine/tsm1" +) + +func Test_BoolEncoder_NoValues(t *testing.T) { + enc := tsm1.NewBoolEncoder() + b, err := enc.Bytes() + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + dec := tsm1.NewBoolDecoder(b) + if dec.Next() { + t.Fatalf("unexpected next value: got true, exp false") + } +} + +func Test_BoolEncoder_Single(t *testing.T) { + enc := tsm1.NewBoolEncoder() + v1 := true + enc.Write(v1) + b, err := enc.Bytes() + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + dec := tsm1.NewBoolDecoder(b) + if !dec.Next() { + t.Fatalf("unexpected next value: got false, exp true") + } + + if v1 != dec.Read() { + t.Fatalf("unexpected value: got %v, exp %v", dec.Read(), v1) + } +} + +func Test_BoolEncoder_Multi_Compressed(t *testing.T) { + enc := tsm1.NewBoolEncoder() + + values := make([]bool, 10) + for i := range values { + values[i] = i%2 == 0 + enc.Write(values[i]) + } + + b, err := enc.Bytes() + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if exp := 4; len(b) != exp { + t.Fatalf("unexpected length: got %v, exp %v", len(b), exp) + } + + dec := tsm1.NewBoolDecoder(b) + + for i, v := range values { + if !dec.Next() { + t.Fatalf("unexpected next value: got false, exp true") + } + if v != dec.Read() { + t.Fatalf("unexpected value at pos %d: got %v, exp %v", i, dec.Read(), v) + } + } + + if dec.Next() { + t.Fatalf("unexpected next value: got true, exp false") + } +} diff --git a/tsdb/engine/tsm1/cursor.go b/tsdb/engine/tsm1/cursor.go new file mode 100644 index 00000000000..0cee157d3a3 --- /dev/null +++ b/tsdb/engine/tsm1/cursor.go @@ -0,0 +1,481 @@ +package tsm1 + +import ( + "math" + + "github.com/influxdb/influxdb/tsdb" +) + +// combinedEngineCursor holds a cursor for the WAL and the index +// and will combine the two together. Any points in the WAL with +// identical timestamps from the index will be preferred over the +// index point +type combinedEngineCursor struct { + walCursor tsdb.Cursor + engineCursor tsdb.Cursor + walKeyBuf int64 + walValueBuf interface{} + engineKeyBuf int64 + engineValueBuf interface{} + ascending bool +} + +func NewCombinedEngineCursor(wc, ec tsdb.Cursor, ascending bool) tsdb.Cursor { + return &combinedEngineCursor{ + walCursor: wc, + engineCursor: ec, + ascending: ascending, + } +} + +// SeekTo will seek both the index and WAL cursor +func (c *combinedEngineCursor) SeekTo(seek int64) (key int64, value interface{}) { + c.walKeyBuf, c.walValueBuf = c.walCursor.SeekTo(seek) + c.engineKeyBuf, c.engineValueBuf = c.engineCursor.SeekTo(seek) + return c.read() +} + +// Next returns the next value in the cursor +func (c *combinedEngineCursor) Next() (int64, interface{}) { + return c.read() +} + +// Ascending returns true if the cursor is time ascending +func (c *combinedEngineCursor) Ascending() bool { + return c.ascending +} + +// read will return the buffer value that is next from either the +// WAL or index cursor and repopulate the buffer value with the +// appropriate cursor's next value +func (c *combinedEngineCursor) read() (key int64, value interface{}) { + if c.walKeyBuf == tsdb.EOF && c.engineKeyBuf == tsdb.EOF { + return tsdb.EOF, nil + } + + // handle the case where they have the same point + if c.walKeyBuf == c.engineKeyBuf { + // keep the wal value since it will overwrite the engine value + key = c.walKeyBuf + value = c.walValueBuf + c.walKeyBuf, c.walValueBuf = c.walCursor.Next() + + // overwrite the buffered engine values + c.engineKeyBuf, c.engineValueBuf = c.engineCursor.Next() + return + } + + // ascending order + if c.ascending { + if c.walKeyBuf != tsdb.EOF && (c.walKeyBuf < c.engineKeyBuf || c.engineKeyBuf == tsdb.EOF) { + key = c.walKeyBuf + value = c.walValueBuf + c.walKeyBuf, c.walValueBuf = c.walCursor.Next() + return + } + + key = c.engineKeyBuf + value = c.engineValueBuf + c.engineKeyBuf, c.engineValueBuf = c.engineCursor.Next() + return + } + + // descending order + if c.walKeyBuf != tsdb.EOF && c.walKeyBuf > c.engineKeyBuf { + key = c.walKeyBuf + value = c.walValueBuf + c.walKeyBuf, c.walValueBuf = c.walCursor.Next() + return + } + + key = c.engineKeyBuf + value = c.engineValueBuf + c.engineKeyBuf, c.engineValueBuf = c.engineCursor.Next() + return +} + +// multieFieldCursor wraps cursors for multiple fields on the same series +// key. Instead of returning a plain interface value in the call for Next(), +// it returns a map[string]interface{} for the field values +type multiFieldCursor struct { + fields []string + cursors []tsdb.Cursor + ascending bool + keyBuffer []int64 + valueBuffer []interface{} +} + +func NewMultiFieldCursor(fields []string, cursors []tsdb.Cursor, ascending bool) tsdb.Cursor { + return &multiFieldCursor{ + fields: fields, + cursors: cursors, + ascending: ascending, + keyBuffer: make([]int64, len(cursors)), + valueBuffer: make([]interface{}, len(cursors)), + } +} + +func (m *multiFieldCursor) SeekTo(seek int64) (key int64, value interface{}) { + for i, c := range m.cursors { + m.keyBuffer[i], m.valueBuffer[i] = c.SeekTo(seek) + } + return m.read() +} + +func (m *multiFieldCursor) Next() (int64, interface{}) { + return m.read() +} + +func (m *multiFieldCursor) Ascending() bool { + return m.ascending +} + +func (m *multiFieldCursor) read() (int64, interface{}) { + t := int64(math.MaxInt64) + if !m.ascending { + t = int64(math.MinInt64) + } + + // find the time we need to combine all fields + for _, k := range m.keyBuffer { + if k == tsdb.EOF { + continue + } + if m.ascending && t > k { + t = k + } else if !m.ascending && t < k { + t = k + } + } + + // get the value and advance each of the cursors that have the matching time + if t == math.MinInt64 || t == math.MaxInt64 { + return tsdb.EOF, nil + } + + mm := make(map[string]interface{}) + for i, k := range m.keyBuffer { + if k == t { + mm[m.fields[i]] = m.valueBuffer[i] + m.keyBuffer[i], m.valueBuffer[i] = m.cursors[i].Next() + } + } + return t, mm +} + +type emptyCursor struct { + ascending bool +} + +func (c *emptyCursor) Next() (int64, interface{}) { return tsdb.EOF, nil } +func (c *emptyCursor) SeekTo(key int64) (int64, interface{}) { return tsdb.EOF, nil } +func (c *emptyCursor) Ascending() bool { return c.ascending } + +// cursor is a cursor for the data in the index +type cursor struct { + // id for the series key and field + id uint64 + + // f is the current data file we're reading from + f *dataFile + + // filesPos is the position in the files index we're reading from + filesPos int // the index in the files slice we're looking at + + // pos is the position in the current data file we're reading + pos uint32 + + // vals is the current decoded block of Values we're iterating from + vals Values + + ascending bool + + // blockPositions is used for descending queries to keep track + // of what positions in the current data file encoded blocks for + // the id exist at + blockPositions []uint32 + + // time acending slice of read only data files + files []*dataFile +} + +func newCursor(id uint64, files []*dataFile, ascending bool) *cursor { + return &cursor{ + id: id, + ascending: ascending, + files: files, + } +} + +func (c *cursor) SeekTo(seek int64) (int64, interface{}) { + if len(c.files) == 0 { + return tsdb.EOF, nil + } + + if c.ascending { + if seek <= c.files[0].MinTime() { + c.filesPos = 0 + c.f = c.files[0] + } else { + for i, f := range c.files { + if seek >= f.MinTime() && seek <= f.MaxTime() { + c.filesPos = i + c.f = f + break + } + } + } + } else { + if seek >= c.files[len(c.files)-1].MaxTime() { + c.filesPos = len(c.files) - 1 + c.f = c.files[c.filesPos] + } else if seek < c.files[0].MinTime() { + return tsdb.EOF, nil + } else { + for i, f := range c.files { + if seek >= f.MinTime() && seek <= f.MaxTime() { + c.filesPos = i + c.f = f + break + } + } + } + } + + if c.f == nil { + return tsdb.EOF, nil + } + + // find the first file we need to check in + for { + if c.filesPos < 0 || c.filesPos >= len(c.files) { + return tsdb.EOF, nil + } + c.f = c.files[c.filesPos] + + c.pos = c.f.StartingPositionForID(c.id) + + // if this id isn't in this file, move to next one or return + if c.pos == 0 { + if c.ascending { + c.filesPos++ + } else { + c.filesPos-- + c.blockPositions = nil + } + continue + } + + // handle seek for correct order + k := tsdb.EOF + var v interface{} + + if c.ascending { + k, v = c.seekAscending(seek) + } else { + k, v = c.seekDescending(seek) + } + + if k != tsdb.EOF { + return k, v + } + + if c.ascending { + c.filesPos++ + } else { + c.filesPos-- + c.blockPositions = nil + } + } +} + +func (c *cursor) seekAscending(seek int64) (int64, interface{}) { + // seek to the block and values we're looking for + for { + // if the time is between this block and the next, + // decode this block and go, otherwise seek to next block + length := c.blockLength(c.pos) + + // if the next block has a time less than what we're seeking to, + // skip decoding this block and continue on + nextBlockPos := c.pos + blockHeaderSize + length + if nextBlockPos < c.f.indexPosition() { + nextBlockID := btou64(c.f.mmap[nextBlockPos : nextBlockPos+8]) + if nextBlockID == c.id { + nextBlockTime := c.blockMinTime(nextBlockPos) + if nextBlockTime <= seek { + c.pos = nextBlockPos + continue + } + } + } + + // it must be in this block or not at all + id := btou64((c.f.mmap[c.pos : c.pos+8])) + if id != c.id { + return tsdb.EOF, nil + } + c.decodeBlock(c.pos) + + // see if we can find it in this block + for i, v := range c.vals { + if v.Time().UnixNano() >= seek { + c.vals = c.vals[i+1:] + return v.Time().UnixNano(), v.Value() + } + } + } +} + +func (c *cursor) seekDescending(seek int64) (int64, interface{}) { + c.setBlockPositions() + if len(c.blockPositions) == 0 { + return tsdb.EOF, nil + } + + for i := len(c.blockPositions) - 1; i >= 0; i-- { + pos := c.blockPositions[i] + if c.blockMinTime(pos) > seek { + continue + } + + c.decodeBlock(pos) + c.blockPositions = c.blockPositions[:i] + + for i := len(c.vals) - 1; i >= 0; i-- { + val := c.vals[i] + if seek >= val.UnixNano() { + c.vals = c.vals[:i] + return val.UnixNano(), val.Value() + } + if seek < val.UnixNano() { + // we need to move to the next block + if i == 0 { + break + } + val := c.vals[i-1] + c.vals = c.vals[:i-1] + return val.UnixNano(), val.Value() + } + } + c.blockPositions = c.blockPositions[:i] + } + + return tsdb.EOF, nil +} + +// blockMinTime is the minimum time for the block +func (c *cursor) blockMinTime(pos uint32) int64 { + return int64(btou64(c.f.mmap[pos+12 : pos+20])) +} + +// setBlockPositions will read the positions of all +// blocks for the cursor id in the given data file +func (c *cursor) setBlockPositions() { + pos := c.pos + + for { + if pos >= c.f.indexPosition() { + return + } + + length := c.blockLength(pos) + id := btou64(c.f.mmap[pos : pos+8]) + + if id != c.id { + return + } + + c.blockPositions = append(c.blockPositions, pos) + pos += blockHeaderSize + length + } +} + +func (c *cursor) Next() (int64, interface{}) { + if c.ascending { + k, v := c.nextAscending() + return k, v + } + return c.nextDescending() +} + +func (c *cursor) nextAscending() (int64, interface{}) { + if len(c.vals) > 0 { + v := c.vals[0] + c.vals = c.vals[1:] + + return v.Time().UnixNano(), v.Value() + } + + // if we have a file set, see if the next block is for this ID + if c.f != nil && c.pos < c.f.indexPosition() { + nextBlockID := btou64(c.f.mmap[c.pos : c.pos+8]) + if nextBlockID == c.id { + c.decodeBlock(c.pos) + return c.nextAscending() + } + } + + // loop through the files until we hit the next one that has this id + for { + c.filesPos++ + if c.filesPos >= len(c.files) { + return tsdb.EOF, nil + } + c.f = c.files[c.filesPos] + + startingPos := c.f.StartingPositionForID(c.id) + if startingPos == 0 { + // move to next file because it isn't in this one + continue + } + + // we have a block with this id, decode and return + c.decodeBlock(startingPos) + return c.nextAscending() + } +} + +func (c *cursor) nextDescending() (int64, interface{}) { + if len(c.vals) > 0 { + v := c.vals[len(c.vals)-1] + if len(c.vals) >= 1 { + c.vals = c.vals[:len(c.vals)-1] + } else { + c.vals = nil + } + return v.UnixNano(), v.Value() + } + + for i := len(c.blockPositions) - 1; i >= 0; i-- { + c.decodeBlock(c.blockPositions[i]) + c.blockPositions = c.blockPositions[:i] + if len(c.vals) == 0 { + continue + } + val := c.vals[len(c.vals)-1] + c.vals = c.vals[:len(c.vals)-1] + return val.UnixNano(), val.Value() + } + + return tsdb.EOF, nil +} + +func (c *cursor) blockLength(pos uint32) uint32 { + return btou32(c.f.mmap[pos+8 : pos+12]) +} + +// decodeBlock will decod the block and set the vals +func (c *cursor) decodeBlock(position uint32) { + length := c.blockLength(position) + block := c.f.mmap[position+blockHeaderSize : position+blockHeaderSize+length] + c.vals, _ = DecodeBlock(block) + + // only adavance the position if we're asceending. + // Descending queries use the blockPositions + if c.ascending { + c.pos = position + blockHeaderSize + length + } +} + +func (c *cursor) Ascending() bool { return c.ascending } diff --git a/tsdb/engine/tsm1/encoding.go b/tsdb/engine/tsm1/encoding.go new file mode 100644 index 00000000000..3de88586320 --- /dev/null +++ b/tsdb/engine/tsm1/encoding.go @@ -0,0 +1,554 @@ +package tsm1 + +import ( + "encoding/binary" + "fmt" + "sort" + "time" + + "github.com/influxdb/influxdb/tsdb" +) + +const ( + // BlockFloat64 designates a block encodes float64 values + BlockFloat64 = 0 + + // BlockInt64 designates a block encodes int64 values + BlockInt64 = 1 + + // BlockBool designates a block encodes bool values + BlockBool = 2 + + // BlockString designates a block encodes string values + BlockString = 3 + + // encodedBlockHeaderSize is the size of the header for an encoded block. The first 8 bytes + // are the minimum timestamp of the block. The next byte is a block encoding type indicator. + encodedBlockHeaderSize = 9 +) + +type Value interface { + Time() time.Time + UnixNano() int64 + Value() interface{} + Size() int +} + +func NewValue(t time.Time, value interface{}) Value { + switch v := value.(type) { + case int64: + return &Int64Value{time: t, value: v} + case float64: + return &FloatValue{time: t, value: v} + case bool: + return &BoolValue{time: t, value: v} + case string: + return &StringValue{time: t, value: v} + } + return &EmptyValue{} +} + +type EmptyValue struct { +} + +func (e *EmptyValue) UnixNano() int64 { return tsdb.EOF } +func (e *EmptyValue) Time() time.Time { return time.Unix(0, tsdb.EOF) } +func (e *EmptyValue) Value() interface{} { return nil } +func (e *EmptyValue) Size() int { return 0 } + +// Values represented a time ascending sorted collection of Value types. +// the underlying type should be the same across all values, but the interface +// makes the code cleaner. +type Values []Value + +func (v Values) MinTime() int64 { + return v[0].Time().UnixNano() +} + +func (v Values) MaxTime() int64 { + return v[len(v)-1].Time().UnixNano() +} + +func (v Values) Encode(buf []byte) ([]byte, error) { + switch v[0].(type) { + case *FloatValue: + return encodeFloatBlock(buf, v) + case *Int64Value: + return encodeInt64Block(buf, v) + case *BoolValue: + return encodeBoolBlock(buf, v) + case *StringValue: + return encodeStringBlock(buf, v) + } + + return nil, fmt.Errorf("unsupported value type %T", v[0]) +} + +func (v Values) DecodeSameTypeBlock(block []byte) Values { + switch v[0].(type) { + case *FloatValue: + a, _ := decodeFloatBlock(block) + return a + case *Int64Value: + a, _ := decodeInt64Block(block) + return a + case *BoolValue: + a, _ := decodeBoolBlock(block) + return a + case *StringValue: + a, _ := decodeStringBlock(block) + return a + } + return nil +} + +// DecodeBlock takes a byte array and will decode into values of the appropriate type +// based on the block +func DecodeBlock(block []byte) (Values, error) { + if len(block) <= encodedBlockHeaderSize { + panic(fmt.Sprintf("decode of short block: got %v, exp %v", len(block), encodedBlockHeaderSize)) + } + + blockType := block[8] + switch blockType { + case BlockFloat64: + return decodeFloatBlock(block) + case BlockInt64: + return decodeInt64Block(block) + case BlockBool: + return decodeBoolBlock(block) + case BlockString: + return decodeStringBlock(block) + default: + panic(fmt.Sprintf("unknown block type: %d", blockType)) + } +} + +// Deduplicate returns a new Values slice with any values +// that have the same timestamp removed. The Value that appears +// last in the slice is the one that is kept. The returned slice is in ascending order +func (v Values) Deduplicate() Values { + m := make(map[int64]Value) + for _, val := range v { + m[val.UnixNano()] = val + } + + a := make([]Value, 0, len(m)) + for _, val := range m { + a = append(a, val) + } + sort.Sort(Values(a)) + + return a +} + +// Sort methods +func (a Values) Len() int { return len(a) } +func (a Values) Swap(i, j int) { a[i], a[j] = a[j], a[i] } +func (a Values) Less(i, j int) bool { return a[i].Time().UnixNano() < a[j].Time().UnixNano() } + +type FloatValue struct { + time time.Time + value float64 +} + +func (f *FloatValue) Time() time.Time { + return f.time +} + +func (f *FloatValue) UnixNano() int64 { + return f.time.UnixNano() +} + +func (f *FloatValue) Value() interface{} { + return f.value +} + +func (f *FloatValue) Size() int { + return 16 +} + +func encodeFloatBlock(buf []byte, values []Value) ([]byte, error) { + if len(values) == 0 { + return nil, nil + } + + // A float block is encoded using different compression strategies + // for timestamps and values. + + // Encode values using Gorilla float compression + venc := NewFloatEncoder() + + // Encode timestamps using an adaptive encoder that uses delta-encoding, + // frame-or-reference and run length encoding. + tsenc := NewTimeEncoder() + + for _, v := range values { + tsenc.Write(v.Time()) + venc.Push(v.(*FloatValue).value) + } + venc.Finish() + + // Encoded timestamp values + tb, err := tsenc.Bytes() + if err != nil { + return nil, err + } + // Encoded float values + vb := venc.Bytes() + + // Prepend the first timestamp of the block in the first 8 bytes and the block + // in the next byte, followed by the block + block := packBlockHeader(values[0].Time(), BlockFloat64) + block = append(block, packBlock(tb, vb)...) + return block, nil +} + +func decodeFloatBlock(block []byte) ([]Value, error) { + // The first 8 bytes is the minimum timestamp of the block + block = block[8:] + + // Block type is the next block, make sure we actually have a float block + blockType := block[0] + if blockType != BlockFloat64 { + return nil, fmt.Errorf("invalid block type: exp %d, got %d", BlockFloat64, blockType) + } + block = block[1:] + + tb, vb := unpackBlock(block) + + // Setup our timestamp and value decoders + dec := NewTimeDecoder(tb) + iter, err := NewFloatDecoder(vb) + if err != nil { + return nil, err + } + + // Decode both a timestamp and value + var a []Value + for dec.Next() && iter.Next() { + ts := dec.Read() + v := iter.Values() + a = append(a, &FloatValue{ts, v}) + } + + // Did timestamp decoding have an error? + if dec.Error() != nil { + return nil, dec.Error() + } + // Did float decoding have an error? + if iter.Error() != nil { + return nil, iter.Error() + } + + return a, nil +} + +type BoolValue struct { + time time.Time + value bool +} + +func (b *BoolValue) Time() time.Time { + return b.time +} + +func (b *BoolValue) Size() int { + return 9 +} + +func (b *BoolValue) UnixNano() int64 { + return b.time.UnixNano() +} + +func (b *BoolValue) Value() interface{} { + return b.value +} + +func encodeBoolBlock(buf []byte, values []Value) ([]byte, error) { + if len(values) == 0 { + return nil, nil + } + + // A bool block is encoded using different compression strategies + // for timestamps and values. + + // Encode values using Gorilla float compression + venc := NewBoolEncoder() + + // Encode timestamps using an adaptive encoder + tsenc := NewTimeEncoder() + + for _, v := range values { + tsenc.Write(v.Time()) + venc.Write(v.(*BoolValue).value) + } + + // Encoded timestamp values + tb, err := tsenc.Bytes() + if err != nil { + return nil, err + } + // Encoded float values + vb, err := venc.Bytes() + if err != nil { + return nil, err + } + + // Prepend the first timestamp of the block in the first 8 bytes and the block + // in the next byte, followed by the block + block := packBlockHeader(values[0].Time(), BlockBool) + block = append(block, packBlock(tb, vb)...) + return block, nil +} + +func decodeBoolBlock(block []byte) ([]Value, error) { + // The first 8 bytes is the minimum timestamp of the block + block = block[8:] + + // Block type is the next block, make sure we actually have a float block + blockType := block[0] + if blockType != BlockBool { + return nil, fmt.Errorf("invalid block type: exp %d, got %d", BlockBool, blockType) + } + block = block[1:] + + tb, vb := unpackBlock(block) + + // Setup our timestamp and value decoders + dec := NewTimeDecoder(tb) + vdec := NewBoolDecoder(vb) + + // Decode both a timestamp and value + var a []Value + for dec.Next() && vdec.Next() { + ts := dec.Read() + v := vdec.Read() + a = append(a, &BoolValue{ts, v}) + } + + // Did timestamp decoding have an error? + if dec.Error() != nil { + return nil, dec.Error() + } + // Did bool decoding have an error? + if vdec.Error() != nil { + return nil, vdec.Error() + } + + return a, nil +} + +type Int64Value struct { + time time.Time + value int64 +} + +func (v *Int64Value) Time() time.Time { + return v.time +} + +func (v *Int64Value) Value() interface{} { + return v.value +} + +func (f *Int64Value) UnixNano() int64 { + return f.time.UnixNano() +} + +func (v *Int64Value) Size() int { + return 16 +} + +func (v *Int64Value) String() string { return fmt.Sprintf("%v", v.value) } + +func encodeInt64Block(buf []byte, values []Value) ([]byte, error) { + tsEnc := NewTimeEncoder() + vEnc := NewInt64Encoder() + for _, v := range values { + tsEnc.Write(v.Time()) + vEnc.Write(v.(*Int64Value).value) + } + + // Encoded timestamp values + tb, err := tsEnc.Bytes() + if err != nil { + return nil, err + } + // Encoded int64 values + vb, err := vEnc.Bytes() + if err != nil { + return nil, err + } + + // Prepend the first timestamp of the block in the first 8 bytes + block := packBlockHeader(values[0].Time(), BlockInt64) + return append(block, packBlock(tb, vb)...), nil +} + +func decodeInt64Block(block []byte) ([]Value, error) { + // slice off the first 8 bytes (min timestmap for the block) + block = block[8:] + + blockType := block[0] + if blockType != BlockInt64 { + return nil, fmt.Errorf("invalid block type: exp %d, got %d", BlockInt64, blockType) + } + + block = block[1:] + + // The first 8 bytes is the minimum timestamp of the block + tb, vb := unpackBlock(block) + + // Setup our timestamp and value decoders + tsDec := NewTimeDecoder(tb) + vDec := NewInt64Decoder(vb) + + // Decode both a timestamp and value + var a []Value + for tsDec.Next() && vDec.Next() { + ts := tsDec.Read() + v := vDec.Read() + a = append(a, &Int64Value{ts, v}) + } + + // Did timestamp decoding have an error? + if tsDec.Error() != nil { + return nil, tsDec.Error() + } + // Did int64 decoding have an error? + if vDec.Error() != nil { + return nil, vDec.Error() + } + + return a, nil +} + +type StringValue struct { + time time.Time + value string +} + +func (v *StringValue) Time() time.Time { + return v.time +} + +func (v *StringValue) Value() interface{} { + return v.value +} + +func (v *StringValue) UnixNano() int64 { + return v.time.UnixNano() +} + +func (v *StringValue) Size() int { + return 8 + len(v.value) +} + +func (v *StringValue) String() string { return v.value } + +func encodeStringBlock(buf []byte, values []Value) ([]byte, error) { + tsEnc := NewTimeEncoder() + vEnc := NewStringEncoder() + for _, v := range values { + tsEnc.Write(v.Time()) + vEnc.Write(v.(*StringValue).value) + } + + // Encoded timestamp values + tb, err := tsEnc.Bytes() + if err != nil { + return nil, err + } + // Encoded string values + vb, err := vEnc.Bytes() + if err != nil { + return nil, err + } + + // Prepend the first timestamp of the block in the first 8 bytes + block := packBlockHeader(values[0].Time(), BlockString) + return append(block, packBlock(tb, vb)...), nil +} + +func decodeStringBlock(block []byte) ([]Value, error) { + // slice off the first 8 bytes (min timestmap for the block) + block = block[8:] + + blockType := block[0] + if blockType != BlockString { + return nil, fmt.Errorf("invalid block type: exp %d, got %d", BlockString, blockType) + } + + block = block[1:] + + // The first 8 bytes is the minimum timestamp of the block + tb, vb := unpackBlock(block) + + // Setup our timestamp and value decoders + tsDec := NewTimeDecoder(tb) + vDec, err := NewStringDecoder(vb) + if err != nil { + return nil, err + } + + // Decode both a timestamp and value + var a []Value + for tsDec.Next() && vDec.Next() { + ts := tsDec.Read() + v := vDec.Read() + a = append(a, &StringValue{ts, v}) + } + + // Did timestamp decoding have an error? + if tsDec.Error() != nil { + return nil, tsDec.Error() + } + // Did string decoding have an error? + if vDec.Error() != nil { + return nil, vDec.Error() + } + + return a, nil +} + +func packBlockHeader(firstTime time.Time, blockType byte) []byte { + return append(u64tob(uint64(firstTime.UnixNano())), blockType) +} + +func packBlock(ts []byte, values []byte) []byte { + // We encode the length of the timestamp block using a variable byte encoding. + // This allows small byte slices to take up 1 byte while larger ones use 2 or more. + b := make([]byte, 10) + i := binary.PutUvarint(b, uint64(len(ts))) + + // block is , , + block := append(b[:i], ts...) + + // We don't encode the value length because we know it's the rest of the block after + // the timestamp block. + return append(block, values...) +} + +func unpackBlock(buf []byte) (ts, values []byte) { + // Unpack the timestamp block length + tsLen, i := binary.Uvarint(buf) + + // Unpack the timestamp bytes + ts = buf[int(i) : int(i)+int(tsLen)] + + // Unpack the value bytes + values = buf[int(i)+int(tsLen):] + return +} + +// ZigZagEncode converts a int64 to a uint64 by zig zagging negative and positive values +// across even and odd numbers. Eg. [0,-1,1,-2] becomes [0, 1, 2, 3] +func ZigZagEncode(x int64) uint64 { + return uint64(uint64(x<<1) ^ uint64((int64(x) >> 63))) +} + +// ZigZagDecode converts a previously zigzag encoded uint64 back to a int64 +func ZigZagDecode(v uint64) int64 { + return int64((v >> 1) ^ uint64((int64(v&1)<<63)>>63)) +} diff --git a/tsdb/engine/tsm1/encoding_test.go b/tsdb/engine/tsm1/encoding_test.go new file mode 100644 index 00000000000..309b947eb60 --- /dev/null +++ b/tsdb/engine/tsm1/encoding_test.go @@ -0,0 +1,158 @@ +package tsm1_test + +import ( + // "math/rand" + + "fmt" + "reflect" + "testing" + "time" + + "github.com/influxdb/influxdb/tsdb/engine/tsm1" +) + +func TestEncoding_FloatBlock(t *testing.T) { + valueCount := 1000 + times := getTimes(valueCount, 60, time.Second) + values := make(tsm1.Values, len(times)) + for i, t := range times { + values[i] = tsm1.NewValue(t, float64(i)) + } + + b, err := values.Encode(nil) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + decodedValues := values.DecodeSameTypeBlock(b) + + if !reflect.DeepEqual(decodedValues, values) { + t.Fatalf("unexpected results:\n\tgot: %v\n\texp: %v\n", decodedValues, values) + } +} + +func TestEncoding_FloatBlock_ZeroTime(t *testing.T) { + values := make(tsm1.Values, 3) + for i := 0; i < 3; i++ { + values[i] = tsm1.NewValue(time.Unix(0, 0), float64(i)) + } + + b, err := values.Encode(nil) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + decodedValues := values.DecodeSameTypeBlock(b) + + if !reflect.DeepEqual(decodedValues, values) { + t.Fatalf("unexpected results:\n\tgot: %v\n\texp: %v\n", decodedValues, values) + } +} + +func TestEncoding_IntBlock_Basic(t *testing.T) { + valueCount := 1000 + times := getTimes(valueCount, 60, time.Second) + values := make(tsm1.Values, len(times)) + for i, t := range times { + values[i] = tsm1.NewValue(t, int64(i)) + } + + b, err := values.Encode(nil) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + decodedValues := values.DecodeSameTypeBlock(b) + + if len(decodedValues) != len(values) { + t.Fatalf("unexpected results length:\n\tgot: %v\n\texp: %v\n", len(decodedValues), len(values)) + } + + for i := 0; i < len(decodedValues); i++ { + + if decodedValues[i].Time() != values[i].Time() { + t.Fatalf("unexpected results:\n\tgot: %v\n\texp: %v\n", decodedValues[i].Time(), values[i].Time()) + } + + if decodedValues[i].Value() != values[i].Value() { + t.Fatalf("unexpected results:\n\tgot: %v\n\texp: %v\n", decodedValues[i].Value(), values[i].Value()) + } + } +} + +func TestEncoding_IntBlock_Negatives(t *testing.T) { + valueCount := 1000 + times := getTimes(valueCount, 60, time.Second) + values := make(tsm1.Values, len(times)) + for i, t := range times { + v := int64(i) + if i%2 == 0 { + v = -v + } + values[i] = tsm1.NewValue(t, int64(v)) + } + + b, err := values.Encode(nil) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + decodedValues := values.DecodeSameTypeBlock(b) + + if !reflect.DeepEqual(decodedValues, values) { + t.Fatalf("unexpected results:\n\tgot: %v\n\texp: %v\n", decodedValues, values) + } +} + +func TestEncoding_BoolBlock_Basic(t *testing.T) { + valueCount := 1000 + times := getTimes(valueCount, 60, time.Second) + values := make(tsm1.Values, len(times)) + for i, t := range times { + v := true + if i%2 == 0 { + v = false + } + values[i] = tsm1.NewValue(t, v) + } + + b, err := values.Encode(nil) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + decodedValues := values.DecodeSameTypeBlock(b) + + if !reflect.DeepEqual(decodedValues, values) { + t.Fatalf("unexpected results:\n\tgot: %v\n\texp: %v\n", decodedValues, values) + } +} + +func TestEncoding_StringBlock_Basic(t *testing.T) { + valueCount := 1000 + times := getTimes(valueCount, 60, time.Second) + values := make(tsm1.Values, len(times)) + for i, t := range times { + values[i] = tsm1.NewValue(t, fmt.Sprintf("value %d", i)) + } + + b, err := values.Encode(nil) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + decodedValues := values.DecodeSameTypeBlock(b) + + if !reflect.DeepEqual(decodedValues, values) { + t.Fatalf("unexpected results:\n\tgot: %v\n\texp: %v\n", decodedValues, values) + } +} + +func getTimes(n, step int, precision time.Duration) []time.Time { + t := time.Now().Round(precision) + a := make([]time.Time, n) + for i := 0; i < n; i++ { + a[i] = t.Add(time.Duration(i*60) * precision) + } + return a +} diff --git a/tsdb/engine/tsm1/float.go b/tsdb/engine/tsm1/float.go new file mode 100644 index 00000000000..8961c70f4ff --- /dev/null +++ b/tsdb/engine/tsm1/float.go @@ -0,0 +1,210 @@ +package tsm1 + +/* +This code is originally from: https://github.com/dgryski/go-tsz and has been modified to remove +the timestamp compression fuctionality. + +It implements the float compression as presented in: http://www.vldb.org/pvldb/vol8/p1816-teller.pdf. +This implementation uses a sentinel value of NaN which means that float64 NaN cannot be stored using +this version. +*/ + +import ( + "bytes" + "math" + + "github.com/dgryski/go-bits" + "github.com/dgryski/go-bitstream" +) + +const ( + // floatUncompressed is an uncompressed format using 8 bytes per value + floatUncompressed = 0 + // floatCompressedGorilla is a compressed format using the gorilla paper encoding + floatCompressedGorilla = 1 +) + +// FloatEncoder encodes multiple float64s into a byte slice +type FloatEncoder struct { + val float64 + + leading uint64 + trailing uint64 + + buf bytes.Buffer + bw *bitstream.BitWriter + + first bool + finished bool +} + +func NewFloatEncoder() *FloatEncoder { + s := FloatEncoder{ + first: true, + leading: ^uint64(0), + } + + s.bw = bitstream.NewWriter(&s.buf) + + return &s + +} + +func (s *FloatEncoder) Bytes() []byte { + return append([]byte{floatCompressedGorilla << 4}, s.buf.Bytes()...) +} + +func (s *FloatEncoder) Finish() { + if !s.finished { + // // write an end-of-stream record + s.Push(math.NaN()) + s.bw.Flush(bitstream.Zero) + s.finished = true + } +} + +func (s *FloatEncoder) Push(v float64) { + if s.first { + // first point + s.val = v + s.first = false + s.bw.WriteBits(math.Float64bits(v), 64) + return + } + + vDelta := math.Float64bits(v) ^ math.Float64bits(s.val) + + if vDelta == 0 { + s.bw.WriteBit(bitstream.Zero) + } else { + s.bw.WriteBit(bitstream.One) + + leading := bits.Clz(vDelta) + trailing := bits.Ctz(vDelta) + + // TODO(dgryski): check if it's 'cheaper' to reset the leading/trailing bits instead + if s.leading != ^uint64(0) && leading >= s.leading && trailing >= s.trailing { + s.bw.WriteBit(bitstream.Zero) + s.bw.WriteBits(vDelta>>s.trailing, 64-int(s.leading)-int(s.trailing)) + } else { + s.leading, s.trailing = leading, trailing + + s.bw.WriteBit(bitstream.One) + s.bw.WriteBits(leading, 5) + + sigbits := 64 - leading - trailing + s.bw.WriteBits(sigbits, 6) + s.bw.WriteBits(vDelta>>trailing, int(sigbits)) + } + } + + s.val = v +} + +// FloatDecoder decodes a byte slice into multipe float64 values +type FloatDecoder struct { + val float64 + + leading uint64 + trailing uint64 + + br *bitstream.BitReader + + b []byte + + first bool + finished bool + + err error +} + +func NewFloatDecoder(b []byte) (*FloatDecoder, error) { + // first byte is the compression type but we currently just have gorilla + // compression + br := bitstream.NewReader(bytes.NewReader(b[1:])) + + v, err := br.ReadBits(64) + if err != nil { + return nil, err + } + + return &FloatDecoder{ + val: math.Float64frombits(v), + first: true, + br: br, + b: b, + }, nil +} + +func (it *FloatDecoder) Next() bool { + if it.err != nil || it.finished { + return false + } + + if it.first { + it.first = false + return true + } + + // read compressed value + bit, err := it.br.ReadBit() + if err != nil { + it.err = err + return false + } + + if bit == bitstream.Zero { + // it.val = it.val + } else { + bit, err := it.br.ReadBit() + if err != nil { + it.err = err + return false + } + if bit == bitstream.Zero { + // reuse leading/trailing zero bits + // it.leading, it.trailing = it.leading, it.trailing + } else { + bits, err := it.br.ReadBits(5) + if err != nil { + it.err = err + return false + } + it.leading = bits + + bits, err = it.br.ReadBits(6) + if err != nil { + it.err = err + return false + } + mbits := bits + it.trailing = 64 - it.leading - mbits + } + + mbits := int(64 - it.leading - it.trailing) + bits, err := it.br.ReadBits(mbits) + if err != nil { + it.err = err + return false + } + vbits := math.Float64bits(it.val) + vbits ^= (bits << it.trailing) + + val := math.Float64frombits(vbits) + if math.IsNaN(val) { + it.finished = true + return false + } + it.val = val + } + + return true +} + +func (it *FloatDecoder) Values() float64 { + return it.val +} + +func (it *FloatDecoder) Error() error { + return it.err +} diff --git a/tsdb/engine/tsm1/float_test.go b/tsdb/engine/tsm1/float_test.go new file mode 100644 index 00000000000..794d62e5b7b --- /dev/null +++ b/tsdb/engine/tsm1/float_test.go @@ -0,0 +1,165 @@ +package tsm1_test + +import ( + "testing" + + "github.com/influxdb/influxdb/tsdb/engine/tsm1" +) + +func TestFloatEncoder_Simple(t *testing.T) { + + // Example from the paper + s := tsm1.NewFloatEncoder() + + s.Push(12) + s.Push(12) + s.Push(24) + + // extra tests + + // floating point masking/shifting bug + s.Push(13) + s.Push(24) + + // delta-of-delta sizes + s.Push(24) + s.Push(24) + s.Push(24) + + s.Finish() + + b := s.Bytes() + + it, err := tsm1.NewFloatDecoder(b) + if err != nil { + t.Fatalf("unexpected error creating float decoder: %v", err) + } + + want := []float64{ + 12, + 12, + 24, + + 13, + 24, + + 24, + 24, + 24, + } + + for _, w := range want { + if !it.Next() { + t.Fatalf("Next()=false, want true") + } + vv := it.Values() + if w != vv { + t.Errorf("Values()=(%v), want (%v)\n", vv, w) + } + } + + if it.Next() { + t.Fatalf("Next()=true, want false") + } + + if err := it.Error(); err != nil { + t.Errorf("it.Error()=%v, want nil", err) + } +} + +var TwoHoursData = []struct { + v float64 +}{ + // 2h of data + {761}, {727}, {763}, {706}, {700}, + {679}, {757}, {708}, {739}, {707}, + {699}, {740}, {729}, {766}, {730}, + {715}, {705}, {693}, {765}, {724}, + {799}, {761}, {737}, {766}, {756}, + {719}, {722}, {801}, {747}, {731}, + {742}, {744}, {791}, {750}, {759}, + {809}, {751}, {705}, {770}, {792}, + {727}, {762}, {772}, {721}, {748}, + {753}, {744}, {716}, {776}, {659}, + {789}, {766}, {758}, {690}, {795}, + {770}, {758}, {723}, {767}, {765}, + {693}, {706}, {681}, {727}, {724}, + {780}, {678}, {696}, {758}, {740}, + {735}, {700}, {742}, {747}, {752}, + {734}, {743}, {732}, {746}, {770}, + {780}, {710}, {731}, {712}, {712}, + {741}, {770}, {770}, {754}, {718}, + {670}, {775}, {749}, {795}, {756}, + {741}, {787}, {721}, {745}, {782}, + {765}, {780}, {811}, {790}, {836}, + {743}, {858}, {739}, {762}, {770}, + {752}, {763}, {795}, {792}, {746}, + {786}, {785}, {774}, {786}, {718}, +} + +func TestFloatEncoder_Roundtrip(t *testing.T) { + + s := tsm1.NewFloatEncoder() + for _, p := range TwoHoursData { + s.Push(p.v) + } + s.Finish() + + b := s.Bytes() + + it, err := tsm1.NewFloatDecoder(b) + if err != nil { + t.Fatalf("unexpected error creating float decoder: %v", err) + } + + for _, w := range TwoHoursData { + if !it.Next() { + t.Fatalf("Next()=false, want true") + } + vv := it.Values() + // t.Logf("it.Values()=(%+v, %+v)\n", time.Unix(int64(tt), 0), vv) + if w.v != vv { + t.Errorf("Values()=(%v), want (%v)\n", vv, w.v) + } + } + + if it.Next() { + t.Fatalf("Next()=true, want false") + } + + if err := it.Error(); err != nil { + t.Errorf("it.Error()=%v, want nil", err) + } +} + +func BenchmarkFloatEncoder(b *testing.B) { + for i := 0; i < b.N; i++ { + s := tsm1.NewFloatEncoder() + for _, tt := range TwoHoursData { + s.Push(tt.v) + } + s.Finish() + } +} + +func BenchmarkFloatDecoder(b *testing.B) { + s := tsm1.NewFloatEncoder() + for _, tt := range TwoHoursData { + s.Push(tt.v) + } + s.Finish() + bytes := s.Bytes() + + b.ResetTimer() + + for i := 0; i < b.N; i++ { + it, err := tsm1.NewFloatDecoder(bytes) + if err != nil { + b.Fatalf("unexpected error creating float decoder: %v", err) + } + + for j := 0; j < len(TwoHoursData); it.Next() { + j++ + } + } +} diff --git a/tsdb/engine/tsm1/int.go b/tsdb/engine/tsm1/int.go new file mode 100644 index 00000000000..9ce18fe96e3 --- /dev/null +++ b/tsdb/engine/tsm1/int.go @@ -0,0 +1,180 @@ +package tsm1 + +// Int64 encoding uses two different strategies depending on the range of values in +// the uncompressed data. Encoded values are first encoding used zig zag encoding. +// This interleaves postiive and negative integers across a range of positive integers. +// +// For example, [-2,-1,0,1] becomes [3,1,0,2]. See +// https://developers.google.com/protocol-buffers/docs/encoding?hl=en#signed-integers +// for more information. +// +// If all the zig zag encoded values less than 1 << 60 - 1, they are compressed using +// simple8b encoding. If any values is larger than 1 << 60 - 1, the values are stored uncompressed. +// +// Each encoded byte slice, contains a 1 byte header followed by multiple 8 byte packed integers +// or 8 byte uncompressed integers. The 4 high bits of the first byte indicate the encoding type +// for the remaining bytes. +// +// There are currently two encoding types that can be used with room for 16 total. These additional +// encoding slots are reserved for future use. One improvement to be made is to use a patched +// encoding such as PFOR if only a small number of values exceed the max compressed value range. This +// should improve compression ratios with very large integers near the ends of the int64 range. + +import ( + "encoding/binary" + "fmt" + + "github.com/jwilder/encoding/simple8b" +) + +const ( + // intUncompressed is an uncompressed format using 8 bytes per point + intUncompressed = 0 + // intCompressedSimple is a bit-packed format using simple8b encoding + intCompressedSimple = 1 +) + +// Int64Encoder encoders int64 into byte slices +type Int64Encoder interface { + Write(v int64) + Bytes() ([]byte, error) +} + +// Int64Decoder decodes a byte slice into int64s +type Int64Decoder interface { + Next() bool + Read() int64 + Error() error +} + +type int64Encoder struct { + values []uint64 +} + +func NewInt64Encoder() Int64Encoder { + return &int64Encoder{} +} + +func (e *int64Encoder) Write(v int64) { + e.values = append(e.values, ZigZagEncode(v)) +} + +func (e *int64Encoder) Bytes() ([]byte, error) { + for _, v := range e.values { + // Value is too large to encode using packed format + if v > simple8b.MaxValue { + return e.encodeUncompressed() + } + } + + return e.encodePacked() +} + +func (e *int64Encoder) encodePacked() ([]byte, error) { + encoded, err := simple8b.EncodeAll(e.values) + if err != nil { + return nil, err + } + + b := make([]byte, 1+len(encoded)*8) + // 4 high bits of first byte store the encoding type for the block + b[0] = byte(intCompressedSimple) << 4 + + for i, v := range encoded { + binary.BigEndian.PutUint64(b[1+i*8:1+i*8+8], v) + } + return b, nil +} + +func (e *int64Encoder) encodeUncompressed() ([]byte, error) { + b := make([]byte, 1+len(e.values)*8) + // 4 high bits of first byte store the encoding type for the block + b[0] = byte(intUncompressed) << 4 + + for i, v := range e.values { + binary.BigEndian.PutUint64(b[1+i*8:1+i*8+8], v) + } + return b, nil +} + +type int64Decoder struct { + values []uint64 + bytes []byte + i int + n int + + encoding byte + err error +} + +func NewInt64Decoder(b []byte) Int64Decoder { + d := &int64Decoder{ + // 240 is the maximum number of values that can be encoded into a single uint64 using simple8b + values: make([]uint64, 240), + } + + d.SetBytes(b) + return d +} + +func (d *int64Decoder) SetBytes(b []byte) { + if len(b) > 0 { + d.encoding = b[0] >> 4 + d.bytes = b[1:] + } + d.i = 0 + d.n = 0 +} + +func (d *int64Decoder) Next() bool { + if d.i >= d.n && len(d.bytes) == 0 { + return false + } + + d.i += 1 + + if d.i >= d.n { + switch d.encoding { + case intUncompressed: + d.decodeUncompressed() + case intCompressedSimple: + d.decodePacked() + default: + d.err = fmt.Errorf("unknown encoding %v", d.encoding) + } + } + return d.i < d.n +} + +func (d *int64Decoder) Error() error { + return d.err +} + +func (d *int64Decoder) Read() int64 { + return ZigZagDecode(d.values[d.i]) +} + +func (d *int64Decoder) decodePacked() { + if len(d.bytes) == 0 { + return + } + + v := binary.BigEndian.Uint64(d.bytes[0:8]) + n, err := simple8b.Decode(d.values, v) + if err != nil { + // Should never happen, only error that could be returned is if the the value to be decoded was not + // actually encoded by simple8b encoder. + d.err = fmt.Errorf("failed to decode value %v: %v", v, err) + } + + d.n = n + d.i = 0 + d.bytes = d.bytes[8:] +} + +func (d *int64Decoder) decodeUncompressed() { + d.values[0] = binary.BigEndian.Uint64(d.bytes[0:8]) + d.i = 0 + d.n = 1 + d.bytes = d.bytes[8:] +} diff --git a/tsdb/engine/tsm1/int_test.go b/tsdb/engine/tsm1/int_test.go new file mode 100644 index 00000000000..279b55e49bf --- /dev/null +++ b/tsdb/engine/tsm1/int_test.go @@ -0,0 +1,249 @@ +package tsm1_test + +import ( + "math" + "testing" + + "github.com/influxdb/influxdb/tsdb/engine/tsm1" +) + +func Test_Int64Encoder_NoValues(t *testing.T) { + enc := tsm1.NewInt64Encoder() + b, err := enc.Bytes() + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + dec := tsm1.NewInt64Decoder(b) + if dec.Next() { + t.Fatalf("unexpected next value: got true, exp false") + } +} + +func Test_Int64Encoder_One(t *testing.T) { + enc := tsm1.NewInt64Encoder() + v1 := int64(1) + + enc.Write(1) + b, err := enc.Bytes() + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + dec := tsm1.NewInt64Decoder(b) + if !dec.Next() { + t.Fatalf("unexpected next value: got true, exp false") + } + + if v1 != dec.Read() { + t.Fatalf("read value mismatch: got %v, exp %v", dec.Read(), v1) + } +} + +func Test_Int64Encoder_Two(t *testing.T) { + enc := tsm1.NewInt64Encoder() + var v1, v2 int64 = 1, 2 + + enc.Write(v1) + enc.Write(v2) + + b, err := enc.Bytes() + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + dec := tsm1.NewInt64Decoder(b) + if !dec.Next() { + t.Fatalf("unexpected next value: got true, exp false") + } + + if v1 != dec.Read() { + t.Fatalf("read value mismatch: got %v, exp %v", dec.Read(), v1) + } + + if !dec.Next() { + t.Fatalf("unexpected next value: got true, exp false") + } + + if v2 != dec.Read() { + t.Fatalf("read value mismatch: got %v, exp %v", dec.Read(), v2) + } +} + +func Test_Int64Encoder_Negative(t *testing.T) { + enc := tsm1.NewInt64Encoder() + var v1, v2, v3 int64 = -2, 0, 1 + + enc.Write(v1) + enc.Write(v2) + enc.Write(v3) + + b, err := enc.Bytes() + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + dec := tsm1.NewInt64Decoder(b) + if !dec.Next() { + t.Fatalf("unexpected next value: got true, exp false") + } + + if v1 != dec.Read() { + t.Fatalf("read value mismatch: got %v, exp %v", dec.Read(), v1) + } + + if !dec.Next() { + t.Fatalf("unexpected next value: got true, exp false") + } + + if v2 != dec.Read() { + t.Fatalf("read value mismatch: got %v, exp %v", dec.Read(), v2) + } + + if !dec.Next() { + t.Fatalf("unexpected next value: got true, exp false") + } + + if v3 != dec.Read() { + t.Fatalf("read value mismatch: got %v, exp %v", dec.Read(), v3) + } +} + +func Test_Int64Encoder_Large_Range(t *testing.T) { + enc := tsm1.NewInt64Encoder() + var v1, v2 int64 = math.MinInt64, math.MaxInt64 + enc.Write(v1) + enc.Write(v2) + b, err := enc.Bytes() + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + dec := tsm1.NewInt64Decoder(b) + if !dec.Next() { + t.Fatalf("unexpected next value: got true, exp false") + } + + if v1 != dec.Read() { + t.Fatalf("read value mismatch: got %v, exp %v", dec.Read(), v1) + } + + if !dec.Next() { + t.Fatalf("unexpected next value: got true, exp false") + } + + if v2 != dec.Read() { + t.Fatalf("read value mismatch: got %v, exp %v", dec.Read(), v2) + } +} + +func Test_Int64Encoder_Uncompressed(t *testing.T) { + enc := tsm1.NewInt64Encoder() + var v1, v2, v3 int64 = 0, 1, 1 << 60 + + enc.Write(v1) + enc.Write(v2) + enc.Write(v3) + + b, err := enc.Bytes() + if err != nil { + t.Fatalf("expected error: %v", err) + } + + // 1 byte header + 3 * 8 byte values + if exp := 25; len(b) != exp { + t.Fatalf("length mismatch: got %v, exp %v", len(b), exp) + } + + dec := tsm1.NewInt64Decoder(b) + if !dec.Next() { + t.Fatalf("unexpected next value: got true, exp false") + } + + if v1 != dec.Read() { + t.Fatalf("read value mismatch: got %v, exp %v", dec.Read(), v1) + } + + if !dec.Next() { + t.Fatalf("unexpected next value: got true, exp false") + } + + if v2 != dec.Read() { + t.Fatalf("read value mismatch: got %v, exp %v", dec.Read(), v2) + } + + if !dec.Next() { + t.Fatalf("unexpected next value: got true, exp false") + } + + if v3 != dec.Read() { + t.Fatalf("read value mismatch: got %v, exp %v", dec.Read(), v3) + } +} + +func Test_Int64Encoder_AllNegative(t *testing.T) { + enc := tsm1.NewInt64Encoder() + values := []int64{ + -10, -5, -1, + } + + for _, v := range values { + enc.Write(v) + } + + b, err := enc.Bytes() + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + dec := tsm1.NewInt64Decoder(b) + i := 0 + for dec.Next() { + if i > len(values) { + t.Fatalf("read too many values: got %v, exp %v", i, len(values)) + } + + if values[i] != dec.Read() { + t.Fatalf("read value %d mismatch: got %v, exp %v", i, dec.Read(), values[i]) + } + i += 1 + } +} + +func BenchmarkInt64Encoder(b *testing.B) { + enc := tsm1.NewInt64Encoder() + x := make([]int64, 1024) + for i := 0; i < len(x); i++ { + x[i] = int64(i) + enc.Write(x[i]) + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + enc.Bytes() + } +} + +type byteSetter interface { + SetBytes(b []byte) +} + +func BenchmarkInt64Decoder(b *testing.B) { + x := make([]int64, 1024) + enc := tsm1.NewInt64Encoder() + for i := 0; i < len(x); i++ { + x[i] = int64(i) + enc.Write(x[i]) + } + bytes, _ := enc.Bytes() + + b.ResetTimer() + + dec := tsm1.NewInt64Decoder(bytes) + + for i := 0; i < b.N; i++ { + dec.(byteSetter).SetBytes(bytes) + for dec.Next() { + } + } +} diff --git a/tsdb/engine/tsm1/string.go b/tsdb/engine/tsm1/string.go new file mode 100644 index 00000000000..da06bc53599 --- /dev/null +++ b/tsdb/engine/tsm1/string.go @@ -0,0 +1,94 @@ +package tsm1 + +// String encoding uses snappy compression to compress each string. Each string is +// appended to byte slice prefixed with a variable byte length followed by the string +// bytes. The bytes are compressed using snappy compressor and a 1 byte header is used +// to indicate the type of encoding. + +import ( + "encoding/binary" + "fmt" + + "github.com/golang/snappy" +) + +const ( + // stringUncompressed is a an uncompressed format encoding strings as raw bytes + stringUncompressed = 0 + // stringCompressedSnappy is a compressed encoding using Snappy compression + stringCompressedSnappy = 1 +) + +type StringEncoder interface { + Write(s string) + Bytes() ([]byte, error) +} + +type StringDecoder interface { + Next() bool + Read() string + Error() error +} + +type stringEncoder struct { + // The encoded bytes + bytes []byte +} + +func NewStringEncoder() StringEncoder { + return &stringEncoder{} +} + +func (e *stringEncoder) Write(s string) { + b := make([]byte, 10) + // Append the length of the string using variable byte encoding + i := binary.PutUvarint(b, uint64(len(s))) + e.bytes = append(e.bytes, b[:i]...) + + // Append the string bytes + e.bytes = append(e.bytes, s...) +} + +func (e *stringEncoder) Bytes() ([]byte, error) { + // Compress the currently appended bytes using snappy and prefix with + // a 1 byte header for future extension + data := snappy.Encode(nil, e.bytes) + return append([]byte{stringCompressedSnappy << 4}, data...), nil +} + +type stringDecoder struct { + b []byte + l int + i int + err error +} + +func NewStringDecoder(b []byte) (StringDecoder, error) { + // First byte stores the encoding type, only have snappy format + // currently so ignore for now. + data, err := snappy.Decode(nil, b[1:]) + if err != nil { + return nil, fmt.Errorf("failed to decode string block: %v", err.Error()) + } + + return &stringDecoder{b: data}, nil +} + +func (e *stringDecoder) Next() bool { + e.i += e.l + return e.i < len(e.b) +} + +func (e *stringDecoder) Read() string { + // Read the length of the string + length, n := binary.Uvarint(e.b[e.i:]) + + // The length of this string plus the length of the variable byte encoded length + e.l = int(length) + n + + return string(e.b[e.i+n : e.i+n+int(length)]) +} + +func (e *stringDecoder) Error() error { + return e.err +} diff --git a/tsdb/engine/tsm1/string_test.go b/tsdb/engine/tsm1/string_test.go new file mode 100644 index 00000000000..f5143514ecc --- /dev/null +++ b/tsdb/engine/tsm1/string_test.go @@ -0,0 +1,85 @@ +package tsm1 + +import ( + "fmt" + "testing" +) + +func Test_StringEncoder_NoValues(t *testing.T) { + enc := NewStringEncoder() + b, err := enc.Bytes() + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + dec, err := NewStringDecoder(b) + if err != nil { + t.Fatalf("unexpected erorr creating string decoder: %v", err) + } + if dec.Next() { + t.Fatalf("unexpected next value: got true, exp false") + } +} + +func Test_StringEncoder_Single(t *testing.T) { + enc := NewStringEncoder() + v1 := "v1" + enc.Write(v1) + b, err := enc.Bytes() + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + dec, err := NewStringDecoder(b) + if err != nil { + t.Fatalf("unexpected erorr creating string decoder: %v", err) + } + if !dec.Next() { + t.Fatalf("unexpected next value: got false, exp true") + } + + if v1 != dec.Read() { + t.Fatalf("unexpected value: got %v, exp %v", dec.Read(), v1) + } +} + +func Test_StringEncoder_Multi_Compressed(t *testing.T) { + enc := NewStringEncoder() + + values := make([]string, 10) + for i := range values { + values[i] = fmt.Sprintf("value %d", i) + enc.Write(values[i]) + } + + b, err := enc.Bytes() + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if b[0]>>4 != stringCompressedSnappy { + t.Fatalf("unexpected encoding: got %v, exp %v", b[0], stringCompressedSnappy) + } + + if exp := 47; len(b) != exp { + t.Fatalf("unexpected length: got %v, exp %v", len(b), exp) + } + + dec, err := NewStringDecoder(b) + if err != nil { + t.Fatalf("unexpected erorr creating string decoder: %v", err) + } + + for i, v := range values { + if !dec.Next() { + t.Fatalf("unexpected next value: got false, exp true") + } + if v != dec.Read() { + t.Fatalf("unexpected value at pos %d: got %v, exp %v", i, dec.Read(), v) + } + } + + if dec.Next() { + t.Fatalf("unexpected next value: got true, exp false") + } +} diff --git a/tsdb/engine/tsm1/timestamp.go b/tsdb/engine/tsm1/timestamp.go new file mode 100644 index 00000000000..ad7ed644196 --- /dev/null +++ b/tsdb/engine/tsm1/timestamp.go @@ -0,0 +1,309 @@ +package tsm1 + +// Timestamp encoding is adaptive and based on structure of the timestamps that are encoded. It +// uses a combination of delta encoding, scaling and compression using simple8b, run length encoding +// as well as falling back to no compression if needed. +// +// Timestamp values to be encoded should be sorted before encoding. When encoded, the values are +// first delta-encoded. The first value is the starting timestamp, subsequent values are the difference. +// from the prior value. +// +// Timestamp resolution can also be in the nanosecond. Many timestamps are monotonically increasing +// and fall on even boundaries of time such as every 10s. When the timestamps have this structure, +// they are scaled by the largest common divisor that is also a factor of 10. This has the effect +// of converting very large integer deltas into very small one that can be reversed by multiplying them +// by the scaling factor. +// +// Using these adjusted values, if all the deltas are the same, the time range is stored using run +// length encoding. If run length encoding is not possible and all values are less than 1 << 60 - 1 +// (~36.5 yrs in nanosecond resolution), then the timestamps are encoded using simple8b encoding. If +// any value exceeds the maximum values, the deltas are stored uncompressed using 8b each. +// +// Each compressed byte slice has a 1 byte header indicating the compression type. The 4 high bits +// indicated the encoding type. The 4 low bits are used by the encoding type. +// +// For run-length encoding, the 4 low bits store the log10 of the scaling factor. The next 8 bytes are +// the starting timestamp, next 1-10 bytes is the delta value using variable-length encoding, finally the +// next 1-10 bytes is the count of values. +// +// For simple8b encoding, the 4 low bits store the log10 of the scaling factor. The next 8 bytes is the +// first delta value stored uncompressed, the remaining bytes are 64bit words containg compressed delta +// values. +// +// For uncompressed encoding, the delta values are stored using 8 bytes each. + +import ( + "encoding/binary" + "fmt" + "math" + "time" + + "github.com/jwilder/encoding/simple8b" +) + +const ( + // timeUncompressed is a an uncompressed format using 8 bytes per timestamp + timeUncompressed = 0 + // timeCompressedPackedSimple is a bit-packed format using simple8b encoding + timeCompressedPackedSimple = 1 + // timeCompressedRLE is a run-length encoding format + timeCompressedRLE = 2 +) + +// TimeEncoder encodes time.Time to byte slices. +type TimeEncoder interface { + Write(t time.Time) + Bytes() ([]byte, error) +} + +// TimeEncoder decodes byte slices to time.Time values. +type TimeDecoder interface { + Next() bool + Read() time.Time + Error() error +} + +type encoder struct { + ts []uint64 +} + +// NewTimeEncoder returns a TimeEncoder +func NewTimeEncoder() TimeEncoder { + return &encoder{} +} + +// Write adds a time.Time to the compressed stream. +func (e *encoder) Write(t time.Time) { + e.ts = append(e.ts, uint64(t.UnixNano())) +} + +func (e *encoder) reduce() (max, divisor uint64, rle bool, deltas []uint64) { + // Compute the deltas in place to avoid allocating another slice + deltas = e.ts + // Starting values for a max and divisor + max, divisor = 0, 1e12 + + // Indicates whether the the deltas can be run-length encoded + rle = true + + // Iterate in reverse so we can apply deltas in place + for i := len(deltas) - 1; i > 0; i-- { + + // First differential encode the values + deltas[i] = deltas[i] - deltas[i-1] + + // We also need to keep track of the max value and largest common divisor + v := deltas[i] + + if v > max { + max = v + } + + for { + // If our value is divisible by 10, break. Otherwise, try the next smallest divisor. + if v%divisor == 0 { + break + } + divisor /= 10 + } + + // Skip the first value || see if prev = curr. The deltas can be RLE if the are all equal. + rle = i == len(deltas)-1 || rle && (deltas[i+1] == deltas[i]) + } + return +} + +// Bytes returns the encoded bytes of all written times. +func (e *encoder) Bytes() ([]byte, error) { + if len(e.ts) == 0 { + return []byte{}, nil + } + + // Maximum and largest common divisor. rle is true if dts (the delta timestamps), + // are all the same. + max, div, rle, dts := e.reduce() + + // The deltas are all the same, so we can run-length encode them + if rle && len(e.ts) > 60 { + return e.encodeRLE(e.ts[0], e.ts[1], div, len(e.ts)) + } + + // We can't compress this time-range, the deltas exceed 1 << 60 + if max > simple8b.MaxValue { + return e.encodeRaw() + } + + return e.encodePacked(div, dts) +} + +func (e *encoder) encodePacked(div uint64, dts []uint64) ([]byte, error) { + enc := simple8b.NewEncoder() + for _, v := range dts[1:] { + enc.Write(uint64(v) / div) + } + + b := make([]byte, 8+1) + + // 4 high bits used for the encoding type + b[0] = byte(timeCompressedPackedSimple) << 4 + // 4 low bits are the log10 divisor + b[0] |= byte(math.Log10(float64(div))) + + // The first delta value + binary.BigEndian.PutUint64(b[1:9], uint64(dts[0])) + + // The compressed deltas + deltas, err := enc.Bytes() + if err != nil { + return nil, err + } + + return append(b, deltas...), nil +} + +func (e *encoder) encodeRaw() ([]byte, error) { + b := make([]byte, 1+len(e.ts)*8) + b[0] = byte(timeUncompressed) << 4 + for i, v := range e.ts { + binary.BigEndian.PutUint64(b[1+i*8:1+i*8+8], uint64(v)) + } + return b, nil +} + +func (e *encoder) encodeRLE(first, delta, div uint64, n int) ([]byte, error) { + // Large varints can take up to 10 bytes + b := make([]byte, 1+10*3) + + // 4 high bits used for the encoding type + b[0] = byte(timeCompressedRLE) << 4 + // 4 low bits are the log10 divisor + b[0] |= byte(math.Log10(float64(div))) + + i := 1 + // The first timestamp + binary.BigEndian.PutUint64(b[i:], uint64(first)) + i += 8 + // The first delta + i += binary.PutUvarint(b[i:], uint64(delta/div)) + // The number of times the delta is repeated + i += binary.PutUvarint(b[i:], uint64(n)) + + return b[:i], nil +} + +type decoder struct { + v time.Time + ts []uint64 + err error +} + +func NewTimeDecoder(b []byte) TimeDecoder { + d := &decoder{} + d.decode(b) + return d +} + +func (d *decoder) Next() bool { + if len(d.ts) == 0 { + return false + } + d.v = time.Unix(0, int64(d.ts[0])) + d.ts = d.ts[1:] + return true +} + +func (d *decoder) Read() time.Time { + return d.v +} + +func (d *decoder) Error() error { + return d.err +} + +func (d *decoder) decode(b []byte) { + if len(b) == 0 { + return + } + + // Encoding type is stored in the 4 high bits of the first byte + encoding := b[0] >> 4 + switch encoding { + case timeUncompressed: + d.decodeRaw(b[1:]) + case timeCompressedRLE: + d.decodeRLE(b) + case timeCompressedPackedSimple: + d.decodePacked(b) + default: + d.err = fmt.Errorf("unknown encoding: %v", encoding) + } +} + +func (d *decoder) decodePacked(b []byte) { + div := uint64(math.Pow10(int(b[0] & 0xF))) + first := uint64(binary.BigEndian.Uint64(b[1:9])) + + enc := simple8b.NewDecoder(b[9:]) + + deltas := []uint64{first} + for enc.Next() { + deltas = append(deltas, enc.Read()) + } + + // Compute the prefix sum and scale the deltas back up + for i := 1; i < len(deltas); i++ { + dgap := deltas[i] * div + deltas[i] = deltas[i-1] + dgap + } + + d.ts = deltas +} + +func (d *decoder) decodeRLE(b []byte) { + var i, n int + + // Lower 4 bits hold the 10 based exponent so we can scale the values back up + mod := int64(math.Pow10(int(b[i] & 0xF))) + i += 1 + + // Next 8 bytes is the starting timestamp + first := binary.BigEndian.Uint64(b[i : i+8]) + i += 8 + + // Next 1-10 bytes is our (scaled down by factor of 10) run length values + value, n := binary.Uvarint(b[i:]) + + // Scale the value back up + value *= uint64(mod) + i += n + + // Last 1-10 bytes is how many times the value repeats + count, n := binary.Uvarint(b[i:]) + + // Rebuild construct the original values now + deltas := make([]uint64, count) + for i := range deltas { + deltas[i] = value + } + + // Reverse the delta-encoding + deltas[0] = first + for i := 1; i < len(deltas); i++ { + deltas[i] = deltas[i-1] + deltas[i] + } + + d.ts = deltas +} + +func (d *decoder) decodeRaw(b []byte) { + d.ts = make([]uint64, len(b)/8) + for i := range d.ts { + d.ts[i] = binary.BigEndian.Uint64(b[i*8 : i*8+8]) + + delta := d.ts[i] + // Compute the prefix sum and scale the deltas back up + if i > 0 { + d.ts[i] = d.ts[i-1] + delta + } + } +} diff --git a/tsdb/engine/tsm1/timestamp_test.go b/tsdb/engine/tsm1/timestamp_test.go new file mode 100644 index 00000000000..402a6578a11 --- /dev/null +++ b/tsdb/engine/tsm1/timestamp_test.go @@ -0,0 +1,388 @@ +package tsm1 + +import ( + "testing" + "time" +) + +func Test_TimeEncoder(t *testing.T) { + enc := NewTimeEncoder() + + x := []time.Time{} + now := time.Unix(0, 0) + x = append(x, now) + enc.Write(now) + for i := 1; i < 4; i++ { + x = append(x, now.Add(time.Duration(i)*time.Second)) + enc.Write(x[i]) + } + + b, err := enc.Bytes() + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if got := b[0] >> 4; got != timeCompressedPackedSimple { + t.Fatalf("Wrong encoding used: expected uncompressed, got %v", got) + } + + dec := NewTimeDecoder(b) + for i, v := range x { + if !dec.Next() { + t.Fatalf("Next == false, expected true") + } + + if v != dec.Read() { + t.Fatalf("Item %d mismatch, got %v, exp %v", i, dec.Read(), v) + } + } +} + +func Test_TimeEncoder_NoValues(t *testing.T) { + enc := NewTimeEncoder() + b, err := enc.Bytes() + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + dec := NewTimeDecoder(b) + if dec.Next() { + t.Fatalf("unexpected next value: got true, exp false") + } +} + +func Test_TimeEncoder_One(t *testing.T) { + enc := NewTimeEncoder() + tm := time.Unix(0, 0) + + enc.Write(tm) + b, err := enc.Bytes() + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if got := b[0] >> 4; got != timeCompressedPackedSimple { + t.Fatalf("Wrong encoding used: expected uncompressed, got %v", got) + } + + dec := NewTimeDecoder(b) + if !dec.Next() { + t.Fatalf("unexpected next value: got true, exp false") + } + + if tm != dec.Read() { + t.Fatalf("read value mismatch: got %v, exp %v", dec.Read(), tm) + } +} + +func Test_TimeEncoder_Two(t *testing.T) { + enc := NewTimeEncoder() + t1 := time.Unix(0, 0) + t2 := time.Unix(0, 1) + enc.Write(t1) + enc.Write(t2) + + b, err := enc.Bytes() + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if got := b[0] >> 4; got != timeCompressedPackedSimple { + t.Fatalf("Wrong encoding used: expected uncompressed, got %v", got) + } + + dec := NewTimeDecoder(b) + if !dec.Next() { + t.Fatalf("unexpected next value: got true, exp false") + } + + if t1 != dec.Read() { + t.Fatalf("read value mismatch: got %v, exp %v", dec.Read(), t1) + } + + if !dec.Next() { + t.Fatalf("unexpected next value: got true, exp false") + } + + if t2 != dec.Read() { + t.Fatalf("read value mismatch: got %v, exp %v", dec.Read(), t2) + } +} + +func Test_TimeEncoder_Three(t *testing.T) { + enc := NewTimeEncoder() + t1 := time.Unix(0, 0) + t2 := time.Unix(0, 1) + t3 := time.Unix(0, 2) + + enc.Write(t1) + enc.Write(t2) + enc.Write(t3) + + b, err := enc.Bytes() + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if got := b[0] >> 4; got != timeCompressedPackedSimple { + t.Fatalf("Wrong encoding used: expected uncompressed, got %v", got) + } + + dec := NewTimeDecoder(b) + if !dec.Next() { + t.Fatalf("unexpected next value: got true, exp false") + } + + if t1 != dec.Read() { + t.Fatalf("read value mismatch: got %v, exp %v", dec.Read(), t1) + } + + if !dec.Next() { + t.Fatalf("unexpected next value: got true, exp false") + } + + if t2 != dec.Read() { + t.Fatalf("read value mismatch: got %v, exp %v", dec.Read(), t2) + } + + if !dec.Next() { + t.Fatalf("unexpected next value: got true, exp false") + } + + if t3 != dec.Read() { + t.Fatalf("read value mismatch: got %v, exp %v", dec.Read(), t3) + } +} + +func Test_TimeEncoder_Large_Range(t *testing.T) { + enc := NewTimeEncoder() + t1 := time.Unix(0, 1442369134000000000) + t2 := time.Unix(0, 1442369135000000000) + enc.Write(t1) + enc.Write(t2) + b, err := enc.Bytes() + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if got := b[0] >> 4; got != timeCompressedPackedSimple { + t.Fatalf("Wrong encoding used: expected uncompressed, got %v", got) + } + + dec := NewTimeDecoder(b) + if !dec.Next() { + t.Fatalf("unexpected next value: got true, exp false") + } + + if t1 != dec.Read() { + t.Fatalf("read value mismatch: got %v, exp %v", dec.Read(), t1) + } + + if !dec.Next() { + t.Fatalf("unexpected next value: got true, exp false") + } + + if t2 != dec.Read() { + t.Fatalf("read value mismatch: got %v, exp %v", dec.Read(), t2) + } +} + +func Test_TimeEncoder_Uncompressed(t *testing.T) { + enc := NewTimeEncoder() + t1 := time.Unix(0, 0) + t2 := time.Unix(1, 0) + + // about 36.5yrs in NS resolution is max range for compressed format + // This should cause the encoding to fallback to raw points + t3 := time.Unix(2, (2 << 59)) + enc.Write(t1) + enc.Write(t2) + enc.Write(t3) + + b, err := enc.Bytes() + if err != nil { + t.Fatalf("expected error: %v", err) + } + + if exp := 25; len(b) != exp { + t.Fatalf("length mismatch: got %v, exp %v", len(b), exp) + } + + if got := b[0] >> 4; got != timeUncompressed { + t.Fatalf("Wrong encoding used: expected uncompressed, got %v", got) + } + + dec := NewTimeDecoder(b) + if !dec.Next() { + t.Fatalf("unexpected next value: got true, exp false") + } + + if t1 != dec.Read() { + t.Fatalf("read value mismatch: got %v, exp %v", dec.Read(), t1) + } + + if !dec.Next() { + t.Fatalf("unexpected next value: got true, exp false") + } + + if t2 != dec.Read() { + t.Fatalf("read value mismatch: got %v, exp %v", dec.Read(), t2) + } + + if !dec.Next() { + t.Fatalf("unexpected next value: got true, exp false") + } + + if t3 != dec.Read() { + t.Fatalf("read value mismatch: got %v, exp %v", dec.Read(), t3) + } +} + +func Test_TimeEncoder_RLE(t *testing.T) { + enc := NewTimeEncoder() + var ts []time.Time + for i := 0; i < 500; i++ { + ts = append(ts, time.Unix(int64(i), 0)) + } + + for _, v := range ts { + enc.Write(v) + } + + b, err := enc.Bytes() + if exp := 12; len(b) != exp { + t.Fatalf("length mismatch: got %v, exp %v", len(b), exp) + } + + if got := b[0] >> 4; got != timeCompressedRLE { + t.Fatalf("Wrong encoding used: expected uncompressed, got %v", got) + } + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + dec := NewTimeDecoder(b) + for i, v := range ts { + if !dec.Next() { + t.Fatalf("Next == false, expected true") + } + + if v != dec.Read() { + t.Fatalf("Item %d mismatch, got %v, exp %v", i, dec.Read(), v) + } + } + + if dec.Next() { + t.Fatalf("unexpected extra values") + } +} + +func Test_TimeEncoder_Reverse(t *testing.T) { + enc := NewTimeEncoder() + ts := []time.Time{ + time.Unix(0, 3), + time.Unix(0, 2), + time.Unix(0, 1), + } + + for _, v := range ts { + enc.Write(v) + } + + b, err := enc.Bytes() + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if got := b[0] >> 4; got != timeUncompressed { + t.Fatalf("Wrong encoding used: expected uncompressed, got %v", got) + } + + dec := NewTimeDecoder(b) + i := 0 + for dec.Next() { + if ts[i] != dec.Read() { + t.Fatalf("read value %d mismatch: got %v, exp %v", i, dec.Read(), ts[i]) + } + i += 1 + } +} + +func Test_TimeEncoder_220SecondDelta(t *testing.T) { + enc := NewTimeEncoder() + var ts []time.Time + now := time.Now() + for i := 0; i < 220; i++ { + ts = append(ts, now.Add(time.Duration(i*60)*time.Second)) + } + + for _, v := range ts { + enc.Write(v) + } + + b, err := enc.Bytes() + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // Using RLE, should get 12 bytes + if exp := 12; len(b) != exp { + t.Fatalf("unexpected length: got %v, exp %v", len(b), exp) + } + + if got := b[0] >> 4; got != timeCompressedRLE { + t.Fatalf("Wrong encoding used: expected uncompressed, got %v", got) + } + + dec := NewTimeDecoder(b) + i := 0 + for dec.Next() { + if ts[i] != dec.Read() { + t.Fatalf("read value %d mismatch: got %v, exp %v", i, dec.Read(), ts[i]) + } + i += 1 + } + + if i != len(ts) { + t.Fatalf("Read too few values: exp %d, got %d", len(ts), i) + } + + if dec.Next() { + t.Fatalf("expecte Next() = false, got true") + } +} + +func BenchmarkTimeEncoder(b *testing.B) { + enc := NewTimeEncoder() + x := make([]time.Time, 1024) + for i := 0; i < len(x); i++ { + x[i] = time.Now() + enc.Write(x[i]) + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + enc.Bytes() + } +} + +func BenchmarkTimeDecoder(b *testing.B) { + x := make([]time.Time, 1024) + enc := NewTimeEncoder() + for i := 0; i < len(x); i++ { + x[i] = time.Now() + enc.Write(x[i]) + } + bytes, _ := enc.Bytes() + + b.ResetTimer() + + for i := 0; i < b.N; i++ { + b.StopTimer() + dec := NewTimeDecoder(bytes) + b.StartTimer() + for dec.Next() { + } + } +} diff --git a/tsdb/engine/tsm1/tsm1.go b/tsdb/engine/tsm1/tsm1.go new file mode 100644 index 00000000000..59a0f3fe051 --- /dev/null +++ b/tsdb/engine/tsm1/tsm1.go @@ -0,0 +1,1974 @@ +package tsm1 + +import ( + "encoding/binary" + "encoding/json" + "fmt" + "hash/fnv" + "io" + "io/ioutil" + "log" + "math" + "os" + "path/filepath" + "reflect" + "sort" + "strings" + "sync" + "syscall" + "time" + + "github.com/golang/snappy" + "github.com/influxdb/influxdb/models" + "github.com/influxdb/influxdb/tsdb" +) + +const ( + // Format is the file format name of this engine. + Format = "tsm1" + + //IDsFileExtension is the extension for the file that keeps the compressed map + // of keys to uint64 IDs. + IDsFileExtension = "ids" + + // FieldsFileExtension is the extension for the file that stores compressed field + // encoding data for this db + FieldsFileExtension = "fields" + + // SeriesFileExtension is the extension for the file that stores the compressed + // series metadata for series in this db + SeriesFileExtension = "series" + + // CollisionsFileExtension is the extension for the file that keeps a map of which + // keys have hash collisions and what their actual IDs are + CollisionsFileExtension = "collisions" + + //CheckpointExtension is the extension given to files that checkpoint. + // The checkpoint files are created when a new file is first created. They + // are removed after the file has been synced and is safe for use. If a file + // has an associated checkpoint file, it wasn't safely written and both should be removed + CheckpointExtension = "check" + + // keyFieldSeparator separates the series key from the field name in the composite key + // that identifies a specific field in series + keyFieldSeparator = "#!~#" +) + +type TimePrecision uint8 + +const ( + Seconds TimePrecision = iota + Milliseconds + Microseconds + Nanoseconds +) + +func init() { + tsdb.RegisterEngine(Format, NewEngine) +} + +const ( + MaxDataFileSize = 1024 * 1024 * 1024 // 1GB + + // DefaultRotateBlockSize is the default size to rotate to a new compressed block + DefaultRotateBlockSize = 512 * 1024 // 512KB + + DefaultRotateFileSize = 5 * 1024 * 1024 // 5MB + + DefaultMaxPointsPerBlock = 1000 + + // MAP_POPULATE is for the mmap syscall. For some reason this isn't defined in golang's syscall + MAP_POPULATE = 0x8000 + + // magicNumber is written as the first 4 bytes of a data file to + // identify the file as a tsm1 formatted file + magicNumber uint32 = 0x16D116D1 +) + +// Ensure Engine implements the interface. +var _ tsdb.Engine = &Engine{} + +// Engine represents a storage engine with compressed blocks. +type Engine struct { + writeLock *WriteLock + metaLock sync.Mutex + path string + logger *log.Logger + + // deletesPending mark how many old data files are waiting to be deleted. This will + // keep a close from returning until all deletes finish + deletesPending sync.WaitGroup + + // HashSeriesField is a function that takes a series key and a field name + // and returns a hash identifier. It's not guaranteed to be unique. + HashSeriesField func(key string) uint64 + + WAL *Log + + RotateFileSize uint32 + SkipCompaction bool + CompactionAge time.Duration + MinCompactionFileCount int + IndexCompactionFullAge time.Duration + IndexMinCompactionInterval time.Duration + MaxPointsPerBlock int + RotateBlockSize int + + // filesLock is only for modifying and accessing the files slice + filesLock sync.RWMutex + files dataFiles + currentFileID int + compactionRunning bool + lastCompactionTime time.Time + + // deletes is a map of keys that are deleted, but haven't yet been + // compacted and flushed. They map the ID to the corresponding key + deletes map[uint64]string + + // deleteMeasurements is a map of the measurements that are deleted + // but haven't yet been compacted and flushed + deleteMeasurements map[string]bool + + collisionsLock sync.RWMutex + collisions map[string]uint64 + + // queryLock keeps data files from being deleted or the store from + // being closed while queries are running + queryLock sync.RWMutex +} + +// NewEngine returns a new instance of Engine. +func NewEngine(path string, walPath string, opt tsdb.EngineOptions) tsdb.Engine { + w := NewLog(path) + w.FlushColdInterval = time.Duration(opt.Config.WALFlushColdInterval) + w.FlushMemorySizeThreshold = opt.Config.WALFlushMemorySizeThreshold + w.MaxMemorySizeThreshold = opt.Config.WALMaxMemorySizeThreshold + w.LoggingEnabled = opt.Config.WALLoggingEnabled + + e := &Engine{ + path: path, + writeLock: &WriteLock{}, + logger: log.New(os.Stderr, "[tsm1] ", log.LstdFlags), + + // TODO: this is the function where we can inject a check against the in memory collisions + HashSeriesField: hashSeriesField, + WAL: w, + RotateFileSize: DefaultRotateFileSize, + CompactionAge: opt.Config.IndexCompactionAge, + MinCompactionFileCount: opt.Config.IndexMinCompactionFileCount, + IndexCompactionFullAge: opt.Config.IndexCompactionFullAge, + IndexMinCompactionInterval: opt.Config.IndexMinCompactionInterval, + MaxPointsPerBlock: DefaultMaxPointsPerBlock, + RotateBlockSize: DefaultRotateBlockSize, + } + e.WAL.Index = e + + return e +} + +// Path returns the path the engine was opened with. +func (e *Engine) Path() string { return e.path } + +// PerformMaintenance is for periodic maintenance of the store. A no-op for b1 +func (e *Engine) PerformMaintenance() { + if f := e.WAL.shouldFlush(); f != noFlush { + go func() { + e.WAL.flush(f) + }() + return + } + + // don't do a full compaction if the WAL received writes in the time window + if time.Since(e.WAL.LastWriteTime()) < e.IndexCompactionFullAge { + return + } + + e.filesLock.RLock() + running := e.compactionRunning + deletesPending := len(e.deletes) > 0 + e.filesLock.RUnlock() + if running || deletesPending { + return + } + + // do a full compaction if all the index files are older than the compaction time + for _, f := range e.copyFilesCollection() { + if time.Since(f.modTime) < e.IndexCompactionFullAge { + return + } + } + + go e.Compact(true) +} + +// Format returns the format type of this engine +func (e *Engine) Format() tsdb.EngineFormat { + return tsdb.TSM1Format +} + +// Open opens and initializes the engine. +func (e *Engine) Open() error { + if err := os.MkdirAll(e.path, 0777); err != nil { + return err + } + + // perform any cleanup on metafiles that were halfway written + e.cleanupMetafile(SeriesFileExtension) + e.cleanupMetafile(FieldsFileExtension) + e.cleanupMetafile(IDsFileExtension) + e.cleanupMetafile(CollisionsFileExtension) + + files, err := filepath.Glob(filepath.Join(e.path, fmt.Sprintf("*.%s", Format))) + if err != nil { + return err + } + for _, fn := range files { + // if the file has a checkpoint it's not valid, so remove it + if removed := e.removeFileIfCheckpointExists(fn); removed { + continue + } + + id, err := idFromFileName(fn) + if err != nil { + return err + } + if id >= e.currentFileID { + e.currentFileID = id + 1 + } + f, err := os.OpenFile(fn, os.O_RDONLY, 0666) + if err != nil { + return fmt.Errorf("error opening file %s: %s", fn, err.Error()) + } + df, err := NewDataFile(f) + if err != nil { + return fmt.Errorf("error opening memory map for file %s: %s", fn, err.Error()) + } + e.files = append(e.files, df) + } + sort.Sort(e.files) + + if err := e.readCollisions(); err != nil { + return err + } + + e.deletes = make(map[uint64]string) + e.deleteMeasurements = make(map[string]bool) + + // mark the last compaction as now so it doesn't try to compact while + // flushing the WAL on load + e.lastCompactionTime = time.Now() + + if err := e.WAL.Open(); err != nil { + return err + } + + e.lastCompactionTime = time.Now() + + return nil +} + +// Close closes the engine. +func (e *Engine) Close() error { + // get all the locks so queries, writes, and compactions stop before closing + e.queryLock.Lock() + defer e.queryLock.Unlock() + e.metaLock.Lock() + defer e.metaLock.Unlock() + min, max := int64(math.MinInt64), int64(math.MaxInt64) + e.writeLock.LockRange(min, max) + defer e.writeLock.UnlockRange(min, max) + e.filesLock.Lock() + defer e.filesLock.Unlock() + + // ensure all deletes have been processed + e.deletesPending.Wait() + + for _, df := range e.files { + _ = df.Close() + } + e.files = nil + e.currentFileID = 0 + e.collisions = nil + e.deletes = nil + e.deleteMeasurements = nil + return nil +} + +// DataFileCount returns the number of data files in the database +func (e *Engine) DataFileCount() int { + e.filesLock.RLock() + defer e.filesLock.RUnlock() + return len(e.files) +} + +// SetLogOutput is a no-op. +func (e *Engine) SetLogOutput(w io.Writer) {} + +// LoadMetadataIndex loads the shard metadata into memory. +func (e *Engine) LoadMetadataIndex(shard *tsdb.Shard, index *tsdb.DatabaseIndex, measurementFields map[string]*tsdb.MeasurementFields) error { + // Load measurement metadata + fields, err := e.readFields() + if err != nil { + return err + } + for k, mf := range fields { + m := index.CreateMeasurementIndexIfNotExists(string(k)) + for name, _ := range mf.Fields { + m.SetFieldName(name) + } + mf.Codec = tsdb.NewFieldCodec(mf.Fields) + measurementFields[m.Name] = mf + } + + // Load series metadata + series, err := e.readSeries() + if err != nil { + return err + } + + // Load the series into the in-memory index in sorted order to ensure + // it's always consistent for testing purposes + a := make([]string, 0, len(series)) + for k, _ := range series { + a = append(a, k) + } + sort.Strings(a) + for _, key := range a { + s := series[key] + s.InitializeShards() + index.CreateSeriesIndexIfNotExists(tsdb.MeasurementFromSeriesKey(string(key)), s) + } + + return nil +} + +// WritePoints writes metadata and point data into the engine. +// Returns an error if new points are added to an existing key. +func (e *Engine) WritePoints(points []models.Point, measurementFieldsToSave map[string]*tsdb.MeasurementFields, seriesToCreate []*tsdb.SeriesCreate) error { + return e.WAL.WritePoints(points, measurementFieldsToSave, seriesToCreate) +} + +func (e *Engine) Write(pointsByKey map[string]Values, measurementFieldsToSave map[string]*tsdb.MeasurementFields, seriesToCreate []*tsdb.SeriesCreate) error { + // Flush any deletes before writing new data from the WAL + e.filesLock.RLock() + hasDeletes := len(e.deletes) > 0 + e.filesLock.RUnlock() + if hasDeletes { + e.flushDeletes() + } + + err, startTime, endTime, valuesByID := e.convertKeysAndWriteMetadata(pointsByKey, measurementFieldsToSave, seriesToCreate) + if err != nil { + return err + } + if len(valuesByID) == 0 { + return nil + } + + files, lockStart, lockEnd := e.filesAndLock(startTime, endTime) + defer e.writeLock.UnlockRange(lockStart, lockEnd) + + if len(files) == 0 { + return e.rewriteFile(nil, valuesByID) + } + + maxTime := int64(math.MaxInt64) + + // do the file rewrites in parallel + var mu sync.Mutex + var writes sync.WaitGroup + var errors []error + + // reverse through the data files and write in the data + for i := len(files) - 1; i >= 0; i-- { + f := files[i] + // max times are exclusive, so add 1 to it + fileMax := f.MaxTime() + 1 + fileMin := f.MinTime() + // if the file is < rotate, write all data between fileMin and maxTime + if f.size < e.RotateFileSize { + writes.Add(1) + go func(df *dataFile, vals map[uint64]Values) { + if err := e.rewriteFile(df, vals); err != nil { + mu.Lock() + errors = append(errors, err) + mu.Unlock() + } + writes.Done() + }(f, e.filterDataBetweenTimes(valuesByID, fileMin, maxTime)) + continue + } + // if the file is > rotate: + // write all data between fileMax and maxTime into new file + // write all data between fileMin and fileMax into old file + writes.Add(1) + go func(vals map[uint64]Values) { + if err := e.rewriteFile(nil, vals); err != nil { + mu.Lock() + errors = append(errors, err) + mu.Unlock() + } + writes.Done() + }(e.filterDataBetweenTimes(valuesByID, fileMax, maxTime)) + writes.Add(1) + go func(df *dataFile, vals map[uint64]Values) { + if err := e.rewriteFile(df, vals); err != nil { + mu.Lock() + errors = append(errors, err) + mu.Unlock() + } + writes.Done() + }(f, e.filterDataBetweenTimes(valuesByID, fileMin, fileMax)) + maxTime = fileMin + } + // for any data leftover, write into a new file since it's all older + // than any file we currently have + writes.Add(1) + go func() { + if err := e.rewriteFile(nil, valuesByID); err != nil { + mu.Lock() + errors = append(errors, err) + mu.Unlock() + } + writes.Done() + }() + + writes.Wait() + + if len(errors) > 0 { + // TODO: log errors + return errors[0] + } + + if !e.SkipCompaction && e.shouldCompact() { + go e.Compact(false) + } + + return nil +} + +// MarkDeletes will mark the given keys for deletion in memory. They will be deleted from data +// files on the next flush. This mainly for the WAL to use on startup +func (e *Engine) MarkDeletes(keys []string) { + e.filesLock.Lock() + defer e.filesLock.Unlock() + for _, k := range keys { + e.deletes[e.keyToID(k)] = k + } +} + +func (e *Engine) MarkMeasurementDelete(name string) { + e.filesLock.Lock() + defer e.filesLock.Unlock() + e.deleteMeasurements[name] = true +} + +// filesAndLock returns the data files that match the given range and +// ensures that the write lock will hold for the entire range +func (e *Engine) filesAndLock(min, max int64) (a dataFiles, lockStart, lockEnd int64) { + for { + a = make([]*dataFile, 0) + files := e.copyFilesCollection() + + for _, f := range e.files { + fmin, fmax := f.MinTime(), f.MaxTime() + if min < fmax && fmin >= fmin { + a = append(a, f) + } else if max >= fmin && max < fmax { + a = append(a, f) + } + } + + if len(a) > 0 { + lockStart = a[0].MinTime() + lockEnd = a[len(a)-1].MaxTime() + if max > lockEnd { + lockEnd = max + } + } else { + lockStart = min + lockEnd = max + } + + e.writeLock.LockRange(lockStart, lockEnd) + + // it's possible for compaction to change the files collection while we + // were waiting for a write lock on the range. Make sure the files are still the + // same after we got the lock, otherwise try again. This shouldn't happen often. + filesAfterLock := e.copyFilesCollection() + if reflect.DeepEqual(files, filesAfterLock) { + return + } + + e.writeLock.UnlockRange(lockStart, lockEnd) + } +} + +func (e *Engine) Compact(fullCompaction bool) error { + // we're looping here to ensure that the files we've marked to compact are + // still there after we've obtained the write lock + var minTime, maxTime int64 + var files dataFiles + for { + if fullCompaction { + files = e.copyFilesCollection() + } else { + files = e.filesToCompact() + } + if len(files) < 2 { + return nil + } + minTime = files[0].MinTime() + maxTime = files[len(files)-1].MaxTime() + + e.writeLock.LockRange(minTime, maxTime) + + // if the files are different after obtaining the write lock, one or more + // was rewritten. Release the lock and try again. This shouldn't happen really. + var filesAfterLock dataFiles + if fullCompaction { + filesAfterLock = e.copyFilesCollection() + } else { + filesAfterLock = e.filesToCompact() + } + if !reflect.DeepEqual(files, filesAfterLock) { + e.writeLock.UnlockRange(minTime, maxTime) + continue + } + + // we've got the write lock and the files are all there + break + } + + // mark the compaction as running + e.filesLock.Lock() + if e.compactionRunning { + e.filesLock.Unlock() + return nil + } + e.compactionRunning = true + e.filesLock.Unlock() + defer func() { + //release the lock + e.writeLock.UnlockRange(minTime, maxTime) + e.filesLock.Lock() + e.lastCompactionTime = time.Now() + e.compactionRunning = false + e.filesLock.Unlock() + }() + + var s string + if fullCompaction { + s = "FULL " + } + fileName := e.nextFileName() + e.logger.Printf("Starting %scompaction in partition %s of %d files to new file %s", s, e.path, len(files), fileName) + st := time.Now() + + positions := make([]uint32, len(files)) + ids := make([]uint64, len(files)) + + // initilaize for writing + f, err := e.openFileAndCheckpoint(fileName) + + for i, df := range files { + ids[i] = btou64(df.mmap[4:12]) + positions[i] = 4 + } + currentPosition := uint32(fileHeaderSize) + newPositions := make([]uint32, 0) + newIDs := make([]uint64, 0) + buf := make([]byte, e.RotateBlockSize) + for { + // find the min ID so we can write it to the file + minID := uint64(math.MaxUint64) + for _, id := range ids { + if minID > id && id != 0 { + minID = id + } + } + if minID == math.MaxUint64 { // we've emptied all the files + break + } + + newIDs = append(newIDs, minID) + newPositions = append(newPositions, currentPosition) + + // write the blocks in order from the files with this id. as we + // go merge blocks together from one file to another, if the right size + var previousValues Values + for i, id := range ids { + if id != minID { + continue + } + df := files[i] + pos := positions[i] + fid, _, block := df.block(pos) + if fid != id { + panic("not possible") + } + newPos := pos + uint32(blockHeaderSize+len(block)) + positions[i] = newPos + + // write the blocks out to file that are already at their size limit + for { + // write the values, the block or combine with previous + if len(previousValues) > 0 { + previousValues = append(previousValues, previousValues.DecodeSameTypeBlock(block)...) + } else if len(block) > e.RotateBlockSize { + if _, err := f.Write(df.mmap[pos:newPos]); err != nil { + return err + } + currentPosition += uint32(newPos - pos) + } else { + // TODO: handle decode error + previousValues, _ = DecodeBlock(block) + } + + // write the previous values and clear if we've hit the limit + if len(previousValues) > e.MaxPointsPerBlock { + b, err := previousValues.Encode(buf) + if err != nil { + panic(fmt.Sprintf("failure encoding block: %v", err)) + } + + if err := e.writeBlock(f, id, b); err != nil { + // fail hard. If we can't write a file someone needs to get woken up + panic(fmt.Sprintf("failure writing block: %s", err.Error())) + } + currentPosition += uint32(blockHeaderSize + len(b)) + previousValues = nil + } + + // if the next block is the same ID, we don't need to decode this one + // so we can just write it out to the file + nextID, _, nextBlock := df.block(newPos) + + // move to the next block in this file only if the id is the same + if nextID != id { + // flush remaining values + if len(previousValues) > 0 { + b, err := previousValues.Encode(buf) + if err != nil { + panic(fmt.Sprintf("failure encoding block: %v", err)) + } + currentPosition += uint32(blockHeaderSize + len(b)) + previousValues = nil + if err := e.writeBlock(f, id, b); err != nil { + panic(fmt.Sprintf("error writing file %s: %s", f.Name(), err.Error())) + } + } + ids[i] = nextID + break + } + pos = newPos + newPos = pos + uint32(blockHeaderSize+len(nextBlock)) + positions[i] = newPos + block = nextBlock + } + } + + if len(previousValues) > 0 { + b, err := previousValues.Encode(buf) + if err != nil { + panic(fmt.Sprintf("failure encoding block: %v", err)) + } + + if err := e.writeBlock(f, minID, b); err != nil { + // fail hard. If we can't write a file someone needs to get woken up + panic(fmt.Sprintf("failure writing block: %s", err.Error())) + } + currentPosition += uint32(blockHeaderSize + len(b)) + } + } + + newDF, err := e.writeIndexAndGetDataFile(f, minTime, maxTime, newIDs, newPositions) + if err != nil { + return err + } + + // update engine with new file pointers + e.filesLock.Lock() + var newFiles dataFiles + for _, df := range e.files { + // exclude any files that were compacted + include := true + for _, f := range files { + if f == df { + include = false + break + } + } + if include { + newFiles = append(newFiles, df) + } + } + newFiles = append(newFiles, newDF) + sort.Sort(newFiles) + e.files = newFiles + e.filesLock.Unlock() + + e.logger.Printf("Compaction of %s took %s", e.path, time.Since(st)) + + // delete the old files in a goroutine so running queries won't block the write + // from completing + e.deletesPending.Add(1) + go func() { + for _, f := range files { + if err := f.Delete(); err != nil { + e.logger.Println("ERROR DELETING:", f.f.Name()) + } + } + e.deletesPending.Done() + }() + + return nil +} + +func (e *Engine) writeBlock(f *os.File, id uint64, block []byte) error { + if _, err := f.Write(append(u64tob(id), u32tob(uint32(len(block)))...)); err != nil { + return err + } + _, err := f.Write(block) + return err +} + +func (e *Engine) writeIndexAndGetDataFile(f *os.File, minTime, maxTime int64, ids []uint64, newPositions []uint32) (*dataFile, error) { + // write the file index, starting with the series ids and their positions + for i, id := range ids { + if _, err := f.Write(u64tob(id)); err != nil { + return nil, err + } + if _, err := f.Write(u32tob(newPositions[i])); err != nil { + return nil, err + } + } + + // write the min time, max time + if _, err := f.Write(append(u64tob(uint64(minTime)), u64tob(uint64(maxTime))...)); err != nil { + return nil, err + } + + // series count + if _, err := f.Write(u32tob(uint32(len(ids)))); err != nil { + return nil, err + } + + // sync it and see4k back to the beginning to hand off to the mmap + if err := f.Sync(); err != nil { + return nil, err + } + if _, err := f.Seek(0, 0); err != nil { + return nil, err + } + + if err := e.removeCheckpoint(f.Name()); err != nil { + return nil, err + } + + // now open it as a memory mapped data file + newDF, err := NewDataFile(f) + if err != nil { + return nil, err + } + + return newDF, nil +} + +func (e *Engine) shouldCompact() bool { + e.filesLock.RLock() + running := e.compactionRunning + since := time.Since(e.lastCompactionTime) + deletesPending := len(e.deletes) > 0 + e.filesLock.RUnlock() + if running || since < e.IndexMinCompactionInterval || deletesPending { + return false + } + return len(e.filesToCompact()) >= e.MinCompactionFileCount +} + +func (e *Engine) filesToCompact() dataFiles { + e.filesLock.RLock() + defer e.filesLock.RUnlock() + + var a dataFiles + for _, df := range e.files { + if time.Since(df.modTime) > e.CompactionAge && df.size < MaxDataFileSize { + a = append(a, df) + } else if len(a) > 0 { + // only compact contiguous ranges. If we hit the negative case and + // there are files to compact, stop here + break + } + } + return a +} + +func (e *Engine) convertKeysAndWriteMetadata(pointsByKey map[string]Values, measurementFieldsToSave map[string]*tsdb.MeasurementFields, seriesToCreate []*tsdb.SeriesCreate) (err error, minTime, maxTime int64, valuesByID map[uint64]Values) { + e.metaLock.Lock() + defer e.metaLock.Unlock() + + if err := e.writeNewFields(measurementFieldsToSave); err != nil { + return err, 0, 0, nil + } + if err := e.writeNewSeries(seriesToCreate); err != nil { + return err, 0, 0, nil + } + + if len(pointsByKey) == 0 { + return nil, 0, 0, nil + } + + // read in keys and assign any that aren't defined + b, err := e.readCompressedFile(IDsFileExtension) + if err != nil { + return err, 0, 0, nil + } + ids := make(map[string]uint64) + if b != nil { + if err := json.Unmarshal(b, &ids); err != nil { + return err, 0, 0, nil + } + } + + // these are values that are newer than anything stored in the shard + valuesByID = make(map[uint64]Values) + + idToKey := make(map[uint64]string) // we only use this map if new ids are being created + collisions := make(map[string]uint64) // we only use this if a collision is encountered + newKeys := false + // track the min and max time of values being inserted so we can lock that time range + minTime = int64(math.MaxInt64) + maxTime = int64(math.MinInt64) + for k, values := range pointsByKey { + var id uint64 + var ok bool + if id, ok = ids[k]; !ok { + // populate the map if we haven't already + + if len(idToKey) == 0 { + for n, id := range ids { + idToKey[id] = n + } + } + + // now see if the hash id collides with a different key + hashID := e.HashSeriesField(k) + existingKey, idInMap := idToKey[hashID] + // we only care if the keys are different. if so, it's a hash collision we have to keep track of + if idInMap && k != existingKey { + // we have a collision, find this new key the next available id + hashID = 0 + for { + hashID++ + if _, ok := idToKey[hashID]; !ok { + // next ID is available, use it + break + } + } + collisions[k] = hashID + } + + newKeys = true + ids[k] = hashID + idToKey[hashID] = k + id = hashID + } + + if minTime > values.MinTime() { + minTime = values.MinTime() + } + if maxTime < values.MaxTime() { + maxTime = values.MaxTime() + } + + valuesByID[id] = values + } + + if newKeys { + b, err := json.Marshal(ids) + if err != nil { + return err, 0, 0, nil + } + if err := e.replaceCompressedFile(IDsFileExtension, b); err != nil { + return err, 0, 0, nil + } + } + + if len(collisions) > 0 { + e.saveNewCollisions(collisions) + } + + return +} + +func (e *Engine) saveNewCollisions(collisions map[string]uint64) error { + e.collisionsLock.Lock() + defer e.collisionsLock.Unlock() + + for k, v := range collisions { + e.collisions[k] = v + } + + data, err := json.Marshal(e.collisions) + + if err != nil { + return err + } + + return e.replaceCompressedFile(CollisionsFileExtension, data) +} + +func (e *Engine) readCollisions() error { + e.collisions = make(map[string]uint64) + data, err := e.readCompressedFile(CollisionsFileExtension) + if err != nil { + return err + } + + if len(data) == 0 { + return nil + } + + return json.Unmarshal(data, &e.collisions) +} + +// filterDataBetweenTimes will create a new map with data between +// the minTime (inclusive) and maxTime (exclusive) while removing that +// data from the passed in map. It is assume that the Values arrays +// are sorted in time ascending order +func (e *Engine) filterDataBetweenTimes(valuesByID map[uint64]Values, minTime, maxTime int64) map[uint64]Values { + filteredValues := make(map[uint64]Values) + for id, values := range valuesByID { + maxIndex := len(values) + minIndex := -1 + // find the index of the first value in the range + for i, v := range values { + t := v.UnixNano() + if t >= minTime && t < maxTime { + minIndex = i + break + } + } + if minIndex == -1 { + continue + } + // go backwards to find the index of the last value in the range + for i := len(values) - 1; i >= 0; i-- { + t := values[i].UnixNano() + if t < maxTime { + maxIndex = i + 1 + break + } + } + + // write into the result map and filter the passed in map + filteredValues[id] = values[minIndex:maxIndex] + + // if we grabbed all the values, remove them from the passed in map + if minIndex == len(values) || (minIndex == 0 && maxIndex == len(values)) { + delete(valuesByID, id) + continue + } + + valuesByID[id] = values[0:minIndex] + if maxIndex < len(values) { + valuesByID[id] = append(valuesByID[id], values[maxIndex:]...) + } + } + return filteredValues +} + +// rewriteFile will read in the old data file, if provided and merge the values +// in the passed map into a new data file +func (e *Engine) rewriteFile(oldDF *dataFile, valuesByID map[uint64]Values) error { + if len(valuesByID) == 0 { + return nil + } + + // we need the values in sorted order so that we can merge them into the + // new file as we read the old file + ids := make([]uint64, 0, len(valuesByID)) + for id, _ := range valuesByID { + ids = append(ids, id) + } + + minTime := int64(math.MaxInt64) + maxTime := int64(math.MinInt64) + + // read header of ids to starting positions and times + oldIDToPosition := make(map[uint64]uint32) + if oldDF != nil { + oldIDToPosition = oldDF.IDToPosition() + minTime = oldDF.MinTime() + maxTime = oldDF.MaxTime() + } + + for _, v := range valuesByID { + if minTime > v.MinTime() { + minTime = v.MinTime() + } + if maxTime < v.MaxTime() { + // add 1 ns to the time since maxTime is exclusive + maxTime = v.MaxTime() + 1 + } + } + + // add any ids that are in the file that aren't getting flushed here + for id, _ := range oldIDToPosition { + if _, ok := valuesByID[id]; !ok { + ids = append(ids, id) + } + } + + // always write in order by ID + sort.Sort(uint64slice(ids)) + + f, err := e.openFileAndCheckpoint(e.nextFileName()) + if err != nil { + return err + } + + if oldDF == nil { + e.logger.Printf("writing new index file %s", f.Name()) + } else { + e.logger.Printf("rewriting index file %s with %s", oldDF.f.Name(), f.Name()) + } + + // now combine the old file data with the new values, keeping track of + // their positions + currentPosition := uint32(fileHeaderSize) + newPositions := make([]uint32, len(ids)) + buf := make([]byte, e.MaxPointsPerBlock*20) + for i, id := range ids { + // mark the position for this ID + newPositions[i] = currentPosition + + newVals := valuesByID[id] + + // if this id is only in the file and not in the new values, just copy over from old file + if len(newVals) == 0 { + fpos := oldIDToPosition[id] + + // write the blocks until we hit whatever the next id is + for { + fid := btou64(oldDF.mmap[fpos : fpos+8]) + if fid != id { + break + } + length := btou32(oldDF.mmap[fpos+8 : fpos+12]) + if _, err := f.Write(oldDF.mmap[fpos : fpos+12+length]); err != nil { + f.Close() + return err + } + fpos += (12 + length) + currentPosition += (12 + length) + + // make sure we're not at the end of the file + if fpos >= oldDF.size { + break + } + } + + continue + } + + // if the values are not in the file, just write the new ones + fpos, ok := oldIDToPosition[id] + if !ok { + // TODO: ensure we encode only the amount in a block + block, err := newVals.Encode(buf) + if err != nil { + f.Close() + return err + } + + if err := e.writeBlock(f, id, block); err != nil { + f.Close() + return err + } + currentPosition += uint32(blockHeaderSize + len(block)) + + continue + } + + // it's in the file and the new values, combine them and write out + for { + fid, _, block := oldDF.block(fpos) + if fid != id { + break + } + fpos += uint32(blockHeaderSize + len(block)) + + // determine if there's a block after this with the same id and get its time + nextID, nextTime, _ := oldDF.block(fpos) + hasFutureBlock := nextID == id + + nv, newBlock, err := e.DecodeAndCombine(newVals, block, buf[:0], nextTime, hasFutureBlock) + newVals = nv + if err != nil { + return err + } + if _, err := f.Write(append(u64tob(id), u32tob(uint32(len(newBlock)))...)); err != nil { + f.Close() + return err + } + if _, err := f.Write(newBlock); err != nil { + f.Close() + return err + } + + currentPosition += uint32(blockHeaderSize + len(newBlock)) + + if fpos >= oldDF.indexPosition() { + break + } + } + + // TODO: ensure we encode only the amount in a block, refactor this wil line 450 into func + if len(newVals) > 0 { + // TODO: ensure we encode only the amount in a block + block, err := newVals.Encode(buf) + if err != nil { + f.Close() + return err + } + + if _, err := f.Write(append(u64tob(id), u32tob(uint32(len(block)))...)); err != nil { + f.Close() + return err + } + if _, err := f.Write(block); err != nil { + f.Close() + return err + } + currentPosition += uint32(blockHeaderSize + len(block)) + } + } + + newDF, err := e.writeIndexAndGetDataFile(f, minTime, maxTime, ids, newPositions) + if err != nil { + f.Close() + return err + } + + // update the engine to point at the new dataFiles + e.filesLock.Lock() + var files dataFiles + for _, df := range e.files { + if df != oldDF { + files = append(files, df) + } + } + files = append(files, newDF) + sort.Sort(files) + e.files = files + e.filesLock.Unlock() + + // remove the old data file. no need to block returning the write, + // but we need to let any running queries finish before deleting it + if oldDF != nil { + e.deletesPending.Add(1) + go func() { + if err := oldDF.Delete(); err != nil { + e.logger.Println("ERROR DELETING FROM REWRITE:", oldDF.f.Name()) + } + e.deletesPending.Done() + }() + } + + return nil +} + +// flushDeletes will lock the entire shard and rewrite all index files so they no +// longer contain the flushed IDs +func (e *Engine) flushDeletes() error { + e.writeLock.LockRange(math.MinInt64, math.MaxInt64) + defer e.writeLock.UnlockRange(math.MinInt64, math.MaxInt64) + e.metaLock.Lock() + defer e.metaLock.Unlock() + + measurements := make(map[string]bool) + deletes := make(map[uint64]string) + e.filesLock.RLock() + for name, _ := range e.deleteMeasurements { + measurements[name] = true + } + for id, key := range e.deletes { + deletes[id] = key + } + e.filesLock.RUnlock() + + // if we're deleting measurements, rewrite the field data + if len(measurements) > 0 { + fields, err := e.readFields() + if err != nil { + return err + } + for name, _ := range measurements { + delete(fields, name) + } + if err := e.writeFields(fields); err != nil { + return err + } + } + + series, err := e.readSeries() + if err != nil { + return err + } + for _, key := range deletes { + seriesName, _ := seriesAndFieldFromCompositeKey(key) + delete(series, seriesName) + } + if err := e.writeSeries(series); err != nil { + return err + } + + // now remove the raw time series data from the data files + files := e.copyFilesCollection() + newFiles := make(dataFiles, 0, len(files)) + for _, f := range files { + newFiles = append(newFiles, e.writeNewFileExcludeDeletes(f)) + } + + // update the delete map and files + e.filesLock.Lock() + defer e.filesLock.Unlock() + + e.files = newFiles + + // remove the things we've deleted from the map + for name, _ := range measurements { + delete(e.deleteMeasurements, name) + } + for id, _ := range deletes { + delete(e.deletes, id) + } + + e.deletesPending.Add(1) + go func() { + for _, oldDF := range files { + if err := oldDF.Delete(); err != nil { + e.logger.Println("ERROR DELETING FROM REWRITE:", oldDF.f.Name()) + } + } + e.deletesPending.Done() + }() + return nil +} + +func (e *Engine) writeNewFileExcludeDeletes(oldDF *dataFile) *dataFile { + f, err := e.openFileAndCheckpoint(e.nextFileName()) + if err != nil { + panic(fmt.Sprintf("error opening new data file: %s", err.Error())) + } + + ids := make([]uint64, 0) + positions := make([]uint32, 0) + + indexPosition := oldDF.indexPosition() + currentPosition := uint32(fileHeaderSize) + currentID := uint64(0) + for currentPosition < indexPosition { + id := btou64(oldDF.mmap[currentPosition : currentPosition+8]) + length := btou32(oldDF.mmap[currentPosition+8 : currentPosition+blockHeaderSize]) + newPosition := currentPosition + blockHeaderSize + length + + if _, ok := e.deletes[id]; ok { + currentPosition = newPosition + continue + } + + if _, err := f.Write(oldDF.mmap[currentPosition:newPosition]); err != nil { + panic(fmt.Sprintf("error writing new index file: %s", err.Error())) + } + if id != currentID { + currentID = id + ids = append(ids, id) + positions = append(positions, currentPosition) + } + currentPosition = newPosition + } + + df, err := e.writeIndexAndGetDataFile(f, oldDF.MinTime(), oldDF.MaxTime(), ids, positions) + if err != nil { + panic(fmt.Sprintf("error writing new index file: %s", err.Error())) + } + + return df +} + +func (e *Engine) nextFileName() string { + e.filesLock.Lock() + defer e.filesLock.Unlock() + e.currentFileID++ + return filepath.Join(e.path, fmt.Sprintf("%07d.%s", e.currentFileID, Format)) +} + +func (e *Engine) readCompressedFile(name string) ([]byte, error) { + f, err := os.OpenFile(filepath.Join(e.path, name), os.O_RDONLY, 0666) + if os.IsNotExist(err) { + return nil, nil + } else if err != nil { + return nil, err + } + b, err := ioutil.ReadAll(f) + if err != nil { + return nil, err + } + + data, err := snappy.Decode(nil, b) + if err != nil { + return nil, err + } + return data, nil +} + +func (e *Engine) replaceCompressedFile(name string, data []byte) error { + tmpName := filepath.Join(e.path, name+"tmp") + f, err := os.OpenFile(tmpName, os.O_CREATE|os.O_RDWR, 0666) + if err != nil { + return err + } + b := snappy.Encode(nil, data) + if _, err := f.Write(b); err != nil { + return err + } + if err := f.Close(); err != nil { + return err + } + if err := os.Remove(name); err != nil && !os.IsNotExist(err) { + return err + } + return os.Rename(tmpName, filepath.Join(e.path, name)) +} + +// keysWithFields takes the map of measurements to their fields and a set of series keys +// and returns the columnar keys for the keys and fields +func (e *Engine) keysWithFields(fields map[string]*tsdb.MeasurementFields, keys []string) []string { + e.WAL.cacheLock.RLock() + defer e.WAL.cacheLock.RUnlock() + + a := make([]string, 0) + for _, k := range keys { + measurement := tsdb.MeasurementFromSeriesKey(k) + + // add the fields from the index + mf := fields[measurement] + if mf != nil { + for _, f := range mf.Fields { + a = append(a, SeriesFieldKey(k, f.Name)) + } + } + + // now add any fields from the WAL that haven't been flushed yet + mf = e.WAL.measurementFieldsCache[measurement] + if mf != nil { + for _, f := range mf.Fields { + a = append(a, SeriesFieldKey(k, f.Name)) + } + } + } + + return a +} + +// DeleteSeries deletes the series from the engine. +func (e *Engine) DeleteSeries(seriesKeys []string) error { + e.metaLock.Lock() + defer e.metaLock.Unlock() + + fields, err := e.readFields() + if err != nil { + return err + } + + keyFields := e.keysWithFields(fields, seriesKeys) + e.filesLock.Lock() + defer e.filesLock.Unlock() + for _, key := range keyFields { + e.deletes[e.keyToID(key)] = key + } + + return e.WAL.DeleteSeries(keyFields) +} + +// DeleteMeasurement deletes a measurement and all related series. +func (e *Engine) DeleteMeasurement(name string, seriesKeys []string) error { + e.metaLock.Lock() + defer e.metaLock.Unlock() + + fields, err := e.readFields() + if err != nil { + return err + } + + // mark the measurement, series keys and the fields for deletion on the next flush + // also serves as a tombstone for any queries that come in before the flush + keyFields := e.keysWithFields(fields, seriesKeys) + e.filesLock.Lock() + defer e.filesLock.Unlock() + + e.deleteMeasurements[name] = true + for _, k := range keyFields { + e.deletes[e.keyToID(k)] = k + } + + return e.WAL.DeleteMeasurement(name, seriesKeys) +} + +// SeriesCount returns the number of series buckets on the shard. +func (e *Engine) SeriesCount() (n int, err error) { + return 0, nil +} + +// Begin starts a new transaction on the engine. +func (e *Engine) Begin(writable bool) (tsdb.Tx, error) { + e.queryLock.RLock() + + var files dataFiles + + // we do this to ensure that the data files haven't been deleted from a compaction + // while we were waiting to get the query lock + for { + files = e.copyFilesCollection() + + // get the query lock + for _, f := range files { + f.mu.RLock() + } + + // ensure they're all still open + reset := false + for _, f := range files { + if f.f == nil { + reset = true + break + } + } + + // if not, release and try again + if reset { + for _, f := range files { + f.mu.RUnlock() + } + continue + } + + // we're good to go + break + } + + return &tx{files: files, engine: e}, nil +} + +func (e *Engine) WriteTo(w io.Writer) (n int64, err error) { panic("not implemented") } + +func (e *Engine) keyToID(key string) uint64 { + // get the ID for the key and be sure to check if it had hash collision before + e.collisionsLock.RLock() + id, ok := e.collisions[key] + e.collisionsLock.RUnlock() + + if !ok { + id = e.HashSeriesField(key) + } + return id +} + +func (e *Engine) keyAndFieldToID(series, field string) uint64 { + key := SeriesFieldKey(series, field) + return e.keyToID(key) +} + +func (e *Engine) copyFilesCollection() []*dataFile { + e.filesLock.RLock() + defer e.filesLock.RUnlock() + a := make([]*dataFile, len(e.files)) + copy(a, e.files) + return a +} + +func (e *Engine) writeNewFields(measurementFieldsToSave map[string]*tsdb.MeasurementFields) error { + if len(measurementFieldsToSave) == 0 { + return nil + } + + // read in all the previously saved fields + fields, err := e.readFields() + if err != nil { + return err + } + + // add the new ones or overwrite old ones + for name, mf := range measurementFieldsToSave { + fields[name] = mf + } + + return e.writeFields(fields) +} + +func (e *Engine) writeFields(fields map[string]*tsdb.MeasurementFields) error { + // compress and save everything + data, err := json.Marshal(fields) + if err != nil { + return err + } + + fn := filepath.Join(e.path, FieldsFileExtension+"tmp") + ff, err := os.OpenFile(fn, os.O_CREATE|os.O_RDWR, 0666) + if err != nil { + return err + } + _, err = ff.Write(snappy.Encode(nil, data)) + if err != nil { + return err + } + if err := ff.Close(); err != nil { + return err + } + fieldsFileName := filepath.Join(e.path, FieldsFileExtension) + + if _, err := os.Stat(fieldsFileName); !os.IsNotExist(err) { + if err := os.Remove(fieldsFileName); err != nil { + return err + } + } + + return os.Rename(fn, fieldsFileName) +} + +func (e *Engine) readFields() (map[string]*tsdb.MeasurementFields, error) { + fields := make(map[string]*tsdb.MeasurementFields) + + f, err := os.OpenFile(filepath.Join(e.path, FieldsFileExtension), os.O_RDONLY, 0666) + if os.IsNotExist(err) { + return fields, nil + } else if err != nil { + return nil, err + } + b, err := ioutil.ReadAll(f) + if err != nil { + return nil, err + } + + data, err := snappy.Decode(nil, b) + if err != nil { + return nil, err + } + + if err := json.Unmarshal(data, &fields); err != nil { + return nil, err + } + + return fields, nil +} + +func (e *Engine) writeNewSeries(seriesToCreate []*tsdb.SeriesCreate) error { + if len(seriesToCreate) == 0 { + return nil + } + + // read in previously saved series + series, err := e.readSeries() + if err != nil { + return err + } + + // add new ones, compress and save + for _, s := range seriesToCreate { + series[s.Series.Key] = s.Series + } + + return e.writeSeries(series) +} + +func (e *Engine) writeSeries(series map[string]*tsdb.Series) error { + data, err := json.Marshal(series) + if err != nil { + return err + } + + fn := filepath.Join(e.path, SeriesFileExtension+"tmp") + ff, err := os.OpenFile(fn, os.O_CREATE|os.O_RDWR, 0666) + if err != nil { + return err + } + _, err = ff.Write(snappy.Encode(nil, data)) + if err != nil { + return err + } + if err := ff.Close(); err != nil { + return err + } + seriesFileName := filepath.Join(e.path, SeriesFileExtension) + + if _, err := os.Stat(seriesFileName); !os.IsNotExist(err) { + if err := os.Remove(seriesFileName); err != nil && err != os.ErrNotExist { + return err + } + } + + return os.Rename(fn, seriesFileName) +} + +func (e *Engine) readSeries() (map[string]*tsdb.Series, error) { + series := make(map[string]*tsdb.Series) + + f, err := os.OpenFile(filepath.Join(e.path, SeriesFileExtension), os.O_RDONLY, 0666) + if os.IsNotExist(err) { + return series, nil + } else if err != nil { + return nil, err + } + defer f.Close() + b, err := ioutil.ReadAll(f) + if err != nil { + return nil, err + } + + data, err := snappy.Decode(nil, b) + if err != nil { + return nil, err + } + + if err := json.Unmarshal(data, &series); err != nil { + return nil, err + } + + return series, nil +} + +// DecodeAndCombine take an encoded block from a file, decodes it and interleaves the file +// values with the values passed in. nextTime and hasNext refer to if the file +// has future encoded blocks so that this method can know how much of its values can be +// combined and output in the resulting encoded block. +func (e *Engine) DecodeAndCombine(newValues Values, block, buf []byte, nextTime int64, hasFutureBlock bool) (Values, []byte, error) { + values := newValues.DecodeSameTypeBlock(block) + + var remainingValues Values + + if hasFutureBlock { + // take all values that have times less than the future block and update the vals array + pos := sort.Search(len(newValues), func(i int) bool { + return newValues[i].Time().UnixNano() >= nextTime + }) + values = append(values, newValues[:pos]...) + remainingValues = newValues[pos:] + values = values.Deduplicate() + } else { + requireSort := values.MaxTime() >= newValues.MinTime() + values = append(values, newValues...) + if requireSort { + values = values.Deduplicate() + } + } + + if len(values) > e.MaxPointsPerBlock { + remainingValues = values[e.MaxPointsPerBlock:] + values = values[:e.MaxPointsPerBlock] + } + + encoded, err := values.Encode(buf) + if err != nil { + return nil, nil, err + } + return remainingValues, encoded, nil +} + +// removeFileIfCheckpointExists will remove the file if its associated checkpoint fil is there. +// It returns true if the file was removed. This is for recovery of data files on startup +func (e *Engine) removeFileIfCheckpointExists(fileName string) bool { + checkpointName := fmt.Sprintf("%s.%s", fileName, CheckpointExtension) + _, err := os.Stat(checkpointName) + + // if there's no checkpoint, move on + if err != nil { + return false + } + + // there's a checkpoint so we know this file isn't safe so we should remove it + err = os.Remove(fileName) + if err != nil { + panic(fmt.Sprintf("error removing file %s", err.Error())) + } + + err = os.Remove(checkpointName) + if err != nil { + panic(fmt.Sprintf("error removing file %s", err.Error())) + } + + return true +} + +// cleanupMetafile will remove the tmp file if the other file exists, or rename the +// tmp file to be a regular file if the normal file is missing. This is for recovery on +// startup. +func (e *Engine) cleanupMetafile(name string) { + fileName := filepath.Join(e.path, name) + tmpName := fileName + "tmp" + + _, err := os.Stat(tmpName) + + // if the tmp file isn't there, we can just exit + if err != nil { + return + } + + _, err = os.Stat(fileName) + + // the regular file is there so we should just remove the tmp file + if err == nil { + err = os.Remove(tmpName) + if err != nil { + panic(fmt.Sprintf("error removing meta file %s: %s", tmpName, err.Error())) + } + } + + // regular file isn't there so have the tmp file take its place + err = os.Rename(tmpName, fileName) + if err != nil { + panic(fmt.Sprintf("error renaming meta file %s: %s", tmpName, err.Error())) + } +} + +// openFileAndCehckpoint will create a checkpoint file, open a new file for +// writing a data index, write the header and return the file +func (e *Engine) openFileAndCheckpoint(fileName string) (*os.File, error) { + checkpointFile := fmt.Sprintf("%s.%s", fileName, CheckpointExtension) + cf, err := os.OpenFile(checkpointFile, os.O_CREATE, 0666) + if err != nil { + return nil, err + } + // _, err = cf.Write(u32tob(magicNumber)) + // if err != nil { + // panic(err) + // } + if err := cf.Close(); err != nil { + return nil, err + } + _, err = os.Stat(checkpointFile) + + f, err := os.OpenFile(fileName, os.O_CREATE|os.O_RDWR, 0666) + if err != nil { + return nil, err + } + + // write the header, which is just the magic number + if _, err := f.Write(u32tob(magicNumber)); err != nil { + f.Close() + return nil, err + } + + return f, nil +} + +// removeCheckpoint removes the checkpoint for a new data file that was getting written +func (e *Engine) removeCheckpoint(fileName string) error { + checkpointFile := fmt.Sprintf("%s.%s", fileName, CheckpointExtension) + return os.Remove(checkpointFile) +} + +type dataFile struct { + f *os.File + mu sync.RWMutex + size uint32 + modTime time.Time + mmap []byte +} + +// byte size constants for the data file +const ( + fileHeaderSize = 4 + seriesCountSize = 4 + timeSize = 8 + blockHeaderSize = 12 + seriesIDSize = 8 + seriesPositionSize = 4 + seriesHeaderSize = seriesIDSize + seriesPositionSize + minTimeOffset = 20 + maxTimeOffset = 12 +) + +func NewDataFile(f *os.File) (*dataFile, error) { + fInfo, err := f.Stat() + if err != nil { + return nil, err + } + mmap, err := syscall.Mmap(int(f.Fd()), 0, int(fInfo.Size()), syscall.PROT_READ, syscall.MAP_SHARED|MAP_POPULATE) + if err != nil { + return nil, err + } + + return &dataFile{ + f: f, + mmap: mmap, + size: uint32(fInfo.Size()), + modTime: fInfo.ModTime(), + }, nil +} + +func (d *dataFile) Close() error { + d.mu.Lock() + defer d.mu.Unlock() + return d.close() +} + +func (d *dataFile) Delete() error { + d.mu.Lock() + defer d.mu.Unlock() + if err := d.close(); err != nil { + return err + } + err := os.Remove(d.f.Name()) + if err != nil { + return err + } + d.f = nil + return nil +} + +func (d *dataFile) close() error { + if d.mmap == nil { + return nil + } + err := syscall.Munmap(d.mmap) + if err != nil { + return err + } + + d.mmap = nil + return d.f.Close() +} + +func (d *dataFile) MinTime() int64 { + minTimePosition := d.size - minTimeOffset + timeBytes := d.mmap[minTimePosition : minTimePosition+timeSize] + return int64(btou64(timeBytes)) +} + +func (d *dataFile) MaxTime() int64 { + maxTimePosition := d.size - maxTimeOffset + timeBytes := d.mmap[maxTimePosition : maxTimePosition+timeSize] + return int64(btou64(timeBytes)) +} + +func (d *dataFile) SeriesCount() uint32 { + return btou32(d.mmap[d.size-4:]) +} + +func (d *dataFile) IDToPosition() map[uint64]uint32 { + count := int(d.SeriesCount()) + m := make(map[uint64]uint32) + + indexStart := d.size - uint32(count*12+20) + for i := 0; i < count; i++ { + offset := indexStart + uint32(i*12) + id := btou64(d.mmap[offset : offset+8]) + pos := btou32(d.mmap[offset+8 : offset+12]) + m[id] = pos + } + + return m +} + +func (d *dataFile) indexPosition() uint32 { + return d.size - uint32(d.SeriesCount()*12+20) +} + +// StartingPositionForID returns the position in the file of the +// first block for the given ID. If zero is returned the ID doesn't +// have any data in this file. +func (d *dataFile) StartingPositionForID(id uint64) uint32 { + + seriesCount := d.SeriesCount() + indexStart := d.indexPosition() + + min := uint32(0) + max := uint32(seriesCount) + + for min < max { + mid := (max-min)/2 + min + + offset := mid*seriesHeaderSize + indexStart + checkID := btou64(d.mmap[offset : offset+8]) + + if checkID == id { + return btou32(d.mmap[offset+8 : offset+12]) + } else if checkID < id { + min = mid + 1 + } else { + max = mid + } + } + + return uint32(0) +} + +func (d *dataFile) block(pos uint32) (id uint64, t int64, block []byte) { + defer func() { + if r := recover(); r != nil { + panic(fmt.Sprintf("panic decoding file: %s at position %d for id %d at time %d", d.f.Name(), pos, id, t)) + } + }() + if pos < d.indexPosition() { + id = btou64(d.mmap[pos : pos+8]) + length := btou32(d.mmap[pos+8 : pos+12]) + block = d.mmap[pos+blockHeaderSize : pos+blockHeaderSize+length] + t = int64(btou64(d.mmap[pos+blockHeaderSize : pos+blockHeaderSize+8])) + } + return +} + +type dataFiles []*dataFile + +func (a dataFiles) Len() int { return len(a) } +func (a dataFiles) Swap(i, j int) { a[i], a[j] = a[j], a[i] } +func (a dataFiles) Less(i, j int) bool { return a[i].MinTime() < a[j].MinTime() } + +// u64tob converts a uint64 into an 8-byte slice. +func u64tob(v uint64) []byte { + b := make([]byte, 8) + binary.BigEndian.PutUint64(b, v) + return b +} + +func btou64(b []byte) uint64 { + return binary.BigEndian.Uint64(b) +} + +func u32tob(v uint32) []byte { + b := make([]byte, 4) + binary.BigEndian.PutUint32(b, v) + return b +} + +func btou32(b []byte) uint32 { + return uint32(binary.BigEndian.Uint32(b)) +} + +func hashSeriesField(key string) uint64 { + h := fnv.New64a() + h.Write([]byte(key)) + return h.Sum64() +} + +// SeriesFieldKey combine a series key and field name for a unique string to be hashed to a numeric ID +func SeriesFieldKey(seriesKey, field string) string { + return seriesKey + keyFieldSeparator + field +} + +func seriesAndFieldFromCompositeKey(key string) (string, string) { + parts := strings.Split(key, keyFieldSeparator) + if len(parts) != 0 { + return parts[0], strings.Join(parts[1:], keyFieldSeparator) + } + return parts[0], parts[1] +} + +type uint64slice []uint64 + +func (a uint64slice) Len() int { return len(a) } +func (a uint64slice) Swap(i, j int) { a[i], a[j] = a[j], a[i] } +func (a uint64slice) Less(i, j int) bool { return a[i] < a[j] } diff --git a/tsdb/engine/tsm1/tsm1_test.go b/tsdb/engine/tsm1/tsm1_test.go new file mode 100644 index 00000000000..dbd353d7ee0 --- /dev/null +++ b/tsdb/engine/tsm1/tsm1_test.go @@ -0,0 +1,1379 @@ +package tsm1_test + +import ( + "encoding/binary" + "fmt" + "io/ioutil" + "math" + "os" + "reflect" + "testing" + "time" + + "github.com/influxdb/influxdb/influxql" + "github.com/influxdb/influxdb/models" + "github.com/influxdb/influxdb/tsdb" + "github.com/influxdb/influxdb/tsdb/engine/tsm1" +) + +func TestEngine_WriteAndReadFloats(t *testing.T) { + e := OpenDefaultEngine() + defer e.Cleanup() + + p1 := parsePoint("cpu,host=A value=1.1 1000000000") + p2 := parsePoint("cpu,host=B value=1.2 1000000000") + p3 := parsePoint("cpu,host=A value=2.1 2000000000") + p4 := parsePoint("cpu,host=B value=2.2 2000000000") + + if err := e.WritePoints([]models.Point{p1, p2, p3}, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + + fields := []string{"value"} + + verify := func(checkSingleBVal bool) { + tx, _ := e.Begin(false) + defer tx.Rollback() + c := tx.Cursor("cpu,host=A", fields, nil, true) + k, v := c.SeekTo(0) + if k != p1.UnixNano() { + t.Fatalf("p1 time wrong:\n\texp:%d\n\tgot:%d\n", p1.UnixNano(), k) + } + if 1.1 != v { + t.Fatal("p1 data not equal") + } + k, v = c.Next() + if k != p3.UnixNano() { + t.Fatalf("p3 time wrong:\n\texp:%d\n\tgot:%d\n", p3.UnixNano(), k) + } + if 2.1 != v { + t.Fatal("p3 data not equal") + } + k, v = c.Next() + if k != tsdb.EOF { + t.Fatal("expected EOF") + } + + c = tx.Cursor("cpu,host=B", fields, nil, true) + k, v = c.SeekTo(0) + if k != p2.UnixNano() { + t.Fatalf("p2 time wrong:\n\texp:%d\n\tgot:%d\n", p2.UnixNano(), k) + } + if 1.2 != v { + t.Fatal("p2 data not equal") + } + + if checkSingleBVal { + k, v = c.Next() + if k != tsdb.EOF { + t.Fatal("expected EOF") + } + } + } + verify(true) + + if err := e.WritePoints([]models.Point{p4}, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + verify(false) + + tx, _ := e.Begin(false) + c := tx.Cursor("cpu,host=B", fields, nil, true) + k, v := c.SeekTo(0) + if k != p2.UnixNano() { + t.Fatalf("p2 time wrong:\n\texp:%d\n\tgot:%d\n", p2.UnixNano(), k) + } + if 1.2 != v { + t.Fatal("p2 data not equal") + } + k, v = c.Next() + if k != p4.UnixNano() { + t.Fatalf("p2 time wrong:\n\texp:%d\n\tgot:%d\n", p2.UnixNano(), k) + } + if 2.2 != v { + t.Fatal("p2 data not equal") + } + + // verify we can seek + k, v = c.SeekTo(2000000000) + if k != p4.UnixNano() { + t.Fatalf("p2 time wrong:\n\texp:%d\n\tgot:%d\n", p2.UnixNano(), k) + } + if 2.2 != v { + t.Fatal("p2 data not equal") + } + + c = tx.Cursor("cpu,host=A", fields, nil, true) + k, v = c.SeekTo(0) + if k != p1.UnixNano() { + t.Fatalf("p1 time wrong:\n\texp:%d\n\tgot:%d\n", p1.UnixNano(), k) + } + if 1.1 != v { + t.Fatal("p1 data not equal") + } + tx.Rollback() + + if err := e.Close(); err != nil { + t.Fatalf("error closing: %s", err.Error()) + } + + if err := e.Open(); err != nil { + t.Fatalf("error opening: %s", err.Error()) + } + + verify(false) +} + +func TestEngine_WriteIndexWithCollision(t *testing.T) { +} + +func TestEngine_WriteIndexQueryAcrossDataFiles(t *testing.T) { + e := OpenDefaultEngine() + defer e.Cleanup() + + e.RotateFileSize = 10 + + p1 := parsePoint("cpu,host=A value=1.1 1000000000") + p2 := parsePoint("cpu,host=B value=1.2 1000000000") + p3 := parsePoint("cpu,host=A value=2.1 4000000000") + p4 := parsePoint("cpu,host=B value=2.2 4000000000") + + if err := e.WritePoints([]models.Point{p1, p2, p3, p4}, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + + p5 := parsePoint("cpu,host=A value=3.1 5000000000") + p6 := parsePoint("cpu,host=B value=3.2 5000000000") + p7 := parsePoint("cpu,host=A value=4.1 3000000000") + p8 := parsePoint("cpu,host=B value=4.2 3000000000") + + if err := e.WritePoints([]models.Point{p5, p6, p7, p8}, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + + if count := e.DataFileCount(); count != 2 { + t.Fatalf("expected 2 data files to exist but got %d", count) + } + + fields := []string{"value"} + + verify := func(series string, points []models.Point, seek int64) { + tx, _ := e.Begin(false) + defer tx.Rollback() + c := tx.Cursor(series, fields, nil, true) + + k, v := c.SeekTo(seek) + p := points[0] + val := p.Fields()["value"] + if p.UnixNano() != k || val != v { + t.Fatalf("expected to seek to first point\n\texp: %d %f\n\tgot: %d %f", p.UnixNano(), val, k, v) + } + points = points[1:] + + for _, p := range points { + k, v := c.Next() + val := p.Fields()["value"] + if p.UnixNano() != k || val != v { + t.Fatalf("expected to seek to first point\n\texp: %d %f\n\tgot: %d %f", p.UnixNano(), val, k, v.(float64)) + } + } + } + + fmt.Println("v1") + verify("cpu,host=A", []models.Point{p1, p7, p3, p5}, 0) + fmt.Println("v2") + verify("cpu,host=B", []models.Point{p2, p8, p4, p6}, 0) + fmt.Println("v3") + verify("cpu,host=A", []models.Point{p5}, 5000000000) + fmt.Println("v4") + verify("cpu,host=B", []models.Point{p6}, 5000000000) +} + +func TestEngine_WriteOverwritePreviousPoint(t *testing.T) { + e := OpenDefaultEngine() + defer e.Cleanup() + + fields := []string{"value"} + + p1 := parsePoint("cpu,host=A value=1.1 1000000000") + p2 := parsePoint("cpu,host=A value=1.2 1000000000") + p3 := parsePoint("cpu,host=A value=1.3 1000000000") + + if err := e.WritePoints([]models.Point{p1, p2}, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + + tx, _ := e.Begin(false) + defer tx.Rollback() + c := tx.Cursor("cpu,host=A", fields, nil, true) + k, v := c.SeekTo(0) + if k != p2.UnixNano() { + t.Fatalf("time wrong:\n\texp:%d\n\tgot:%d\n", p2.UnixNano(), k) + } + if 1.2 != v { + t.Fatalf("data wrong:\n\texp:%f\n\tgot:%f", 1.2, v.(float64)) + } + k, v = c.Next() + if k != tsdb.EOF { + t.Fatal("expected EOF") + } + + if err := e.WritePoints([]models.Point{p3}, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + + tx2, _ := e.Begin(false) + defer tx2.Rollback() + c = tx2.Cursor("cpu,host=A", fields, nil, true) + k, v = c.SeekTo(0) + if k != p3.UnixNano() { + t.Fatalf("time wrong:\n\texp:%d\n\tgot:%d\n", p3.UnixNano(), k) + } + if 1.3 != v { + t.Fatalf("data wrong:\n\texp:%f\n\tgot:%f", 1.3, v.(float64)) + } + k, v = c.Next() + if k != tsdb.EOF { + t.Fatal("expected EOF") + } +} + +func TestEngine_CursorCombinesWALAndIndex(t *testing.T) { + e := OpenDefaultEngine() + defer e.Cleanup() + + fields := []string{"value"} + + p1 := parsePoint("cpu,host=A value=1.1 1000000000") + p2 := parsePoint("cpu,host=A value=1.2 2000000000") + + if err := e.WritePoints([]models.Point{p1}, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + e.WAL.SkipCache = false + if err := e.WritePoints([]models.Point{p2}, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + + tx, _ := e.Begin(false) + defer tx.Rollback() + c := tx.Cursor("cpu,host=A", fields, nil, true) + k, v := c.SeekTo(0) + if k != p1.UnixNano() { + t.Fatalf("time wrong:\n\texp:%d\n\tgot:%d\n", p1.UnixNano(), k) + } + if 1.1 != v { + t.Fatalf("data wrong:\n\texp:%f\n\tgot:%f", 1.1, v.(float64)) + } + k, v = c.Next() + if k != p2.UnixNano() { + t.Fatalf("time wrong:\n\texp:%d\n\tgot:%d\n", p2.UnixNano(), k) + } + if 1.2 != v { + t.Fatalf("data wrong:\n\texp:%f\n\tgot:%f", 1.2, v.(float64)) + } + k, v = c.Next() + if k != tsdb.EOF { + t.Fatal("expected EOF") + } +} + +func TestEngine_Compaction(t *testing.T) { + e := OpenDefaultEngine() + defer e.Cleanup() + + e.RotateFileSize = 10 + + p1 := parsePoint("cpu,host=A value=1.1 1000000000") + p2 := parsePoint("cpu,host=B value=1.1 1000000000") + if err := e.WritePoints([]models.Point{p1, p2}, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + + p3 := parsePoint("cpu,host=A value=2.4 4000000000") + p4 := parsePoint("cpu,host=B value=2.4 4000000000") + if err := e.WritePoints([]models.Point{p3, p4}, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + + p5 := parsePoint("cpu,host=A value=1.5 5000000000") + p6 := parsePoint("cpu,host=B value=2.5 5000000000") + if err := e.WritePoints([]models.Point{p5, p6}, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + + p7 := parsePoint("cpu,host=A value=1.5 6000000000") + p8 := parsePoint("cpu,host=B value=2.5 6000000000") + if err := e.WritePoints([]models.Point{p7, p8}, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + + if count := e.DataFileCount(); count != 4 { + t.Fatalf("expected 3 data files to exist but got %d", count) + } + + fields := []string{"value"} + + e.CompactionAge = time.Duration(0) + + if err := e.Compact(true); err != nil { + t.Fatalf("error compacting: %s", err.Error()) + } + + if count := e.DataFileCount(); count != 1 { + t.Fatalf("expected compaction to reduce data file count to 1 but got %d", count) + } + + verify := func(series string, points []models.Point, seek int64) { + tx, _ := e.Begin(false) + defer tx.Rollback() + c := tx.Cursor(series, fields, nil, true) + + k, v := c.SeekTo(seek) + p := points[0] + val := p.Fields()["value"] + if p.UnixNano() != k || val != v { + t.Fatalf("expected to seek to first point\n\texp: %d %f\n\tgot: %d %f", p.UnixNano(), val, k, v) + } + points = points[1:] + + for _, p := range points { + k, v := c.Next() + val := p.Fields()["value"] + if p.UnixNano() != k || val != v { + t.Fatalf("expected to seek to first point\n\texp: %d %f\n\tgot: %d %f", p.UnixNano(), val, k, v.(float64)) + } + } + } + + verify("cpu,host=A", []models.Point{p1, p3, p5, p7}, 0) + verify("cpu,host=B", []models.Point{p2, p4, p6, p8}, 0) + if err := e.Close(); err != nil { + t.Fatalf("error closing: %s", err.Error()) + } + if err := e.Open(); err != nil { + t.Fatalf("error opening: %s", err.Error()) + } + verify("cpu,host=A", []models.Point{p1, p3, p5, p7}, 0) + verify("cpu,host=B", []models.Point{p2, p4, p6, p8}, 0) +} + +// Ensure that if two keys have the same fnv64-a id, we handle it +func TestEngine_KeyCollisionsAreHandled(t *testing.T) { + e := OpenDefaultEngine() + defer e.Cleanup() + + fields := []string{"value"} + + // make sure two of these keys collide + e.HashSeriesField = func(key string) uint64 { + return 1 + } + p1 := parsePoint("cpu,host=A value=1.1 1000000000") + p2 := parsePoint("cpu,host=B value=1.2 1000000000") + p3 := parsePoint("cpu,host=C value=1.3 1000000000") + + if err := e.WritePoints([]models.Point{p1, p2, p3}, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + + verify := func(series string, points []models.Point, seek int64) { + tx, _ := e.Begin(false) + defer tx.Rollback() + c := tx.Cursor(series, fields, nil, true) + + k, v := c.SeekTo(seek) + p := points[0] + val := p.Fields()["value"] + if p.UnixNano() != k || val != v { + t.Fatalf("expected to seek to first point\n\texp: %d %f\n\tgot: %d %f", p.UnixNano(), val, k, v) + } + points = points[1:] + + for _, p := range points { + k, v := c.Next() + val := p.Fields()["value"] + if p.UnixNano() != k || val != v { + t.Fatalf("expected to seek to first point\n\texp: %d %f\n\tgot: %d %f", p.UnixNano(), val, k, v.(float64)) + } + } + } + + verify("cpu,host=A", []models.Point{p1}, 0) + verify("cpu,host=B", []models.Point{p2}, 0) + verify("cpu,host=C", []models.Point{p3}, 0) + + p4 := parsePoint("cpu,host=A value=2.1 2000000000") + p5 := parsePoint("cpu,host=B value=2.2 2000000000") + p6 := parsePoint("cpu,host=C value=2.3 2000000000") + + if err := e.WritePoints([]models.Point{p4, p5, p6}, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + + verify("cpu,host=A", []models.Point{p1, p4}, 0) + verify("cpu,host=B", []models.Point{p2, p5}, 0) + verify("cpu,host=C", []models.Point{p3, p6}, 0) + + // verify collisions are handled after closing and reopening + if err := e.Close(); err != nil { + t.Fatalf("error closing: %s", err.Error()) + } + if err := e.Open(); err != nil { + t.Fatalf("error opening: %s", err.Error()) + } + + verify("cpu,host=A", []models.Point{p1, p4}, 0) + verify("cpu,host=B", []models.Point{p2, p5}, 0) + verify("cpu,host=C", []models.Point{p3, p6}, 0) + + p7 := parsePoint("cpu,host=A value=3.1 3000000000") + p8 := parsePoint("cpu,host=B value=3.2 3000000000") + p9 := parsePoint("cpu,host=C value=3.3 3000000000") + + if err := e.WritePoints([]models.Point{p7, p8, p9}, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + + verify("cpu,host=A", []models.Point{p1, p4, p7}, 0) + verify("cpu,host=B", []models.Point{p2, p5, p8}, 0) + verify("cpu,host=C", []models.Point{p3, p6, p9}, 0) +} + +func TestEngine_SupportMultipleFields(t *testing.T) { + e := OpenDefaultEngine() + defer e.Cleanup() + + fields := []string{"value", "foo"} + + p1 := parsePoint("cpu,host=A value=1.1 1000000000") + p2 := parsePoint("cpu,host=A value=1.2,foo=2.2 2000000000") + + if err := e.WritePoints([]models.Point{p1, p2}, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + tx, _ := e.Begin(false) + defer tx.Rollback() + c := tx.Cursor("cpu,host=A", fields, nil, true) + k, v := c.SeekTo(0) + if k != p1.UnixNano() { + t.Fatalf("time wrong:\n\texp: %d\n\tgot: %d", p1.UnixNano(), k) + } + if !reflect.DeepEqual(v, map[string]interface{}{"value": 1.1}) { + t.Fatalf("value wrong: %v", v) + } + k, v = c.Next() + if k != p2.UnixNano() { + t.Fatalf("time wrong:\n\texp: %d\n\tgot: %d", p2.UnixNano(), k) + } + if !reflect.DeepEqual(v, map[string]interface{}{"value": 1.2, "foo": 2.2}) { + t.Fatalf("value wrong: %v", v) + } + k, _ = c.Next() + if k != tsdb.EOF { + t.Fatal("expected EOF") + } + + // verify we can update a field and it's still all good + p11 := parsePoint("cpu,host=A foo=2.1 1000000000") + if err := e.WritePoints([]models.Point{p11}, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + + tx2, _ := e.Begin(false) + defer tx2.Rollback() + c = tx2.Cursor("cpu,host=A", fields, nil, true) + k, v = c.SeekTo(0) + if k != p1.UnixNano() { + t.Fatalf("time wrong:\n\texp: %d\n\tgot: %d", p1.UnixNano(), k) + } + if !reflect.DeepEqual(v, map[string]interface{}{"value": 1.1, "foo": 2.1}) { + t.Fatalf("value wrong: %v", v) + } + k, v = c.Next() + if k != p2.UnixNano() { + t.Fatalf("time wrong:\n\texp: %d\n\tgot: %d", p2.UnixNano(), k) + } + if !reflect.DeepEqual(v, map[string]interface{}{"value": 1.2, "foo": 2.2}) { + t.Fatalf("value wrong: %v", v) + } + k, _ = c.Next() + if k != tsdb.EOF { + t.Fatal("expected EOF") + } + + // verify it's all good with the wal in the picture + e.WAL.SkipCache = false + + p3 := parsePoint("cpu,host=A value=1.3 3000000000") + p4 := parsePoint("cpu,host=A value=1.4,foo=2.4 4000000000") + if err := e.WritePoints([]models.Point{p3, p4}, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + + tx3, _ := e.Begin(false) + defer tx3.Rollback() + c = tx3.Cursor("cpu,host=A", fields, nil, true) + k, v = c.SeekTo(0) + if k != p1.UnixNano() { + t.Fatalf("time wrong:\n\texp: %d\n\tgot: %d", p1.UnixNano(), k) + } + if !reflect.DeepEqual(v, map[string]interface{}{"value": 1.1, "foo": 2.1}) { + t.Fatalf("value wrong: %v", v) + } + k, v = c.Next() + if k != p2.UnixNano() { + t.Fatalf("time wrong:\n\texp: %d\n\tgot: %d", p2.UnixNano(), k) + } + if !reflect.DeepEqual(v, map[string]interface{}{"value": 1.2, "foo": 2.2}) { + t.Fatalf("value wrong: %v", v) + } + k, v = c.Next() + if k != p3.UnixNano() { + t.Fatalf("time wrong:\n\texp: %d\n\tgot: %d", p3.UnixNano(), k) + } + if !reflect.DeepEqual(v, map[string]interface{}{"value": 1.3}) { + t.Fatalf("value wrong: %v", v) + } + k, v = c.Next() + if k != p4.UnixNano() { + t.Fatalf("time wrong:\n\texp: %d\n\tgot: %d", p2.UnixNano(), k) + } + if !reflect.DeepEqual(v, map[string]interface{}{"value": 1.4, "foo": 2.4}) { + t.Fatalf("value wrong: %v", v) + } + k, _ = c.Next() + if k != tsdb.EOF { + t.Fatal("expected EOF") + } + + p33 := parsePoint("cpu,host=A foo=2.3 3000000000") + if err := e.WritePoints([]models.Point{p33}, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + + tx4, _ := e.Begin(false) + defer tx4.Rollback() + c = tx4.Cursor("cpu,host=A", fields, nil, true) + k, v = c.SeekTo(0) + if k != p1.UnixNano() { + t.Fatalf("time wrong:\n\texp: %d\n\tgot: %d", p1.UnixNano(), k) + } + if !reflect.DeepEqual(v, map[string]interface{}{"value": 1.1, "foo": 2.1}) { + t.Fatalf("value wrong: %v", v) + } + k, v = c.Next() + if k != p2.UnixNano() { + t.Fatalf("time wrong:\n\texp: %d\n\tgot: %d", p2.UnixNano(), k) + } + if !reflect.DeepEqual(v, map[string]interface{}{"value": 1.2, "foo": 2.2}) { + t.Fatalf("value wrong: %v", v) + } + k, v = c.Next() + if k != p3.UnixNano() { + t.Fatalf("time wrong:\n\texp: %d\n\tgot: %d", p3.UnixNano(), k) + } + if !reflect.DeepEqual(v, map[string]interface{}{"value": 1.3, "foo": 2.3}) { + t.Fatalf("value wrong: %v", v) + } + k, v = c.Next() + if k != p4.UnixNano() { + t.Fatalf("time wrong:\n\texp: %d\n\tgot: %d", p2.UnixNano(), k) + } + if !reflect.DeepEqual(v, map[string]interface{}{"value": 1.4, "foo": 2.4}) { + t.Fatalf("value wrong: %v", v) + } + k, _ = c.Next() + if k != tsdb.EOF { + t.Fatal("expected EOF") + } + + // and ensure we can grab one of the fields + c = tx4.Cursor("cpu,host=A", []string{"value"}, nil, true) + k, v = c.SeekTo(4000000000) + if k != p4.UnixNano() { + t.Fatalf("time wrong:\n\texp: %d\n\tgot: %d", p4.UnixNano(), k) + } + if v != 1.4 { + t.Fatalf("value wrong: %v", v) + } + k, _ = c.Next() + if k != tsdb.EOF { + t.Fatal("expected EOF") + } +} + +func TestEngine_WriteManyPointsToSingleSeries(t *testing.T) { + e := OpenDefaultEngine() + defer e.Cleanup() + + fields := []string{"value"} + + var points []models.Point + for i := 1; i <= 10000; i++ { + points = append(points, parsePoint(fmt.Sprintf("cpu,host=A value=%d %d000000000", i, i))) + if i%500 == 0 { + if err := e.WritePoints(points, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + points = nil + } + } + + tx, _ := e.Begin(false) + defer tx.Rollback() + c := tx.Cursor("cpu,host=A", fields, nil, true) + k, v := c.SeekTo(0) + for i := 2; i <= 10000; i++ { + k, v = c.Next() + if k != int64(i)*1000000000 { + t.Fatalf("time wrong:\n\texp: %d\n\tgot: %d", i*1000000000, k) + } + if v != float64(i) { + t.Fatalf("value wrong:\n\texp:%v\n\tgot:%v", float64(i), v) + } + } + k, _ = c.Next() + if k != tsdb.EOF { + t.Fatal("expected EOF") + } +} + +func TestEngine_WritePointsInMultipleRequestsWithSameTime(t *testing.T) { + e := OpenDefaultEngine() + defer e.Cleanup() + + fields := []string{"value"} + + e.WAL.SkipCache = false + + if err := e.WritePoints([]models.Point{parsePoint("foo value=1 0")}, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + if err := e.WritePoints([]models.Point{parsePoint("foo value=2 0")}, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + if err := e.WritePoints([]models.Point{parsePoint("foo value=3 0")}, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + + verify := func() { + tx, _ := e.Begin(false) + defer tx.Rollback() + c := tx.Cursor("foo", fields, nil, true) + k, v := c.SeekTo(0) + if k != 0 { + t.Fatalf("expected 0 time but got %d", k) + } + if v != float64(3) { + t.Fatalf("expected 3 for value but got %f", v.(float64)) + } + k, _ = c.Next() + if k != tsdb.EOF { + t.Fatal("expected EOF") + } + } + + verify() + + if err := e.Close(); err != nil { + t.Fatalf("error closing: %s", err.Error()) + } + if err := e.Open(); err != nil { + t.Fatalf("error opening: %s", err.Error()) + } + + verify() +} + +func TestEngine_CursorDescendingOrder(t *testing.T) { + e := OpenDefaultEngine() + defer e.Cleanup() + + fields := []string{"value"} + + p1 := parsePoint("foo value=1 1") + p2 := parsePoint("foo value=2 2") + + e.WAL.SkipCache = false + + if err := e.WritePoints([]models.Point{p1, p2}, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + + verify := func() { + tx, _ := e.Begin(false) + defer tx.Rollback() + c := tx.Cursor("foo", fields, nil, false) + fmt.Println("seek") + k, v := c.SeekTo(5000000) + if k != 2 { + t.Fatalf("expected 2 time but got %d", k) + } + if v != float64(2) { + t.Fatalf("expected 2 for value but got %f", v.(float64)) + } + fmt.Println("next1") + k, v = c.Next() + if k != 1 { + t.Fatalf("expected 1 time but got %d", k) + } + fmt.Println("next2") + if v != float64(1) { + t.Fatalf("expected 1 for value but got %f", v.(float64)) + } + k, _ = c.Next() + if k != tsdb.EOF { + t.Fatal("expected EOF", k) + } + } + fmt.Println("verify 1") + verify() + + if err := e.WAL.Flush(); err != nil { + t.Fatalf("error flushing WAL %s", err.Error()) + } + + fmt.Println("verify 2") + verify() + + p3 := parsePoint("foo value=3 3") + + if err := e.WritePoints([]models.Point{p3}, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + + func() { + tx, _ := e.Begin(false) + defer tx.Rollback() + c := tx.Cursor("foo", fields, nil, false) + k, v := c.SeekTo(234232) + if k != 3 { + t.Fatalf("expected 3 time but got %d", k) + } + if v != float64(3) { + t.Fatalf("expected 3 for value but got %f", v.(float64)) + } + k, _ = c.Next() + if k != 2 { + t.Fatalf("expected 2 time but got %d", k) + } + }() +} + +func TestEngine_CompactWithSeriesInOneFile(t *testing.T) { + e := OpenDefaultEngine() + defer e.Cleanup() + + fields := []string{"value"} + + e.RotateFileSize = 10 + e.MaxPointsPerBlock = 1 + + p1 := parsePoint("cpu,host=A value=1.1 1000000000") + p2 := parsePoint("cpu,host=B value=1.2 2000000000") + p3 := parsePoint("cpu,host=A value=1.3 3000000000") + + if err := e.WritePoints([]models.Point{p1}, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + if err := e.WritePoints([]models.Point{p2}, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + if err := e.WritePoints([]models.Point{p3}, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + + if count := e.DataFileCount(); count != 3 { + t.Fatalf("expected 3 data files but got %d", count) + } + + verify := func() { + tx, _ := e.Begin(false) + defer tx.Rollback() + c := tx.Cursor("cpu,host=A", fields, nil, true) + k, v := c.SeekTo(0) + if k != 1000000000 { + t.Fatalf("expected time 1000000000 but got %d", k) + } + if v != 1.1 { + t.Fatalf("expected value 1.1 but got %f", v.(float64)) + } + k, v = c.Next() + if k != 3000000000 { + t.Fatalf("expected time 3000000000 but got %d", k) + } + c = tx.Cursor("cpu,host=B", fields, nil, true) + k, v = c.SeekTo(0) + if k != 2000000000 { + t.Fatalf("expected time 2000000000 but got %d", k) + } + if v != 1.2 { + t.Fatalf("expected value 1.2 but got %f", v.(float64)) + } + } + + fmt.Println("verify 1") + verify() + + if err := e.Compact(true); err != nil { + t.Fatalf("error compacting: %s", err.Error()) + } + fmt.Println("verify 2") + verify() + + p4 := parsePoint("cpu,host=A value=1.4 4000000000") + if err := e.WritePoints([]models.Point{p4}, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + + if err := e.Compact(true); err != nil { + t.Fatalf("error compacting: %s", err.Error()) + } + tx1, _ := e.Begin(false) + defer tx1.Rollback() + c := tx1.Cursor("cpu,host=A", fields, nil, true) + k, v := c.SeekTo(0) + if k != 1000000000 { + t.Fatalf("expected time 1000000000 but got %d", k) + } + if v != 1.1 { + t.Fatalf("expected value 1.1 but got %f", v.(float64)) + } + k, v = c.Next() + if k != 3000000000 { + t.Fatalf("expected time 3000000000 but got %d", k) + } + k, v = c.Next() + if k != 4000000000 { + t.Fatalf("expected time 3000000000 but got %d", k) + } +} + +// Ensure that compactions that happen where blocks from old data files +// skip decoding and just get copied over to the new data file works. +func TestEngine_CompactionWithCopiedBlocks(t *testing.T) { + e := OpenDefaultEngine() + defer e.Cleanup() + + fields := []string{"value"} + + e.RotateFileSize = 10 + e.MaxPointsPerBlock = 1 + e.RotateBlockSize = 10 + + p1 := parsePoint("cpu,host=A value=1.1 1000000000") + p2 := parsePoint("cpu,host=A value=1.2 2000000000") + p3 := parsePoint("cpu,host=A value=1.3 3000000000") + + if err := e.WritePoints([]models.Point{p1, p2}, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + if err := e.WritePoints([]models.Point{p3}, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + + verify := func() { + tx, _ := e.Begin(false) + defer tx.Rollback() + c := tx.Cursor("cpu,host=A", fields, nil, true) + k, _ := c.SeekTo(0) + if k != 1000000000 { + t.Fatalf("expected time 1000000000 but got %d", k) + } + k, _ = c.Next() + if k != 2000000000 { + t.Fatalf("expected time 2000000000 but got %d", k) + } + k, _ = c.Next() + if k != 3000000000 { + t.Fatalf("expected time 3000000000 but got %d", k) + } + } + + verify() + if err := e.Compact(true); err != nil { + t.Fatalf("error compacting: %s", err.Error()) + } + fmt.Println("verify 2") + verify() + + p4 := parsePoint("cpu,host=B value=1.4 4000000000") + if err := e.WritePoints([]models.Point{p4}, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + + if err := e.Compact(true); err != nil { + t.Fatalf("error compacting: %s", err.Error()) + } + fmt.Println("verify 3") + verify() + + p5 := parsePoint("cpu,host=A value=1.5 5000000000") + p6 := parsePoint("cpu,host=A value=1.6 6000000000") + p7 := parsePoint("cpu,host=B value=2.1 7000000000") + if err := e.WritePoints([]models.Point{p5, p6, p7}, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + + p8 := parsePoint("cpu,host=A value=1.5 7000000000") + p9 := parsePoint("cpu,host=A value=1.6 8000000000") + p10 := parsePoint("cpu,host=B value=2.1 8000000000") + if err := e.WritePoints([]models.Point{p8, p9, p10}, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + + if err := e.Compact(true); err != nil { + t.Fatalf("error compacting: %s", err.Error()) + } + verify() + +} + +func TestEngine_RewritingOldBlocks(t *testing.T) { + e := OpenDefaultEngine() + defer e.Cleanup() + + fields := []string{"value"} + + e.MaxPointsPerBlock = 2 + + p1 := parsePoint("cpu,host=A value=1.1 1000000000") + p2 := parsePoint("cpu,host=A value=1.2 2000000000") + p3 := parsePoint("cpu,host=A value=1.3 3000000000") + p4 := parsePoint("cpu,host=A value=1.5 1500000000") + + if err := e.WritePoints([]models.Point{p1, p2}, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + if err := e.WritePoints([]models.Point{p3}, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + if err := e.WritePoints([]models.Point{p4}, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + + tx, _ := e.Begin(false) + defer tx.Rollback() + c := tx.Cursor("cpu,host=A", fields, nil, true) + k, _ := c.SeekTo(0) + if k != 1000000000 { + t.Fatalf("expected time 1000000000 but got %d", k) + } + k, _ = c.Next() + if k != 1500000000 { + t.Fatalf("expected time 1500000000 but got %d", k) + } + k, _ = c.Next() + if k != 2000000000 { + t.Fatalf("expected time 2000000000 but got %d", k) + } + k, _ = c.Next() + if k != 3000000000 { + t.Fatalf("expected time 3000000000 but got %d", k) + } +} + +func TestEngine_WriteIntoCompactedFile(t *testing.T) { + e := OpenDefaultEngine() + defer e.Cleanup() + + fields := []string{"value"} + + e.MaxPointsPerBlock = 3 + e.RotateFileSize = 10 + + p1 := parsePoint("cpu,host=A value=1.1 1000000000") + p2 := parsePoint("cpu,host=A value=1.2 2000000000") + p3 := parsePoint("cpu,host=A value=1.3 3000000000") + p4 := parsePoint("cpu,host=A value=1.5 4000000000") + p5 := parsePoint("cpu,host=A value=1.6 2500000000") + + if err := e.WritePoints([]models.Point{p1, p2}, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + if err := e.WritePoints([]models.Point{p3}, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + + if err := e.Compact(true); err != nil { + t.Fatalf("error compacting: %s", err.Error()) + } + + if err := e.WritePoints([]models.Point{p4}, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + + if err := e.Compact(true); err != nil { + t.Fatalf("error compacting: %s", err.Error()) + } + + if err := e.WritePoints([]models.Point{p5}, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + + if count := e.DataFileCount(); count != 1 { + t.Fatalf("execpted 1 data file but got %d", count) + } + + tx, _ := e.Begin(false) + defer tx.Rollback() + c := tx.Cursor("cpu,host=A", fields, nil, true) + k, _ := c.SeekTo(0) + if k != 1000000000 { + t.Fatalf("wrong time: %d", k) + } + k, _ = c.Next() + if k != 2000000000 { + t.Fatalf("wrong time: %d", k) + } + k, _ = c.Next() + if k != 2500000000 { + t.Fatalf("wrong time: %d", k) + } + k, _ = c.Next() + if k != 3000000000 { + t.Fatalf("wrong time: %d", k) + } + k, _ = c.Next() + if k != 4000000000 { + t.Fatalf("wrong time: %d", k) + } +} + +func TestEngine_DuplicatePointsInWalAndIndex(t *testing.T) { + e := OpenDefaultEngine() + defer e.Cleanup() + + fields := []string{"value"} + p1 := parsePoint("cpu,host=A value=1.1 1000000000") + p2 := parsePoint("cpu,host=A value=1.2 1000000000") + if err := e.WritePoints([]models.Point{p1}, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + e.WAL.SkipCache = false + if err := e.WritePoints([]models.Point{p2}, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + tx, _ := e.Begin(false) + defer tx.Rollback() + c := tx.Cursor("cpu,host=A", fields, nil, true) + k, v := c.SeekTo(0) + if k != 1000000000 { + t.Fatalf("wrong time: %d", k) + } + if v != 1.2 { + t.Fatalf("wrong value: %f", v.(float64)) + } + k, _ = c.Next() + if k != tsdb.EOF { + t.Fatal("expected EOF", k) + } +} + +func TestEngine_Deletes(t *testing.T) { + e := OpenDefaultEngine() + defer e.Cleanup() + + fields := []string{"value"} + // Create metadata. + mf := &tsdb.MeasurementFields{Fields: make(map[string]*tsdb.Field)} + mf.CreateFieldIfNotExists("value", influxql.Float, false) + atag := map[string]string{"host": "A"} + btag := map[string]string{"host": "B"} + seriesToCreate := []*tsdb.SeriesCreate{ + {Series: tsdb.NewSeries(string(models.MakeKey([]byte("cpu"), atag)), atag)}, + {Series: tsdb.NewSeries(string(models.MakeKey([]byte("cpu"), btag)), btag)}, + } + + p1 := parsePoint("cpu,host=A value=1.1 1000000001") + p2 := parsePoint("cpu,host=A value=1.2 2000000001") + p3 := parsePoint("cpu,host=B value=2.1 1000000000") + p4 := parsePoint("cpu,host=B value=2.1 2000000000") + + e.SkipCompaction = true + e.WAL.SkipCache = false + + if err := e.WritePoints([]models.Point{p1, p3}, map[string]*tsdb.MeasurementFields{"cpu": mf}, seriesToCreate); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + + func() { + tx, _ := e.Begin(false) + defer tx.Rollback() + c := tx.Cursor("cpu,host=A", fields, nil, true) + k, _ := c.SeekTo(0) + if k != p1.UnixNano() { + t.Fatalf("time wrong:\n\texp:%d\n\tgot:%d\n", p1.UnixNano(), k) + } + }() + + if err := e.DeleteSeries([]string{"cpu,host=A"}); err != nil { + t.Fatalf("failed to delete series: %s", err.Error()) + } + + func() { + tx, _ := e.Begin(false) + defer tx.Rollback() + c := tx.Cursor("cpu,host=B", fields, nil, true) + k, _ := c.SeekTo(0) + if k != p3.UnixNano() { + t.Fatalf("time wrong:\n\texp:%d\n\tgot:%d\n", p1.UnixNano(), k) + } + c = tx.Cursor("cpu,host=A", fields, nil, true) + k, _ = c.SeekTo(0) + if k != tsdb.EOF { + t.Fatal("expected EOF", k) + } + }() + + if err := e.WritePoints([]models.Point{p2, p4}, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + + if err := e.WAL.Flush(); err != nil { + t.Fatalf("error flushing wal: %s", err.Error()) + } + + func() { + tx, _ := e.Begin(false) + defer tx.Rollback() + c := tx.Cursor("cpu,host=A", fields, nil, true) + k, _ := c.SeekTo(0) + if k != p2.UnixNano() { + t.Fatalf("time wrong:\n\texp:%d\n\tgot:%d\n", p1.UnixNano(), k) + } + }() + + if err := e.DeleteSeries([]string{"cpu,host=A"}); err != nil { + t.Fatalf("failed to delete series: %s", err.Error()) + } + + // we already know the delete on the wal works. open and close so + // the wal flushes to the index. To verify that the delete gets + // persisted and will go all the way through the index + + if err := e.Close(); err != nil { + t.Fatalf("error closing: %s", err.Error()) + } + if err := e.Open(); err != nil { + t.Fatalf("error opening: %s", err.Error()) + } + + verify := func() { + tx, _ := e.Begin(false) + defer tx.Rollback() + c := tx.Cursor("cpu,host=B", fields, nil, true) + k, _ := c.SeekTo(0) + if k != p3.UnixNano() { + t.Fatalf("time wrong:\n\texp:%d\n\tgot:%d\n", p1.UnixNano(), k) + } + c = tx.Cursor("cpu,host=A", fields, nil, true) + k, _ = c.SeekTo(0) + if k != tsdb.EOF { + t.Fatal("expected EOF") + } + } + + fmt.Println("verify 1") + verify() + + // open and close to verify thd delete was persisted + if err := e.Close(); err != nil { + t.Fatalf("error closing: %s", err.Error()) + } + if err := e.Open(); err != nil { + t.Fatalf("error opening: %s", err.Error()) + } + + fmt.Println("verify 2") + verify() + + if err := e.DeleteSeries([]string{"cpu,host=B"}); err != nil { + t.Fatalf("failed to delete series: %s", err.Error()) + } + + func() { + tx, _ := e.Begin(false) + defer tx.Rollback() + c := tx.Cursor("cpu,host=B", fields, nil, true) + k, _ := c.SeekTo(0) + if k != tsdb.EOF { + t.Fatal("expected EOF") + } + }() + + if err := e.WAL.Flush(); err != nil { + t.Fatalf("error flushing: %s", err.Error()) + } + + func() { + tx, _ := e.Begin(false) + defer tx.Rollback() + c := tx.Cursor("cpu,host=B", fields, nil, true) + k, _ := c.SeekTo(0) + if k != tsdb.EOF { + t.Fatal("expected EOF") + } + }() + + // open and close to verify thd delete was persisted + if err := e.Close(); err != nil { + t.Fatalf("error closing: %s", err.Error()) + } + if err := e.Open(); err != nil { + t.Fatalf("error opening: %s", err.Error()) + } + + func() { + tx, _ := e.Begin(false) + defer tx.Rollback() + c := tx.Cursor("cpu,host=B", fields, nil, true) + k, _ := c.SeekTo(0) + if k != tsdb.EOF { + t.Fatal("expected EOF") + } + }() +} + +func TestEngine_IndexGoodAfterFlush(t *testing.T) { + e := OpenDefaultEngine() + defer e.Cleanup() + + fields := []string{"value"} + + p1 := parsePoint("test,tag=a value=2.5 1443916800000000000") + p2 := parsePoint("test value=3.5 1443916810000000000") + p3 := parsePoint("test,tag=b value=6.5 1443916860000000000") + p4 := parsePoint("test value=8.5 1443916861000000000") + + e.SkipCompaction = true + e.WAL.SkipCache = false + + for _, p := range []models.Point{p1, p2, p3, p4} { + if err := e.WritePoints([]models.Point{p}, nil, nil); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + } + + verify := func() { + tx, _ := e.Begin(false) + defer tx.Rollback() + c1 := tx.Cursor("test", fields, nil, true) + c2 := tx.Cursor("test,tag=a", fields, nil, true) + c3 := tx.Cursor("test,tag=b", fields, nil, true) + k, v := c1.SeekTo(1443916800000000001) + if k != p2.UnixNano() { + t.Fatalf("time wrong: %d", k) + } + if v != 3.5 { + t.Fatalf("value wrong: %f", v.(float64)) + } + k, v = c1.Next() + if k != p4.UnixNano() { + t.Fatalf("time wrong: %d", k) + } + if v != 8.5 { + t.Fatalf("value wrong: %f", v.(float64)) + } + if k, _ := c1.Next(); k != tsdb.EOF { + t.Fatalf("expected EOF: %d", k) + } + k, _ = c2.SeekTo(1443916800000000001) + if k != tsdb.EOF { + t.Fatalf("time wrong: %d", k) + } + k, v = c3.SeekTo(1443916800000000001) + if k != p3.UnixNano() { + t.Fatalf("time wrong: %d", k) + } + if v != 6.5 { + t.Fatalf("value wrong: %f", v.(float64)) + } + if k, _ := c3.Next(); k != tsdb.EOF { + t.Fatalf("expected EOF: %d", k) + } + } + + fmt.Println("verify1") + verify() + fmt.Println("flush") + if err := e.WAL.Flush(); err != nil { + t.Fatalf("error flushing: %s", err.Error()) + } + fmt.Println("verify2") + verify() +} + +// Engine represents a test wrapper for tsm1.Engine. +type Engine struct { + *tsm1.Engine +} + +// NewEngine returns a new instance of Engine. +func NewEngine(opt tsdb.EngineOptions) *Engine { + dir, err := ioutil.TempDir("", "tsm1-test") + if err != nil { + panic("couldn't get temp dir") + } + + // Create test wrapper and attach mocks. + e := &Engine{ + Engine: tsm1.NewEngine(dir, dir, opt).(*tsm1.Engine), + } + + return e +} + +// OpenEngine returns an opened instance of Engine. Panic on error. +func OpenEngine(opt tsdb.EngineOptions) *Engine { + e := NewEngine(opt) + if err := e.Open(); err != nil { + panic(err) + } + e.WAL.SkipCache = true + e.SkipCompaction = true + return e +} + +// OpenDefaultEngine returns an open Engine with default options. +func OpenDefaultEngine() *Engine { return OpenEngine(tsdb.NewEngineOptions()) } + +// Cleanup closes the engine and removes all data. +func (e *Engine) Cleanup() error { + e.Engine.Close() + os.RemoveAll(e.Path()) + return nil +} + +func parsePoints(buf string) []models.Point { + points, err := models.ParsePointsString(buf) + if err != nil { + panic(fmt.Sprintf("couldn't parse points: %s", err.Error())) + } + return points +} + +func parsePoint(buf string) models.Point { + return parsePoints(buf)[0] +} + +func inttob(v int) []byte { + b := make([]byte, 8) + binary.BigEndian.PutUint64(b, uint64(v)) + return b +} + +func btou64(b []byte) uint64 { + return binary.BigEndian.Uint64(b) +} + +func u64tob(v uint64) []byte { + b := make([]byte, 8) + binary.BigEndian.PutUint64(b, v) + return b +} + +func btof64(b []byte) float64 { + return math.Float64frombits(binary.BigEndian.Uint64(b)) +} diff --git a/tsdb/engine/tsm1/tx.go b/tsdb/engine/tsm1/tx.go new file mode 100644 index 00000000000..54653c28728 --- /dev/null +++ b/tsdb/engine/tsm1/tx.go @@ -0,0 +1,69 @@ +package tsm1 + +import ( + "io" + + "github.com/influxdb/influxdb/tsdb" +) + +type tx struct { + files dataFiles + engine *Engine +} + +// TODO: handle multiple fields and descending +func (t *tx) Cursor(series string, fields []string, dec *tsdb.FieldCodec, ascending bool) tsdb.Cursor { + t.engine.filesLock.RLock() + defer t.engine.filesLock.RUnlock() + + // don't add the overhead of the multifield cursor if we only have one field + if len(fields) == 1 { + id := t.engine.keyAndFieldToID(series, fields[0]) + _, isDeleted := t.engine.deletes[id] + + var indexCursor tsdb.Cursor + if isDeleted { + indexCursor = &emptyCursor{ascending: ascending} + } else { + indexCursor = newCursor(id, t.files, ascending) + } + wc := t.engine.WAL.Cursor(series, fields, dec, ascending) + return NewCombinedEngineCursor(wc, indexCursor, ascending) + } + + // multiple fields. use just the MultiFieldCursor, which also handles time collisions + // so we don't need to use the combined cursor + cursors := make([]tsdb.Cursor, 0) + cursorFields := make([]string, 0) + for _, field := range fields { + id := t.engine.keyAndFieldToID(series, field) + _, isDeleted := t.engine.deletes[id] + + var indexCursor tsdb.Cursor + if isDeleted { + indexCursor = &emptyCursor{ascending: ascending} + } else { + indexCursor = newCursor(id, t.files, ascending) + } + wc := t.engine.WAL.Cursor(series, []string{field}, dec, ascending) + // double up the fields since there's one for the wal and one for the index + cursorFields = append(cursorFields, field, field) + cursors = append(cursors, indexCursor, wc) + } + + return NewMultiFieldCursor(cursorFields, cursors, ascending) +} + +func (t *tx) Rollback() error { + t.engine.queryLock.RUnlock() + for _, f := range t.files { + f.mu.RUnlock() + } + + return nil +} + +// TODO: refactor the Tx interface to not have Size, Commit, or WriteTo since they're not used +func (t *tx) Size() int64 { panic("not implemented") } +func (t *tx) Commit() error { panic("not implemented") } +func (t *tx) WriteTo(w io.Writer) (n int64, err error) { panic("not implemented") } diff --git a/tsdb/engine/tsm1/wal.go b/tsdb/engine/tsm1/wal.go new file mode 100644 index 00000000000..4f6607d02ae --- /dev/null +++ b/tsdb/engine/tsm1/wal.go @@ -0,0 +1,792 @@ +package tsm1 + +import ( + "encoding/json" + "fmt" + "io" + "log" + "os" + "path/filepath" + "sort" + "strconv" + "strings" + "sync" + "time" + + "github.com/influxdb/influxdb/models" + "github.com/influxdb/influxdb/tsdb" + + "github.com/golang/snappy" +) + +const ( + // DefaultSegmentSize of 2MB is the size at which segment files will be rolled over + DefaultSegmentSize = 2 * 1024 * 1024 + + // FileExtension is the file extension we expect for wal segments + WALFileExtension = "wal" + + WALFilePrefix = "_" + + writeBufLen = 32 << 10 // 32kb +) + +// flushType indiciates why a flush and compaction are being run so the partition can +// do the appropriate type of compaction +type flushType int + +const ( + // noFlush indicates that no flush or compaction are necesssary at this time + noFlush flushType = iota + // memoryFlush indicates that we should look for the series using the most + // memory to flush out and compact all others + memoryFlush + // idleFlush indicates that we should flush all series in the parition, + // delete all segment files and hold off on opening a new one + idleFlush + // deleteFlush indicates that we're flushing because series need to be removed from the WAL + deleteFlush + // startupFlush indicates that we're flushing because the database is starting up + startupFlush +) + +// walEntry is a byte written to a wal segment file that indicates what the following compressed block contains +type walEntryType byte + +const ( + pointsEntry walEntryType = 0x01 + fieldsEntry walEntryType = 0x02 + seriesEntry walEntryType = 0x03 + deleteEntry walEntryType = 0x04 +) + +type Log struct { + path string + + flushCheckTimer *time.Timer // check this often to see if a background flush should happen + flushCheckInterval time.Duration + + // write variables + writeLock sync.Mutex + currentSegmentID int + currentSegmentFile *os.File + currentSegmentSize int + + // cache and flush variables + cacheLock sync.RWMutex + lastWriteTime time.Time + flushRunning bool + cache map[string]Values + cacheDirtySort map[string]bool // this map should be small, only for dirty vals + flushCache map[string]Values // temporary map while flushing + memorySize int + measurementFieldsCache map[string]*tsdb.MeasurementFields + seriesToCreateCache []*tsdb.SeriesCreate + + // LogOutput is the writer used by the logger. + LogOutput io.Writer + logger *log.Logger + + // FlushColdInterval is the period of time after which a partition will do a + // full flush and compaction if it has been cold for writes. + FlushColdInterval time.Duration + + // SegmentSize is the file size at which a segment file will be rotated + SegmentSize int + + // FlushMemorySizeThreshold specifies when the log should be forced to be flushed + FlushMemorySizeThreshold int + + // MaxMemorySizeThreshold specifies the limit at which writes to the WAL should be rejected + MaxMemorySizeThreshold int + + // Index is the database series will be flushed to + Index IndexWriter + + // LoggingEnabled specifies if detailed logs should be output + LoggingEnabled bool + + // SkipCache specifies if the wal should immediately write to the index instead of + // caching data in memory. False by default so we buffer in memory before flushing to index. + SkipCache bool + + // SkipDurability specifies if the wal should not write the wal entries to disk. + // False by default which means all writes are durable even when cached before flushing to index. + SkipDurability bool +} + +// IndexWriter is an interface for the indexed database the WAL flushes data to +type IndexWriter interface { + Write(valuesByKey map[string]Values, measurementFieldsToSave map[string]*tsdb.MeasurementFields, seriesToCreate []*tsdb.SeriesCreate) error + MarkDeletes(keys []string) + MarkMeasurementDelete(name string) +} + +func NewLog(path string) *Log { + return &Log{ + path: path, + + // these options should be overriden by any options in the config + LogOutput: os.Stderr, + FlushColdInterval: tsdb.DefaultFlushColdInterval, + SegmentSize: DefaultSegmentSize, + FlushMemorySizeThreshold: tsdb.DefaultFlushMemorySizeThreshold, + MaxMemorySizeThreshold: tsdb.DefaultMaxMemorySizeThreshold, + logger: log.New(os.Stderr, "[tsm1wal] ", log.LstdFlags), + } +} + +// Open opens and initializes the Log. Will recover from previous unclosed shutdowns +func (l *Log) Open() error { + + if l.LoggingEnabled { + l.logger.Printf("tsm1 WAL starting with %d flush memory size threshold and %d max memory size threshold\n", l.FlushMemorySizeThreshold, l.MaxMemorySizeThreshold) + l.logger.Printf("tsm1 WAL writing to %s\n", l.path) + } + if err := os.MkdirAll(l.path, 0777); err != nil { + return err + } + + l.cache = make(map[string]Values) + l.cacheDirtySort = make(map[string]bool) + l.measurementFieldsCache = make(map[string]*tsdb.MeasurementFields) + + // flush out any WAL entries that are there from before + if err := l.readAndFlushWAL(); err != nil { + return err + } + + return nil +} + +// Cursor will return a cursor object to Seek and iterate with Next for the WAL cache for the given. +// This should only ever be called by the engine cursor method, which will always give it +// exactly one field. +func (l *Log) Cursor(series string, fields []string, dec *tsdb.FieldCodec, ascending bool) tsdb.Cursor { + l.cacheLock.RLock() + defer l.cacheLock.RUnlock() + + if len(fields) != 1 { + panic("wal cursor should only ever be called with 1 field") + } + ck := SeriesFieldKey(series, fields[0]) + values := l.cache[ck] + + // if we're in the middle of a flush, combine the previous cache + // with this one for the cursor + if l.flushCache != nil { + if fc, ok := l.flushCache[ck]; ok { + c := make([]Value, len(fc), len(fc)+len(values)) + copy(c, fc) + c = append(c, values...) + + return newWALCursor(Values(c).Deduplicate(), ascending) + } + } + + if l.cacheDirtySort[ck] { + values = Values(values).Deduplicate() + } + + // build a copy so writes afterwards don't change the result set + a := make([]Value, len(values)) + copy(a, values) + return newWALCursor(a, ascending) +} + +func (l *Log) WritePoints(points []models.Point, fields map[string]*tsdb.MeasurementFields, series []*tsdb.SeriesCreate) error { + // add everything to the cache, or return an error if we've hit our max memory + if addedToCache := l.addToCache(points, fields, series, true); !addedToCache { + return fmt.Errorf("WAL backed up flushing to index, hit max memory") + } + + // make the write durable if specified + if !l.SkipDurability { + // write the points + pointStrings := make([]string, len(points)) + for i, p := range points { + pointStrings[i] = p.String() + } + data := strings.Join(pointStrings, "\n") + compressed := snappy.Encode(nil, []byte(data)) + + if err := l.writeToLog(pointsEntry, compressed); err != nil { + return err + } + + // write the new fields + if len(fields) > 0 { + data, err := json.Marshal(fields) + if err != nil { + return err + } + compressed = snappy.Encode(compressed, data) + if err := l.writeToLog(fieldsEntry, compressed); err != nil { + return err + } + } + + // write the new series + if len(series) > 0 { + data, err := json.Marshal(series) + if err != nil { + return err + } + compressed = snappy.Encode(compressed, data) + if err := l.writeToLog(seriesEntry, compressed); err != nil { + return err + } + } + } + + // usually skipping the cache is only for testing purposes and this was the easiest + // way to represent the logic (to cache and then immediately flush) + if l.SkipCache { + l.flush(idleFlush) + } + + return nil +} + +// addToCache will add the points, measurements, and fields to the cache and return true if successful. They will be queryable +// immediately after return and will be flushed at the next flush cycle. Before adding to the cache we check if we're over the +// max memory threshold. If we are we request a flush in a new goroutine and return false, indicating we didn't add the values +// to the cache and that writes should return a failure. +func (l *Log) addToCache(points []models.Point, fields map[string]*tsdb.MeasurementFields, series []*tsdb.SeriesCreate, checkMemory bool) bool { + l.cacheLock.Lock() + defer l.cacheLock.Unlock() + + // if we should check memory and we're over the threshold, mark a flush as running and kick one off in a goroutine + if checkMemory && l.memorySize > l.FlushMemorySizeThreshold { + if !l.flushRunning { + l.flushRunning = true + go l.flush(memoryFlush) + } + if l.memorySize > l.MaxMemorySizeThreshold { + return false + } + } + + for _, p := range points { + for name, value := range p.Fields() { + k := SeriesFieldKey(string(p.Key()), name) + v := NewValue(p.Time(), value) + cacheValues := l.cache[k] + + // only mark it as dirty if it isn't already + if _, ok := l.cacheDirtySort[k]; !ok && len(cacheValues) > 0 { + dirty := cacheValues[len(cacheValues)-1].Time().UnixNano() >= v.Time().UnixNano() + if dirty { + l.cacheDirtySort[k] = true + } + } + l.memorySize += v.Size() + l.cache[k] = append(cacheValues, v) + } + } + + for k, v := range fields { + l.measurementFieldsCache[k] = v + } + l.seriesToCreateCache = append(l.seriesToCreateCache, series...) + l.lastWriteTime = time.Now() + + return true +} + +func (l *Log) LastWriteTime() time.Time { + l.cacheLock.RLock() + defer l.cacheLock.RUnlock() + return l.lastWriteTime +} + +// readAndFlushWAL is called on open and will read the segment files in, flushing whenever +// the memory gets over the limit. Once all files have been read it will flush and remove the files +func (l *Log) readAndFlushWAL() error { + files, err := l.segmentFileNames() + if err != nil { + return err + } + + // read all the segment files and cache them, flushing along the way if we + // hit memory limits + for _, fn := range files { + if err := l.readFileToCache(fn); err != nil { + return err + } + + if l.memorySize > l.MaxMemorySizeThreshold { + if err := l.flush(memoryFlush); err != nil { + return err + } + } + } + + // now flush and remove all the old files + if err := l.flush(startupFlush); err != nil { + return err + } + + return nil +} + +func (l *Log) readFileToCache(fileName string) error { + f, err := os.OpenFile(fileName, os.O_RDONLY, 0666) + if err != nil { + return err + } + defer f.Close() + + buf := make([]byte, writeBufLen) + data := make([]byte, writeBufLen) + for { + // read the type and the length of the entry + _, err := io.ReadFull(f, buf[0:5]) + if err == io.EOF { + return nil + } else if err != nil { + l.logger.Printf("error reading segment file %s: %s", fileName, err.Error()) + return err + } + entryType := buf[0] + length := btou32(buf[1:5]) + + // read the compressed block and decompress it + if int(length) > len(buf) { + buf = make([]byte, length) + } + _, err = io.ReadFull(f, buf[0:length]) + if err == io.EOF || err == io.ErrUnexpectedEOF { + l.logger.Printf("hit end of file while reading compressed wal entry from %s", fileName) + return nil + } else if err != nil { + return err + } + data, err = snappy.Decode(data, buf[0:length]) + if err != nil { + l.logger.Printf("error decoding compressed entry from %s: %s", fileName, err.Error()) + return nil + } + + // and marshal it and send it to the cache + switch walEntryType(entryType) { + case pointsEntry: + points, err := models.ParsePoints(data) + if err != nil { + return err + } + l.addToCache(points, nil, nil, false) + case fieldsEntry: + fields := make(map[string]*tsdb.MeasurementFields) + if err := json.Unmarshal(data, &fields); err != nil { + return err + } + l.addToCache(nil, fields, nil, false) + case seriesEntry: + series := make([]*tsdb.SeriesCreate, 0) + if err := json.Unmarshal(data, &series); err != nil { + return err + } + l.addToCache(nil, nil, series, false) + case deleteEntry: + d := &deleteData{} + if err := json.Unmarshal(data, &d); err != nil { + return err + } + l.Index.MarkDeletes(d.Keys) + l.Index.MarkMeasurementDelete(d.MeasurementName) + l.deleteKeysFromCache(d.Keys) + if d.MeasurementName != "" { + l.deleteMeasurementFromCache(d.MeasurementName) + } + } + } +} + +func (l *Log) writeToLog(writeType walEntryType, data []byte) error { + l.writeLock.Lock() + defer l.writeLock.Unlock() + + if l.currentSegmentFile == nil || l.currentSegmentSize > DefaultSegmentSize { + if err := l.newSegmentFile(); err != nil { + // fail hard since we can't write data + panic(fmt.Sprintf("error opening new segment file for wal: %s", err.Error())) + } + } + + // The panics here are an intentional choice. Based on reports from users + // it's better to fail hard if the database can't take writes. Then they'll + // get alerted and fix whatever is broken. Remove these and face Paul's wrath. + if _, err := l.currentSegmentFile.Write([]byte{byte(writeType)}); err != nil { + panic(fmt.Sprintf("error writing type to wal: %s", err.Error())) + } + if _, err := l.currentSegmentFile.Write(u32tob(uint32(len(data)))); err != nil { + panic(fmt.Sprintf("error writing len to wal: %s", err.Error())) + } + if _, err := l.currentSegmentFile.Write(data); err != nil { + panic(fmt.Sprintf("error writing data to wal: %s", err.Error())) + } + + l.currentSegmentSize += 5 + len(data) + + return l.currentSegmentFile.Sync() +} + +// Flush will force a flush of the WAL to the index +func (l *Log) Flush() error { + return l.flush(idleFlush) +} + +func (l *Log) DeleteMeasurement(measurement string, keys []string) error { + d := &deleteData{MeasurementName: measurement, Keys: keys} + err := l.writeDeleteEntry(d) + if err != nil { + return err + } + + l.deleteKeysFromCache(keys) + l.deleteMeasurementFromCache(measurement) + + return nil +} + +func (l *Log) deleteMeasurementFromCache(name string) { + l.cacheLock.Lock() + defer l.cacheLock.Unlock() + delete(l.measurementFieldsCache, name) +} + +func (l *Log) writeDeleteEntry(d *deleteData) error { + js, err := json.Marshal(d) + if err != nil { + return err + } + data := snappy.Encode(nil, js) + return l.writeToLog(deleteEntry, data) +} + +func (l *Log) DeleteSeries(keys []string) error { + l.deleteKeysFromCache(keys) + + return l.writeDeleteEntry(&deleteData{Keys: keys}) +} + +func (l *Log) deleteKeysFromCache(keys []string) { + seriesKeys := make(map[string]bool) + for _, k := range keys { + series, _ := seriesAndFieldFromCompositeKey(k) + seriesKeys[series] = true + } + + l.cacheLock.Lock() + defer l.cacheLock.Unlock() + + for _, k := range keys { + delete(l.cache, k) + } + + // now remove any of these that are marked for creation + var seriesCreate []*tsdb.SeriesCreate + for _, sc := range l.seriesToCreateCache { + if _, ok := seriesKeys[sc.Series.Key]; !ok { + seriesCreate = append(seriesCreate, sc) + } + } + l.seriesToCreateCache = seriesCreate +} + +// Close will finish any flush that is currently in process and close file handles +func (l *Log) Close() error { + l.writeLock.Lock() + l.cacheLock.Lock() + defer l.writeLock.Unlock() + defer l.cacheLock.Unlock() + + l.cache = nil + l.measurementFieldsCache = nil + l.seriesToCreateCache = nil + if l.currentSegmentFile == nil { + return nil + } + if err := l.currentSegmentFile.Close(); err != nil { + return err + } + l.currentSegmentFile = nil + + return nil +} + +// close all the open Log partitions and file handles +func (l *Log) close() error { + l.cache = nil + l.cacheDirtySort = nil + if l.currentSegmentFile == nil { + return nil + } + if err := l.currentSegmentFile.Close(); err != nil { + return err + } + l.currentSegmentFile = nil + + return nil +} + +// flush writes all wal data in memory to the index +func (l *Log) flush(flush flushType) error { + // only flush if there isn't one already running. Memory flushes are only triggered + // by writes, which will mark the flush as running, so we can ignore it. + l.cacheLock.Lock() + + if l.flushRunning && flush != memoryFlush { + l.cacheLock.Unlock() + return nil + } + + // mark the flush as running and ensure that it gets marked as not running when we return + l.flushRunning = true + defer func() { + l.cacheLock.Lock() + l.flushRunning = false + l.cacheLock.Unlock() + }() + + // only hold the lock while we rotate the segment file + l.writeLock.Lock() + lastFileID := l.currentSegmentID + // if it's an idle flush, don't open a new segment file + if flush == idleFlush { + if l.currentSegmentFile != nil { + if err := l.currentSegmentFile.Close(); err != nil { + return err + } + l.currentSegmentFile = nil + l.currentSegmentSize = 0 + } + } else { + if err := l.newSegmentFile(); err != nil { + // there's no recovering from this, fail hard + panic(fmt.Sprintf("error creating new wal file: %s", err.Error())) + } + } + l.writeLock.Unlock() + + // copy the cache items to new maps so we can empty them out + l.flushCache = make(map[string]Values) + valueCount := 0 + for key, v := range l.cache { + l.flushCache[key] = v + valueCount += len(v) + } + l.cache = make(map[string]Values) + for k, _ := range l.cacheDirtySort { + l.flushCache[k] = l.flushCache[k].Deduplicate() + } + l.cacheDirtySort = make(map[string]bool) + + flushSize := l.memorySize + + // reset the memory being used by the cache + l.memorySize = 0 + + // reset the measurements for flushing + mfc := l.measurementFieldsCache + l.measurementFieldsCache = make(map[string]*tsdb.MeasurementFields) + + // reset the series for flushing + scc := l.seriesToCreateCache + l.seriesToCreateCache = nil + + l.cacheLock.Unlock() + + // exit if there's nothing to flush to the index + if len(l.flushCache) == 0 && len(mfc) == 0 && len(scc) == 0 && flush != startupFlush { + return nil + } + + if l.LoggingEnabled { + ftype := "idle" + if flush == memoryFlush { + ftype = "memory" + } else if flush == startupFlush { + ftype = "startup" + } + l.logger.Printf("%s flush of %s with %d keys and %d total values of %d bytes\n", ftype, l.path, len(l.flushCache), valueCount, flushSize) + } + + startTime := time.Now() + if err := l.Index.Write(l.flushCache, mfc, scc); err != nil { + return err + } + if l.LoggingEnabled { + l.logger.Printf("%s flush to index took %s\n", l.path, time.Since(startTime)) + } + + l.cacheLock.Lock() + l.flushCache = nil + l.cacheLock.Unlock() + + // remove all the old segment files + fileNames, err := l.segmentFileNames() + if err != nil { + return err + } + for _, fn := range fileNames { + id, err := idFromFileName(fn) + if err != nil { + return err + } + if id <= lastFileID { + err := os.Remove(fn) + if err != nil { + return err + } + } + } + + return nil +} + +// segmentFileNames will return all files that are WAL segment files in sorted order by ascending ID +func (l *Log) segmentFileNames() ([]string, error) { + names, err := filepath.Glob(filepath.Join(l.path, fmt.Sprintf("%s*.%s", WALFilePrefix, WALFileExtension))) + if err != nil { + return nil, err + } + sort.Strings(names) + return names, nil +} + +// newSegmentFile will close the current segment file and open a new one, updating bookkeeping info on the log +func (l *Log) newSegmentFile() error { + l.currentSegmentID += 1 + if l.currentSegmentFile != nil { + if err := l.currentSegmentFile.Close(); err != nil { + return err + } + } + + fileName := filepath.Join(l.path, fmt.Sprintf("%s%05d.%s", WALFilePrefix, l.currentSegmentID, WALFileExtension)) + ff, err := os.OpenFile(fileName, os.O_CREATE|os.O_RDWR, 0666) + if err != nil { + return err + } + l.currentSegmentSize = 0 + l.currentSegmentFile = ff + + return nil +} + +// shouldFlush will return the flushType specifying whether we should flush. memoryFlush +// is never returned from this function since those can only be triggered by writes +func (l *Log) shouldFlush() flushType { + l.cacheLock.RLock() + defer l.cacheLock.RUnlock() + + if l.flushRunning { + return noFlush + } + + if len(l.cache) == 0 { + return noFlush + } + + if time.Since(l.lastWriteTime) > l.FlushColdInterval { + return idleFlush + } + + return noFlush +} + +// cursor is a unidirectional iterator for a given entry in the cache +type walCursor struct { + cache Values + position int + ascending bool +} + +func newWALCursor(cache Values, ascending bool) *walCursor { + // position is set such that a call to Next will successfully advance + // to the next postion and return the value. + c := &walCursor{cache: cache, ascending: ascending, position: -1} + if !ascending { + c.position = len(c.cache) + } + return c +} + +func (c *walCursor) Ascending() bool { return c.ascending } + +// Seek will point the cursor to the given time (or key) +func (c *walCursor) SeekTo(seek int64) (int64, interface{}) { + // Seek cache index + c.position = sort.Search(len(c.cache), func(i int) bool { + return c.cache[i].Time().UnixNano() >= seek + }) + + // If seek is not in the cache, return the last value in the cache + if !c.ascending && c.position >= len(c.cache) { + c.position = len(c.cache) - 1 + } + + // Make sure our position points to something in the cache + if c.position < 0 || c.position >= len(c.cache) { + return tsdb.EOF, nil + } + + v := c.cache[c.position] + + return v.Time().UnixNano(), v.Value() +} + +// Next moves the cursor to the next key/value. will return nil if at the end +func (c *walCursor) Next() (int64, interface{}) { + var v Value + if c.ascending { + v = c.nextForward() + } else { + v = c.nextReverse() + } + + return v.Time().UnixNano(), v.Value() +} + +// nextForward advances the cursor forward returning the next value +func (c *walCursor) nextForward() Value { + c.position++ + + if c.position >= len(c.cache) { + return &EmptyValue{} + } + + return c.cache[c.position] +} + +// nextReverse advances the cursor backwards returning the next value +func (c *walCursor) nextReverse() Value { + c.position-- + + if c.position < 0 { + return &EmptyValue{} + } + + return c.cache[c.position] +} + +// deleteData holds the information for a delete entry +type deleteData struct { + // MeasurementName will be empty for deletes that are only against series + MeasurementName string + Keys []string +} + +// idFromFileName parses the segment file ID from its name +func idFromFileName(name string) (int, error) { + parts := strings.Split(filepath.Base(name), ".") + if len(parts) != 2 { + return 0, fmt.Errorf("file %s has wrong name format to have an id", name) + } + + id, err := strconv.ParseUint(parts[0][1:], 10, 32) + + return int(id), err +} diff --git a/tsdb/engine/tsm1/wal_test.go b/tsdb/engine/tsm1/wal_test.go new file mode 100644 index 00000000000..9df191c7a7e --- /dev/null +++ b/tsdb/engine/tsm1/wal_test.go @@ -0,0 +1,178 @@ +package tsm1_test + +import ( + "io/ioutil" + "os" + "reflect" + "testing" + + "github.com/influxdb/influxdb/models" + "github.com/influxdb/influxdb/tsdb" + "github.com/influxdb/influxdb/tsdb/engine/tsm1" +) + +func TestWAL_TestWriteQueryOpen(t *testing.T) { + w := NewWAL() + defer w.Cleanup() + + var vals map[string]tsm1.Values + var fields map[string]*tsdb.MeasurementFields + var series []*tsdb.SeriesCreate + + w.Index = &MockIndexWriter{ + fn: func(valuesByKey map[string]tsm1.Values, measurementFieldsToSave map[string]*tsdb.MeasurementFields, seriesToCreate []*tsdb.SeriesCreate) error { + vals = valuesByKey + fields = measurementFieldsToSave + series = seriesToCreate + return nil + }, + } + + if err := w.Open(); err != nil { + t.Fatalf("error opening: %s", err.Error()) + } + + p1 := parsePoint("cpu,host=A value=1.1 1000000000") + p2 := parsePoint("cpu,host=B value=1.2 1000000000") + p3 := parsePoint("cpu,host=A value=2.1 2000000000") + p4 := parsePoint("cpu,host=B value=2.2 2000000000") + fieldsToWrite := map[string]*tsdb.MeasurementFields{"foo": {Fields: map[string]*tsdb.Field{"bar": {Name: "value"}}}} + seriesToWrite := []*tsdb.SeriesCreate{{Measurement: "asdf"}} + + if err := w.WritePoints([]models.Point{p1, p2}, fieldsToWrite, seriesToWrite); err != nil { + t.Fatalf("failed to write points: %s", err.Error()) + } + + fieldNames := []string{"value"} + var codec *tsdb.FieldCodec + + c := w.Cursor("cpu,host=A", fieldNames, codec, true) + k, v := c.Next() + if k != p1.UnixNano() { + t.Fatalf("p1 time wrong:\n\texp:%d\n\tgot:%d\n", p1.UnixNano(), k) + } + if 1.1 != v { + t.Fatal("p1 data not equal") + } + c = w.Cursor("cpu,host=B", fieldNames, codec, true) + k, v = c.Next() + if k != p2.UnixNano() { + t.Fatalf("p2 time wrong:\n\texp:%d\n\tgot:%d\n", p2.UnixNano(), k) + } + if 1.2 != v { + t.Fatal("p2 data not equal") + } + + k, v = c.Next() + if k != tsdb.EOF { + t.Fatal("expected EOF", k, v) + } + + // ensure we can do another write to the wal and get stuff + if err := w.WritePoints([]models.Point{p3}, nil, nil); err != nil { + t.Fatalf("failed to write: %s", err.Error()) + } + + c = w.Cursor("cpu,host=A", fieldNames, codec, true) + k, v = c.Next() + if k != p1.UnixNano() { + t.Fatalf("p1 time wrong:\n\texp:%d\n\tgot:%d\n", p1.UnixNano(), k) + } + if 1.1 != v { + t.Fatal("p1 data not equal") + } + k, v = c.Next() + if k != p3.UnixNano() { + t.Fatalf("p3 time wrong:\n\texp:%d\n\tgot:%d\n", p3.UnixNano(), k) + } + if 2.1 != v { + t.Fatal("p3 data not equal") + } + + // ensure we can seek + k, v = c.SeekTo(2000000000) + if k != p3.UnixNano() { + t.Fatalf("p3 time wrong:\n\texp:%d\n\tgot:%d\n", p3.UnixNano(), k) + } + if 2.1 != v { + t.Fatal("p3 data not equal") + } + k, v = c.Next() + if k != tsdb.EOF { + t.Fatal("expected EOF") + } + + // ensure we close and after open it flushes to the index + if err := w.Close(); err != nil { + t.Fatalf("failed to close: %s", err.Error()) + } + + if err := w.Open(); err != nil { + t.Fatalf("failed to open: %s", err.Error()) + } + + if len(vals[tsm1.SeriesFieldKey("cpu,host=A", "value")]) != 2 { + t.Fatal("expected host A values to flush to index on open") + } + + if len(vals[tsm1.SeriesFieldKey("cpu,host=B", "value")]) != 1 { + t.Fatal("expected host B values to flush to index on open") + } + + if err := w.WritePoints([]models.Point{p4}, nil, nil); err != nil { + t.Fatalf("failed to write: %s", err.Error()) + } + c = w.Cursor("cpu,host=B", fieldNames, codec, true) + k, v = c.Next() + if k != p4.UnixNano() { + t.Fatalf("p4 time wrong:\n\texp:%d\n\tgot:%d\n", p4.UnixNano(), k) + } + if 2.2 != v { + t.Fatal("p4 data not equal") + } + + if !reflect.DeepEqual(fields, fieldsToWrite) { + t.Fatal("fields not flushed") + } + + if !reflect.DeepEqual(series, seriesToWrite) { + t.Fatal("series not flushed") + } +} + +type Log struct { + *tsm1.Log + path string +} + +func NewWAL() *Log { + dir, err := ioutil.TempDir("", "tsm1-test") + if err != nil { + panic("couldn't get temp dir") + } + + l := &Log{ + Log: tsm1.NewLog(dir), + path: dir, + } + l.LoggingEnabled = true + return l +} + +func (l *Log) Cleanup() error { + l.Close() + os.RemoveAll(l.path) + return nil +} + +type MockIndexWriter struct { + fn func(valuesByKey map[string]tsm1.Values, measurementFieldsToSave map[string]*tsdb.MeasurementFields, seriesToCreate []*tsdb.SeriesCreate) error +} + +func (m *MockIndexWriter) Write(valuesByKey map[string]tsm1.Values, measurementFieldsToSave map[string]*tsdb.MeasurementFields, seriesToCreate []*tsdb.SeriesCreate) error { + return m.fn(valuesByKey, measurementFieldsToSave, seriesToCreate) +} + +func (m *MockIndexWriter) MarkDeletes(keys []string) {} + +func (m *MockIndexWriter) MarkMeasurementDelete(name string) {} diff --git a/tsdb/engine/tsm1/write_lock.go b/tsdb/engine/tsm1/write_lock.go new file mode 100644 index 00000000000..f4514e58aaa --- /dev/null +++ b/tsdb/engine/tsm1/write_lock.go @@ -0,0 +1,96 @@ +package tsm1 + +import ( + "reflect" + "sync" +) + +// writeLock is a lock that enables locking of ranges between a +// min and max value. We use this so that flushes from the WAL +// can occur concurrently along with compactions. +type WriteLock struct { + rangesLock sync.Mutex + ranges []*rangeLock +} + +// LockRange will ensure an exclusive lock between the min and +// max values inclusive. Any subsequent calls that have an +// an overlapping range will have to wait until the previous +// lock is released. A corresponding call to UnlockRange should +// be deferred. +func (w *WriteLock) LockRange(min, max int64) { + r := &rangeLock{min: min, max: max} + for { + ranges := w.currentlyLockedRanges() + + // ensure there are no currently locked ranges that overlap + for _, rr := range ranges { + if rr.overlaps(r) { + // wait until it gets unlocked + rr.mu.Lock() + // release the lock so the object can get GC'd + rr.mu.Unlock() + } + } + + // ensure that no one else got a lock on the range while we + // were waiting + w.rangesLock.Lock() + if len(w.ranges) == 0 || reflect.DeepEqual(ranges, w.ranges) { + // and lock the range + r.mu.Lock() + + // now that we know the range is free, add it to the locks + w.ranges = append(w.ranges, r) + w.rangesLock.Unlock() + return + } + + // try again + w.rangesLock.Unlock() + } +} + +// UnlockRange will release a previously locked range. +func (w *WriteLock) UnlockRange(min, max int64) { + w.rangesLock.Lock() + defer w.rangesLock.Unlock() + + // take the range out of the slice and unlock it + a := make([]*rangeLock, 0) + for _, r := range w.ranges { + if r.min == min && r.max == max { + r.mu.Unlock() + continue + } + a = append(a, r) + } + w.ranges = a +} + +func (w *WriteLock) currentlyLockedRanges() []*rangeLock { + w.rangesLock.Lock() + defer w.rangesLock.Unlock() + a := make([]*rangeLock, len(w.ranges)) + copy(a, w.ranges) + return a +} + +type rangeLock struct { + mu sync.Mutex + min int64 + max int64 +} + +func (r *rangeLock) overlaps(l *rangeLock) bool { + if l.min >= r.min && l.min <= r.max { + return true + } else if l.max >= r.min && l.max <= r.max { + return true + } else if l.min <= r.min && l.max >= r.max { + return true + } else if l.min >= r.min && l.max <= r.max { + return true + } + return false +} diff --git a/tsdb/engine/tsm1/write_lock_test.go b/tsdb/engine/tsm1/write_lock_test.go new file mode 100644 index 00000000000..7fa17c530c5 --- /dev/null +++ b/tsdb/engine/tsm1/write_lock_test.go @@ -0,0 +1,131 @@ +package tsm1_test + +import ( + // "sync" + "testing" + "time" + + "github.com/influxdb/influxdb/tsdb/engine/tsm1" +) + +func TestWriteLock_FullCover(t *testing.T) { + w := &tsm1.WriteLock{} + w.LockRange(2, 10) + + lock := make(chan bool) + timeout := time.NewTimer(10 * time.Millisecond) + go func() { + w.LockRange(1, 11) + lock <- true + }() + select { + case <-lock: + t.Fatal("able to get lock when we shouldn't") + case <-timeout.C: + // we're all good + } +} + +func TestWriteLock_RightIntersect(t *testing.T) { + w := &tsm1.WriteLock{} + w.LockRange(2, 10) + + lock := make(chan bool) + timeout := time.NewTimer(10 * time.Millisecond) + go func() { + w.LockRange(5, 15) + lock <- true + }() + select { + case <-lock: + t.Fatal("able to get lock when we shouldn't") + case <-timeout.C: + // we're all good + } +} + +func TestWriteLock_LeftIntersect(t *testing.T) { + w := &tsm1.WriteLock{} + w.LockRange(1, 4) + + lock := make(chan bool) + timeout := time.NewTimer(10 * time.Millisecond) + go func() { + w.LockRange(1, 11) + lock <- true + }() + select { + case <-lock: + t.Fatal("able to get lock when we shouldn't") + case <-timeout.C: + // we're all good + } +} + +func TestWriteLock_Inside(t *testing.T) { + w := &tsm1.WriteLock{} + w.LockRange(4, 8) + + lock := make(chan bool) + timeout := time.NewTimer(10 * time.Millisecond) + go func() { + w.LockRange(1, 11) + lock <- true + }() + select { + case <-lock: + t.Fatal("able to get lock when we shouldn't") + case <-timeout.C: + // we're all good + } +} + +func TestWriteLock_Same(t *testing.T) { + w := &tsm1.WriteLock{} + w.LockRange(2, 10) + + lock := make(chan bool) + timeout := time.NewTimer(10 * time.Millisecond) + go func() { + w.LockRange(2, 10) + lock <- true + }() + select { + case <-lock: + t.Fatal("able to get lock when we shouldn't") + case <-timeout.C: + // we're all good + } +} + +// func TestWriteLock_FreeRangeWithContentionElsewhere(t *testing.T) { +// w := &tsm1.WriteLock{} +// w.LockRange(2, 10) + +// lock := make(chan bool) +// freeRange := make(chan bool) +// timeout := time.NewTimer(10 * time.Millisecond) +// var wg sync.WaitGroup + +// wg.Add(1) +// go func() { +// wg.Done() +// w.LockRange(4, 12) +// lock <- true +// }() + +// // make sure the other go func has gotten to the point of requesting the lock +// wg.Wait() +// go func() { +// w.LockRange(15, 23) +// freeRange <- true +// }() +// select { +// case <-lock: +// t.Fatal("able to get lock when we shouldn't") +// case <-timeout.C: +// t.Fatal("unable to get lock of free range when contention exists elsewhere") +// case <-freeRange: +// // we're all good +// } +// } diff --git a/tsdb/shard.go b/tsdb/shard.go index e355cc661de..3a7215e0857 100644 --- a/tsdb/shard.go +++ b/tsdb/shard.go @@ -16,7 +16,6 @@ import ( "github.com/influxdb/influxdb/models" "github.com/influxdb/influxdb/tsdb/internal" - "github.com/boltdb/bolt" "github.com/gogo/protobuf/proto" ) @@ -49,7 +48,6 @@ var ( // Data can be split across many shards. The query engine in TSDB is responsible // for combining the output of many shards into a single query result. type Shard struct { - db *bolt.DB // underlying data store index *DatabaseIndex path string walPath string @@ -91,6 +89,12 @@ func NewShard(id uint64, index *DatabaseIndex, path string, walPath string, opti // Path returns the path set on the shard when it was created. func (s *Shard) Path() string { return s.path } +// PerformMaintenance gets called periodically to have the engine perform +// any maintenance tasks like WAL flushing and compaction +func (s *Shard) PerformMaintenance() { + s.engine.PerformMaintenance() +} + // open initializes and opens the shard's store. func (s *Shard) Open() error { if err := func() error { @@ -121,7 +125,7 @@ func (s *Shard) Open() error { } // Load metadata index. - if err := s.engine.LoadMetadataIndex(s.index, s.measurementFields); err != nil { + if err := s.engine.LoadMetadataIndex(s, s.index, s.measurementFields); err != nil { return fmt.Errorf("load metadata index: %s", err) } @@ -229,27 +233,30 @@ func (s *Shard) WritePoints(points []models.Point) error { } // make sure all data is encoded before attempting to save to bolt - for _, p := range points { - // Ignore if raw data has already been marshaled. - if p.Data() != nil { - continue - } + // only required for the b1 and bz1 formats + if s.engine.Format() != TSM1Format { + for _, p := range points { + // Ignore if raw data has already been marshaled. + if p.Data() != nil { + continue + } - // This was populated earlier, don't need to validate that it's there. - s.mu.RLock() - mf := s.measurementFields[p.Name()] - s.mu.RUnlock() + // This was populated earlier, don't need to validate that it's there. + s.mu.RLock() + mf := s.measurementFields[p.Name()] + s.mu.RUnlock() - // If a measurement is dropped while writes for it are in progress, this could be nil - if mf == nil { - return ErrFieldNotFound - } + // If a measurement is dropped while writes for it are in progress, this could be nil + if mf == nil { + return ErrFieldNotFound + } - data, err := mf.Codec.EncodeFields(p.Fields()) - if err != nil { - return err + data, err := mf.Codec.EncodeFields(p.Fields()) + if err != nil { + return err + } + p.SetData(data) } - p.SetData(data) } // Write to the engine. @@ -360,7 +367,9 @@ func (s *Shard) createFieldsAndMeasurements(fieldsToCreate []*FieldCreate) (map[ measurementsToSave[f.Measurement] = m // add the field to the in memory index - if err := m.CreateFieldIfNotExists(f.Field.Name, f.Field.Type); err != nil { + // only limit the field count for non-tsm eninges + limitFieldCount := s.engine.Format() == B1Format || s.engine.Format() == BZ1Format + if err := m.CreateFieldIfNotExists(f.Field.Name, f.Field.Type, limitFieldCount); err != nil { return nil, err } @@ -468,7 +477,7 @@ func (m *MeasurementFields) UnmarshalBinary(buf []byte) error { // CreateFieldIfNotExists creates a new field with an autoincrementing ID. // Returns an error if 255 fields have already been created on the measurement or // the fields already exists with a different type. -func (m *MeasurementFields) CreateFieldIfNotExists(name string, typ influxql.DataType) error { +func (m *MeasurementFields) CreateFieldIfNotExists(name string, typ influxql.DataType, limitCount bool) error { // Ignore if the field already exists. if f := m.Fields[name]; f != nil { if f.Type != typ { @@ -477,8 +486,8 @@ func (m *MeasurementFields) CreateFieldIfNotExists(name string, typ influxql.Dat return nil } - // Only 255 fields are allowed. If we go over that then return an error. - if len(m.Fields)+1 > math.MaxUint8 { + // If we're supposed to limit the number of fields, only 255 are allowed. If we go over that then return an error. + if len(m.Fields)+1 > math.MaxUint8 && limitCount { return ErrFieldOverflow } @@ -741,15 +750,22 @@ func (f *FieldCodec) DecodeByID(targetID uint8, b []byte) (interface{}, error) { // DecodeByName scans a byte slice for a field with the given name, converts it to its // expected type, and return that value. func (f *FieldCodec) DecodeByName(name string, b []byte) (interface{}, error) { - fi := f.fieldByName(name) + fi := f.FieldByName(name) if fi == nil { return 0, ErrFieldNotFound } return f.DecodeByID(fi.ID, b) } +func (f *FieldCodec) Fields() (a []*Field) { + for _, f := range f.fieldsByID { + a = append(a, f) + } + return +} + // FieldByName returns the field by its name. It will return a nil if not found -func (f *FieldCodec) fieldByName(name string) *Field { +func (f *FieldCodec) FieldByName(name string) *Field { return f.fieldsByName[name] } diff --git a/tsdb/snapshot_writer.go b/tsdb/snapshot_writer.go index 785ca13908c..4a0a2d3edef 100644 --- a/tsdb/snapshot_writer.go +++ b/tsdb/snapshot_writer.go @@ -8,7 +8,6 @@ import ( "path/filepath" "time" - "github.com/boltdb/bolt" "github.com/influxdb/influxdb/snapshot" ) @@ -83,7 +82,7 @@ func appendShardSnapshotFile(sw *snapshot.Writer, sh *Shard, name string) error } // Begin transaction. - tx, err := sh.db.Begin(false) + tx, err := sh.engine.Begin(false) if err != nil { return fmt.Errorf("begin: %s", err) } @@ -103,7 +102,7 @@ func appendShardSnapshotFile(sw *snapshot.Writer, sh *Shard, name string) error // boltTxCloser wraps a Bolt transaction to implement io.Closer. type boltTxCloser struct { - *bolt.Tx + Tx } // Close rolls back the transaction. diff --git a/tsdb/store.go b/tsdb/store.go index 13235a16844..be7076d00e2 100644 --- a/tsdb/store.go +++ b/tsdb/store.go @@ -9,6 +9,7 @@ import ( "strconv" "strings" "sync" + "time" "github.com/influxdb/influxdb/influxql" "github.com/influxdb/influxdb/models" @@ -27,6 +28,11 @@ func NewStore(path string) *Store { var ( ErrShardNotFound = fmt.Errorf("shard not found") + ErrStoreClosed = fmt.Errorf("store is closed") +) + +const ( + MaintenanceCheckInterval = time.Minute ) type Store struct { @@ -38,7 +44,10 @@ type Store struct { EngineOptions EngineOptions Logger *log.Logger - closing chan struct{} + + closing chan struct{} + wg sync.WaitGroup + opened bool } // Path returns the store's root path. @@ -71,7 +80,7 @@ func (s *Store) CreateShard(database, retentionPolicy string, shardID uint64) er select { case <-s.closing: - return fmt.Errorf("closing") + return ErrStoreClosed default: } @@ -124,7 +133,7 @@ func (s *Store) DeleteShard(shardID uint64) error { return err } - if err := os.Remove(sh.path); err != nil { + if err := os.RemoveAll(sh.path); err != nil { return err } @@ -301,6 +310,41 @@ func (s *Store) loadShards() error { } +// periodicMaintenance is the method called in a goroutine on the opening of the store +// to perform periodic maintenance of the shards. +func (s *Store) periodicMaintenance() { + t := time.NewTicker(MaintenanceCheckInterval) + for { + select { + case <-t.C: + s.performMaintenance() + case <-s.closing: + t.Stop() + return + } + } +} + +// performMaintenance will loop through the shars and tell them +// to perform any maintenance tasks. Those tasks should kick off +// their own goroutines if it's anything that could take time. +func (s *Store) performMaintenance() { + s.mu.Lock() + defer s.mu.Unlock() + for _, sh := range s.shards { + s.performMaintenanceOnShard(sh) + } +} + +func (s *Store) performMaintenanceOnShard(shard *Shard) { + defer func() { + if r := recover(); r != nil { + s.Logger.Printf("recovered eror in maintenance on shard %d", shard.id) + } + }() + shard.PerformMaintenance() +} + func (s *Store) Open() error { s.mu.Lock() defer s.mu.Unlock() @@ -326,12 +370,22 @@ func (s *Store) Open() error { return err } + go s.periodicMaintenance() + s.opened = true + return nil } func (s *Store) WriteToShard(shardID uint64, points []models.Point) error { s.mu.RLock() defer s.mu.RUnlock() + + select { + case <-s.closing: + return ErrStoreClosed + default: + } + sh, ok := s.shards[shardID] if !ok { return ErrShardNotFound @@ -367,15 +421,17 @@ func (s *Store) Close() error { s.mu.Lock() defer s.mu.Unlock() + if s.opened { + close(s.closing) + } + s.wg.Wait() + for _, sh := range s.shards { if err := sh.Close(); err != nil { return err } } - if s.closing != nil { - close(s.closing) - } - s.closing = nil + s.opened = false s.shards = nil s.databaseIndexes = nil