From e42d06277c2301a7958be61f55779c9ab1138740 Mon Sep 17 00:00:00 2001 From: Matti Ranta Date: Mon, 4 Feb 2019 10:16:54 -0500 Subject: [PATCH 1/3] add other session providers. Fix #3717 --- Gopkg.lock | 156 +- modules/setting/setting.go | 10 +- vendor/github.com/BurntSushi/toml/COPYING | 21 + .../toml/cmd/toml-test-decoder/COPYING | 21 + .../toml/cmd/toml-test-encoder/COPYING | 21 + .../BurntSushi/toml/cmd/tomlv/COPYING | 21 + vendor/github.com/BurntSushi/toml/decode.go | 509 + .../github.com/BurntSushi/toml/decode_meta.go | 121 + vendor/github.com/BurntSushi/toml/doc.go | 27 + vendor/github.com/BurntSushi/toml/encode.go | 568 + .../BurntSushi/toml/encoding_types.go | 19 + .../BurntSushi/toml/encoding_types_1.1.go | 18 + vendor/github.com/BurntSushi/toml/lex.go | 953 ++ vendor/github.com/BurntSushi/toml/parse.go | 592 ++ .../github.com/BurntSushi/toml/type_check.go | 91 + .../github.com/BurntSushi/toml/type_fields.go | 242 + .../github.com/couchbase/gomemcached/LICENSE | 19 + .../couchbase/gomemcached/client/mc.go | 1074 ++ .../couchbase/gomemcached/client/tap_feed.go | 333 + .../couchbase/gomemcached/client/transport.go | 67 + .../couchbase/gomemcached/client/upr_feed.go | 1005 ++ .../couchbase/gomemcached/mc_constants.go | 335 + .../couchbase/gomemcached/mc_req.go | 197 + .../couchbase/gomemcached/mc_res.go | 267 + .../github.com/couchbase/gomemcached/tap.go | 168 + .../github.com/couchbase/goutils/LICENSE.md | 47 + .../couchbase/goutils/logging/logger.go | 481 + .../couchbase/goutils/logging/logger_golog.go | 318 + .../couchbase/goutils/scramsha/scramsha.go | 207 + .../goutils/scramsha/scramsha_http.go | 252 + .../couchbaselabs/go-couchbase/LICENSE | 19 + .../couchbaselabs/go-couchbase/audit.go | 32 + .../couchbaselabs/go-couchbase/client.go | 1385 +++ .../couchbaselabs/go-couchbase/conn_pool.go | 387 + .../couchbaselabs/go-couchbase/ddocs.go | 288 + .../couchbaselabs/go-couchbase/observe.go | 300 + .../couchbaselabs/go-couchbase/pools.go | 1282 +++ .../couchbaselabs/go-couchbase/streaming.go | 209 + .../couchbaselabs/go-couchbase/tap.go | 143 + .../couchbaselabs/go-couchbase/upr.go | 398 + .../couchbaselabs/go-couchbase/users.go | 119 + .../couchbaselabs/go-couchbase/util.go | 49 + .../couchbaselabs/go-couchbase/vbmap.go | 77 + .../couchbaselabs/go-couchbase/views.go | 231 + .../go-macaron/session/couchbase/couchbase.go | 228 + vendor/github.com/go-macaron/session/file.go | 5 + vendor/github.com/go-macaron/session/flash.go | 61 + .../go-macaron/session/ledis/ledis.go | 227 + .../go-macaron/session/memcache/memcache.go | 204 + .../go-macaron/session/mysql/mysql.go | 200 + .../go-macaron/session/nodb/nodb.go | 208 + .../go-macaron/session/postgres/postgres.go | 201 + .../go-macaron/session/redis/redis.go | 7 +- .../github.com/go-macaron/session/session.go | 104 +- vendor/github.com/go-macaron/session/utils.go | 5 +- vendor/github.com/lunny/log/LICENSE | 27 + vendor/github.com/lunny/log/dbwriter.go | 36 + vendor/github.com/lunny/log/filewriter.go | 112 + vendor/github.com/lunny/log/logext.go | 595 ++ vendor/github.com/lunny/nodb/LICENSE | 21 + vendor/github.com/lunny/nodb/batch.go | 106 + vendor/github.com/lunny/nodb/binlog.go | 391 + vendor/github.com/lunny/nodb/binlog_util.go | 215 + vendor/github.com/lunny/nodb/config/config.go | 135 + vendor/github.com/lunny/nodb/const.go | 98 + vendor/github.com/lunny/nodb/doc.go | 61 + vendor/github.com/lunny/nodb/dump.go | 200 + vendor/github.com/lunny/nodb/info.go | 24 + vendor/github.com/lunny/nodb/multi.go | 73 + vendor/github.com/lunny/nodb/nodb.go | 128 + vendor/github.com/lunny/nodb/nodb_db.go | 171 + vendor/github.com/lunny/nodb/replication.go | 312 + vendor/github.com/lunny/nodb/scan.go | 144 + vendor/github.com/lunny/nodb/store/db.go | 61 + .../lunny/nodb/store/driver/batch.go | 39 + .../lunny/nodb/store/driver/driver.go | 67 + .../lunny/nodb/store/driver/store.go | 46 + .../lunny/nodb/store/goleveldb/batch.go | 27 + .../lunny/nodb/store/goleveldb/const.go | 4 + .../lunny/nodb/store/goleveldb/db.go | 187 + .../lunny/nodb/store/goleveldb/iterator.go | 49 + .../lunny/nodb/store/goleveldb/snapshot.go | 26 + .../github.com/lunny/nodb/store/iterator.go | 327 + .../github.com/lunny/nodb/store/snapshot.go | 16 + vendor/github.com/lunny/nodb/store/store.go | 51 + vendor/github.com/lunny/nodb/store/tx.go | 42 + .../github.com/lunny/nodb/store/writebatch.go | 9 + vendor/github.com/lunny/nodb/t_bit.go | 922 ++ vendor/github.com/lunny/nodb/t_hash.go | 509 + vendor/github.com/lunny/nodb/t_kv.go | 387 + vendor/github.com/lunny/nodb/t_list.go | 492 + vendor/github.com/lunny/nodb/t_set.go | 601 ++ vendor/github.com/lunny/nodb/t_ttl.go | 195 + vendor/github.com/lunny/nodb/t_zset.go | 943 ++ vendor/github.com/lunny/nodb/tx.go | 113 + vendor/github.com/lunny/nodb/util.go | 113 + vendor/github.com/pkg/errors/LICENSE | 23 + vendor/github.com/pkg/errors/errors.go | 282 + vendor/github.com/pkg/errors/stack.go | 147 + .../github.com/siddontang/go-snappy/AUTHORS | 12 + .../siddontang/go-snappy/CONTRIBUTORS | 34 + .../github.com/siddontang/go-snappy/LICENSE | 27 + .../siddontang/go-snappy/snappy/decode.go | 124 + .../siddontang/go-snappy/snappy/encode.go | 174 + .../siddontang/go-snappy/snappy/snappy.go | 38 + vendor/github.com/siddontang/go/LICENSE | 20 + vendor/github.com/siddontang/go/bson/LICENSE | 25 + .../github.com/siddontang/go/filelock/LICENSE | 27 + .../go/filelock/file_lock_generic.go | 17 + .../go/filelock/file_lock_solaris.go | 43 + .../siddontang/go/filelock/file_lock_unix.go | 51 + .../go/filelock/file_lock_windows.go | 36 + vendor/github.com/siddontang/go/hack/hack.go | 27 + .../siddontang/go/ioutil2/ioutil.go | 39 + .../siddontang/go/ioutil2/sectionwriter.go | 69 + vendor/github.com/siddontang/go/log/doc.go | 21 + .../siddontang/go/log/filehandler.go | 221 + .../github.com/siddontang/go/log/handler.go | 48 + vendor/github.com/siddontang/go/log/log.go | 343 + .../siddontang/go/log/sockethandler.go | 65 + vendor/github.com/siddontang/go/num/bytes.go | 67 + vendor/github.com/siddontang/go/num/cmp.go | 161 + vendor/github.com/siddontang/go/num/str.go | 157 + .../github.com/siddontang/go/snappy/LICENSE | 27 + .../github.com/siddontang/go/snappy/decode.go | 124 + .../github.com/siddontang/go/snappy/encode.go | 174 + .../github.com/siddontang/go/snappy/snappy.go | 38 + .../github.com/siddontang/go/sync2/atomic.go | 146 + .../siddontang/go/sync2/semaphore.go | 65 + vendor/github.com/siddontang/ledisdb/LICENSE | 21 + .../siddontang/ledisdb/client/go/LICENSE | 21 + .../siddontang/ledisdb/config/config.go | 279 + .../siddontang/ledisdb/ledis/batch.go | 138 + .../siddontang/ledisdb/ledis/const.go | 100 + .../siddontang/ledisdb/ledis/doc.go | 58 + .../siddontang/ledisdb/ledis/dump.go | 220 + .../siddontang/ledisdb/ledis/event.go | 135 + .../siddontang/ledisdb/ledis/info.go | 26 + .../siddontang/ledisdb/ledis/ledis.go | 215 + .../siddontang/ledisdb/ledis/ledis_db.go | 147 + .../siddontang/ledisdb/ledis/multi.go | 75 + .../siddontang/ledisdb/ledis/replication.go | 246 + .../siddontang/ledisdb/ledis/scan.go | 136 + .../siddontang/ledisdb/ledis/t_bit.go | 926 ++ .../siddontang/ledisdb/ledis/t_hash.go | 511 + .../siddontang/ledisdb/ledis/t_kv.go | 396 + .../siddontang/ledisdb/ledis/t_list.go | 665 ++ .../siddontang/ledisdb/ledis/t_set.go | 605 ++ .../siddontang/ledisdb/ledis/t_ttl.go | 210 + .../siddontang/ledisdb/ledis/t_zset.go | 1025 ++ .../github.com/siddontang/ledisdb/ledis/tx.go | 112 + .../siddontang/ledisdb/ledis/util.go | 94 + .../siddontang/ledisdb/rpl/file_io.go | 362 + .../siddontang/ledisdb/rpl/file_store.go | 414 + .../siddontang/ledisdb/rpl/file_table.go | 570 + .../siddontang/ledisdb/rpl/goleveldb_store.go | 224 + .../github.com/siddontang/ledisdb/rpl/log.go | 167 + .../github.com/siddontang/ledisdb/rpl/rpl.go | 331 + .../siddontang/ledisdb/rpl/store.go | 36 + .../siddontang/ledisdb/store/boltdb/const.go | 3 + .../siddontang/ledisdb/store/boltdb/db.go | 172 + .../ledisdb/store/boltdb/iterator.go | 50 + .../ledisdb/store/boltdb/snapshot.go | 37 + .../siddontang/ledisdb/store/boltdb/tx.go | 61 + .../github.com/siddontang/ledisdb/store/db.go | 179 + .../siddontang/ledisdb/store/driver/batch.go | 70 + .../siddontang/ledisdb/store/driver/driver.go | 79 + .../siddontang/ledisdb/store/driver/slice.go | 21 + .../siddontang/ledisdb/store/driver/store.go | 45 + .../ledisdb/store/goleveldb/batch.go | 39 + .../ledisdb/store/goleveldb/const.go | 4 + .../siddontang/ledisdb/store/goleveldb/db.go | 208 + .../ledisdb/store/goleveldb/iterator.go | 49 + .../ledisdb/store/goleveldb/snapshot.go | 26 + .../siddontang/ledisdb/store/iterator.go | 333 + .../siddontang/ledisdb/store/leveldb/batch.go | 104 + .../siddontang/ledisdb/store/leveldb/cache.go | 20 + .../siddontang/ledisdb/store/leveldb/const.go | 3 + .../siddontang/ledisdb/store/leveldb/db.go | 315 + .../ledisdb/store/leveldb/filterpolicy.go | 21 + .../ledisdb/store/leveldb/iterator.go | 70 + .../ledisdb/store/leveldb/leveldb_ext.cc | 95 + .../ledisdb/store/leveldb/leveldb_ext.h | 41 + .../ledisdb/store/leveldb/options.go | 122 + .../siddontang/ledisdb/store/leveldb/slice.go | 40 + .../ledisdb/store/leveldb/snapshot.go | 39 + .../siddontang/ledisdb/store/leveldb/util.go | 45 + .../siddontang/ledisdb/store/mdb/const.go | 3 + .../siddontang/ledisdb/store/mdb/mdb.go | 316 + .../siddontang/ledisdb/store/mdb/snapshot.go | 43 + .../siddontang/ledisdb/store/mdb/tx.go | 90 + .../siddontang/ledisdb/store/rocksdb/batch.go | 83 + .../siddontang/ledisdb/store/rocksdb/cache.go | 20 + .../siddontang/ledisdb/store/rocksdb/const.go | 3 + .../siddontang/ledisdb/store/rocksdb/db.go | 346 + .../siddontang/ledisdb/store/rocksdb/env.go | 27 + .../ledisdb/store/rocksdb/filterpolicy.go | 21 + .../ledisdb/store/rocksdb/iterator.go | 70 + .../ledisdb/store/rocksdb/options.go | 233 + .../ledisdb/store/rocksdb/rocksdb_ext.cc | 44 + .../ledisdb/store/rocksdb/rocksdb_ext.h | 24 + .../siddontang/ledisdb/store/rocksdb/slice.go | 41 + .../ledisdb/store/rocksdb/snapshot.go | 39 + .../siddontang/ledisdb/store/rocksdb/util.go | 54 + .../siddontang/ledisdb/store/slice.go | 9 + .../siddontang/ledisdb/store/snapshot.go | 48 + .../siddontang/ledisdb/store/stat.go | 37 + .../siddontang/ledisdb/store/store.go | 63 + .../github.com/siddontang/ledisdb/store/tx.go | 86 + .../siddontang/ledisdb/store/writebatch.go | 150 + vendor/github.com/syndtr/goleveldb/LICENSE | 24 + .../syndtr/goleveldb/leveldb/batch.go | 349 + .../syndtr/goleveldb/leveldb/cache/cache.go | 704 ++ .../syndtr/goleveldb/leveldb/cache/lru.go | 195 + .../syndtr/goleveldb/leveldb/comparer.go | 67 + .../leveldb/comparer/bytes_comparer.go | 51 + .../goleveldb/leveldb/comparer/comparer.go | 57 + .../github.com/syndtr/goleveldb/leveldb/db.go | 1175 +++ .../syndtr/goleveldb/leveldb/db_compaction.go | 854 ++ .../syndtr/goleveldb/leveldb/db_iter.go | 360 + .../syndtr/goleveldb/leveldb/db_snapshot.go | 183 + .../syndtr/goleveldb/leveldb/db_state.go | 239 + .../goleveldb/leveldb/db_transaction.go | 325 + .../syndtr/goleveldb/leveldb/db_util.go | 102 + .../syndtr/goleveldb/leveldb/db_write.go | 464 + .../syndtr/goleveldb/leveldb/doc.go | 92 + .../syndtr/goleveldb/leveldb/errors.go | 20 + .../syndtr/goleveldb/leveldb/errors/errors.go | 78 + .../syndtr/goleveldb/leveldb/filter.go | 31 + .../syndtr/goleveldb/leveldb/filter/bloom.go | 116 + .../syndtr/goleveldb/leveldb/filter/filter.go | 60 + .../goleveldb/leveldb/iterator/array_iter.go | 184 + .../leveldb/iterator/indexed_iter.go | 242 + .../syndtr/goleveldb/leveldb/iterator/iter.go | 132 + .../goleveldb/leveldb/iterator/merged_iter.go | 304 + .../goleveldb/leveldb/journal/journal.go | 524 + .../syndtr/goleveldb/leveldb/key.go | 143 + .../syndtr/goleveldb/leveldb/memdb/memdb.go | 475 + .../syndtr/goleveldb/leveldb/opt/options.go | 697 ++ .../syndtr/goleveldb/leveldb/options.go | 107 + .../syndtr/goleveldb/leveldb/session.go | 210 + .../goleveldb/leveldb/session_compaction.go | 302 + .../goleveldb/leveldb/session_record.go | 323 + .../syndtr/goleveldb/leveldb/session_util.go | 271 + .../syndtr/goleveldb/leveldb/storage.go | 63 + .../goleveldb/leveldb/storage/file_storage.go | 671 ++ .../leveldb/storage/file_storage_nacl.go | 34 + .../leveldb/storage/file_storage_plan9.go | 63 + .../leveldb/storage/file_storage_solaris.go | 81 + .../leveldb/storage/file_storage_unix.go | 98 + .../leveldb/storage/file_storage_windows.go | 78 + .../goleveldb/leveldb/storage/mem_storage.go | 222 + .../goleveldb/leveldb/storage/storage.go | 187 + .../syndtr/goleveldb/leveldb/table.go | 531 + .../syndtr/goleveldb/leveldb/table/reader.go | 1135 ++ .../syndtr/goleveldb/leveldb/table/table.go | 177 + .../syndtr/goleveldb/leveldb/table/writer.go | 375 + .../syndtr/goleveldb/leveldb/util.go | 98 + .../syndtr/goleveldb/leveldb/util/buffer.go | 293 + .../goleveldb/leveldb/util/buffer_pool.go | 239 + .../syndtr/goleveldb/leveldb/util/crc32.go | 30 + .../syndtr/goleveldb/leveldb/util/hash.go | 48 + .../syndtr/goleveldb/leveldb/util/range.go | 32 + .../syndtr/goleveldb/leveldb/util/util.go | 73 + .../syndtr/goleveldb/leveldb/version.go | 528 + vendor/github.com/szferi/gomdb/LICENSE | 10 + vendor/github.com/szferi/gomdb/cursor.go | 103 + vendor/github.com/szferi/gomdb/env.go | 227 + vendor/github.com/szferi/gomdb/lmdb.h | 1553 +++ vendor/github.com/szferi/gomdb/mdb.c | 9364 +++++++++++++++++ vendor/github.com/szferi/gomdb/mdb.go | 13 + vendor/github.com/szferi/gomdb/midl.c | 360 + vendor/github.com/szferi/gomdb/midl.h | 186 + vendor/github.com/szferi/gomdb/txn.go | 197 + vendor/github.com/szferi/gomdb/val.go | 52 + 275 files changed, 64598 insertions(+), 65 deletions(-) create mode 100644 vendor/github.com/BurntSushi/toml/COPYING create mode 100644 vendor/github.com/BurntSushi/toml/cmd/toml-test-decoder/COPYING create mode 100644 vendor/github.com/BurntSushi/toml/cmd/toml-test-encoder/COPYING create mode 100644 vendor/github.com/BurntSushi/toml/cmd/tomlv/COPYING create mode 100644 vendor/github.com/BurntSushi/toml/decode.go create mode 100644 vendor/github.com/BurntSushi/toml/decode_meta.go create mode 100644 vendor/github.com/BurntSushi/toml/doc.go create mode 100644 vendor/github.com/BurntSushi/toml/encode.go create mode 100644 vendor/github.com/BurntSushi/toml/encoding_types.go create mode 100644 vendor/github.com/BurntSushi/toml/encoding_types_1.1.go create mode 100644 vendor/github.com/BurntSushi/toml/lex.go create mode 100644 vendor/github.com/BurntSushi/toml/parse.go create mode 100644 vendor/github.com/BurntSushi/toml/type_check.go create mode 100644 vendor/github.com/BurntSushi/toml/type_fields.go create mode 100644 vendor/github.com/couchbase/gomemcached/LICENSE create mode 100644 vendor/github.com/couchbase/gomemcached/client/mc.go create mode 100644 vendor/github.com/couchbase/gomemcached/client/tap_feed.go create mode 100644 vendor/github.com/couchbase/gomemcached/client/transport.go create mode 100644 vendor/github.com/couchbase/gomemcached/client/upr_feed.go create mode 100644 vendor/github.com/couchbase/gomemcached/mc_constants.go create mode 100644 vendor/github.com/couchbase/gomemcached/mc_req.go create mode 100644 vendor/github.com/couchbase/gomemcached/mc_res.go create mode 100644 vendor/github.com/couchbase/gomemcached/tap.go create mode 100644 vendor/github.com/couchbase/goutils/LICENSE.md create mode 100644 vendor/github.com/couchbase/goutils/logging/logger.go create mode 100644 vendor/github.com/couchbase/goutils/logging/logger_golog.go create mode 100644 vendor/github.com/couchbase/goutils/scramsha/scramsha.go create mode 100644 vendor/github.com/couchbase/goutils/scramsha/scramsha_http.go create mode 100644 vendor/github.com/couchbaselabs/go-couchbase/LICENSE create mode 100644 vendor/github.com/couchbaselabs/go-couchbase/audit.go create mode 100644 vendor/github.com/couchbaselabs/go-couchbase/client.go create mode 100644 vendor/github.com/couchbaselabs/go-couchbase/conn_pool.go create mode 100644 vendor/github.com/couchbaselabs/go-couchbase/ddocs.go create mode 100644 vendor/github.com/couchbaselabs/go-couchbase/observe.go create mode 100644 vendor/github.com/couchbaselabs/go-couchbase/pools.go create mode 100644 vendor/github.com/couchbaselabs/go-couchbase/streaming.go create mode 100644 vendor/github.com/couchbaselabs/go-couchbase/tap.go create mode 100644 vendor/github.com/couchbaselabs/go-couchbase/upr.go create mode 100644 vendor/github.com/couchbaselabs/go-couchbase/users.go create mode 100644 vendor/github.com/couchbaselabs/go-couchbase/util.go create mode 100644 vendor/github.com/couchbaselabs/go-couchbase/vbmap.go create mode 100644 vendor/github.com/couchbaselabs/go-couchbase/views.go create mode 100644 vendor/github.com/go-macaron/session/couchbase/couchbase.go create mode 100644 vendor/github.com/go-macaron/session/flash.go create mode 100644 vendor/github.com/go-macaron/session/ledis/ledis.go create mode 100644 vendor/github.com/go-macaron/session/memcache/memcache.go create mode 100644 vendor/github.com/go-macaron/session/mysql/mysql.go create mode 100644 vendor/github.com/go-macaron/session/nodb/nodb.go create mode 100644 vendor/github.com/go-macaron/session/postgres/postgres.go create mode 100644 vendor/github.com/lunny/log/LICENSE create mode 100644 vendor/github.com/lunny/log/dbwriter.go create mode 100644 vendor/github.com/lunny/log/filewriter.go create mode 100644 vendor/github.com/lunny/log/logext.go create mode 100644 vendor/github.com/lunny/nodb/LICENSE create mode 100644 vendor/github.com/lunny/nodb/batch.go create mode 100644 vendor/github.com/lunny/nodb/binlog.go create mode 100644 vendor/github.com/lunny/nodb/binlog_util.go create mode 100644 vendor/github.com/lunny/nodb/config/config.go create mode 100644 vendor/github.com/lunny/nodb/const.go create mode 100644 vendor/github.com/lunny/nodb/doc.go create mode 100644 vendor/github.com/lunny/nodb/dump.go create mode 100644 vendor/github.com/lunny/nodb/info.go create mode 100644 vendor/github.com/lunny/nodb/multi.go create mode 100644 vendor/github.com/lunny/nodb/nodb.go create mode 100644 vendor/github.com/lunny/nodb/nodb_db.go create mode 100644 vendor/github.com/lunny/nodb/replication.go create mode 100644 vendor/github.com/lunny/nodb/scan.go create mode 100644 vendor/github.com/lunny/nodb/store/db.go create mode 100644 vendor/github.com/lunny/nodb/store/driver/batch.go create mode 100644 vendor/github.com/lunny/nodb/store/driver/driver.go create mode 100644 vendor/github.com/lunny/nodb/store/driver/store.go create mode 100644 vendor/github.com/lunny/nodb/store/goleveldb/batch.go create mode 100644 vendor/github.com/lunny/nodb/store/goleveldb/const.go create mode 100644 vendor/github.com/lunny/nodb/store/goleveldb/db.go create mode 100644 vendor/github.com/lunny/nodb/store/goleveldb/iterator.go create mode 100644 vendor/github.com/lunny/nodb/store/goleveldb/snapshot.go create mode 100644 vendor/github.com/lunny/nodb/store/iterator.go create mode 100644 vendor/github.com/lunny/nodb/store/snapshot.go create mode 100644 vendor/github.com/lunny/nodb/store/store.go create mode 100644 vendor/github.com/lunny/nodb/store/tx.go create mode 100644 vendor/github.com/lunny/nodb/store/writebatch.go create mode 100644 vendor/github.com/lunny/nodb/t_bit.go create mode 100644 vendor/github.com/lunny/nodb/t_hash.go create mode 100644 vendor/github.com/lunny/nodb/t_kv.go create mode 100644 vendor/github.com/lunny/nodb/t_list.go create mode 100644 vendor/github.com/lunny/nodb/t_set.go create mode 100644 vendor/github.com/lunny/nodb/t_ttl.go create mode 100644 vendor/github.com/lunny/nodb/t_zset.go create mode 100644 vendor/github.com/lunny/nodb/tx.go create mode 100644 vendor/github.com/lunny/nodb/util.go create mode 100644 vendor/github.com/pkg/errors/LICENSE create mode 100644 vendor/github.com/pkg/errors/errors.go create mode 100644 vendor/github.com/pkg/errors/stack.go create mode 100644 vendor/github.com/siddontang/go-snappy/AUTHORS create mode 100644 vendor/github.com/siddontang/go-snappy/CONTRIBUTORS create mode 100644 vendor/github.com/siddontang/go-snappy/LICENSE create mode 100644 vendor/github.com/siddontang/go-snappy/snappy/decode.go create mode 100644 vendor/github.com/siddontang/go-snappy/snappy/encode.go create mode 100644 vendor/github.com/siddontang/go-snappy/snappy/snappy.go create mode 100644 vendor/github.com/siddontang/go/LICENSE create mode 100644 vendor/github.com/siddontang/go/bson/LICENSE create mode 100644 vendor/github.com/siddontang/go/filelock/LICENSE create mode 100644 vendor/github.com/siddontang/go/filelock/file_lock_generic.go create mode 100644 vendor/github.com/siddontang/go/filelock/file_lock_solaris.go create mode 100644 vendor/github.com/siddontang/go/filelock/file_lock_unix.go create mode 100644 vendor/github.com/siddontang/go/filelock/file_lock_windows.go create mode 100644 vendor/github.com/siddontang/go/hack/hack.go create mode 100644 vendor/github.com/siddontang/go/ioutil2/ioutil.go create mode 100644 vendor/github.com/siddontang/go/ioutil2/sectionwriter.go create mode 100644 vendor/github.com/siddontang/go/log/doc.go create mode 100644 vendor/github.com/siddontang/go/log/filehandler.go create mode 100644 vendor/github.com/siddontang/go/log/handler.go create mode 100644 vendor/github.com/siddontang/go/log/log.go create mode 100644 vendor/github.com/siddontang/go/log/sockethandler.go create mode 100644 vendor/github.com/siddontang/go/num/bytes.go create mode 100644 vendor/github.com/siddontang/go/num/cmp.go create mode 100644 vendor/github.com/siddontang/go/num/str.go create mode 100644 vendor/github.com/siddontang/go/snappy/LICENSE create mode 100644 vendor/github.com/siddontang/go/snappy/decode.go create mode 100644 vendor/github.com/siddontang/go/snappy/encode.go create mode 100644 vendor/github.com/siddontang/go/snappy/snappy.go create mode 100644 vendor/github.com/siddontang/go/sync2/atomic.go create mode 100644 vendor/github.com/siddontang/go/sync2/semaphore.go create mode 100644 vendor/github.com/siddontang/ledisdb/LICENSE create mode 100644 vendor/github.com/siddontang/ledisdb/client/go/LICENSE create mode 100644 vendor/github.com/siddontang/ledisdb/config/config.go create mode 100644 vendor/github.com/siddontang/ledisdb/ledis/batch.go create mode 100644 vendor/github.com/siddontang/ledisdb/ledis/const.go create mode 100644 vendor/github.com/siddontang/ledisdb/ledis/doc.go create mode 100644 vendor/github.com/siddontang/ledisdb/ledis/dump.go create mode 100644 vendor/github.com/siddontang/ledisdb/ledis/event.go create mode 100644 vendor/github.com/siddontang/ledisdb/ledis/info.go create mode 100644 vendor/github.com/siddontang/ledisdb/ledis/ledis.go create mode 100644 vendor/github.com/siddontang/ledisdb/ledis/ledis_db.go create mode 100644 vendor/github.com/siddontang/ledisdb/ledis/multi.go create mode 100644 vendor/github.com/siddontang/ledisdb/ledis/replication.go create mode 100644 vendor/github.com/siddontang/ledisdb/ledis/scan.go create mode 100644 vendor/github.com/siddontang/ledisdb/ledis/t_bit.go create mode 100644 vendor/github.com/siddontang/ledisdb/ledis/t_hash.go create mode 100644 vendor/github.com/siddontang/ledisdb/ledis/t_kv.go create mode 100644 vendor/github.com/siddontang/ledisdb/ledis/t_list.go create mode 100644 vendor/github.com/siddontang/ledisdb/ledis/t_set.go create mode 100644 vendor/github.com/siddontang/ledisdb/ledis/t_ttl.go create mode 100644 vendor/github.com/siddontang/ledisdb/ledis/t_zset.go create mode 100644 vendor/github.com/siddontang/ledisdb/ledis/tx.go create mode 100644 vendor/github.com/siddontang/ledisdb/ledis/util.go create mode 100644 vendor/github.com/siddontang/ledisdb/rpl/file_io.go create mode 100644 vendor/github.com/siddontang/ledisdb/rpl/file_store.go create mode 100644 vendor/github.com/siddontang/ledisdb/rpl/file_table.go create mode 100644 vendor/github.com/siddontang/ledisdb/rpl/goleveldb_store.go create mode 100644 vendor/github.com/siddontang/ledisdb/rpl/log.go create mode 100644 vendor/github.com/siddontang/ledisdb/rpl/rpl.go create mode 100644 vendor/github.com/siddontang/ledisdb/rpl/store.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/boltdb/const.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/boltdb/db.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/boltdb/iterator.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/boltdb/snapshot.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/boltdb/tx.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/db.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/driver/batch.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/driver/driver.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/driver/slice.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/driver/store.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/goleveldb/batch.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/goleveldb/const.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/goleveldb/db.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/goleveldb/iterator.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/goleveldb/snapshot.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/iterator.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/leveldb/batch.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/leveldb/cache.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/leveldb/const.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/leveldb/db.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/leveldb/filterpolicy.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/leveldb/iterator.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/leveldb/leveldb_ext.cc create mode 100644 vendor/github.com/siddontang/ledisdb/store/leveldb/leveldb_ext.h create mode 100644 vendor/github.com/siddontang/ledisdb/store/leveldb/options.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/leveldb/slice.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/leveldb/snapshot.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/leveldb/util.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/mdb/const.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/mdb/mdb.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/mdb/snapshot.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/mdb/tx.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/rocksdb/batch.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/rocksdb/cache.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/rocksdb/const.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/rocksdb/db.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/rocksdb/env.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/rocksdb/filterpolicy.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/rocksdb/iterator.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/rocksdb/options.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/rocksdb/rocksdb_ext.cc create mode 100644 vendor/github.com/siddontang/ledisdb/store/rocksdb/rocksdb_ext.h create mode 100644 vendor/github.com/siddontang/ledisdb/store/rocksdb/slice.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/rocksdb/snapshot.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/rocksdb/util.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/slice.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/snapshot.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/stat.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/store.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/tx.go create mode 100644 vendor/github.com/siddontang/ledisdb/store/writebatch.go create mode 100644 vendor/github.com/syndtr/goleveldb/LICENSE create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/batch.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/cache/cache.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/cache/lru.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/comparer.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/comparer/bytes_comparer.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/comparer/comparer.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/db.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/db_compaction.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/db_iter.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/db_snapshot.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/db_state.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/db_transaction.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/db_util.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/db_write.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/doc.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/errors.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/errors/errors.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/filter.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/filter/bloom.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/filter/filter.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/iterator/array_iter.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/iterator/indexed_iter.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/iterator/iter.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/iterator/merged_iter.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/journal/journal.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/key.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/memdb/memdb.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/opt/options.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/options.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/session.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/session_compaction.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/session_record.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/session_util.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/storage.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage_nacl.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage_plan9.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage_solaris.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage_unix.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage_windows.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/storage/mem_storage.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/storage/storage.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/table.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/table/reader.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/table/table.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/table/writer.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/util.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/util/buffer.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/util/buffer_pool.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/util/crc32.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/util/hash.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/util/range.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/util/util.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/version.go create mode 100644 vendor/github.com/szferi/gomdb/LICENSE create mode 100644 vendor/github.com/szferi/gomdb/cursor.go create mode 100644 vendor/github.com/szferi/gomdb/env.go create mode 100644 vendor/github.com/szferi/gomdb/lmdb.h create mode 100644 vendor/github.com/szferi/gomdb/mdb.c create mode 100644 vendor/github.com/szferi/gomdb/mdb.go create mode 100644 vendor/github.com/szferi/gomdb/midl.c create mode 100644 vendor/github.com/szferi/gomdb/midl.h create mode 100644 vendor/github.com/szferi/gomdb/txn.go create mode 100644 vendor/github.com/szferi/gomdb/val.go diff --git a/Gopkg.lock b/Gopkg.lock index 65cdf7efa36c..546d0bf60496 100644 --- a/Gopkg.lock +++ b/Gopkg.lock @@ -17,6 +17,14 @@ pruneopts = "NUT" revision = "d5a42771e7e851e8a89c5c6ffa0f5b075342f9df" +[[projects]] + digest = "1:5d72bbcc9c8667b11c3dc3cbe681c5a6f71e5096744c0bf7726ab5c6425d5dc4" + name = "github.com/BurntSushi/toml" + packages = ["."] + pruneopts = "NUT" + revision = "3012a1dbe2e4bd1391d42b32f0577cb7bbc7f005" + version = "v0.3.1" + [[projects]] digest = "1:3fcef06a1a6561955c94af6c7757a6fa37605eb653f0d06ab960e5bb80092195" name = "github.com/PuerkitoBio/goquery" @@ -185,6 +193,28 @@ pruneopts = "NUT" revision = "098da33fde5f9220736531b3cb26a2dec86a8367" +[[projects]] + branch = "master" + digest = "1:eb205556fe75307c6d2b58d4159e7c2da23e2666481d352c66d4055bebf45a3c" + name = "github.com/couchbase/gomemcached" + packages = [ + ".", + "client", + ] + pruneopts = "NUT" + revision = "5125a94a666c83cb9b7a60907833cd320b84c20f" + +[[projects]] + branch = "master" + digest = "1:ea03e12e246f7708a7b7ab3ad04e96d21ce73f48bb56258bc2bffeed474212e6" + name = "github.com/couchbase/goutils" + packages = [ + "logging", + "scramsha", + ] + pruneopts = "NUT" + revision = "e865a1461c8ac0032bd37e2d4dab3289faea3873" + [[projects]] branch = "master" digest = "1:82e1ad11d777f7bff9a1fc678a8a534a318f85e5026a8a4d6f4a94a6b0678bb6" @@ -197,6 +227,14 @@ pruneopts = "NUT" revision = "eb6ae3743b3f300f2136f83ca78c08cc071edbd4" +[[projects]] + branch = "master" + digest = "1:df592f4b82b993fcac270862376c34210776b8b0334a0f59f4d9d80467713ffa" + name = "github.com/couchbaselabs/go-couchbase" + packages = ["."] + pruneopts = "NUT" + revision = "d904413d884d1fb849e2ad8834619f661761ef57" + [[projects]] digest = "1:a2c1d0e43bd3baaa071d1b9ed72c27d78169b2b269f71c105ac4ba34b1be4a39" name = "github.com/davecgh/go-spew" @@ -358,14 +396,20 @@ [[projects]] branch = "master" - digest = "1:8fea5718d84af17762195beb6fe92a0d6c1048452a1dbc464d227f12e0cff0cc" + digest = "1:8d322b03fdb82ca78e19c2f90f8eb4ad88b53e3b71cbd01d0b3a1d73beb1abb2" name = "github.com/go-macaron/session" packages = [ ".", + "couchbase", + "ledis", + "memcache", + "mysql", + "nodb", + "postgres", "redis", ] pruneopts = "NUT" - revision = "330e4e4d8beb7b00111ac34539561f46f94c4458" + revision = "0a0a789bf1934e55fde19629869caa015a40c525" [[projects]] digest = "1:758d2371fcdee6d02565901b348729053c636055e67ef6e17aa466c7ff6cc57c" @@ -579,6 +623,28 @@ pruneopts = "NUT" revision = "e3534c89ef969912856dfa39e56b09e58c5f5daf" +[[projects]] + digest = "1:1e6a29ed1f189354030e3371f63ec58aacbc2bf232fd104c6e0d41174ac5af48" + name = "github.com/lunny/log" + packages = ["."] + pruneopts = "NUT" + revision = "7887c61bf0de75586961948b286be6f7d05d9f58" + version = "v0.1" + +[[projects]] + branch = "master" + digest = "1:683d835728cb95d176d423b522420eb5e4ec859b276bca18466476b82b3ebc4c" + name = "github.com/lunny/nodb" + packages = [ + ".", + "config", + "store", + "store/driver", + "store/goleveldb", + ] + pruneopts = "NUT" + revision = "fc1ef06ad4af0da31cdb87e3fa5ec084c67e6597" + [[projects]] digest = "1:aa7dcd6a0db70d514821f8739d0a22e7df33b499d8d399cf15b2858d44f8319e" name = "github.com/markbates/goth" @@ -682,6 +748,14 @@ revision = "bb6d471dc95d4fe11e432687f8b70ff496cf3136" version = "v1.0.0" +[[projects]] + digest = "1:14715f705ff5dfe0ffd6571d7d201dd8e921030f8070321a79380d8ca4ec1a24" + name = "github.com/pkg/errors" + packages = ["."] + pruneopts = "NUT" + revision = "ba968bfe8b2f7e042a574c888954fccecfa385b4" + version = "v0.8.1" + [[projects]] digest = "1:0028cb19b2e4c3112225cd871870f2d9cf49b9b4276531f03438a88e94be86fe" name = "github.com/pmezard/go-difflib" @@ -775,6 +849,49 @@ pruneopts = "NUT" revision = "1dba4b3954bc059efc3991ec364f9f9a35f597d2" +[[projects]] + branch = "master" + digest = "1:81cd039986aace9719c68a9794fa8c9dd1007cffa1ff8995631e8ed35aacf6fe" + name = "github.com/siddontang/go" + packages = [ + "filelock", + "hack", + "ioutil2", + "log", + "num", + "snappy", + "sync2", + ] + pruneopts = "NUT" + revision = "bdc77568d726a8702315ec4eafda030b6abc4f43" + +[[projects]] + branch = "master" + digest = "1:dbda803f21e60c38de7d9f884390f2ebbe234ce0c3d139b65bbb36b03a99d266" + name = "github.com/siddontang/go-snappy" + packages = ["snappy"] + pruneopts = "NUT" + revision = "d8f7bb82a96d89c1254e5a6c967134e1433c9ee2" + +[[projects]] + digest = "1:b8d24bf8620181e3232d6fa1638201ba6d808b4f2dc9b620863554cf9b383748" + name = "github.com/siddontang/ledisdb" + packages = [ + "config", + "ledis", + "rpl", + "store", + "store/boltdb", + "store/driver", + "store/goleveldb", + "store/leveldb", + "store/mdb", + "store/rocksdb", + ] + pruneopts = "NUT" + revision = "af217791d176389b9d9f2d41c414ca2f1de005cf" + version = "v0.4" + [[projects]] digest = "1:89fd77d603a74a6540d60067debad9397865bf040955d907362c95d364baeba6" name = "github.com/src-d/gcfg" @@ -804,6 +921,35 @@ revision = "12b6f73e6084dad08a7c6e575284b177ecafbc71" version = "v1.2.1" +[[projects]] + branch = "master" + digest = "1:685fdfea42d825ebd39ee0994354b46c374cf2c2b2d97a41a8dee1807c6a9b62" + name = "github.com/syndtr/goleveldb" + packages = [ + "leveldb", + "leveldb/cache", + "leveldb/comparer", + "leveldb/errors", + "leveldb/filter", + "leveldb/iterator", + "leveldb/journal", + "leveldb/memdb", + "leveldb/opt", + "leveldb/storage", + "leveldb/table", + "leveldb/util", + ] + pruneopts = "NUT" + revision = "2f17a3356c6616cbfc4ae4c38147dc062a68fb0e" + +[[projects]] + branch = "master" + digest = "1:e94c19da512584d4069db25d449150d8849d49a696e6281685b68932d3ba64a5" + name = "github.com/szferi/gomdb" + packages = ["."] + pruneopts = "NUT" + revision = "9f9ffa97e793fb4d06a56e65a5e6b72cb3b32f1c" + [[projects]] branch = "master" digest = "1:3cb6dfe7cdece5716b1c3c3c0b5faf7fce2e83e2758e2baad2e9986d101980b8" @@ -1150,6 +1296,12 @@ "github.com/go-macaron/i18n", "github.com/go-macaron/inject", "github.com/go-macaron/session", + "github.com/go-macaron/session/couchbase", + "github.com/go-macaron/session/ledis", + "github.com/go-macaron/session/memcache", + "github.com/go-macaron/session/mysql", + "github.com/go-macaron/session/nodb", + "github.com/go-macaron/session/postgres", "github.com/go-macaron/session/redis", "github.com/go-macaron/toolbox", "github.com/go-sql-driver/mysql", diff --git a/modules/setting/setting.go b/modules/setting/setting.go index a2fdbf51923e..d739266500a8 100644 --- a/modules/setting/setting.go +++ b/modules/setting/setting.go @@ -31,7 +31,13 @@ import ( _ "github.com/go-macaron/cache/memcache" // memcache plugin for cache _ "github.com/go-macaron/cache/redis" "github.com/go-macaron/session" - _ "github.com/go-macaron/session/redis" // redis plugin for store session + _ "github.com/go-macaron/session/couchbase" // couchbase plugin for session store + _ "github.com/go-macaron/session/ledis" // ledis plugin for session store + _ "github.com/go-macaron/session/memcache" // memcache plugin for session store + _ "github.com/go-macaron/session/mysql" // mysql plugin for session store + _ "github.com/go-macaron/session/nodb" // nodb plugin for session store + _ "github.com/go-macaron/session/postgres" // postgres plugin for session store + _ "github.com/go-macaron/session/redis" // redis plugin for store session "github.com/go-xorm/core" shellquote "github.com/kballard/go-shellquote" version "github.com/mcuadros/go-version" @@ -1506,7 +1512,7 @@ func newCacheService() { func newSessionService() { SessionConfig.Provider = Cfg.Section("session").Key("PROVIDER").In("memory", - []string{"memory", "file", "redis", "mysql"}) + []string{"memory", "file", "redis", "mysql", "postgres", "ledis", "couchbase", "memcache", "nodb"}) SessionConfig.ProviderConfig = strings.Trim(Cfg.Section("session").Key("PROVIDER_CONFIG").MustString(path.Join(AppDataPath, "sessions")), "\" ") if SessionConfig.Provider == "file" && !filepath.IsAbs(SessionConfig.ProviderConfig) { SessionConfig.ProviderConfig = path.Join(AppWorkPath, SessionConfig.ProviderConfig) diff --git a/vendor/github.com/BurntSushi/toml/COPYING b/vendor/github.com/BurntSushi/toml/COPYING new file mode 100644 index 000000000000..01b5743200b8 --- /dev/null +++ b/vendor/github.com/BurntSushi/toml/COPYING @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2013 TOML authors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/vendor/github.com/BurntSushi/toml/cmd/toml-test-decoder/COPYING b/vendor/github.com/BurntSushi/toml/cmd/toml-test-decoder/COPYING new file mode 100644 index 000000000000..01b5743200b8 --- /dev/null +++ b/vendor/github.com/BurntSushi/toml/cmd/toml-test-decoder/COPYING @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2013 TOML authors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/vendor/github.com/BurntSushi/toml/cmd/toml-test-encoder/COPYING b/vendor/github.com/BurntSushi/toml/cmd/toml-test-encoder/COPYING new file mode 100644 index 000000000000..01b5743200b8 --- /dev/null +++ b/vendor/github.com/BurntSushi/toml/cmd/toml-test-encoder/COPYING @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2013 TOML authors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/vendor/github.com/BurntSushi/toml/cmd/tomlv/COPYING b/vendor/github.com/BurntSushi/toml/cmd/tomlv/COPYING new file mode 100644 index 000000000000..01b5743200b8 --- /dev/null +++ b/vendor/github.com/BurntSushi/toml/cmd/tomlv/COPYING @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2013 TOML authors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/vendor/github.com/BurntSushi/toml/decode.go b/vendor/github.com/BurntSushi/toml/decode.go new file mode 100644 index 000000000000..b0fd51d5b6ea --- /dev/null +++ b/vendor/github.com/BurntSushi/toml/decode.go @@ -0,0 +1,509 @@ +package toml + +import ( + "fmt" + "io" + "io/ioutil" + "math" + "reflect" + "strings" + "time" +) + +func e(format string, args ...interface{}) error { + return fmt.Errorf("toml: "+format, args...) +} + +// Unmarshaler is the interface implemented by objects that can unmarshal a +// TOML description of themselves. +type Unmarshaler interface { + UnmarshalTOML(interface{}) error +} + +// Unmarshal decodes the contents of `p` in TOML format into a pointer `v`. +func Unmarshal(p []byte, v interface{}) error { + _, err := Decode(string(p), v) + return err +} + +// Primitive is a TOML value that hasn't been decoded into a Go value. +// When using the various `Decode*` functions, the type `Primitive` may +// be given to any value, and its decoding will be delayed. +// +// A `Primitive` value can be decoded using the `PrimitiveDecode` function. +// +// The underlying representation of a `Primitive` value is subject to change. +// Do not rely on it. +// +// N.B. Primitive values are still parsed, so using them will only avoid +// the overhead of reflection. They can be useful when you don't know the +// exact type of TOML data until run time. +type Primitive struct { + undecoded interface{} + context Key +} + +// DEPRECATED! +// +// Use MetaData.PrimitiveDecode instead. +func PrimitiveDecode(primValue Primitive, v interface{}) error { + md := MetaData{decoded: make(map[string]bool)} + return md.unify(primValue.undecoded, rvalue(v)) +} + +// PrimitiveDecode is just like the other `Decode*` functions, except it +// decodes a TOML value that has already been parsed. Valid primitive values +// can *only* be obtained from values filled by the decoder functions, +// including this method. (i.e., `v` may contain more `Primitive` +// values.) +// +// Meta data for primitive values is included in the meta data returned by +// the `Decode*` functions with one exception: keys returned by the Undecoded +// method will only reflect keys that were decoded. Namely, any keys hidden +// behind a Primitive will be considered undecoded. Executing this method will +// update the undecoded keys in the meta data. (See the example.) +func (md *MetaData) PrimitiveDecode(primValue Primitive, v interface{}) error { + md.context = primValue.context + defer func() { md.context = nil }() + return md.unify(primValue.undecoded, rvalue(v)) +} + +// Decode will decode the contents of `data` in TOML format into a pointer +// `v`. +// +// TOML hashes correspond to Go structs or maps. (Dealer's choice. They can be +// used interchangeably.) +// +// TOML arrays of tables correspond to either a slice of structs or a slice +// of maps. +// +// TOML datetimes correspond to Go `time.Time` values. +// +// All other TOML types (float, string, int, bool and array) correspond +// to the obvious Go types. +// +// An exception to the above rules is if a type implements the +// encoding.TextUnmarshaler interface. In this case, any primitive TOML value +// (floats, strings, integers, booleans and datetimes) will be converted to +// a byte string and given to the value's UnmarshalText method. See the +// Unmarshaler example for a demonstration with time duration strings. +// +// Key mapping +// +// TOML keys can map to either keys in a Go map or field names in a Go +// struct. The special `toml` struct tag may be used to map TOML keys to +// struct fields that don't match the key name exactly. (See the example.) +// A case insensitive match to struct names will be tried if an exact match +// can't be found. +// +// The mapping between TOML values and Go values is loose. That is, there +// may exist TOML values that cannot be placed into your representation, and +// there may be parts of your representation that do not correspond to +// TOML values. This loose mapping can be made stricter by using the IsDefined +// and/or Undecoded methods on the MetaData returned. +// +// This decoder will not handle cyclic types. If a cyclic type is passed, +// `Decode` will not terminate. +func Decode(data string, v interface{}) (MetaData, error) { + rv := reflect.ValueOf(v) + if rv.Kind() != reflect.Ptr { + return MetaData{}, e("Decode of non-pointer %s", reflect.TypeOf(v)) + } + if rv.IsNil() { + return MetaData{}, e("Decode of nil %s", reflect.TypeOf(v)) + } + p, err := parse(data) + if err != nil { + return MetaData{}, err + } + md := MetaData{ + p.mapping, p.types, p.ordered, + make(map[string]bool, len(p.ordered)), nil, + } + return md, md.unify(p.mapping, indirect(rv)) +} + +// DecodeFile is just like Decode, except it will automatically read the +// contents of the file at `fpath` and decode it for you. +func DecodeFile(fpath string, v interface{}) (MetaData, error) { + bs, err := ioutil.ReadFile(fpath) + if err != nil { + return MetaData{}, err + } + return Decode(string(bs), v) +} + +// DecodeReader is just like Decode, except it will consume all bytes +// from the reader and decode it for you. +func DecodeReader(r io.Reader, v interface{}) (MetaData, error) { + bs, err := ioutil.ReadAll(r) + if err != nil { + return MetaData{}, err + } + return Decode(string(bs), v) +} + +// unify performs a sort of type unification based on the structure of `rv`, +// which is the client representation. +// +// Any type mismatch produces an error. Finding a type that we don't know +// how to handle produces an unsupported type error. +func (md *MetaData) unify(data interface{}, rv reflect.Value) error { + + // Special case. Look for a `Primitive` value. + if rv.Type() == reflect.TypeOf((*Primitive)(nil)).Elem() { + // Save the undecoded data and the key context into the primitive + // value. + context := make(Key, len(md.context)) + copy(context, md.context) + rv.Set(reflect.ValueOf(Primitive{ + undecoded: data, + context: context, + })) + return nil + } + + // Special case. Unmarshaler Interface support. + if rv.CanAddr() { + if v, ok := rv.Addr().Interface().(Unmarshaler); ok { + return v.UnmarshalTOML(data) + } + } + + // Special case. Handle time.Time values specifically. + // TODO: Remove this code when we decide to drop support for Go 1.1. + // This isn't necessary in Go 1.2 because time.Time satisfies the encoding + // interfaces. + if rv.Type().AssignableTo(rvalue(time.Time{}).Type()) { + return md.unifyDatetime(data, rv) + } + + // Special case. Look for a value satisfying the TextUnmarshaler interface. + if v, ok := rv.Interface().(TextUnmarshaler); ok { + return md.unifyText(data, v) + } + // BUG(burntsushi) + // The behavior here is incorrect whenever a Go type satisfies the + // encoding.TextUnmarshaler interface but also corresponds to a TOML + // hash or array. In particular, the unmarshaler should only be applied + // to primitive TOML values. But at this point, it will be applied to + // all kinds of values and produce an incorrect error whenever those values + // are hashes or arrays (including arrays of tables). + + k := rv.Kind() + + // laziness + if k >= reflect.Int && k <= reflect.Uint64 { + return md.unifyInt(data, rv) + } + switch k { + case reflect.Ptr: + elem := reflect.New(rv.Type().Elem()) + err := md.unify(data, reflect.Indirect(elem)) + if err != nil { + return err + } + rv.Set(elem) + return nil + case reflect.Struct: + return md.unifyStruct(data, rv) + case reflect.Map: + return md.unifyMap(data, rv) + case reflect.Array: + return md.unifyArray(data, rv) + case reflect.Slice: + return md.unifySlice(data, rv) + case reflect.String: + return md.unifyString(data, rv) + case reflect.Bool: + return md.unifyBool(data, rv) + case reflect.Interface: + // we only support empty interfaces. + if rv.NumMethod() > 0 { + return e("unsupported type %s", rv.Type()) + } + return md.unifyAnything(data, rv) + case reflect.Float32: + fallthrough + case reflect.Float64: + return md.unifyFloat64(data, rv) + } + return e("unsupported type %s", rv.Kind()) +} + +func (md *MetaData) unifyStruct(mapping interface{}, rv reflect.Value) error { + tmap, ok := mapping.(map[string]interface{}) + if !ok { + if mapping == nil { + return nil + } + return e("type mismatch for %s: expected table but found %T", + rv.Type().String(), mapping) + } + + for key, datum := range tmap { + var f *field + fields := cachedTypeFields(rv.Type()) + for i := range fields { + ff := &fields[i] + if ff.name == key { + f = ff + break + } + if f == nil && strings.EqualFold(ff.name, key) { + f = ff + } + } + if f != nil { + subv := rv + for _, i := range f.index { + subv = indirect(subv.Field(i)) + } + if isUnifiable(subv) { + md.decoded[md.context.add(key).String()] = true + md.context = append(md.context, key) + if err := md.unify(datum, subv); err != nil { + return err + } + md.context = md.context[0 : len(md.context)-1] + } else if f.name != "" { + // Bad user! No soup for you! + return e("cannot write unexported field %s.%s", + rv.Type().String(), f.name) + } + } + } + return nil +} + +func (md *MetaData) unifyMap(mapping interface{}, rv reflect.Value) error { + tmap, ok := mapping.(map[string]interface{}) + if !ok { + if tmap == nil { + return nil + } + return badtype("map", mapping) + } + if rv.IsNil() { + rv.Set(reflect.MakeMap(rv.Type())) + } + for k, v := range tmap { + md.decoded[md.context.add(k).String()] = true + md.context = append(md.context, k) + + rvkey := indirect(reflect.New(rv.Type().Key())) + rvval := reflect.Indirect(reflect.New(rv.Type().Elem())) + if err := md.unify(v, rvval); err != nil { + return err + } + md.context = md.context[0 : len(md.context)-1] + + rvkey.SetString(k) + rv.SetMapIndex(rvkey, rvval) + } + return nil +} + +func (md *MetaData) unifyArray(data interface{}, rv reflect.Value) error { + datav := reflect.ValueOf(data) + if datav.Kind() != reflect.Slice { + if !datav.IsValid() { + return nil + } + return badtype("slice", data) + } + sliceLen := datav.Len() + if sliceLen != rv.Len() { + return e("expected array length %d; got TOML array of length %d", + rv.Len(), sliceLen) + } + return md.unifySliceArray(datav, rv) +} + +func (md *MetaData) unifySlice(data interface{}, rv reflect.Value) error { + datav := reflect.ValueOf(data) + if datav.Kind() != reflect.Slice { + if !datav.IsValid() { + return nil + } + return badtype("slice", data) + } + n := datav.Len() + if rv.IsNil() || rv.Cap() < n { + rv.Set(reflect.MakeSlice(rv.Type(), n, n)) + } + rv.SetLen(n) + return md.unifySliceArray(datav, rv) +} + +func (md *MetaData) unifySliceArray(data, rv reflect.Value) error { + sliceLen := data.Len() + for i := 0; i < sliceLen; i++ { + v := data.Index(i).Interface() + sliceval := indirect(rv.Index(i)) + if err := md.unify(v, sliceval); err != nil { + return err + } + } + return nil +} + +func (md *MetaData) unifyDatetime(data interface{}, rv reflect.Value) error { + if _, ok := data.(time.Time); ok { + rv.Set(reflect.ValueOf(data)) + return nil + } + return badtype("time.Time", data) +} + +func (md *MetaData) unifyString(data interface{}, rv reflect.Value) error { + if s, ok := data.(string); ok { + rv.SetString(s) + return nil + } + return badtype("string", data) +} + +func (md *MetaData) unifyFloat64(data interface{}, rv reflect.Value) error { + if num, ok := data.(float64); ok { + switch rv.Kind() { + case reflect.Float32: + fallthrough + case reflect.Float64: + rv.SetFloat(num) + default: + panic("bug") + } + return nil + } + return badtype("float", data) +} + +func (md *MetaData) unifyInt(data interface{}, rv reflect.Value) error { + if num, ok := data.(int64); ok { + if rv.Kind() >= reflect.Int && rv.Kind() <= reflect.Int64 { + switch rv.Kind() { + case reflect.Int, reflect.Int64: + // No bounds checking necessary. + case reflect.Int8: + if num < math.MinInt8 || num > math.MaxInt8 { + return e("value %d is out of range for int8", num) + } + case reflect.Int16: + if num < math.MinInt16 || num > math.MaxInt16 { + return e("value %d is out of range for int16", num) + } + case reflect.Int32: + if num < math.MinInt32 || num > math.MaxInt32 { + return e("value %d is out of range for int32", num) + } + } + rv.SetInt(num) + } else if rv.Kind() >= reflect.Uint && rv.Kind() <= reflect.Uint64 { + unum := uint64(num) + switch rv.Kind() { + case reflect.Uint, reflect.Uint64: + // No bounds checking necessary. + case reflect.Uint8: + if num < 0 || unum > math.MaxUint8 { + return e("value %d is out of range for uint8", num) + } + case reflect.Uint16: + if num < 0 || unum > math.MaxUint16 { + return e("value %d is out of range for uint16", num) + } + case reflect.Uint32: + if num < 0 || unum > math.MaxUint32 { + return e("value %d is out of range for uint32", num) + } + } + rv.SetUint(unum) + } else { + panic("unreachable") + } + return nil + } + return badtype("integer", data) +} + +func (md *MetaData) unifyBool(data interface{}, rv reflect.Value) error { + if b, ok := data.(bool); ok { + rv.SetBool(b) + return nil + } + return badtype("boolean", data) +} + +func (md *MetaData) unifyAnything(data interface{}, rv reflect.Value) error { + rv.Set(reflect.ValueOf(data)) + return nil +} + +func (md *MetaData) unifyText(data interface{}, v TextUnmarshaler) error { + var s string + switch sdata := data.(type) { + case TextMarshaler: + text, err := sdata.MarshalText() + if err != nil { + return err + } + s = string(text) + case fmt.Stringer: + s = sdata.String() + case string: + s = sdata + case bool: + s = fmt.Sprintf("%v", sdata) + case int64: + s = fmt.Sprintf("%d", sdata) + case float64: + s = fmt.Sprintf("%f", sdata) + default: + return badtype("primitive (string-like)", data) + } + if err := v.UnmarshalText([]byte(s)); err != nil { + return err + } + return nil +} + +// rvalue returns a reflect.Value of `v`. All pointers are resolved. +func rvalue(v interface{}) reflect.Value { + return indirect(reflect.ValueOf(v)) +} + +// indirect returns the value pointed to by a pointer. +// Pointers are followed until the value is not a pointer. +// New values are allocated for each nil pointer. +// +// An exception to this rule is if the value satisfies an interface of +// interest to us (like encoding.TextUnmarshaler). +func indirect(v reflect.Value) reflect.Value { + if v.Kind() != reflect.Ptr { + if v.CanSet() { + pv := v.Addr() + if _, ok := pv.Interface().(TextUnmarshaler); ok { + return pv + } + } + return v + } + if v.IsNil() { + v.Set(reflect.New(v.Type().Elem())) + } + return indirect(reflect.Indirect(v)) +} + +func isUnifiable(rv reflect.Value) bool { + if rv.CanSet() { + return true + } + if _, ok := rv.Interface().(TextUnmarshaler); ok { + return true + } + return false +} + +func badtype(expected string, data interface{}) error { + return e("cannot load TOML value of type %T into a Go %s", data, expected) +} diff --git a/vendor/github.com/BurntSushi/toml/decode_meta.go b/vendor/github.com/BurntSushi/toml/decode_meta.go new file mode 100644 index 000000000000..b9914a6798cf --- /dev/null +++ b/vendor/github.com/BurntSushi/toml/decode_meta.go @@ -0,0 +1,121 @@ +package toml + +import "strings" + +// MetaData allows access to meta information about TOML data that may not +// be inferrable via reflection. In particular, whether a key has been defined +// and the TOML type of a key. +type MetaData struct { + mapping map[string]interface{} + types map[string]tomlType + keys []Key + decoded map[string]bool + context Key // Used only during decoding. +} + +// IsDefined returns true if the key given exists in the TOML data. The key +// should be specified hierarchially. e.g., +// +// // access the TOML key 'a.b.c' +// IsDefined("a", "b", "c") +// +// IsDefined will return false if an empty key given. Keys are case sensitive. +func (md *MetaData) IsDefined(key ...string) bool { + if len(key) == 0 { + return false + } + + var hash map[string]interface{} + var ok bool + var hashOrVal interface{} = md.mapping + for _, k := range key { + if hash, ok = hashOrVal.(map[string]interface{}); !ok { + return false + } + if hashOrVal, ok = hash[k]; !ok { + return false + } + } + return true +} + +// Type returns a string representation of the type of the key specified. +// +// Type will return the empty string if given an empty key or a key that +// does not exist. Keys are case sensitive. +func (md *MetaData) Type(key ...string) string { + fullkey := strings.Join(key, ".") + if typ, ok := md.types[fullkey]; ok { + return typ.typeString() + } + return "" +} + +// Key is the type of any TOML key, including key groups. Use (MetaData).Keys +// to get values of this type. +type Key []string + +func (k Key) String() string { + return strings.Join(k, ".") +} + +func (k Key) maybeQuotedAll() string { + var ss []string + for i := range k { + ss = append(ss, k.maybeQuoted(i)) + } + return strings.Join(ss, ".") +} + +func (k Key) maybeQuoted(i int) string { + quote := false + for _, c := range k[i] { + if !isBareKeyChar(c) { + quote = true + break + } + } + if quote { + return "\"" + strings.Replace(k[i], "\"", "\\\"", -1) + "\"" + } + return k[i] +} + +func (k Key) add(piece string) Key { + newKey := make(Key, len(k)+1) + copy(newKey, k) + newKey[len(k)] = piece + return newKey +} + +// Keys returns a slice of every key in the TOML data, including key groups. +// Each key is itself a slice, where the first element is the top of the +// hierarchy and the last is the most specific. +// +// The list will have the same order as the keys appeared in the TOML data. +// +// All keys returned are non-empty. +func (md *MetaData) Keys() []Key { + return md.keys +} + +// Undecoded returns all keys that have not been decoded in the order in which +// they appear in the original TOML document. +// +// This includes keys that haven't been decoded because of a Primitive value. +// Once the Primitive value is decoded, the keys will be considered decoded. +// +// Also note that decoding into an empty interface will result in no decoding, +// and so no keys will be considered decoded. +// +// In this sense, the Undecoded keys correspond to keys in the TOML document +// that do not have a concrete type in your representation. +func (md *MetaData) Undecoded() []Key { + undecoded := make([]Key, 0, len(md.keys)) + for _, key := range md.keys { + if !md.decoded[key.String()] { + undecoded = append(undecoded, key) + } + } + return undecoded +} diff --git a/vendor/github.com/BurntSushi/toml/doc.go b/vendor/github.com/BurntSushi/toml/doc.go new file mode 100644 index 000000000000..b371f396edca --- /dev/null +++ b/vendor/github.com/BurntSushi/toml/doc.go @@ -0,0 +1,27 @@ +/* +Package toml provides facilities for decoding and encoding TOML configuration +files via reflection. There is also support for delaying decoding with +the Primitive type, and querying the set of keys in a TOML document with the +MetaData type. + +The specification implemented: https://github.com/toml-lang/toml + +The sub-command github.com/BurntSushi/toml/cmd/tomlv can be used to verify +whether a file is a valid TOML document. It can also be used to print the +type of each key in a TOML document. + +Testing + +There are two important types of tests used for this package. The first is +contained inside '*_test.go' files and uses the standard Go unit testing +framework. These tests are primarily devoted to holistically testing the +decoder and encoder. + +The second type of testing is used to verify the implementation's adherence +to the TOML specification. These tests have been factored into their own +project: https://github.com/BurntSushi/toml-test + +The reason the tests are in a separate project is so that they can be used by +any implementation of TOML. Namely, it is language agnostic. +*/ +package toml diff --git a/vendor/github.com/BurntSushi/toml/encode.go b/vendor/github.com/BurntSushi/toml/encode.go new file mode 100644 index 000000000000..d905c21a2466 --- /dev/null +++ b/vendor/github.com/BurntSushi/toml/encode.go @@ -0,0 +1,568 @@ +package toml + +import ( + "bufio" + "errors" + "fmt" + "io" + "reflect" + "sort" + "strconv" + "strings" + "time" +) + +type tomlEncodeError struct{ error } + +var ( + errArrayMixedElementTypes = errors.New( + "toml: cannot encode array with mixed element types") + errArrayNilElement = errors.New( + "toml: cannot encode array with nil element") + errNonString = errors.New( + "toml: cannot encode a map with non-string key type") + errAnonNonStruct = errors.New( + "toml: cannot encode an anonymous field that is not a struct") + errArrayNoTable = errors.New( + "toml: TOML array element cannot contain a table") + errNoKey = errors.New( + "toml: top-level values must be Go maps or structs") + errAnything = errors.New("") // used in testing +) + +var quotedReplacer = strings.NewReplacer( + "\t", "\\t", + "\n", "\\n", + "\r", "\\r", + "\"", "\\\"", + "\\", "\\\\", +) + +// Encoder controls the encoding of Go values to a TOML document to some +// io.Writer. +// +// The indentation level can be controlled with the Indent field. +type Encoder struct { + // A single indentation level. By default it is two spaces. + Indent string + + // hasWritten is whether we have written any output to w yet. + hasWritten bool + w *bufio.Writer +} + +// NewEncoder returns a TOML encoder that encodes Go values to the io.Writer +// given. By default, a single indentation level is 2 spaces. +func NewEncoder(w io.Writer) *Encoder { + return &Encoder{ + w: bufio.NewWriter(w), + Indent: " ", + } +} + +// Encode writes a TOML representation of the Go value to the underlying +// io.Writer. If the value given cannot be encoded to a valid TOML document, +// then an error is returned. +// +// The mapping between Go values and TOML values should be precisely the same +// as for the Decode* functions. Similarly, the TextMarshaler interface is +// supported by encoding the resulting bytes as strings. (If you want to write +// arbitrary binary data then you will need to use something like base64 since +// TOML does not have any binary types.) +// +// When encoding TOML hashes (i.e., Go maps or structs), keys without any +// sub-hashes are encoded first. +// +// If a Go map is encoded, then its keys are sorted alphabetically for +// deterministic output. More control over this behavior may be provided if +// there is demand for it. +// +// Encoding Go values without a corresponding TOML representation---like map +// types with non-string keys---will cause an error to be returned. Similarly +// for mixed arrays/slices, arrays/slices with nil elements, embedded +// non-struct types and nested slices containing maps or structs. +// (e.g., [][]map[string]string is not allowed but []map[string]string is OK +// and so is []map[string][]string.) +func (enc *Encoder) Encode(v interface{}) error { + rv := eindirect(reflect.ValueOf(v)) + if err := enc.safeEncode(Key([]string{}), rv); err != nil { + return err + } + return enc.w.Flush() +} + +func (enc *Encoder) safeEncode(key Key, rv reflect.Value) (err error) { + defer func() { + if r := recover(); r != nil { + if terr, ok := r.(tomlEncodeError); ok { + err = terr.error + return + } + panic(r) + } + }() + enc.encode(key, rv) + return nil +} + +func (enc *Encoder) encode(key Key, rv reflect.Value) { + // Special case. Time needs to be in ISO8601 format. + // Special case. If we can marshal the type to text, then we used that. + // Basically, this prevents the encoder for handling these types as + // generic structs (or whatever the underlying type of a TextMarshaler is). + switch rv.Interface().(type) { + case time.Time, TextMarshaler: + enc.keyEqElement(key, rv) + return + } + + k := rv.Kind() + switch k { + case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, + reflect.Int64, + reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, + reflect.Uint64, + reflect.Float32, reflect.Float64, reflect.String, reflect.Bool: + enc.keyEqElement(key, rv) + case reflect.Array, reflect.Slice: + if typeEqual(tomlArrayHash, tomlTypeOfGo(rv)) { + enc.eArrayOfTables(key, rv) + } else { + enc.keyEqElement(key, rv) + } + case reflect.Interface: + if rv.IsNil() { + return + } + enc.encode(key, rv.Elem()) + case reflect.Map: + if rv.IsNil() { + return + } + enc.eTable(key, rv) + case reflect.Ptr: + if rv.IsNil() { + return + } + enc.encode(key, rv.Elem()) + case reflect.Struct: + enc.eTable(key, rv) + default: + panic(e("unsupported type for key '%s': %s", key, k)) + } +} + +// eElement encodes any value that can be an array element (primitives and +// arrays). +func (enc *Encoder) eElement(rv reflect.Value) { + switch v := rv.Interface().(type) { + case time.Time: + // Special case time.Time as a primitive. Has to come before + // TextMarshaler below because time.Time implements + // encoding.TextMarshaler, but we need to always use UTC. + enc.wf(v.UTC().Format("2006-01-02T15:04:05Z")) + return + case TextMarshaler: + // Special case. Use text marshaler if it's available for this value. + if s, err := v.MarshalText(); err != nil { + encPanic(err) + } else { + enc.writeQuoted(string(s)) + } + return + } + switch rv.Kind() { + case reflect.Bool: + enc.wf(strconv.FormatBool(rv.Bool())) + case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, + reflect.Int64: + enc.wf(strconv.FormatInt(rv.Int(), 10)) + case reflect.Uint, reflect.Uint8, reflect.Uint16, + reflect.Uint32, reflect.Uint64: + enc.wf(strconv.FormatUint(rv.Uint(), 10)) + case reflect.Float32: + enc.wf(floatAddDecimal(strconv.FormatFloat(rv.Float(), 'f', -1, 32))) + case reflect.Float64: + enc.wf(floatAddDecimal(strconv.FormatFloat(rv.Float(), 'f', -1, 64))) + case reflect.Array, reflect.Slice: + enc.eArrayOrSliceElement(rv) + case reflect.Interface: + enc.eElement(rv.Elem()) + case reflect.String: + enc.writeQuoted(rv.String()) + default: + panic(e("unexpected primitive type: %s", rv.Kind())) + } +} + +// By the TOML spec, all floats must have a decimal with at least one +// number on either side. +func floatAddDecimal(fstr string) string { + if !strings.Contains(fstr, ".") { + return fstr + ".0" + } + return fstr +} + +func (enc *Encoder) writeQuoted(s string) { + enc.wf("\"%s\"", quotedReplacer.Replace(s)) +} + +func (enc *Encoder) eArrayOrSliceElement(rv reflect.Value) { + length := rv.Len() + enc.wf("[") + for i := 0; i < length; i++ { + elem := rv.Index(i) + enc.eElement(elem) + if i != length-1 { + enc.wf(", ") + } + } + enc.wf("]") +} + +func (enc *Encoder) eArrayOfTables(key Key, rv reflect.Value) { + if len(key) == 0 { + encPanic(errNoKey) + } + for i := 0; i < rv.Len(); i++ { + trv := rv.Index(i) + if isNil(trv) { + continue + } + panicIfInvalidKey(key) + enc.newline() + enc.wf("%s[[%s]]", enc.indentStr(key), key.maybeQuotedAll()) + enc.newline() + enc.eMapOrStruct(key, trv) + } +} + +func (enc *Encoder) eTable(key Key, rv reflect.Value) { + panicIfInvalidKey(key) + if len(key) == 1 { + // Output an extra newline between top-level tables. + // (The newline isn't written if nothing else has been written though.) + enc.newline() + } + if len(key) > 0 { + enc.wf("%s[%s]", enc.indentStr(key), key.maybeQuotedAll()) + enc.newline() + } + enc.eMapOrStruct(key, rv) +} + +func (enc *Encoder) eMapOrStruct(key Key, rv reflect.Value) { + switch rv := eindirect(rv); rv.Kind() { + case reflect.Map: + enc.eMap(key, rv) + case reflect.Struct: + enc.eStruct(key, rv) + default: + panic("eTable: unhandled reflect.Value Kind: " + rv.Kind().String()) + } +} + +func (enc *Encoder) eMap(key Key, rv reflect.Value) { + rt := rv.Type() + if rt.Key().Kind() != reflect.String { + encPanic(errNonString) + } + + // Sort keys so that we have deterministic output. And write keys directly + // underneath this key first, before writing sub-structs or sub-maps. + var mapKeysDirect, mapKeysSub []string + for _, mapKey := range rv.MapKeys() { + k := mapKey.String() + if typeIsHash(tomlTypeOfGo(rv.MapIndex(mapKey))) { + mapKeysSub = append(mapKeysSub, k) + } else { + mapKeysDirect = append(mapKeysDirect, k) + } + } + + var writeMapKeys = func(mapKeys []string) { + sort.Strings(mapKeys) + for _, mapKey := range mapKeys { + mrv := rv.MapIndex(reflect.ValueOf(mapKey)) + if isNil(mrv) { + // Don't write anything for nil fields. + continue + } + enc.encode(key.add(mapKey), mrv) + } + } + writeMapKeys(mapKeysDirect) + writeMapKeys(mapKeysSub) +} + +func (enc *Encoder) eStruct(key Key, rv reflect.Value) { + // Write keys for fields directly under this key first, because if we write + // a field that creates a new table, then all keys under it will be in that + // table (not the one we're writing here). + rt := rv.Type() + var fieldsDirect, fieldsSub [][]int + var addFields func(rt reflect.Type, rv reflect.Value, start []int) + addFields = func(rt reflect.Type, rv reflect.Value, start []int) { + for i := 0; i < rt.NumField(); i++ { + f := rt.Field(i) + // skip unexported fields + if f.PkgPath != "" && !f.Anonymous { + continue + } + frv := rv.Field(i) + if f.Anonymous { + t := f.Type + switch t.Kind() { + case reflect.Struct: + // Treat anonymous struct fields with + // tag names as though they are not + // anonymous, like encoding/json does. + if getOptions(f.Tag).name == "" { + addFields(t, frv, f.Index) + continue + } + case reflect.Ptr: + if t.Elem().Kind() == reflect.Struct && + getOptions(f.Tag).name == "" { + if !frv.IsNil() { + addFields(t.Elem(), frv.Elem(), f.Index) + } + continue + } + // Fall through to the normal field encoding logic below + // for non-struct anonymous fields. + } + } + + if typeIsHash(tomlTypeOfGo(frv)) { + fieldsSub = append(fieldsSub, append(start, f.Index...)) + } else { + fieldsDirect = append(fieldsDirect, append(start, f.Index...)) + } + } + } + addFields(rt, rv, nil) + + var writeFields = func(fields [][]int) { + for _, fieldIndex := range fields { + sft := rt.FieldByIndex(fieldIndex) + sf := rv.FieldByIndex(fieldIndex) + if isNil(sf) { + // Don't write anything for nil fields. + continue + } + + opts := getOptions(sft.Tag) + if opts.skip { + continue + } + keyName := sft.Name + if opts.name != "" { + keyName = opts.name + } + if opts.omitempty && isEmpty(sf) { + continue + } + if opts.omitzero && isZero(sf) { + continue + } + + enc.encode(key.add(keyName), sf) + } + } + writeFields(fieldsDirect) + writeFields(fieldsSub) +} + +// tomlTypeName returns the TOML type name of the Go value's type. It is +// used to determine whether the types of array elements are mixed (which is +// forbidden). If the Go value is nil, then it is illegal for it to be an array +// element, and valueIsNil is returned as true. + +// Returns the TOML type of a Go value. The type may be `nil`, which means +// no concrete TOML type could be found. +func tomlTypeOfGo(rv reflect.Value) tomlType { + if isNil(rv) || !rv.IsValid() { + return nil + } + switch rv.Kind() { + case reflect.Bool: + return tomlBool + case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, + reflect.Int64, + reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, + reflect.Uint64: + return tomlInteger + case reflect.Float32, reflect.Float64: + return tomlFloat + case reflect.Array, reflect.Slice: + if typeEqual(tomlHash, tomlArrayType(rv)) { + return tomlArrayHash + } + return tomlArray + case reflect.Ptr, reflect.Interface: + return tomlTypeOfGo(rv.Elem()) + case reflect.String: + return tomlString + case reflect.Map: + return tomlHash + case reflect.Struct: + switch rv.Interface().(type) { + case time.Time: + return tomlDatetime + case TextMarshaler: + return tomlString + default: + return tomlHash + } + default: + panic("unexpected reflect.Kind: " + rv.Kind().String()) + } +} + +// tomlArrayType returns the element type of a TOML array. The type returned +// may be nil if it cannot be determined (e.g., a nil slice or a zero length +// slize). This function may also panic if it finds a type that cannot be +// expressed in TOML (such as nil elements, heterogeneous arrays or directly +// nested arrays of tables). +func tomlArrayType(rv reflect.Value) tomlType { + if isNil(rv) || !rv.IsValid() || rv.Len() == 0 { + return nil + } + firstType := tomlTypeOfGo(rv.Index(0)) + if firstType == nil { + encPanic(errArrayNilElement) + } + + rvlen := rv.Len() + for i := 1; i < rvlen; i++ { + elem := rv.Index(i) + switch elemType := tomlTypeOfGo(elem); { + case elemType == nil: + encPanic(errArrayNilElement) + case !typeEqual(firstType, elemType): + encPanic(errArrayMixedElementTypes) + } + } + // If we have a nested array, then we must make sure that the nested + // array contains ONLY primitives. + // This checks arbitrarily nested arrays. + if typeEqual(firstType, tomlArray) || typeEqual(firstType, tomlArrayHash) { + nest := tomlArrayType(eindirect(rv.Index(0))) + if typeEqual(nest, tomlHash) || typeEqual(nest, tomlArrayHash) { + encPanic(errArrayNoTable) + } + } + return firstType +} + +type tagOptions struct { + skip bool // "-" + name string + omitempty bool + omitzero bool +} + +func getOptions(tag reflect.StructTag) tagOptions { + t := tag.Get("toml") + if t == "-" { + return tagOptions{skip: true} + } + var opts tagOptions + parts := strings.Split(t, ",") + opts.name = parts[0] + for _, s := range parts[1:] { + switch s { + case "omitempty": + opts.omitempty = true + case "omitzero": + opts.omitzero = true + } + } + return opts +} + +func isZero(rv reflect.Value) bool { + switch rv.Kind() { + case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64: + return rv.Int() == 0 + case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64: + return rv.Uint() == 0 + case reflect.Float32, reflect.Float64: + return rv.Float() == 0.0 + } + return false +} + +func isEmpty(rv reflect.Value) bool { + switch rv.Kind() { + case reflect.Array, reflect.Slice, reflect.Map, reflect.String: + return rv.Len() == 0 + case reflect.Bool: + return !rv.Bool() + } + return false +} + +func (enc *Encoder) newline() { + if enc.hasWritten { + enc.wf("\n") + } +} + +func (enc *Encoder) keyEqElement(key Key, val reflect.Value) { + if len(key) == 0 { + encPanic(errNoKey) + } + panicIfInvalidKey(key) + enc.wf("%s%s = ", enc.indentStr(key), key.maybeQuoted(len(key)-1)) + enc.eElement(val) + enc.newline() +} + +func (enc *Encoder) wf(format string, v ...interface{}) { + if _, err := fmt.Fprintf(enc.w, format, v...); err != nil { + encPanic(err) + } + enc.hasWritten = true +} + +func (enc *Encoder) indentStr(key Key) string { + return strings.Repeat(enc.Indent, len(key)-1) +} + +func encPanic(err error) { + panic(tomlEncodeError{err}) +} + +func eindirect(v reflect.Value) reflect.Value { + switch v.Kind() { + case reflect.Ptr, reflect.Interface: + return eindirect(v.Elem()) + default: + return v + } +} + +func isNil(rv reflect.Value) bool { + switch rv.Kind() { + case reflect.Interface, reflect.Map, reflect.Ptr, reflect.Slice: + return rv.IsNil() + default: + return false + } +} + +func panicIfInvalidKey(key Key) { + for _, k := range key { + if len(k) == 0 { + encPanic(e("Key '%s' is not a valid table name. Key names "+ + "cannot be empty.", key.maybeQuotedAll())) + } + } +} + +func isValidKeyName(s string) bool { + return len(s) != 0 +} diff --git a/vendor/github.com/BurntSushi/toml/encoding_types.go b/vendor/github.com/BurntSushi/toml/encoding_types.go new file mode 100644 index 000000000000..d36e1dd6002b --- /dev/null +++ b/vendor/github.com/BurntSushi/toml/encoding_types.go @@ -0,0 +1,19 @@ +// +build go1.2 + +package toml + +// In order to support Go 1.1, we define our own TextMarshaler and +// TextUnmarshaler types. For Go 1.2+, we just alias them with the +// standard library interfaces. + +import ( + "encoding" +) + +// TextMarshaler is a synonym for encoding.TextMarshaler. It is defined here +// so that Go 1.1 can be supported. +type TextMarshaler encoding.TextMarshaler + +// TextUnmarshaler is a synonym for encoding.TextUnmarshaler. It is defined +// here so that Go 1.1 can be supported. +type TextUnmarshaler encoding.TextUnmarshaler diff --git a/vendor/github.com/BurntSushi/toml/encoding_types_1.1.go b/vendor/github.com/BurntSushi/toml/encoding_types_1.1.go new file mode 100644 index 000000000000..e8d503d04690 --- /dev/null +++ b/vendor/github.com/BurntSushi/toml/encoding_types_1.1.go @@ -0,0 +1,18 @@ +// +build !go1.2 + +package toml + +// These interfaces were introduced in Go 1.2, so we add them manually when +// compiling for Go 1.1. + +// TextMarshaler is a synonym for encoding.TextMarshaler. It is defined here +// so that Go 1.1 can be supported. +type TextMarshaler interface { + MarshalText() (text []byte, err error) +} + +// TextUnmarshaler is a synonym for encoding.TextUnmarshaler. It is defined +// here so that Go 1.1 can be supported. +type TextUnmarshaler interface { + UnmarshalText(text []byte) error +} diff --git a/vendor/github.com/BurntSushi/toml/lex.go b/vendor/github.com/BurntSushi/toml/lex.go new file mode 100644 index 000000000000..e0a742a8870f --- /dev/null +++ b/vendor/github.com/BurntSushi/toml/lex.go @@ -0,0 +1,953 @@ +package toml + +import ( + "fmt" + "strings" + "unicode" + "unicode/utf8" +) + +type itemType int + +const ( + itemError itemType = iota + itemNIL // used in the parser to indicate no type + itemEOF + itemText + itemString + itemRawString + itemMultilineString + itemRawMultilineString + itemBool + itemInteger + itemFloat + itemDatetime + itemArray // the start of an array + itemArrayEnd + itemTableStart + itemTableEnd + itemArrayTableStart + itemArrayTableEnd + itemKeyStart + itemCommentStart + itemInlineTableStart + itemInlineTableEnd +) + +const ( + eof = 0 + comma = ',' + tableStart = '[' + tableEnd = ']' + arrayTableStart = '[' + arrayTableEnd = ']' + tableSep = '.' + keySep = '=' + arrayStart = '[' + arrayEnd = ']' + commentStart = '#' + stringStart = '"' + stringEnd = '"' + rawStringStart = '\'' + rawStringEnd = '\'' + inlineTableStart = '{' + inlineTableEnd = '}' +) + +type stateFn func(lx *lexer) stateFn + +type lexer struct { + input string + start int + pos int + line int + state stateFn + items chan item + + // Allow for backing up up to three runes. + // This is necessary because TOML contains 3-rune tokens (""" and '''). + prevWidths [3]int + nprev int // how many of prevWidths are in use + // If we emit an eof, we can still back up, but it is not OK to call + // next again. + atEOF bool + + // A stack of state functions used to maintain context. + // The idea is to reuse parts of the state machine in various places. + // For example, values can appear at the top level or within arbitrarily + // nested arrays. The last state on the stack is used after a value has + // been lexed. Similarly for comments. + stack []stateFn +} + +type item struct { + typ itemType + val string + line int +} + +func (lx *lexer) nextItem() item { + for { + select { + case item := <-lx.items: + return item + default: + lx.state = lx.state(lx) + } + } +} + +func lex(input string) *lexer { + lx := &lexer{ + input: input, + state: lexTop, + line: 1, + items: make(chan item, 10), + stack: make([]stateFn, 0, 10), + } + return lx +} + +func (lx *lexer) push(state stateFn) { + lx.stack = append(lx.stack, state) +} + +func (lx *lexer) pop() stateFn { + if len(lx.stack) == 0 { + return lx.errorf("BUG in lexer: no states to pop") + } + last := lx.stack[len(lx.stack)-1] + lx.stack = lx.stack[0 : len(lx.stack)-1] + return last +} + +func (lx *lexer) current() string { + return lx.input[lx.start:lx.pos] +} + +func (lx *lexer) emit(typ itemType) { + lx.items <- item{typ, lx.current(), lx.line} + lx.start = lx.pos +} + +func (lx *lexer) emitTrim(typ itemType) { + lx.items <- item{typ, strings.TrimSpace(lx.current()), lx.line} + lx.start = lx.pos +} + +func (lx *lexer) next() (r rune) { + if lx.atEOF { + panic("next called after EOF") + } + if lx.pos >= len(lx.input) { + lx.atEOF = true + return eof + } + + if lx.input[lx.pos] == '\n' { + lx.line++ + } + lx.prevWidths[2] = lx.prevWidths[1] + lx.prevWidths[1] = lx.prevWidths[0] + if lx.nprev < 3 { + lx.nprev++ + } + r, w := utf8.DecodeRuneInString(lx.input[lx.pos:]) + lx.prevWidths[0] = w + lx.pos += w + return r +} + +// ignore skips over the pending input before this point. +func (lx *lexer) ignore() { + lx.start = lx.pos +} + +// backup steps back one rune. Can be called only twice between calls to next. +func (lx *lexer) backup() { + if lx.atEOF { + lx.atEOF = false + return + } + if lx.nprev < 1 { + panic("backed up too far") + } + w := lx.prevWidths[0] + lx.prevWidths[0] = lx.prevWidths[1] + lx.prevWidths[1] = lx.prevWidths[2] + lx.nprev-- + lx.pos -= w + if lx.pos < len(lx.input) && lx.input[lx.pos] == '\n' { + lx.line-- + } +} + +// accept consumes the next rune if it's equal to `valid`. +func (lx *lexer) accept(valid rune) bool { + if lx.next() == valid { + return true + } + lx.backup() + return false +} + +// peek returns but does not consume the next rune in the input. +func (lx *lexer) peek() rune { + r := lx.next() + lx.backup() + return r +} + +// skip ignores all input that matches the given predicate. +func (lx *lexer) skip(pred func(rune) bool) { + for { + r := lx.next() + if pred(r) { + continue + } + lx.backup() + lx.ignore() + return + } +} + +// errorf stops all lexing by emitting an error and returning `nil`. +// Note that any value that is a character is escaped if it's a special +// character (newlines, tabs, etc.). +func (lx *lexer) errorf(format string, values ...interface{}) stateFn { + lx.items <- item{ + itemError, + fmt.Sprintf(format, values...), + lx.line, + } + return nil +} + +// lexTop consumes elements at the top level of TOML data. +func lexTop(lx *lexer) stateFn { + r := lx.next() + if isWhitespace(r) || isNL(r) { + return lexSkip(lx, lexTop) + } + switch r { + case commentStart: + lx.push(lexTop) + return lexCommentStart + case tableStart: + return lexTableStart + case eof: + if lx.pos > lx.start { + return lx.errorf("unexpected EOF") + } + lx.emit(itemEOF) + return nil + } + + // At this point, the only valid item can be a key, so we back up + // and let the key lexer do the rest. + lx.backup() + lx.push(lexTopEnd) + return lexKeyStart +} + +// lexTopEnd is entered whenever a top-level item has been consumed. (A value +// or a table.) It must see only whitespace, and will turn back to lexTop +// upon a newline. If it sees EOF, it will quit the lexer successfully. +func lexTopEnd(lx *lexer) stateFn { + r := lx.next() + switch { + case r == commentStart: + // a comment will read to a newline for us. + lx.push(lexTop) + return lexCommentStart + case isWhitespace(r): + return lexTopEnd + case isNL(r): + lx.ignore() + return lexTop + case r == eof: + lx.emit(itemEOF) + return nil + } + return lx.errorf("expected a top-level item to end with a newline, "+ + "comment, or EOF, but got %q instead", r) +} + +// lexTable lexes the beginning of a table. Namely, it makes sure that +// it starts with a character other than '.' and ']'. +// It assumes that '[' has already been consumed. +// It also handles the case that this is an item in an array of tables. +// e.g., '[[name]]'. +func lexTableStart(lx *lexer) stateFn { + if lx.peek() == arrayTableStart { + lx.next() + lx.emit(itemArrayTableStart) + lx.push(lexArrayTableEnd) + } else { + lx.emit(itemTableStart) + lx.push(lexTableEnd) + } + return lexTableNameStart +} + +func lexTableEnd(lx *lexer) stateFn { + lx.emit(itemTableEnd) + return lexTopEnd +} + +func lexArrayTableEnd(lx *lexer) stateFn { + if r := lx.next(); r != arrayTableEnd { + return lx.errorf("expected end of table array name delimiter %q, "+ + "but got %q instead", arrayTableEnd, r) + } + lx.emit(itemArrayTableEnd) + return lexTopEnd +} + +func lexTableNameStart(lx *lexer) stateFn { + lx.skip(isWhitespace) + switch r := lx.peek(); { + case r == tableEnd || r == eof: + return lx.errorf("unexpected end of table name " + + "(table names cannot be empty)") + case r == tableSep: + return lx.errorf("unexpected table separator " + + "(table names cannot be empty)") + case r == stringStart || r == rawStringStart: + lx.ignore() + lx.push(lexTableNameEnd) + return lexValue // reuse string lexing + default: + return lexBareTableName + } +} + +// lexBareTableName lexes the name of a table. It assumes that at least one +// valid character for the table has already been read. +func lexBareTableName(lx *lexer) stateFn { + r := lx.next() + if isBareKeyChar(r) { + return lexBareTableName + } + lx.backup() + lx.emit(itemText) + return lexTableNameEnd +} + +// lexTableNameEnd reads the end of a piece of a table name, optionally +// consuming whitespace. +func lexTableNameEnd(lx *lexer) stateFn { + lx.skip(isWhitespace) + switch r := lx.next(); { + case isWhitespace(r): + return lexTableNameEnd + case r == tableSep: + lx.ignore() + return lexTableNameStart + case r == tableEnd: + return lx.pop() + default: + return lx.errorf("expected '.' or ']' to end table name, "+ + "but got %q instead", r) + } +} + +// lexKeyStart consumes a key name up until the first non-whitespace character. +// lexKeyStart will ignore whitespace. +func lexKeyStart(lx *lexer) stateFn { + r := lx.peek() + switch { + case r == keySep: + return lx.errorf("unexpected key separator %q", keySep) + case isWhitespace(r) || isNL(r): + lx.next() + return lexSkip(lx, lexKeyStart) + case r == stringStart || r == rawStringStart: + lx.ignore() + lx.emit(itemKeyStart) + lx.push(lexKeyEnd) + return lexValue // reuse string lexing + default: + lx.ignore() + lx.emit(itemKeyStart) + return lexBareKey + } +} + +// lexBareKey consumes the text of a bare key. Assumes that the first character +// (which is not whitespace) has not yet been consumed. +func lexBareKey(lx *lexer) stateFn { + switch r := lx.next(); { + case isBareKeyChar(r): + return lexBareKey + case isWhitespace(r): + lx.backup() + lx.emit(itemText) + return lexKeyEnd + case r == keySep: + lx.backup() + lx.emit(itemText) + return lexKeyEnd + default: + return lx.errorf("bare keys cannot contain %q", r) + } +} + +// lexKeyEnd consumes the end of a key and trims whitespace (up to the key +// separator). +func lexKeyEnd(lx *lexer) stateFn { + switch r := lx.next(); { + case r == keySep: + return lexSkip(lx, lexValue) + case isWhitespace(r): + return lexSkip(lx, lexKeyEnd) + default: + return lx.errorf("expected key separator %q, but got %q instead", + keySep, r) + } +} + +// lexValue starts the consumption of a value anywhere a value is expected. +// lexValue will ignore whitespace. +// After a value is lexed, the last state on the next is popped and returned. +func lexValue(lx *lexer) stateFn { + // We allow whitespace to precede a value, but NOT newlines. + // In array syntax, the array states are responsible for ignoring newlines. + r := lx.next() + switch { + case isWhitespace(r): + return lexSkip(lx, lexValue) + case isDigit(r): + lx.backup() // avoid an extra state and use the same as above + return lexNumberOrDateStart + } + switch r { + case arrayStart: + lx.ignore() + lx.emit(itemArray) + return lexArrayValue + case inlineTableStart: + lx.ignore() + lx.emit(itemInlineTableStart) + return lexInlineTableValue + case stringStart: + if lx.accept(stringStart) { + if lx.accept(stringStart) { + lx.ignore() // Ignore """ + return lexMultilineString + } + lx.backup() + } + lx.ignore() // ignore the '"' + return lexString + case rawStringStart: + if lx.accept(rawStringStart) { + if lx.accept(rawStringStart) { + lx.ignore() // Ignore """ + return lexMultilineRawString + } + lx.backup() + } + lx.ignore() // ignore the "'" + return lexRawString + case '+', '-': + return lexNumberStart + case '.': // special error case, be kind to users + return lx.errorf("floats must start with a digit, not '.'") + } + if unicode.IsLetter(r) { + // Be permissive here; lexBool will give a nice error if the + // user wrote something like + // x = foo + // (i.e. not 'true' or 'false' but is something else word-like.) + lx.backup() + return lexBool + } + return lx.errorf("expected value but found %q instead", r) +} + +// lexArrayValue consumes one value in an array. It assumes that '[' or ',' +// have already been consumed. All whitespace and newlines are ignored. +func lexArrayValue(lx *lexer) stateFn { + r := lx.next() + switch { + case isWhitespace(r) || isNL(r): + return lexSkip(lx, lexArrayValue) + case r == commentStart: + lx.push(lexArrayValue) + return lexCommentStart + case r == comma: + return lx.errorf("unexpected comma") + case r == arrayEnd: + // NOTE(caleb): The spec isn't clear about whether you can have + // a trailing comma or not, so we'll allow it. + return lexArrayEnd + } + + lx.backup() + lx.push(lexArrayValueEnd) + return lexValue +} + +// lexArrayValueEnd consumes everything between the end of an array value and +// the next value (or the end of the array): it ignores whitespace and newlines +// and expects either a ',' or a ']'. +func lexArrayValueEnd(lx *lexer) stateFn { + r := lx.next() + switch { + case isWhitespace(r) || isNL(r): + return lexSkip(lx, lexArrayValueEnd) + case r == commentStart: + lx.push(lexArrayValueEnd) + return lexCommentStart + case r == comma: + lx.ignore() + return lexArrayValue // move on to the next value + case r == arrayEnd: + return lexArrayEnd + } + return lx.errorf( + "expected a comma or array terminator %q, but got %q instead", + arrayEnd, r, + ) +} + +// lexArrayEnd finishes the lexing of an array. +// It assumes that a ']' has just been consumed. +func lexArrayEnd(lx *lexer) stateFn { + lx.ignore() + lx.emit(itemArrayEnd) + return lx.pop() +} + +// lexInlineTableValue consumes one key/value pair in an inline table. +// It assumes that '{' or ',' have already been consumed. Whitespace is ignored. +func lexInlineTableValue(lx *lexer) stateFn { + r := lx.next() + switch { + case isWhitespace(r): + return lexSkip(lx, lexInlineTableValue) + case isNL(r): + return lx.errorf("newlines not allowed within inline tables") + case r == commentStart: + lx.push(lexInlineTableValue) + return lexCommentStart + case r == comma: + return lx.errorf("unexpected comma") + case r == inlineTableEnd: + return lexInlineTableEnd + } + lx.backup() + lx.push(lexInlineTableValueEnd) + return lexKeyStart +} + +// lexInlineTableValueEnd consumes everything between the end of an inline table +// key/value pair and the next pair (or the end of the table): +// it ignores whitespace and expects either a ',' or a '}'. +func lexInlineTableValueEnd(lx *lexer) stateFn { + r := lx.next() + switch { + case isWhitespace(r): + return lexSkip(lx, lexInlineTableValueEnd) + case isNL(r): + return lx.errorf("newlines not allowed within inline tables") + case r == commentStart: + lx.push(lexInlineTableValueEnd) + return lexCommentStart + case r == comma: + lx.ignore() + return lexInlineTableValue + case r == inlineTableEnd: + return lexInlineTableEnd + } + return lx.errorf("expected a comma or an inline table terminator %q, "+ + "but got %q instead", inlineTableEnd, r) +} + +// lexInlineTableEnd finishes the lexing of an inline table. +// It assumes that a '}' has just been consumed. +func lexInlineTableEnd(lx *lexer) stateFn { + lx.ignore() + lx.emit(itemInlineTableEnd) + return lx.pop() +} + +// lexString consumes the inner contents of a string. It assumes that the +// beginning '"' has already been consumed and ignored. +func lexString(lx *lexer) stateFn { + r := lx.next() + switch { + case r == eof: + return lx.errorf("unexpected EOF") + case isNL(r): + return lx.errorf("strings cannot contain newlines") + case r == '\\': + lx.push(lexString) + return lexStringEscape + case r == stringEnd: + lx.backup() + lx.emit(itemString) + lx.next() + lx.ignore() + return lx.pop() + } + return lexString +} + +// lexMultilineString consumes the inner contents of a string. It assumes that +// the beginning '"""' has already been consumed and ignored. +func lexMultilineString(lx *lexer) stateFn { + switch lx.next() { + case eof: + return lx.errorf("unexpected EOF") + case '\\': + return lexMultilineStringEscape + case stringEnd: + if lx.accept(stringEnd) { + if lx.accept(stringEnd) { + lx.backup() + lx.backup() + lx.backup() + lx.emit(itemMultilineString) + lx.next() + lx.next() + lx.next() + lx.ignore() + return lx.pop() + } + lx.backup() + } + } + return lexMultilineString +} + +// lexRawString consumes a raw string. Nothing can be escaped in such a string. +// It assumes that the beginning "'" has already been consumed and ignored. +func lexRawString(lx *lexer) stateFn { + r := lx.next() + switch { + case r == eof: + return lx.errorf("unexpected EOF") + case isNL(r): + return lx.errorf("strings cannot contain newlines") + case r == rawStringEnd: + lx.backup() + lx.emit(itemRawString) + lx.next() + lx.ignore() + return lx.pop() + } + return lexRawString +} + +// lexMultilineRawString consumes a raw string. Nothing can be escaped in such +// a string. It assumes that the beginning "'''" has already been consumed and +// ignored. +func lexMultilineRawString(lx *lexer) stateFn { + switch lx.next() { + case eof: + return lx.errorf("unexpected EOF") + case rawStringEnd: + if lx.accept(rawStringEnd) { + if lx.accept(rawStringEnd) { + lx.backup() + lx.backup() + lx.backup() + lx.emit(itemRawMultilineString) + lx.next() + lx.next() + lx.next() + lx.ignore() + return lx.pop() + } + lx.backup() + } + } + return lexMultilineRawString +} + +// lexMultilineStringEscape consumes an escaped character. It assumes that the +// preceding '\\' has already been consumed. +func lexMultilineStringEscape(lx *lexer) stateFn { + // Handle the special case first: + if isNL(lx.next()) { + return lexMultilineString + } + lx.backup() + lx.push(lexMultilineString) + return lexStringEscape(lx) +} + +func lexStringEscape(lx *lexer) stateFn { + r := lx.next() + switch r { + case 'b': + fallthrough + case 't': + fallthrough + case 'n': + fallthrough + case 'f': + fallthrough + case 'r': + fallthrough + case '"': + fallthrough + case '\\': + return lx.pop() + case 'u': + return lexShortUnicodeEscape + case 'U': + return lexLongUnicodeEscape + } + return lx.errorf("invalid escape character %q; only the following "+ + "escape characters are allowed: "+ + `\b, \t, \n, \f, \r, \", \\, \uXXXX, and \UXXXXXXXX`, r) +} + +func lexShortUnicodeEscape(lx *lexer) stateFn { + var r rune + for i := 0; i < 4; i++ { + r = lx.next() + if !isHexadecimal(r) { + return lx.errorf(`expected four hexadecimal digits after '\u', `+ + "but got %q instead", lx.current()) + } + } + return lx.pop() +} + +func lexLongUnicodeEscape(lx *lexer) stateFn { + var r rune + for i := 0; i < 8; i++ { + r = lx.next() + if !isHexadecimal(r) { + return lx.errorf(`expected eight hexadecimal digits after '\U', `+ + "but got %q instead", lx.current()) + } + } + return lx.pop() +} + +// lexNumberOrDateStart consumes either an integer, a float, or datetime. +func lexNumberOrDateStart(lx *lexer) stateFn { + r := lx.next() + if isDigit(r) { + return lexNumberOrDate + } + switch r { + case '_': + return lexNumber + case 'e', 'E': + return lexFloat + case '.': + return lx.errorf("floats must start with a digit, not '.'") + } + return lx.errorf("expected a digit but got %q", r) +} + +// lexNumberOrDate consumes either an integer, float or datetime. +func lexNumberOrDate(lx *lexer) stateFn { + r := lx.next() + if isDigit(r) { + return lexNumberOrDate + } + switch r { + case '-': + return lexDatetime + case '_': + return lexNumber + case '.', 'e', 'E': + return lexFloat + } + + lx.backup() + lx.emit(itemInteger) + return lx.pop() +} + +// lexDatetime consumes a Datetime, to a first approximation. +// The parser validates that it matches one of the accepted formats. +func lexDatetime(lx *lexer) stateFn { + r := lx.next() + if isDigit(r) { + return lexDatetime + } + switch r { + case '-', 'T', ':', '.', 'Z', '+': + return lexDatetime + } + + lx.backup() + lx.emit(itemDatetime) + return lx.pop() +} + +// lexNumberStart consumes either an integer or a float. It assumes that a sign +// has already been read, but that *no* digits have been consumed. +// lexNumberStart will move to the appropriate integer or float states. +func lexNumberStart(lx *lexer) stateFn { + // We MUST see a digit. Even floats have to start with a digit. + r := lx.next() + if !isDigit(r) { + if r == '.' { + return lx.errorf("floats must start with a digit, not '.'") + } + return lx.errorf("expected a digit but got %q", r) + } + return lexNumber +} + +// lexNumber consumes an integer or a float after seeing the first digit. +func lexNumber(lx *lexer) stateFn { + r := lx.next() + if isDigit(r) { + return lexNumber + } + switch r { + case '_': + return lexNumber + case '.', 'e', 'E': + return lexFloat + } + + lx.backup() + lx.emit(itemInteger) + return lx.pop() +} + +// lexFloat consumes the elements of a float. It allows any sequence of +// float-like characters, so floats emitted by the lexer are only a first +// approximation and must be validated by the parser. +func lexFloat(lx *lexer) stateFn { + r := lx.next() + if isDigit(r) { + return lexFloat + } + switch r { + case '_', '.', '-', '+', 'e', 'E': + return lexFloat + } + + lx.backup() + lx.emit(itemFloat) + return lx.pop() +} + +// lexBool consumes a bool string: 'true' or 'false. +func lexBool(lx *lexer) stateFn { + var rs []rune + for { + r := lx.next() + if !unicode.IsLetter(r) { + lx.backup() + break + } + rs = append(rs, r) + } + s := string(rs) + switch s { + case "true", "false": + lx.emit(itemBool) + return lx.pop() + } + return lx.errorf("expected value but found %q instead", s) +} + +// lexCommentStart begins the lexing of a comment. It will emit +// itemCommentStart and consume no characters, passing control to lexComment. +func lexCommentStart(lx *lexer) stateFn { + lx.ignore() + lx.emit(itemCommentStart) + return lexComment +} + +// lexComment lexes an entire comment. It assumes that '#' has been consumed. +// It will consume *up to* the first newline character, and pass control +// back to the last state on the stack. +func lexComment(lx *lexer) stateFn { + r := lx.peek() + if isNL(r) || r == eof { + lx.emit(itemText) + return lx.pop() + } + lx.next() + return lexComment +} + +// lexSkip ignores all slurped input and moves on to the next state. +func lexSkip(lx *lexer, nextState stateFn) stateFn { + return func(lx *lexer) stateFn { + lx.ignore() + return nextState + } +} + +// isWhitespace returns true if `r` is a whitespace character according +// to the spec. +func isWhitespace(r rune) bool { + return r == '\t' || r == ' ' +} + +func isNL(r rune) bool { + return r == '\n' || r == '\r' +} + +func isDigit(r rune) bool { + return r >= '0' && r <= '9' +} + +func isHexadecimal(r rune) bool { + return (r >= '0' && r <= '9') || + (r >= 'a' && r <= 'f') || + (r >= 'A' && r <= 'F') +} + +func isBareKeyChar(r rune) bool { + return (r >= 'A' && r <= 'Z') || + (r >= 'a' && r <= 'z') || + (r >= '0' && r <= '9') || + r == '_' || + r == '-' +} + +func (itype itemType) String() string { + switch itype { + case itemError: + return "Error" + case itemNIL: + return "NIL" + case itemEOF: + return "EOF" + case itemText: + return "Text" + case itemString, itemRawString, itemMultilineString, itemRawMultilineString: + return "String" + case itemBool: + return "Bool" + case itemInteger: + return "Integer" + case itemFloat: + return "Float" + case itemDatetime: + return "DateTime" + case itemTableStart: + return "TableStart" + case itemTableEnd: + return "TableEnd" + case itemKeyStart: + return "KeyStart" + case itemArray: + return "Array" + case itemArrayEnd: + return "ArrayEnd" + case itemCommentStart: + return "CommentStart" + } + panic(fmt.Sprintf("BUG: Unknown type '%d'.", int(itype))) +} + +func (item item) String() string { + return fmt.Sprintf("(%s, %s)", item.typ.String(), item.val) +} diff --git a/vendor/github.com/BurntSushi/toml/parse.go b/vendor/github.com/BurntSushi/toml/parse.go new file mode 100644 index 000000000000..50869ef9266e --- /dev/null +++ b/vendor/github.com/BurntSushi/toml/parse.go @@ -0,0 +1,592 @@ +package toml + +import ( + "fmt" + "strconv" + "strings" + "time" + "unicode" + "unicode/utf8" +) + +type parser struct { + mapping map[string]interface{} + types map[string]tomlType + lx *lexer + + // A list of keys in the order that they appear in the TOML data. + ordered []Key + + // the full key for the current hash in scope + context Key + + // the base key name for everything except hashes + currentKey string + + // rough approximation of line number + approxLine int + + // A map of 'key.group.names' to whether they were created implicitly. + implicits map[string]bool +} + +type parseError string + +func (pe parseError) Error() string { + return string(pe) +} + +func parse(data string) (p *parser, err error) { + defer func() { + if r := recover(); r != nil { + var ok bool + if err, ok = r.(parseError); ok { + return + } + panic(r) + } + }() + + p = &parser{ + mapping: make(map[string]interface{}), + types: make(map[string]tomlType), + lx: lex(data), + ordered: make([]Key, 0), + implicits: make(map[string]bool), + } + for { + item := p.next() + if item.typ == itemEOF { + break + } + p.topLevel(item) + } + + return p, nil +} + +func (p *parser) panicf(format string, v ...interface{}) { + msg := fmt.Sprintf("Near line %d (last key parsed '%s'): %s", + p.approxLine, p.current(), fmt.Sprintf(format, v...)) + panic(parseError(msg)) +} + +func (p *parser) next() item { + it := p.lx.nextItem() + if it.typ == itemError { + p.panicf("%s", it.val) + } + return it +} + +func (p *parser) bug(format string, v ...interface{}) { + panic(fmt.Sprintf("BUG: "+format+"\n\n", v...)) +} + +func (p *parser) expect(typ itemType) item { + it := p.next() + p.assertEqual(typ, it.typ) + return it +} + +func (p *parser) assertEqual(expected, got itemType) { + if expected != got { + p.bug("Expected '%s' but got '%s'.", expected, got) + } +} + +func (p *parser) topLevel(item item) { + switch item.typ { + case itemCommentStart: + p.approxLine = item.line + p.expect(itemText) + case itemTableStart: + kg := p.next() + p.approxLine = kg.line + + var key Key + for ; kg.typ != itemTableEnd && kg.typ != itemEOF; kg = p.next() { + key = append(key, p.keyString(kg)) + } + p.assertEqual(itemTableEnd, kg.typ) + + p.establishContext(key, false) + p.setType("", tomlHash) + p.ordered = append(p.ordered, key) + case itemArrayTableStart: + kg := p.next() + p.approxLine = kg.line + + var key Key + for ; kg.typ != itemArrayTableEnd && kg.typ != itemEOF; kg = p.next() { + key = append(key, p.keyString(kg)) + } + p.assertEqual(itemArrayTableEnd, kg.typ) + + p.establishContext(key, true) + p.setType("", tomlArrayHash) + p.ordered = append(p.ordered, key) + case itemKeyStart: + kname := p.next() + p.approxLine = kname.line + p.currentKey = p.keyString(kname) + + val, typ := p.value(p.next()) + p.setValue(p.currentKey, val) + p.setType(p.currentKey, typ) + p.ordered = append(p.ordered, p.context.add(p.currentKey)) + p.currentKey = "" + default: + p.bug("Unexpected type at top level: %s", item.typ) + } +} + +// Gets a string for a key (or part of a key in a table name). +func (p *parser) keyString(it item) string { + switch it.typ { + case itemText: + return it.val + case itemString, itemMultilineString, + itemRawString, itemRawMultilineString: + s, _ := p.value(it) + return s.(string) + default: + p.bug("Unexpected key type: %s", it.typ) + panic("unreachable") + } +} + +// value translates an expected value from the lexer into a Go value wrapped +// as an empty interface. +func (p *parser) value(it item) (interface{}, tomlType) { + switch it.typ { + case itemString: + return p.replaceEscapes(it.val), p.typeOfPrimitive(it) + case itemMultilineString: + trimmed := stripFirstNewline(stripEscapedWhitespace(it.val)) + return p.replaceEscapes(trimmed), p.typeOfPrimitive(it) + case itemRawString: + return it.val, p.typeOfPrimitive(it) + case itemRawMultilineString: + return stripFirstNewline(it.val), p.typeOfPrimitive(it) + case itemBool: + switch it.val { + case "true": + return true, p.typeOfPrimitive(it) + case "false": + return false, p.typeOfPrimitive(it) + } + p.bug("Expected boolean value, but got '%s'.", it.val) + case itemInteger: + if !numUnderscoresOK(it.val) { + p.panicf("Invalid integer %q: underscores must be surrounded by digits", + it.val) + } + val := strings.Replace(it.val, "_", "", -1) + num, err := strconv.ParseInt(val, 10, 64) + if err != nil { + // Distinguish integer values. Normally, it'd be a bug if the lexer + // provides an invalid integer, but it's possible that the number is + // out of range of valid values (which the lexer cannot determine). + // So mark the former as a bug but the latter as a legitimate user + // error. + if e, ok := err.(*strconv.NumError); ok && + e.Err == strconv.ErrRange { + + p.panicf("Integer '%s' is out of the range of 64-bit "+ + "signed integers.", it.val) + } else { + p.bug("Expected integer value, but got '%s'.", it.val) + } + } + return num, p.typeOfPrimitive(it) + case itemFloat: + parts := strings.FieldsFunc(it.val, func(r rune) bool { + switch r { + case '.', 'e', 'E': + return true + } + return false + }) + for _, part := range parts { + if !numUnderscoresOK(part) { + p.panicf("Invalid float %q: underscores must be "+ + "surrounded by digits", it.val) + } + } + if !numPeriodsOK(it.val) { + // As a special case, numbers like '123.' or '1.e2', + // which are valid as far as Go/strconv are concerned, + // must be rejected because TOML says that a fractional + // part consists of '.' followed by 1+ digits. + p.panicf("Invalid float %q: '.' must be followed "+ + "by one or more digits", it.val) + } + val := strings.Replace(it.val, "_", "", -1) + num, err := strconv.ParseFloat(val, 64) + if err != nil { + if e, ok := err.(*strconv.NumError); ok && + e.Err == strconv.ErrRange { + + p.panicf("Float '%s' is out of the range of 64-bit "+ + "IEEE-754 floating-point numbers.", it.val) + } else { + p.panicf("Invalid float value: %q", it.val) + } + } + return num, p.typeOfPrimitive(it) + case itemDatetime: + var t time.Time + var ok bool + var err error + for _, format := range []string{ + "2006-01-02T15:04:05Z07:00", + "2006-01-02T15:04:05", + "2006-01-02", + } { + t, err = time.ParseInLocation(format, it.val, time.Local) + if err == nil { + ok = true + break + } + } + if !ok { + p.panicf("Invalid TOML Datetime: %q.", it.val) + } + return t, p.typeOfPrimitive(it) + case itemArray: + array := make([]interface{}, 0) + types := make([]tomlType, 0) + + for it = p.next(); it.typ != itemArrayEnd; it = p.next() { + if it.typ == itemCommentStart { + p.expect(itemText) + continue + } + + val, typ := p.value(it) + array = append(array, val) + types = append(types, typ) + } + return array, p.typeOfArray(types) + case itemInlineTableStart: + var ( + hash = make(map[string]interface{}) + outerContext = p.context + outerKey = p.currentKey + ) + + p.context = append(p.context, p.currentKey) + p.currentKey = "" + for it := p.next(); it.typ != itemInlineTableEnd; it = p.next() { + if it.typ != itemKeyStart { + p.bug("Expected key start but instead found %q, around line %d", + it.val, p.approxLine) + } + if it.typ == itemCommentStart { + p.expect(itemText) + continue + } + + // retrieve key + k := p.next() + p.approxLine = k.line + kname := p.keyString(k) + + // retrieve value + p.currentKey = kname + val, typ := p.value(p.next()) + // make sure we keep metadata up to date + p.setType(kname, typ) + p.ordered = append(p.ordered, p.context.add(p.currentKey)) + hash[kname] = val + } + p.context = outerContext + p.currentKey = outerKey + return hash, tomlHash + } + p.bug("Unexpected value type: %s", it.typ) + panic("unreachable") +} + +// numUnderscoresOK checks whether each underscore in s is surrounded by +// characters that are not underscores. +func numUnderscoresOK(s string) bool { + accept := false + for _, r := range s { + if r == '_' { + if !accept { + return false + } + accept = false + continue + } + accept = true + } + return accept +} + +// numPeriodsOK checks whether every period in s is followed by a digit. +func numPeriodsOK(s string) bool { + period := false + for _, r := range s { + if period && !isDigit(r) { + return false + } + period = r == '.' + } + return !period +} + +// establishContext sets the current context of the parser, +// where the context is either a hash or an array of hashes. Which one is +// set depends on the value of the `array` parameter. +// +// Establishing the context also makes sure that the key isn't a duplicate, and +// will create implicit hashes automatically. +func (p *parser) establishContext(key Key, array bool) { + var ok bool + + // Always start at the top level and drill down for our context. + hashContext := p.mapping + keyContext := make(Key, 0) + + // We only need implicit hashes for key[0:-1] + for _, k := range key[0 : len(key)-1] { + _, ok = hashContext[k] + keyContext = append(keyContext, k) + + // No key? Make an implicit hash and move on. + if !ok { + p.addImplicit(keyContext) + hashContext[k] = make(map[string]interface{}) + } + + // If the hash context is actually an array of tables, then set + // the hash context to the last element in that array. + // + // Otherwise, it better be a table, since this MUST be a key group (by + // virtue of it not being the last element in a key). + switch t := hashContext[k].(type) { + case []map[string]interface{}: + hashContext = t[len(t)-1] + case map[string]interface{}: + hashContext = t + default: + p.panicf("Key '%s' was already created as a hash.", keyContext) + } + } + + p.context = keyContext + if array { + // If this is the first element for this array, then allocate a new + // list of tables for it. + k := key[len(key)-1] + if _, ok := hashContext[k]; !ok { + hashContext[k] = make([]map[string]interface{}, 0, 5) + } + + // Add a new table. But make sure the key hasn't already been used + // for something else. + if hash, ok := hashContext[k].([]map[string]interface{}); ok { + hashContext[k] = append(hash, make(map[string]interface{})) + } else { + p.panicf("Key '%s' was already created and cannot be used as "+ + "an array.", keyContext) + } + } else { + p.setValue(key[len(key)-1], make(map[string]interface{})) + } + p.context = append(p.context, key[len(key)-1]) +} + +// setValue sets the given key to the given value in the current context. +// It will make sure that the key hasn't already been defined, account for +// implicit key groups. +func (p *parser) setValue(key string, value interface{}) { + var tmpHash interface{} + var ok bool + + hash := p.mapping + keyContext := make(Key, 0) + for _, k := range p.context { + keyContext = append(keyContext, k) + if tmpHash, ok = hash[k]; !ok { + p.bug("Context for key '%s' has not been established.", keyContext) + } + switch t := tmpHash.(type) { + case []map[string]interface{}: + // The context is a table of hashes. Pick the most recent table + // defined as the current hash. + hash = t[len(t)-1] + case map[string]interface{}: + hash = t + default: + p.bug("Expected hash to have type 'map[string]interface{}', but "+ + "it has '%T' instead.", tmpHash) + } + } + keyContext = append(keyContext, key) + + if _, ok := hash[key]; ok { + // Typically, if the given key has already been set, then we have + // to raise an error since duplicate keys are disallowed. However, + // it's possible that a key was previously defined implicitly. In this + // case, it is allowed to be redefined concretely. (See the + // `tests/valid/implicit-and-explicit-after.toml` test in `toml-test`.) + // + // But we have to make sure to stop marking it as an implicit. (So that + // another redefinition provokes an error.) + // + // Note that since it has already been defined (as a hash), we don't + // want to overwrite it. So our business is done. + if p.isImplicit(keyContext) { + p.removeImplicit(keyContext) + return + } + + // Otherwise, we have a concrete key trying to override a previous + // key, which is *always* wrong. + p.panicf("Key '%s' has already been defined.", keyContext) + } + hash[key] = value +} + +// setType sets the type of a particular value at a given key. +// It should be called immediately AFTER setValue. +// +// Note that if `key` is empty, then the type given will be applied to the +// current context (which is either a table or an array of tables). +func (p *parser) setType(key string, typ tomlType) { + keyContext := make(Key, 0, len(p.context)+1) + for _, k := range p.context { + keyContext = append(keyContext, k) + } + if len(key) > 0 { // allow type setting for hashes + keyContext = append(keyContext, key) + } + p.types[keyContext.String()] = typ +} + +// addImplicit sets the given Key as having been created implicitly. +func (p *parser) addImplicit(key Key) { + p.implicits[key.String()] = true +} + +// removeImplicit stops tagging the given key as having been implicitly +// created. +func (p *parser) removeImplicit(key Key) { + p.implicits[key.String()] = false +} + +// isImplicit returns true if the key group pointed to by the key was created +// implicitly. +func (p *parser) isImplicit(key Key) bool { + return p.implicits[key.String()] +} + +// current returns the full key name of the current context. +func (p *parser) current() string { + if len(p.currentKey) == 0 { + return p.context.String() + } + if len(p.context) == 0 { + return p.currentKey + } + return fmt.Sprintf("%s.%s", p.context, p.currentKey) +} + +func stripFirstNewline(s string) string { + if len(s) == 0 || s[0] != '\n' { + return s + } + return s[1:] +} + +func stripEscapedWhitespace(s string) string { + esc := strings.Split(s, "\\\n") + if len(esc) > 1 { + for i := 1; i < len(esc); i++ { + esc[i] = strings.TrimLeftFunc(esc[i], unicode.IsSpace) + } + } + return strings.Join(esc, "") +} + +func (p *parser) replaceEscapes(str string) string { + var replaced []rune + s := []byte(str) + r := 0 + for r < len(s) { + if s[r] != '\\' { + c, size := utf8.DecodeRune(s[r:]) + r += size + replaced = append(replaced, c) + continue + } + r += 1 + if r >= len(s) { + p.bug("Escape sequence at end of string.") + return "" + } + switch s[r] { + default: + p.bug("Expected valid escape code after \\, but got %q.", s[r]) + return "" + case 'b': + replaced = append(replaced, rune(0x0008)) + r += 1 + case 't': + replaced = append(replaced, rune(0x0009)) + r += 1 + case 'n': + replaced = append(replaced, rune(0x000A)) + r += 1 + case 'f': + replaced = append(replaced, rune(0x000C)) + r += 1 + case 'r': + replaced = append(replaced, rune(0x000D)) + r += 1 + case '"': + replaced = append(replaced, rune(0x0022)) + r += 1 + case '\\': + replaced = append(replaced, rune(0x005C)) + r += 1 + case 'u': + // At this point, we know we have a Unicode escape of the form + // `uXXXX` at [r, r+5). (Because the lexer guarantees this + // for us.) + escaped := p.asciiEscapeToUnicode(s[r+1 : r+5]) + replaced = append(replaced, escaped) + r += 5 + case 'U': + // At this point, we know we have a Unicode escape of the form + // `uXXXX` at [r, r+9). (Because the lexer guarantees this + // for us.) + escaped := p.asciiEscapeToUnicode(s[r+1 : r+9]) + replaced = append(replaced, escaped) + r += 9 + } + } + return string(replaced) +} + +func (p *parser) asciiEscapeToUnicode(bs []byte) rune { + s := string(bs) + hex, err := strconv.ParseUint(strings.ToLower(s), 16, 32) + if err != nil { + p.bug("Could not parse '%s' as a hexadecimal number, but the "+ + "lexer claims it's OK: %s", s, err) + } + if !utf8.ValidRune(rune(hex)) { + p.panicf("Escaped character '\\u%s' is not valid UTF-8.", s) + } + return rune(hex) +} + +func isStringType(ty itemType) bool { + return ty == itemString || ty == itemMultilineString || + ty == itemRawString || ty == itemRawMultilineString +} diff --git a/vendor/github.com/BurntSushi/toml/type_check.go b/vendor/github.com/BurntSushi/toml/type_check.go new file mode 100644 index 000000000000..c73f8afc1a6d --- /dev/null +++ b/vendor/github.com/BurntSushi/toml/type_check.go @@ -0,0 +1,91 @@ +package toml + +// tomlType represents any Go type that corresponds to a TOML type. +// While the first draft of the TOML spec has a simplistic type system that +// probably doesn't need this level of sophistication, we seem to be militating +// toward adding real composite types. +type tomlType interface { + typeString() string +} + +// typeEqual accepts any two types and returns true if they are equal. +func typeEqual(t1, t2 tomlType) bool { + if t1 == nil || t2 == nil { + return false + } + return t1.typeString() == t2.typeString() +} + +func typeIsHash(t tomlType) bool { + return typeEqual(t, tomlHash) || typeEqual(t, tomlArrayHash) +} + +type tomlBaseType string + +func (btype tomlBaseType) typeString() string { + return string(btype) +} + +func (btype tomlBaseType) String() string { + return btype.typeString() +} + +var ( + tomlInteger tomlBaseType = "Integer" + tomlFloat tomlBaseType = "Float" + tomlDatetime tomlBaseType = "Datetime" + tomlString tomlBaseType = "String" + tomlBool tomlBaseType = "Bool" + tomlArray tomlBaseType = "Array" + tomlHash tomlBaseType = "Hash" + tomlArrayHash tomlBaseType = "ArrayHash" +) + +// typeOfPrimitive returns a tomlType of any primitive value in TOML. +// Primitive values are: Integer, Float, Datetime, String and Bool. +// +// Passing a lexer item other than the following will cause a BUG message +// to occur: itemString, itemBool, itemInteger, itemFloat, itemDatetime. +func (p *parser) typeOfPrimitive(lexItem item) tomlType { + switch lexItem.typ { + case itemInteger: + return tomlInteger + case itemFloat: + return tomlFloat + case itemDatetime: + return tomlDatetime + case itemString: + return tomlString + case itemMultilineString: + return tomlString + case itemRawString: + return tomlString + case itemRawMultilineString: + return tomlString + case itemBool: + return tomlBool + } + p.bug("Cannot infer primitive type of lex item '%s'.", lexItem) + panic("unreachable") +} + +// typeOfArray returns a tomlType for an array given a list of types of its +// values. +// +// In the current spec, if an array is homogeneous, then its type is always +// "Array". If the array is not homogeneous, an error is generated. +func (p *parser) typeOfArray(types []tomlType) tomlType { + // Empty arrays are cool. + if len(types) == 0 { + return tomlArray + } + + theType := types[0] + for _, t := range types[1:] { + if !typeEqual(theType, t) { + p.panicf("Array contains values of type '%s' and '%s', but "+ + "arrays must be homogeneous.", theType, t) + } + } + return tomlArray +} diff --git a/vendor/github.com/BurntSushi/toml/type_fields.go b/vendor/github.com/BurntSushi/toml/type_fields.go new file mode 100644 index 000000000000..608997c22f68 --- /dev/null +++ b/vendor/github.com/BurntSushi/toml/type_fields.go @@ -0,0 +1,242 @@ +package toml + +// Struct field handling is adapted from code in encoding/json: +// +// Copyright 2010 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the Go distribution. + +import ( + "reflect" + "sort" + "sync" +) + +// A field represents a single field found in a struct. +type field struct { + name string // the name of the field (`toml` tag included) + tag bool // whether field has a `toml` tag + index []int // represents the depth of an anonymous field + typ reflect.Type // the type of the field +} + +// byName sorts field by name, breaking ties with depth, +// then breaking ties with "name came from toml tag", then +// breaking ties with index sequence. +type byName []field + +func (x byName) Len() int { return len(x) } + +func (x byName) Swap(i, j int) { x[i], x[j] = x[j], x[i] } + +func (x byName) Less(i, j int) bool { + if x[i].name != x[j].name { + return x[i].name < x[j].name + } + if len(x[i].index) != len(x[j].index) { + return len(x[i].index) < len(x[j].index) + } + if x[i].tag != x[j].tag { + return x[i].tag + } + return byIndex(x).Less(i, j) +} + +// byIndex sorts field by index sequence. +type byIndex []field + +func (x byIndex) Len() int { return len(x) } + +func (x byIndex) Swap(i, j int) { x[i], x[j] = x[j], x[i] } + +func (x byIndex) Less(i, j int) bool { + for k, xik := range x[i].index { + if k >= len(x[j].index) { + return false + } + if xik != x[j].index[k] { + return xik < x[j].index[k] + } + } + return len(x[i].index) < len(x[j].index) +} + +// typeFields returns a list of fields that TOML should recognize for the given +// type. The algorithm is breadth-first search over the set of structs to +// include - the top struct and then any reachable anonymous structs. +func typeFields(t reflect.Type) []field { + // Anonymous fields to explore at the current level and the next. + current := []field{} + next := []field{{typ: t}} + + // Count of queued names for current level and the next. + count := map[reflect.Type]int{} + nextCount := map[reflect.Type]int{} + + // Types already visited at an earlier level. + visited := map[reflect.Type]bool{} + + // Fields found. + var fields []field + + for len(next) > 0 { + current, next = next, current[:0] + count, nextCount = nextCount, map[reflect.Type]int{} + + for _, f := range current { + if visited[f.typ] { + continue + } + visited[f.typ] = true + + // Scan f.typ for fields to include. + for i := 0; i < f.typ.NumField(); i++ { + sf := f.typ.Field(i) + if sf.PkgPath != "" && !sf.Anonymous { // unexported + continue + } + opts := getOptions(sf.Tag) + if opts.skip { + continue + } + index := make([]int, len(f.index)+1) + copy(index, f.index) + index[len(f.index)] = i + + ft := sf.Type + if ft.Name() == "" && ft.Kind() == reflect.Ptr { + // Follow pointer. + ft = ft.Elem() + } + + // Record found field and index sequence. + if opts.name != "" || !sf.Anonymous || ft.Kind() != reflect.Struct { + tagged := opts.name != "" + name := opts.name + if name == "" { + name = sf.Name + } + fields = append(fields, field{name, tagged, index, ft}) + if count[f.typ] > 1 { + // If there were multiple instances, add a second, + // so that the annihilation code will see a duplicate. + // It only cares about the distinction between 1 or 2, + // so don't bother generating any more copies. + fields = append(fields, fields[len(fields)-1]) + } + continue + } + + // Record new anonymous struct to explore in next round. + nextCount[ft]++ + if nextCount[ft] == 1 { + f := field{name: ft.Name(), index: index, typ: ft} + next = append(next, f) + } + } + } + } + + sort.Sort(byName(fields)) + + // Delete all fields that are hidden by the Go rules for embedded fields, + // except that fields with TOML tags are promoted. + + // The fields are sorted in primary order of name, secondary order + // of field index length. Loop over names; for each name, delete + // hidden fields by choosing the one dominant field that survives. + out := fields[:0] + for advance, i := 0, 0; i < len(fields); i += advance { + // One iteration per name. + // Find the sequence of fields with the name of this first field. + fi := fields[i] + name := fi.name + for advance = 1; i+advance < len(fields); advance++ { + fj := fields[i+advance] + if fj.name != name { + break + } + } + if advance == 1 { // Only one field with this name + out = append(out, fi) + continue + } + dominant, ok := dominantField(fields[i : i+advance]) + if ok { + out = append(out, dominant) + } + } + + fields = out + sort.Sort(byIndex(fields)) + + return fields +} + +// dominantField looks through the fields, all of which are known to +// have the same name, to find the single field that dominates the +// others using Go's embedding rules, modified by the presence of +// TOML tags. If there are multiple top-level fields, the boolean +// will be false: This condition is an error in Go and we skip all +// the fields. +func dominantField(fields []field) (field, bool) { + // The fields are sorted in increasing index-length order. The winner + // must therefore be one with the shortest index length. Drop all + // longer entries, which is easy: just truncate the slice. + length := len(fields[0].index) + tagged := -1 // Index of first tagged field. + for i, f := range fields { + if len(f.index) > length { + fields = fields[:i] + break + } + if f.tag { + if tagged >= 0 { + // Multiple tagged fields at the same level: conflict. + // Return no field. + return field{}, false + } + tagged = i + } + } + if tagged >= 0 { + return fields[tagged], true + } + // All remaining fields have the same length. If there's more than one, + // we have a conflict (two fields named "X" at the same level) and we + // return no field. + if len(fields) > 1 { + return field{}, false + } + return fields[0], true +} + +var fieldCache struct { + sync.RWMutex + m map[reflect.Type][]field +} + +// cachedTypeFields is like typeFields but uses a cache to avoid repeated work. +func cachedTypeFields(t reflect.Type) []field { + fieldCache.RLock() + f := fieldCache.m[t] + fieldCache.RUnlock() + if f != nil { + return f + } + + // Compute fields without lock. + // Might duplicate effort but won't hold other computations back. + f = typeFields(t) + if f == nil { + f = []field{} + } + + fieldCache.Lock() + if fieldCache.m == nil { + fieldCache.m = map[reflect.Type][]field{} + } + fieldCache.m[t] = f + fieldCache.Unlock() + return f +} diff --git a/vendor/github.com/couchbase/gomemcached/LICENSE b/vendor/github.com/couchbase/gomemcached/LICENSE new file mode 100644 index 000000000000..b01ef80261ec --- /dev/null +++ b/vendor/github.com/couchbase/gomemcached/LICENSE @@ -0,0 +1,19 @@ +Copyright (c) 2013 Dustin Sallings + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/vendor/github.com/couchbase/gomemcached/client/mc.go b/vendor/github.com/couchbase/gomemcached/client/mc.go new file mode 100644 index 000000000000..bd1433ba28fb --- /dev/null +++ b/vendor/github.com/couchbase/gomemcached/client/mc.go @@ -0,0 +1,1074 @@ +// Package memcached provides a memcached binary protocol client. +package memcached + +import ( + "encoding/binary" + "fmt" + "github.com/couchbase/gomemcached" + "github.com/couchbase/goutils/logging" + "github.com/couchbase/goutils/scramsha" + "github.com/pkg/errors" + "io" + "math" + "net" + "strings" + "sync" + "sync/atomic" + "time" +) + +type ClientIface interface { + Add(vb uint16, key string, flags int, exp int, body []byte) (*gomemcached.MCResponse, error) + Append(vb uint16, key string, data []byte) (*gomemcached.MCResponse, error) + Auth(user, pass string) (*gomemcached.MCResponse, error) + AuthList() (*gomemcached.MCResponse, error) + AuthPlain(user, pass string) (*gomemcached.MCResponse, error) + AuthScramSha(user, pass string) (*gomemcached.MCResponse, error) + CASNext(vb uint16, k string, exp int, state *CASState) bool + CAS(vb uint16, k string, f CasFunc, initexp int) (*gomemcached.MCResponse, error) + Close() error + Decr(vb uint16, key string, amt, def uint64, exp int) (uint64, error) + Del(vb uint16, key string) (*gomemcached.MCResponse, error) + EnableMutationToken() (*gomemcached.MCResponse, error) + Get(vb uint16, key string) (*gomemcached.MCResponse, error) + GetSubdoc(vb uint16, key string, subPaths []string) (*gomemcached.MCResponse, error) + GetAndTouch(vb uint16, key string, exp int) (*gomemcached.MCResponse, error) + GetBulk(vb uint16, keys []string, rv map[string]*gomemcached.MCResponse, subPaths []string) error + GetMeta(vb uint16, key string) (*gomemcached.MCResponse, error) + GetRandomDoc() (*gomemcached.MCResponse, error) + Hijack() io.ReadWriteCloser + Incr(vb uint16, key string, amt, def uint64, exp int) (uint64, error) + Observe(vb uint16, key string) (result ObserveResult, err error) + ObserveSeq(vb uint16, vbuuid uint64) (result *ObserveSeqResult, err error) + Receive() (*gomemcached.MCResponse, error) + ReceiveWithDeadline(deadline time.Time) (*gomemcached.MCResponse, error) + Send(req *gomemcached.MCRequest) (rv *gomemcached.MCResponse, err error) + Set(vb uint16, key string, flags int, exp int, body []byte) (*gomemcached.MCResponse, error) + SetKeepAliveOptions(interval time.Duration) + SetReadDeadline(t time.Time) + SetDeadline(t time.Time) + SelectBucket(bucket string) (*gomemcached.MCResponse, error) + SetCas(vb uint16, key string, flags int, exp int, cas uint64, body []byte) (*gomemcached.MCResponse, error) + Stats(key string) ([]StatValue, error) + StatsMap(key string) (map[string]string, error) + StatsMapForSpecifiedStats(key string, statsMap map[string]string) error + Transmit(req *gomemcached.MCRequest) error + TransmitWithDeadline(req *gomemcached.MCRequest, deadline time.Time) error + TransmitResponse(res *gomemcached.MCResponse) error + + // UprFeed Related + NewUprFeed() (*UprFeed, error) + NewUprFeedIface() (UprFeedIface, error) + NewUprFeedWithConfig(ackByClient bool) (*UprFeed, error) + NewUprFeedWithConfigIface(ackByClient bool) (UprFeedIface, error) + UprGetFailoverLog(vb []uint16) (map[uint16]*FailoverLog, error) +} + +const bufsize = 1024 + +var UnHealthy uint32 = 0 +var Healthy uint32 = 1 + +type Features []Feature +type Feature uint16 + +const FeatureMutationToken = Feature(0x04) +const FeatureXattr = Feature(0x06) +const FeatureDataType = Feature(0x0b) + +// The Client itself. +type Client struct { + conn io.ReadWriteCloser + // use uint32 type so that it can be accessed through atomic APIs + healthy uint32 + opaque uint32 + + hdrBuf []byte +} + +var ( + DefaultDialTimeout = time.Duration(0) // No timeout + + DefaultWriteTimeout = time.Duration(0) // No timeout + + dialFun = func(prot, dest string) (net.Conn, error) { + return net.DialTimeout(prot, dest, DefaultDialTimeout) + } +) + +// Connect to a memcached server. +func Connect(prot, dest string) (rv *Client, err error) { + conn, err := dialFun(prot, dest) + if err != nil { + return nil, err + } + return Wrap(conn) +} + +func SetDefaultTimeouts(dial, read, write time.Duration) { + DefaultDialTimeout = dial + DefaultWriteTimeout = write +} + +func SetDefaultDialTimeout(dial time.Duration) { + DefaultDialTimeout = dial +} + +func (c *Client) SetKeepAliveOptions(interval time.Duration) { + c.conn.(*net.TCPConn).SetKeepAlive(true) + c.conn.(*net.TCPConn).SetKeepAlivePeriod(interval) +} + +func (c *Client) SetReadDeadline(t time.Time) { + c.conn.(*net.TCPConn).SetReadDeadline(t) +} + +func (c *Client) SetDeadline(t time.Time) { + c.conn.(*net.TCPConn).SetDeadline(t) +} + +// Wrap an existing transport. +func Wrap(rwc io.ReadWriteCloser) (rv *Client, err error) { + client := &Client{ + conn: rwc, + hdrBuf: make([]byte, gomemcached.HDR_LEN), + opaque: uint32(1), + } + client.setHealthy(true) + return client, nil +} + +// Close the connection when you're done. +func (c *Client) Close() error { + return c.conn.Close() +} + +// IsHealthy returns true unless the client is belived to have +// difficulty communicating to its server. +// +// This is useful for connection pools where we want to +// non-destructively determine that a connection may be reused. +func (c Client) IsHealthy() bool { + healthyState := atomic.LoadUint32(&c.healthy) + return healthyState == Healthy +} + +// Send a custom request and get the response. +func (c *Client) Send(req *gomemcached.MCRequest) (rv *gomemcached.MCResponse, err error) { + err = c.Transmit(req) + if err != nil { + return + } + resp, _, err := getResponse(c.conn, c.hdrBuf) + c.setHealthy(!gomemcached.IsFatal(err)) + return resp, err +} + +// Transmit send a request, but does not wait for a response. +func (c *Client) Transmit(req *gomemcached.MCRequest) error { + if DefaultWriteTimeout > 0 { + c.conn.(net.Conn).SetWriteDeadline(time.Now().Add(DefaultWriteTimeout)) + } + _, err := transmitRequest(c.conn, req) + // clear write deadline to avoid interference with future write operations + if DefaultWriteTimeout > 0 { + c.conn.(net.Conn).SetWriteDeadline(time.Time{}) + } + if err != nil { + c.setHealthy(false) + } + return err +} + +func (c *Client) TransmitWithDeadline(req *gomemcached.MCRequest, deadline time.Time) error { + c.conn.(net.Conn).SetWriteDeadline(deadline) + + _, err := transmitRequest(c.conn, req) + + // clear write deadline to avoid interference with future write operations + c.conn.(net.Conn).SetWriteDeadline(time.Time{}) + + if err != nil { + c.setHealthy(false) + } + return err +} + +// TransmitResponse send a response, does not wait. +func (c *Client) TransmitResponse(res *gomemcached.MCResponse) error { + if DefaultWriteTimeout > 0 { + c.conn.(net.Conn).SetWriteDeadline(time.Now().Add(DefaultWriteTimeout)) + } + _, err := transmitResponse(c.conn, res) + // clear write deadline to avoid interference with future write operations + if DefaultWriteTimeout > 0 { + c.conn.(net.Conn).SetWriteDeadline(time.Time{}) + } + if err != nil { + c.setHealthy(false) + } + return err +} + +// Receive a response +func (c *Client) Receive() (*gomemcached.MCResponse, error) { + resp, _, err := getResponse(c.conn, c.hdrBuf) + if err != nil && resp.Status != gomemcached.KEY_ENOENT && resp.Status != gomemcached.EBUSY { + c.setHealthy(false) + } + return resp, err +} + +func (c *Client) ReceiveWithDeadline(deadline time.Time) (*gomemcached.MCResponse, error) { + c.conn.(net.Conn).SetReadDeadline(deadline) + + resp, _, err := getResponse(c.conn, c.hdrBuf) + + // Clear read deadline to avoid interference with future read operations. + c.conn.(net.Conn).SetReadDeadline(time.Time{}) + + if err != nil && resp.Status != gomemcached.KEY_ENOENT && resp.Status != gomemcached.EBUSY { + c.setHealthy(false) + } + return resp, err +} + +func appendMutationToken(bytes []byte) []byte { + bytes = append(bytes, 0, 0) + binary.BigEndian.PutUint16(bytes[len(bytes)-2:], uint16(0x04)) + return bytes +} + +//Send a hello command to enable MutationTokens +func (c *Client) EnableMutationToken() (*gomemcached.MCResponse, error) { + var payload []byte + payload = appendMutationToken(payload) + + return c.Send(&gomemcached.MCRequest{ + Opcode: gomemcached.HELLO, + Key: []byte("GoMemcached"), + Body: payload, + }) + +} + +//Send a hello command to enable specific features +func (c *Client) EnableFeatures(features Features) (*gomemcached.MCResponse, error) { + var payload []byte + + for _, feature := range features { + payload = append(payload, 0, 0) + binary.BigEndian.PutUint16(payload[len(payload)-2:], uint16(feature)) + } + + return c.Send(&gomemcached.MCRequest{ + Opcode: gomemcached.HELLO, + Key: []byte("GoMemcached"), + Body: payload, + }) + +} + +// Get the value for a key. +func (c *Client) Get(vb uint16, key string) (*gomemcached.MCResponse, error) { + return c.Send(&gomemcached.MCRequest{ + Opcode: gomemcached.GET, + VBucket: vb, + Key: []byte(key), + }) +} + +// Get the xattrs, doc value for the input key +func (c *Client) GetSubdoc(vb uint16, key string, subPaths []string) (*gomemcached.MCResponse, error) { + + extraBuf, valueBuf := GetSubDocVal(subPaths) + res, err := c.Send(&gomemcached.MCRequest{ + Opcode: gomemcached.SUBDOC_MULTI_LOOKUP, + VBucket: vb, + Key: []byte(key), + Extras: extraBuf, + Body: valueBuf, + }) + + if err != nil && IfResStatusError(res) { + return res, err + } + return res, nil +} + +// Get the value for a key, and update expiry +func (c *Client) GetAndTouch(vb uint16, key string, exp int) (*gomemcached.MCResponse, error) { + extraBuf := make([]byte, 4) + binary.BigEndian.PutUint32(extraBuf[0:], uint32(exp)) + return c.Send(&gomemcached.MCRequest{ + Opcode: gomemcached.GAT, + VBucket: vb, + Key: []byte(key), + Extras: extraBuf, + }) +} + +// Get metadata for a key +func (c *Client) GetMeta(vb uint16, key string) (*gomemcached.MCResponse, error) { + return c.Send(&gomemcached.MCRequest{ + Opcode: gomemcached.GET_META, + VBucket: vb, + Key: []byte(key), + }) +} + +// Del deletes a key. +func (c *Client) Del(vb uint16, key string) (*gomemcached.MCResponse, error) { + return c.Send(&gomemcached.MCRequest{ + Opcode: gomemcached.DELETE, + VBucket: vb, + Key: []byte(key)}) +} + +// Get a random document +func (c *Client) GetRandomDoc() (*gomemcached.MCResponse, error) { + return c.Send(&gomemcached.MCRequest{ + Opcode: 0xB6, + }) +} + +// AuthList lists SASL auth mechanisms. +func (c *Client) AuthList() (*gomemcached.MCResponse, error) { + return c.Send(&gomemcached.MCRequest{ + Opcode: gomemcached.SASL_LIST_MECHS}) +} + +// Auth performs SASL PLAIN authentication against the server. +func (c *Client) Auth(user, pass string) (*gomemcached.MCResponse, error) { + res, err := c.AuthList() + + if err != nil { + return res, err + } + + authMech := string(res.Body) + if strings.Index(authMech, "PLAIN") != -1 { + return c.AuthPlain(user, pass) + } + return nil, fmt.Errorf("auth mechanism PLAIN not supported") +} + +// AuthScramSha performs SCRAM-SHA authentication against the server. +func (c *Client) AuthScramSha(user, pass string) (*gomemcached.MCResponse, error) { + res, err := c.AuthList() + if err != nil { + return nil, errors.Wrap(err, "Unable to obtain list of methods.") + } + + methods := string(res.Body) + method, err := scramsha.BestMethod(methods) + if err != nil { + return nil, errors.Wrap(err, + "Unable to select SCRAM-SHA method.") + } + + s, err := scramsha.NewScramSha(method) + if err != nil { + return nil, errors.Wrap(err, "Unable to initialize scramsha.") + } + + logging.Infof("Using %v authentication for user %v%v%v", method, gomemcached.UdTagBegin, user, gomemcached.UdTagEnd) + + message, err := s.GetStartRequest(user) + if err != nil { + return nil, errors.Wrapf(err, + "Error building start request for user %s.", user) + } + + startRequest := &gomemcached.MCRequest{ + Opcode: 0x21, + Key: []byte(method), + Body: []byte(message)} + + startResponse, err := c.Send(startRequest) + if err != nil { + return nil, errors.Wrap(err, "Error sending start request.") + } + + err = s.HandleStartResponse(string(startResponse.Body)) + if err != nil { + return nil, errors.Wrap(err, "Error handling start response.") + } + + message = s.GetFinalRequest(pass) + + // send step request + finalRequest := &gomemcached.MCRequest{ + Opcode: 0x22, + Key: []byte(method), + Body: []byte(message)} + finalResponse, err := c.Send(finalRequest) + if err != nil { + return nil, errors.Wrap(err, "Error sending final request.") + } + + err = s.HandleFinalResponse(string(finalResponse.Body)) + if err != nil { + return nil, errors.Wrap(err, "Error handling final response.") + } + + return finalResponse, nil +} + +func (c *Client) AuthPlain(user, pass string) (*gomemcached.MCResponse, error) { + logging.Infof("Using plain authentication for user %v%v%v", gomemcached.UdTagBegin, user, gomemcached.UdTagEnd) + return c.Send(&gomemcached.MCRequest{ + Opcode: gomemcached.SASL_AUTH, + Key: []byte("PLAIN"), + Body: []byte(fmt.Sprintf("\x00%s\x00%s", user, pass))}) +} + +// select bucket +func (c *Client) SelectBucket(bucket string) (*gomemcached.MCResponse, error) { + + return c.Send(&gomemcached.MCRequest{ + Opcode: gomemcached.SELECT_BUCKET, + Key: []byte(fmt.Sprintf("%s", bucket))}) +} + +func (c *Client) store(opcode gomemcached.CommandCode, vb uint16, + key string, flags int, exp int, body []byte) (*gomemcached.MCResponse, error) { + + req := &gomemcached.MCRequest{ + Opcode: opcode, + VBucket: vb, + Key: []byte(key), + Cas: 0, + Opaque: 0, + Extras: []byte{0, 0, 0, 0, 0, 0, 0, 0}, + Body: body} + + binary.BigEndian.PutUint64(req.Extras, uint64(flags)<<32|uint64(exp)) + return c.Send(req) +} + +func (c *Client) storeCas(opcode gomemcached.CommandCode, vb uint16, + key string, flags int, exp int, cas uint64, body []byte) (*gomemcached.MCResponse, error) { + + req := &gomemcached.MCRequest{ + Opcode: opcode, + VBucket: vb, + Key: []byte(key), + Cas: cas, + Opaque: 0, + Extras: []byte{0, 0, 0, 0, 0, 0, 0, 0}, + Body: body} + + binary.BigEndian.PutUint64(req.Extras, uint64(flags)<<32|uint64(exp)) + return c.Send(req) +} + +// Incr increments the value at the given key. +func (c *Client) Incr(vb uint16, key string, + amt, def uint64, exp int) (uint64, error) { + + req := &gomemcached.MCRequest{ + Opcode: gomemcached.INCREMENT, + VBucket: vb, + Key: []byte(key), + Extras: make([]byte, 8+8+4), + } + binary.BigEndian.PutUint64(req.Extras[:8], amt) + binary.BigEndian.PutUint64(req.Extras[8:16], def) + binary.BigEndian.PutUint32(req.Extras[16:20], uint32(exp)) + + resp, err := c.Send(req) + if err != nil { + return 0, err + } + + return binary.BigEndian.Uint64(resp.Body), nil +} + +// Decr decrements the value at the given key. +func (c *Client) Decr(vb uint16, key string, + amt, def uint64, exp int) (uint64, error) { + + req := &gomemcached.MCRequest{ + Opcode: gomemcached.DECREMENT, + VBucket: vb, + Key: []byte(key), + Extras: make([]byte, 8+8+4), + } + binary.BigEndian.PutUint64(req.Extras[:8], amt) + binary.BigEndian.PutUint64(req.Extras[8:16], def) + binary.BigEndian.PutUint32(req.Extras[16:20], uint32(exp)) + + resp, err := c.Send(req) + if err != nil { + return 0, err + } + + return binary.BigEndian.Uint64(resp.Body), nil +} + +// Add a value for a key (store if not exists). +func (c *Client) Add(vb uint16, key string, flags int, exp int, + body []byte) (*gomemcached.MCResponse, error) { + return c.store(gomemcached.ADD, vb, key, flags, exp, body) +} + +// Set the value for a key. +func (c *Client) Set(vb uint16, key string, flags int, exp int, + body []byte) (*gomemcached.MCResponse, error) { + return c.store(gomemcached.SET, vb, key, flags, exp, body) +} + +// SetCas set the value for a key with cas +func (c *Client) SetCas(vb uint16, key string, flags int, exp int, cas uint64, + body []byte) (*gomemcached.MCResponse, error) { + return c.storeCas(gomemcached.SET, vb, key, flags, exp, cas, body) +} + +// Append data to the value of a key. +func (c *Client) Append(vb uint16, key string, data []byte) (*gomemcached.MCResponse, error) { + req := &gomemcached.MCRequest{ + Opcode: gomemcached.APPEND, + VBucket: vb, + Key: []byte(key), + Cas: 0, + Opaque: 0, + Body: data} + + return c.Send(req) +} + +// GetBulk gets keys in bulk +func (c *Client) GetBulk(vb uint16, keys []string, rv map[string]*gomemcached.MCResponse, subPaths []string) error { + stopch := make(chan bool) + var wg sync.WaitGroup + + defer func() { + close(stopch) + wg.Wait() + }() + + if (math.MaxInt32 - c.opaque) < (uint32(len(keys)) + 1) { + c.opaque = uint32(1) + } + + opStart := c.opaque + + errch := make(chan error, 2) + + wg.Add(1) + go func() { + defer func() { + if r := recover(); r != nil { + logging.Infof("Recovered in f %v", r) + } + errch <- nil + wg.Done() + }() + + ok := true + for ok { + + select { + case <-stopch: + return + default: + res, err := c.Receive() + + if err != nil && IfResStatusError(res) { + if res == nil || res.Status != gomemcached.KEY_ENOENT { + errch <- err + return + } + // continue receiving in case of KEY_ENOENT + } else if res.Opcode == gomemcached.GET || + res.Opcode == gomemcached.SUBDOC_GET || + res.Opcode == gomemcached.SUBDOC_MULTI_LOOKUP { + opaque := res.Opaque - opStart + if opaque < 0 || opaque >= uint32(len(keys)) { + // Every now and then we seem to be seeing an invalid opaque + // value returned from the server. When this happens log the error + // and the calling function will retry the bulkGet. MB-15140 + logging.Errorf(" Invalid opaque Value. Debug info : Res.opaque : %v(%v), Keys %v, Response received %v \n key list %v this key %v", res.Opaque, opaque, len(keys), res, keys, string(res.Body)) + errch <- fmt.Errorf("Out of Bounds error") + return + } + + rv[keys[opaque]] = res + } + + if res.Opcode == gomemcached.NOOP { + ok = false + } + } + } + }() + + memcachedReqPkt := &gomemcached.MCRequest{ + Opcode: gomemcached.GET, + VBucket: vb, + } + + if len(subPaths) > 0 { + extraBuf, valueBuf := GetSubDocVal(subPaths) + memcachedReqPkt.Opcode = gomemcached.SUBDOC_MULTI_LOOKUP + memcachedReqPkt.Extras = extraBuf + memcachedReqPkt.Body = valueBuf + } + + for _, k := range keys { // Start of Get request + memcachedReqPkt.Key = []byte(k) + memcachedReqPkt.Opaque = c.opaque + + err := c.Transmit(memcachedReqPkt) + if err != nil { + logging.Errorf(" Transmit failed in GetBulkAll %v", err) + return err + } + c.opaque++ + } // End of Get request + + // finally transmit a NOOP + err := c.Transmit(&gomemcached.MCRequest{ + Opcode: gomemcached.NOOP, + VBucket: vb, + Opaque: c.opaque, + }) + + if err != nil { + logging.Errorf(" Transmit of NOOP failed %v", err) + return err + } + c.opaque++ + + return <-errch +} + +func GetSubDocVal(subPaths []string) (extraBuf, valueBuf []byte) { + + var ops []string + totalBytesLen := 0 + num := 1 + + for _, v := range subPaths { + totalBytesLen = totalBytesLen + len([]byte(v)) + ops = append(ops, v) + num = num + 1 + } + + // Xattr retrieval - subdoc multi get + extraBuf = append(extraBuf, uint8(0x04)) + + valueBuf = make([]byte, num*4+totalBytesLen) + + //opcode for subdoc get + op := gomemcached.SUBDOC_GET + + // Calculate path total bytes + // There are 2 ops - get xattrs - both input and $document and get whole doc + valIter := 0 + + for _, v := range ops { + pathBytes := []byte(v) + valueBuf[valIter+0] = uint8(op) + + // SubdocFlagXattrPath indicates that the path refers to + // an Xattr rather than the document body. + valueBuf[valIter+1] = uint8(gomemcached.SUBDOC_FLAG_XATTR) + + // 2 byte key + binary.BigEndian.PutUint16(valueBuf[valIter+2:], uint16(len(pathBytes))) + + // Then n bytes path + copy(valueBuf[valIter+4:], pathBytes) + valIter = valIter + 4 + len(pathBytes) + } + + return +} + +// ObservedStatus is the type reported by the Observe method +type ObservedStatus uint8 + +// Observation status values. +const ( + ObservedNotPersisted = ObservedStatus(0x00) // found, not persisted + ObservedPersisted = ObservedStatus(0x01) // found, persisted + ObservedNotFound = ObservedStatus(0x80) // not found (or a persisted delete) + ObservedLogicallyDeleted = ObservedStatus(0x81) // pending deletion (not persisted yet) +) + +// ObserveResult represents the data obtained by an Observe call +type ObserveResult struct { + Status ObservedStatus // Whether the value has been persisted/deleted + Cas uint64 // Current value's CAS + PersistenceTime time.Duration // Node's average time to persist a value + ReplicationTime time.Duration // Node's average time to replicate a value +} + +// Observe gets the persistence/replication/CAS state of a key +func (c *Client) Observe(vb uint16, key string) (result ObserveResult, err error) { + // http://www.couchbase.com/wiki/display/couchbase/Observe + body := make([]byte, 4+len(key)) + binary.BigEndian.PutUint16(body[0:2], vb) + binary.BigEndian.PutUint16(body[2:4], uint16(len(key))) + copy(body[4:4+len(key)], key) + + res, err := c.Send(&gomemcached.MCRequest{ + Opcode: gomemcached.OBSERVE, + VBucket: vb, + Body: body, + }) + if err != nil { + return + } + + // Parse the response data from the body: + if len(res.Body) < 2+2+1 { + err = io.ErrUnexpectedEOF + return + } + outVb := binary.BigEndian.Uint16(res.Body[0:2]) + keyLen := binary.BigEndian.Uint16(res.Body[2:4]) + if len(res.Body) < 2+2+int(keyLen)+1+8 { + err = io.ErrUnexpectedEOF + return + } + outKey := string(res.Body[4 : 4+keyLen]) + if outVb != vb || outKey != key { + err = fmt.Errorf("observe returned wrong vbucket/key: %d/%q", outVb, outKey) + return + } + result.Status = ObservedStatus(res.Body[4+keyLen]) + result.Cas = binary.BigEndian.Uint64(res.Body[5+keyLen:]) + // The response reuses the Cas field to store time statistics: + result.PersistenceTime = time.Duration(res.Cas>>32) * time.Millisecond + result.ReplicationTime = time.Duration(res.Cas&math.MaxUint32) * time.Millisecond + return +} + +// CheckPersistence checks whether a stored value has been persisted to disk yet. +func (result ObserveResult) CheckPersistence(cas uint64, deletion bool) (persisted bool, overwritten bool) { + switch { + case result.Status == ObservedNotFound && deletion: + persisted = true + case result.Cas != cas: + overwritten = true + case result.Status == ObservedPersisted: + persisted = true + } + return +} + +// Sequence number based Observe Implementation +type ObserveSeqResult struct { + Failover uint8 // Set to 1 if a failover took place + VbId uint16 // vbucket id + Vbuuid uint64 // vucket uuid + LastPersistedSeqNo uint64 // last persisted sequence number + CurrentSeqNo uint64 // current sequence number + OldVbuuid uint64 // Old bucket vbuuid + LastSeqNo uint64 // last sequence number received before failover +} + +func (c *Client) ObserveSeq(vb uint16, vbuuid uint64) (result *ObserveSeqResult, err error) { + // http://www.couchbase.com/wiki/display/couchbase/Observe + body := make([]byte, 8) + binary.BigEndian.PutUint64(body[0:8], vbuuid) + + res, err := c.Send(&gomemcached.MCRequest{ + Opcode: gomemcached.OBSERVE_SEQNO, + VBucket: vb, + Body: body, + Opaque: 0x01, + }) + if err != nil { + return + } + + if res.Status != gomemcached.SUCCESS { + return nil, fmt.Errorf(" Observe returned error %v", res.Status) + } + + // Parse the response data from the body: + if len(res.Body) < (1 + 2 + 8 + 8 + 8) { + err = io.ErrUnexpectedEOF + return + } + + result = &ObserveSeqResult{} + result.Failover = res.Body[0] + result.VbId = binary.BigEndian.Uint16(res.Body[1:3]) + result.Vbuuid = binary.BigEndian.Uint64(res.Body[3:11]) + result.LastPersistedSeqNo = binary.BigEndian.Uint64(res.Body[11:19]) + result.CurrentSeqNo = binary.BigEndian.Uint64(res.Body[19:27]) + + // in case of failover processing we can have old vbuuid and the last persisted seq number + if result.Failover == 1 && len(res.Body) >= (1+2+8+8+8+8+8) { + result.OldVbuuid = binary.BigEndian.Uint64(res.Body[27:35]) + result.LastSeqNo = binary.BigEndian.Uint64(res.Body[35:43]) + } + + return +} + +// CasOp is the type of operation to perform on this CAS loop. +type CasOp uint8 + +const ( + // CASStore instructs the server to store the new value normally + CASStore = CasOp(iota) + // CASQuit instructs the client to stop attempting to CAS, leaving value untouched + CASQuit + // CASDelete instructs the server to delete the current value + CASDelete +) + +// User specified termination is returned as an error. +func (c CasOp) Error() string { + switch c { + case CASStore: + return "CAS store" + case CASQuit: + return "CAS quit" + case CASDelete: + return "CAS delete" + } + panic("Unhandled value") +} + +//////// CAS TRANSFORM + +// CASState tracks the state of CAS over several operations. +// +// This is used directly by CASNext and indirectly by CAS +type CASState struct { + initialized bool // false on the first call to CASNext, then true + Value []byte // Current value of key; update in place to new value + Cas uint64 // Current CAS value of key + Exists bool // Does a value exist for the key? (If not, Value will be nil) + Err error // Error, if any, after CASNext returns false + resp *gomemcached.MCResponse +} + +// CASNext is a non-callback, loop-based version of CAS method. +// +// Usage is like this: +// +// var state memcached.CASState +// for client.CASNext(vb, key, exp, &state) { +// state.Value = some_mutation(state.Value) +// } +// if state.Err != nil { ... } +func (c *Client) CASNext(vb uint16, k string, exp int, state *CASState) bool { + if state.initialized { + if !state.Exists { + // Adding a new key: + if state.Value == nil { + state.Cas = 0 + return false // no-op (delete of non-existent value) + } + state.resp, state.Err = c.Add(vb, k, 0, exp, state.Value) + } else { + // Updating / deleting a key: + req := &gomemcached.MCRequest{ + Opcode: gomemcached.DELETE, + VBucket: vb, + Key: []byte(k), + Cas: state.Cas} + if state.Value != nil { + req.Opcode = gomemcached.SET + req.Opaque = 0 + req.Extras = []byte{0, 0, 0, 0, 0, 0, 0, 0} + req.Body = state.Value + + flags := 0 + binary.BigEndian.PutUint64(req.Extras, uint64(flags)<<32|uint64(exp)) + } + state.resp, state.Err = c.Send(req) + } + + // If the response status is KEY_EEXISTS or NOT_STORED there's a conflict and we'll need to + // get the new value (below). Otherwise, we're done (either success or failure) so return: + if !(state.resp != nil && (state.resp.Status == gomemcached.KEY_EEXISTS || + state.resp.Status == gomemcached.NOT_STORED)) { + state.Cas = state.resp.Cas + return false // either success or fatal error + } + } + + // Initial call, or after a conflict: GET the current value and CAS and return them: + state.initialized = true + if state.resp, state.Err = c.Get(vb, k); state.Err == nil { + state.Exists = true + state.Value = state.resp.Body + state.Cas = state.resp.Cas + } else if state.resp != nil && state.resp.Status == gomemcached.KEY_ENOENT { + state.Err = nil + state.Exists = false + state.Value = nil + state.Cas = 0 + } else { + return false // fatal error + } + return true // keep going... +} + +// CasFunc is type type of function to perform a CAS transform. +// +// Input is the current value, or nil if no value exists. +// The function should return the new value (if any) to set, and the store/quit/delete operation. +type CasFunc func(current []byte) ([]byte, CasOp) + +// CAS performs a CAS transform with the given function. +// +// If the value does not exist, a nil current value will be sent to f. +func (c *Client) CAS(vb uint16, k string, f CasFunc, + initexp int) (*gomemcached.MCResponse, error) { + var state CASState + for c.CASNext(vb, k, initexp, &state) { + newValue, operation := f(state.Value) + if operation == CASQuit || (operation == CASDelete && state.Value == nil) { + return nil, operation + } + state.Value = newValue + } + return state.resp, state.Err +} + +// StatValue is one of the stats returned from the Stats method. +type StatValue struct { + // The stat key + Key string + // The stat value + Val string +} + +// Stats requests server-side stats. +// +// Use "" as the stat key for toplevel stats. +func (c *Client) Stats(key string) ([]StatValue, error) { + rv := make([]StatValue, 0, 128) + + req := &gomemcached.MCRequest{ + Opcode: gomemcached.STAT, + Key: []byte(key), + Opaque: 918494, + } + + err := c.Transmit(req) + if err != nil { + return rv, err + } + + for { + res, _, err := getResponse(c.conn, c.hdrBuf) + if err != nil { + return rv, err + } + k := string(res.Key) + if k == "" { + break + } + rv = append(rv, StatValue{ + Key: k, + Val: string(res.Body), + }) + } + return rv, nil +} + +// StatsMap requests server-side stats similarly to Stats, but returns +// them as a map. +// +// Use "" as the stat key for toplevel stats. +func (c *Client) StatsMap(key string) (map[string]string, error) { + rv := make(map[string]string) + + req := &gomemcached.MCRequest{ + Opcode: gomemcached.STAT, + Key: []byte(key), + Opaque: 918494, + } + + err := c.Transmit(req) + if err != nil { + return rv, err + } + + for { + res, _, err := getResponse(c.conn, c.hdrBuf) + if err != nil { + return rv, err + } + k := string(res.Key) + if k == "" { + break + } + rv[k] = string(res.Body) + } + + return rv, nil +} + +// instead of returning a new statsMap, simply populate passed in statsMap, which contains all the keys +// for which stats needs to be retrieved +func (c *Client) StatsMapForSpecifiedStats(key string, statsMap map[string]string) error { + + // clear statsMap + for key, _ := range statsMap { + statsMap[key] = "" + } + + req := &gomemcached.MCRequest{ + Opcode: gomemcached.STAT, + Key: []byte(key), + Opaque: 918494, + } + + err := c.Transmit(req) + if err != nil { + return err + } + + for { + res, _, err := getResponse(c.conn, c.hdrBuf) + if err != nil { + return err + } + k := string(res.Key) + if k == "" { + break + } + if _, ok := statsMap[k]; ok { + statsMap[k] = string(res.Body) + } + } + + return nil +} + +// Hijack exposes the underlying connection from this client. +// +// It also marks the connection as unhealthy since the client will +// have lost control over the connection and can't otherwise verify +// things are in good shape for connection pools. +func (c *Client) Hijack() io.ReadWriteCloser { + c.setHealthy(false) + return c.conn +} + +func (c *Client) setHealthy(healthy bool) { + healthyState := UnHealthy + if healthy { + healthyState = Healthy + } + atomic.StoreUint32(&c.healthy, healthyState) +} + +func IfResStatusError(response *gomemcached.MCResponse) bool { + return response == nil || + (response.Status != gomemcached.SUBDOC_BAD_MULTI && + response.Status != gomemcached.SUBDOC_PATH_NOT_FOUND && + response.Status != gomemcached.SUBDOC_MULTI_PATH_FAILURE_DELETED) +} diff --git a/vendor/github.com/couchbase/gomemcached/client/tap_feed.go b/vendor/github.com/couchbase/gomemcached/client/tap_feed.go new file mode 100644 index 000000000000..fd628c5de2fb --- /dev/null +++ b/vendor/github.com/couchbase/gomemcached/client/tap_feed.go @@ -0,0 +1,333 @@ +package memcached + +import ( + "bytes" + "encoding/binary" + "fmt" + "io" + "math" + + "github.com/couchbase/gomemcached" + "github.com/couchbase/goutils/logging" +) + +// TAP protocol docs: + +// TapOpcode is the tap operation type (found in TapEvent) +type TapOpcode uint8 + +// Tap opcode values. +const ( + TapBeginBackfill = TapOpcode(iota) + TapEndBackfill + TapMutation + TapDeletion + TapCheckpointStart + TapCheckpointEnd + tapEndStream +) + +const tapMutationExtraLen = 16 + +var tapOpcodeNames map[TapOpcode]string + +func init() { + tapOpcodeNames = map[TapOpcode]string{ + TapBeginBackfill: "BeginBackfill", + TapEndBackfill: "EndBackfill", + TapMutation: "Mutation", + TapDeletion: "Deletion", + TapCheckpointStart: "TapCheckpointStart", + TapCheckpointEnd: "TapCheckpointEnd", + tapEndStream: "EndStream", + } +} + +func (opcode TapOpcode) String() string { + name := tapOpcodeNames[opcode] + if name == "" { + name = fmt.Sprintf("#%d", opcode) + } + return name +} + +// TapEvent is a TAP notification of an operation on the server. +type TapEvent struct { + Opcode TapOpcode // Type of event + VBucket uint16 // VBucket this event applies to + Flags uint32 // Item flags + Expiry uint32 // Item expiration time + Key, Value []byte // Item key/value + Cas uint64 +} + +func makeTapEvent(req gomemcached.MCRequest) *TapEvent { + event := TapEvent{ + VBucket: req.VBucket, + } + switch req.Opcode { + case gomemcached.TAP_MUTATION: + event.Opcode = TapMutation + event.Key = req.Key + event.Value = req.Body + event.Cas = req.Cas + case gomemcached.TAP_DELETE: + event.Opcode = TapDeletion + event.Key = req.Key + event.Cas = req.Cas + case gomemcached.TAP_CHECKPOINT_START: + event.Opcode = TapCheckpointStart + case gomemcached.TAP_CHECKPOINT_END: + event.Opcode = TapCheckpointEnd + case gomemcached.TAP_OPAQUE: + if len(req.Extras) < 8+4 { + return nil + } + switch op := int(binary.BigEndian.Uint32(req.Extras[8:])); op { + case gomemcached.TAP_OPAQUE_INITIAL_VBUCKET_STREAM: + event.Opcode = TapBeginBackfill + case gomemcached.TAP_OPAQUE_CLOSE_BACKFILL: + event.Opcode = TapEndBackfill + case gomemcached.TAP_OPAQUE_CLOSE_TAP_STREAM: + event.Opcode = tapEndStream + case gomemcached.TAP_OPAQUE_ENABLE_AUTO_NACK: + return nil + case gomemcached.TAP_OPAQUE_ENABLE_CHECKPOINT_SYNC: + return nil + default: + logging.Infof("TapFeed: Ignoring TAP_OPAQUE/%d", op) + return nil // unknown opaque event + } + case gomemcached.NOOP: + return nil // ignore + default: + logging.Infof("TapFeed: Ignoring %s", req.Opcode) + return nil // unknown event + } + + if len(req.Extras) >= tapMutationExtraLen && + (event.Opcode == TapMutation || event.Opcode == TapDeletion) { + + event.Flags = binary.BigEndian.Uint32(req.Extras[8:]) + event.Expiry = binary.BigEndian.Uint32(req.Extras[12:]) + } + + return &event +} + +func (event TapEvent) String() string { + switch event.Opcode { + case TapBeginBackfill, TapEndBackfill, TapCheckpointStart, TapCheckpointEnd: + return fmt.Sprintf("", + event.Opcode, event.VBucket) + default: + return fmt.Sprintf("", + event.Opcode, event.Key, len(event.Value), + event.Flags, event.Expiry) + } +} + +// TapArguments are parameters for requesting a TAP feed. +// +// Call DefaultTapArguments to get a default one. +type TapArguments struct { + // Timestamp of oldest item to send. + // + // Use TapNoBackfill to suppress all past items. + Backfill uint64 + // If set, server will disconnect after sending existing items. + Dump bool + // The indices of the vbuckets to watch; empty/nil to watch all. + VBuckets []uint16 + // Transfers ownership of vbuckets during cluster rebalance. + Takeover bool + // If true, server will wait for client ACK after every notification. + SupportAck bool + // If true, client doesn't want values so server shouldn't send them. + KeysOnly bool + // If true, client wants the server to send checkpoint events. + Checkpoint bool + // Optional identifier to use for this client, to allow reconnects + ClientName string + // Registers this client (by name) till explicitly deregistered. + RegisteredClient bool +} + +// Value for TapArguments.Backfill denoting that no past events at all +// should be sent. +const TapNoBackfill = math.MaxUint64 + +// DefaultTapArguments returns a default set of parameter values to +// pass to StartTapFeed. +func DefaultTapArguments() TapArguments { + return TapArguments{ + Backfill: TapNoBackfill, + } +} + +func (args *TapArguments) flags() []byte { + var flags gomemcached.TapConnectFlag + if args.Backfill != 0 { + flags |= gomemcached.BACKFILL + } + if args.Dump { + flags |= gomemcached.DUMP + } + if len(args.VBuckets) > 0 { + flags |= gomemcached.LIST_VBUCKETS + } + if args.Takeover { + flags |= gomemcached.TAKEOVER_VBUCKETS + } + if args.SupportAck { + flags |= gomemcached.SUPPORT_ACK + } + if args.KeysOnly { + flags |= gomemcached.REQUEST_KEYS_ONLY + } + if args.Checkpoint { + flags |= gomemcached.CHECKPOINT + } + if args.RegisteredClient { + flags |= gomemcached.REGISTERED_CLIENT + } + encoded := make([]byte, 4) + binary.BigEndian.PutUint32(encoded, uint32(flags)) + return encoded +} + +func must(err error) { + if err != nil { + panic(err) + } +} + +func (args *TapArguments) bytes() (rv []byte) { + buf := bytes.NewBuffer([]byte{}) + + if args.Backfill > 0 { + must(binary.Write(buf, binary.BigEndian, uint64(args.Backfill))) + } + + if len(args.VBuckets) > 0 { + must(binary.Write(buf, binary.BigEndian, uint16(len(args.VBuckets)))) + for i := 0; i < len(args.VBuckets); i++ { + must(binary.Write(buf, binary.BigEndian, uint16(args.VBuckets[i]))) + } + } + return buf.Bytes() +} + +// TapFeed represents a stream of events from a server. +type TapFeed struct { + C <-chan TapEvent + Error error + closer chan bool +} + +// StartTapFeed starts a TAP feed on a client connection. +// +// The events can be read from the returned channel. The connection +// can no longer be used for other purposes; it's now reserved for +// receiving the TAP messages. To stop receiving events, close the +// client connection. +func (mc *Client) StartTapFeed(args TapArguments) (*TapFeed, error) { + rq := &gomemcached.MCRequest{ + Opcode: gomemcached.TAP_CONNECT, + Key: []byte(args.ClientName), + Extras: args.flags(), + Body: args.bytes()} + + err := mc.Transmit(rq) + if err != nil { + return nil, err + } + + ch := make(chan TapEvent) + feed := &TapFeed{ + C: ch, + closer: make(chan bool), + } + go mc.runFeed(ch, feed) + return feed, nil +} + +// TapRecvHook is called after every incoming tap packet is received. +var TapRecvHook func(*gomemcached.MCRequest, int, error) + +// Internal goroutine that reads from the socket and writes events to +// the channel +func (mc *Client) runFeed(ch chan TapEvent, feed *TapFeed) { + defer close(ch) + var headerBuf [gomemcached.HDR_LEN]byte +loop: + for { + // Read the next request from the server. + // + // (Can't call mc.Receive() because it reads a + // _response_ not a request.) + var pkt gomemcached.MCRequest + n, err := pkt.Receive(mc.conn, headerBuf[:]) + if TapRecvHook != nil { + TapRecvHook(&pkt, n, err) + } + + if err != nil { + if err != io.EOF { + feed.Error = err + } + break loop + } + + //logging.Infof("** TapFeed received %#v : %q", pkt, pkt.Body) + + if pkt.Opcode == gomemcached.TAP_CONNECT { + // This is not an event from the server; it's + // an error response to my connect request. + feed.Error = fmt.Errorf("tap connection failed: %s", pkt.Body) + break loop + } + + event := makeTapEvent(pkt) + if event != nil { + if event.Opcode == tapEndStream { + break loop + } + + select { + case ch <- *event: + case <-feed.closer: + break loop + } + } + + if len(pkt.Extras) >= 4 { + reqFlags := binary.BigEndian.Uint16(pkt.Extras[2:]) + if reqFlags&gomemcached.TAP_ACK != 0 { + if _, err := mc.sendAck(&pkt); err != nil { + feed.Error = err + break loop + } + } + } + } + if err := mc.Close(); err != nil { + logging.Errorf("Error closing memcached client: %v", err) + } +} + +func (mc *Client) sendAck(pkt *gomemcached.MCRequest) (int, error) { + res := gomemcached.MCResponse{ + Opcode: pkt.Opcode, + Opaque: pkt.Opaque, + Status: gomemcached.SUCCESS, + } + return res.Transmit(mc.conn) +} + +// Close terminates a TapFeed. +// +// Call this if you stop using a TapFeed before its channel ends. +func (feed *TapFeed) Close() { + close(feed.closer) +} diff --git a/vendor/github.com/couchbase/gomemcached/client/transport.go b/vendor/github.com/couchbase/gomemcached/client/transport.go new file mode 100644 index 000000000000..f4cea17fcaa2 --- /dev/null +++ b/vendor/github.com/couchbase/gomemcached/client/transport.go @@ -0,0 +1,67 @@ +package memcached + +import ( + "errors" + "io" + + "github.com/couchbase/gomemcached" +) + +var errNoConn = errors.New("no connection") + +// UnwrapMemcachedError converts memcached errors to normal responses. +// +// If the error is a memcached response, declare the error to be nil +// so a client can handle the status without worrying about whether it +// indicates success or failure. +func UnwrapMemcachedError(rv *gomemcached.MCResponse, + err error) (*gomemcached.MCResponse, error) { + + if rv == err { + return rv, nil + } + return rv, err +} + +// ReceiveHook is called after every packet is received (or attempted to be) +var ReceiveHook func(*gomemcached.MCResponse, int, error) + +func getResponse(s io.Reader, hdrBytes []byte) (rv *gomemcached.MCResponse, n int, err error) { + if s == nil { + return nil, 0, errNoConn + } + + rv = &gomemcached.MCResponse{} + n, err = rv.Receive(s, hdrBytes) + + if ReceiveHook != nil { + ReceiveHook(rv, n, err) + } + + if err == nil && (rv.Status != gomemcached.SUCCESS && rv.Status != gomemcached.AUTH_CONTINUE) { + err = rv + } + return rv, n, err +} + +// TransmitHook is called after each packet is transmitted. +var TransmitHook func(*gomemcached.MCRequest, int, error) + +func transmitRequest(o io.Writer, req *gomemcached.MCRequest) (int, error) { + if o == nil { + return 0, errNoConn + } + n, err := req.Transmit(o) + if TransmitHook != nil { + TransmitHook(req, n, err) + } + return n, err +} + +func transmitResponse(o io.Writer, res *gomemcached.MCResponse) (int, error) { + if o == nil { + return 0, errNoConn + } + n, err := res.Transmit(o) + return n, err +} diff --git a/vendor/github.com/couchbase/gomemcached/client/upr_feed.go b/vendor/github.com/couchbase/gomemcached/client/upr_feed.go new file mode 100644 index 000000000000..dc737e6cc0fd --- /dev/null +++ b/vendor/github.com/couchbase/gomemcached/client/upr_feed.go @@ -0,0 +1,1005 @@ +// go implementation of upr client. +// See https://github.com/couchbaselabs/cbupr/blob/master/transport-spec.md +// TODO +// 1. Use a pool allocator to avoid garbage +package memcached + +import ( + "encoding/binary" + "errors" + "fmt" + "github.com/couchbase/gomemcached" + "github.com/couchbase/goutils/logging" + "strconv" + "sync" + "sync/atomic" +) + +const uprMutationExtraLen = 30 +const uprDeletetionExtraLen = 18 +const uprDeletetionWithDeletionTimeExtraLen = 21 +const uprSnapshotExtraLen = 20 +const bufferAckThreshold = 0.2 +const opaqueOpen = 0xBEAF0001 +const opaqueFailover = 0xDEADBEEF +const uprDefaultNoopInterval = 120 + +// Counter on top of opaqueOpen that others can draw from for open and control msgs +var opaqueOpenCtrlWell uint32 = opaqueOpen + +// UprEvent memcached events for UPR streams. +type UprEvent struct { + Opcode gomemcached.CommandCode // Type of event + Status gomemcached.Status // Response status + VBucket uint16 // VBucket this event applies to + DataType uint8 // data type + Opaque uint16 // 16 MSB of opaque + VBuuid uint64 // This field is set by downstream + Flags uint32 // Item flags + Expiry uint32 // Item expiration time + Key, Value []byte // Item key/value + OldValue []byte // TODO: TBD: old document value + Cas uint64 // CAS value of the item + Seqno uint64 // sequence number of the mutation + RevSeqno uint64 // rev sequence number : deletions + LockTime uint32 // Lock time + MetadataSize uint16 // Metadata size + SnapstartSeq uint64 // start sequence number of this snapshot + SnapendSeq uint64 // End sequence number of the snapshot + SnapshotType uint32 // 0: disk 1: memory + FailoverLog *FailoverLog // Failover log containing vvuid and sequnce number + Error error // Error value in case of a failure + ExtMeta []byte + AckSize uint32 // The number of bytes that can be Acked to DCP +} + +// UprStream is per stream data structure over an UPR Connection. +type UprStream struct { + Vbucket uint16 // Vbucket id + Vbuuid uint64 // vbucket uuid + StartSeq uint64 // start sequence number + EndSeq uint64 // end sequence number + connected bool +} + +const ( + CompressionTypeStartMarker = iota // also means invalid + CompressionTypeNone = iota + CompressionTypeSnappy = iota + CompressionTypeEndMarker = iota // also means invalid +) + +// kv_engine/include/mcbp/protocol/datatype.h +const ( + JSONDataType uint8 = 1 + SnappyDataType uint8 = 2 + XattrDataType uint8 = 4 +) + +type UprFeatures struct { + Xattribute bool + CompressionType int + IncludeDeletionTime bool +} + +/** + * Used to handle multiple concurrent calls UprRequestStream() by UprFeed clients + * It is expected that a client that calls UprRequestStream() more than once should issue + * different "opaque" (version) numbers + */ +type opaqueStreamMap map[uint16]*UprStream // opaque -> stream + +type vbStreamNegotiator struct { + vbHandshakeMap map[uint16]opaqueStreamMap // vbno -> opaqueStreamMap + mutex sync.RWMutex +} + +func (negotiator *vbStreamNegotiator) initialize() { + negotiator.mutex.Lock() + negotiator.vbHandshakeMap = make(map[uint16]opaqueStreamMap) + negotiator.mutex.Unlock() +} + +func (negotiator *vbStreamNegotiator) registerRequest(vbno, opaque uint16, vbuuid, startSequence, endSequence uint64) { + negotiator.mutex.Lock() + defer negotiator.mutex.Unlock() + + var osMap opaqueStreamMap + var ok bool + if osMap, ok = negotiator.vbHandshakeMap[vbno]; !ok { + osMap = make(opaqueStreamMap) + negotiator.vbHandshakeMap[vbno] = osMap + } + + if _, ok = osMap[opaque]; !ok { + osMap[opaque] = &UprStream{ + Vbucket: vbno, + Vbuuid: vbuuid, + StartSeq: startSequence, + EndSeq: endSequence, + } + } +} + +func (negotiator *vbStreamNegotiator) getStreamsCntFromMap(vbno uint16) int { + negotiator.mutex.RLock() + defer negotiator.mutex.RUnlock() + + osmap, ok := negotiator.vbHandshakeMap[vbno] + if !ok { + return 0 + } else { + return len(osmap) + } +} + +func (negotiator *vbStreamNegotiator) getStreamFromMap(vbno, opaque uint16) (*UprStream, error) { + negotiator.mutex.RLock() + defer negotiator.mutex.RUnlock() + + osmap, ok := negotiator.vbHandshakeMap[vbno] + if !ok { + return nil, fmt.Errorf("Error: stream for vb: %v does not exist", vbno) + } + + stream, ok := osmap[opaque] + if !ok { + return nil, fmt.Errorf("Error: stream for vb: %v opaque: %v does not exist", vbno, opaque) + } + return stream, nil +} + +func (negotiator *vbStreamNegotiator) deleteStreamFromMap(vbno, opaque uint16) { + negotiator.mutex.Lock() + defer negotiator.mutex.Unlock() + + osmap, ok := negotiator.vbHandshakeMap[vbno] + if !ok { + return + } + + delete(osmap, opaque) + if len(osmap) == 0 { + delete(negotiator.vbHandshakeMap, vbno) + } +} + +func (negotiator *vbStreamNegotiator) handleStreamRequest(feed *UprFeed, + headerBuf [gomemcached.HDR_LEN]byte, pktPtr *gomemcached.MCRequest, bytesReceivedFromDCP int, + response *gomemcached.MCResponse) (*UprEvent, error) { + var event *UprEvent + + if feed == nil || response == nil || pktPtr == nil { + return nil, errors.New("Invalid inputs") + } + + // Get Stream from negotiator map + vbno := vbOpaque(response.Opaque) + opaque := appOpaque(response.Opaque) + + stream, err := negotiator.getStreamFromMap(vbno, opaque) + if err != nil { + err = fmt.Errorf("Stream not found for vb %d appOpaque %v: %#v", vbno, appOpaque, *pktPtr) + logging.Errorf(err.Error()) + return nil, err + } + + status, rb, flog, err := handleStreamRequest(response, headerBuf[:]) + + if status == gomemcached.ROLLBACK { + event = makeUprEvent(*pktPtr, stream, bytesReceivedFromDCP) + event.Status = status + // rollback stream + logging.Infof("UPR_STREAMREQ with rollback %d for vb %d Failed: %v", rb, vbno, err) + negotiator.deleteStreamFromMap(vbno, opaque) + } else if status == gomemcached.SUCCESS { + event = makeUprEvent(*pktPtr, stream, bytesReceivedFromDCP) + event.Seqno = stream.StartSeq + event.FailoverLog = flog + event.Status = status + feed.activateStream(vbno, opaque, stream) + feed.negotiator.deleteStreamFromMap(vbno, opaque) + logging.Infof("UPR_STREAMREQ for vb %d successful", vbno) + + } else if err != nil { + logging.Errorf("UPR_STREAMREQ for vbucket %d erro %s", vbno, err.Error()) + event = &UprEvent{ + Opcode: gomemcached.UPR_STREAMREQ, + Status: status, + VBucket: vbno, + Error: err, + } + negotiator.deleteStreamFromMap(vbno, opaque) + } + return event, nil +} + +func (negotiator *vbStreamNegotiator) cleanUpVbStreams(vbno uint16) { + negotiator.mutex.Lock() + defer negotiator.mutex.Unlock() + + delete(negotiator.vbHandshakeMap, vbno) +} + +// UprFeed represents an UPR feed. A feed contains a connection to a single +// host and multiple vBuckets +type UprFeed struct { + // lock for feed.vbstreams + muVbstreams sync.RWMutex + // lock for feed.closed + muClosed sync.RWMutex + C <-chan *UprEvent // Exported channel for receiving UPR events + negotiator vbStreamNegotiator // Used for pre-vbstreams, concurrent vb stream negotiation + vbstreams map[uint16]*UprStream // official live vb->stream mapping + closer chan bool // closer + conn *Client // connection to UPR producer + Error error // error + bytesRead uint64 // total bytes read on this connection + toAckBytes uint32 // bytes client has read + maxAckBytes uint32 // Max buffer control ack bytes + stats UprStats // Stats for upr client + transmitCh chan *gomemcached.MCRequest // transmit command channel + transmitCl chan bool // closer channel for transmit go-routine + closed bool // flag indicating whether the feed has been closed + // flag indicating whether client of upr feed will send ack to upr feed + // if flag is true, upr feed will use ack from client to determine whether/when to send ack to DCP + // if flag is false, upr feed will track how many bytes it has sent to client + // and use that to determine whether/when to send ack to DCP + ackByClient bool +} + +// Exported interface - to allow for mocking +type UprFeedIface interface { + Close() + Closed() bool + CloseStream(vbno, opaqueMSB uint16) error + GetError() error + GetUprStats() *UprStats + ClientAck(event *UprEvent) error + GetUprEventCh() <-chan *UprEvent + StartFeed() error + StartFeedWithConfig(datachan_len int) error + UprOpen(name string, sequence uint32, bufSize uint32) error + UprOpenWithXATTR(name string, sequence uint32, bufSize uint32) error + UprOpenWithFeatures(name string, sequence uint32, bufSize uint32, features UprFeatures) (error, UprFeatures) + UprRequestStream(vbno, opaqueMSB uint16, flags uint32, vuuid, startSequence, endSequence, snapStart, snapEnd uint64) error +} + +type UprStats struct { + TotalBytes uint64 + TotalMutation uint64 + TotalBufferAckSent uint64 + TotalSnapShot uint64 +} + +// FailoverLog containing vvuid and sequnce number +type FailoverLog [][2]uint64 + +// error codes +var ErrorInvalidLog = errors.New("couchbase.errorInvalidLog") + +func (flogp *FailoverLog) Latest() (vbuuid, seqno uint64, err error) { + if flogp != nil { + flog := *flogp + latest := flog[len(flog)-1] + return latest[0], latest[1], nil + } + return vbuuid, seqno, ErrorInvalidLog +} + +func makeUprEvent(rq gomemcached.MCRequest, stream *UprStream, bytesReceivedFromDCP int) *UprEvent { + event := &UprEvent{ + Opcode: rq.Opcode, + VBucket: stream.Vbucket, + VBuuid: stream.Vbuuid, + Key: rq.Key, + Value: rq.Body, + Cas: rq.Cas, + ExtMeta: rq.ExtMeta, + DataType: rq.DataType, + } + + // set AckSize for events that need to be acked to DCP, + // i.e., events with CommandCodes that need to be buffered in DCP + if _, ok := gomemcached.BufferedCommandCodeMap[rq.Opcode]; ok { + event.AckSize = uint32(bytesReceivedFromDCP) + } + + // 16 LSBits are used by client library to encode vbucket number. + // 16 MSBits are left for application to multiplex on opaque value. + event.Opaque = appOpaque(rq.Opaque) + + if len(rq.Extras) >= uprMutationExtraLen && + event.Opcode == gomemcached.UPR_MUTATION { + + event.Seqno = binary.BigEndian.Uint64(rq.Extras[:8]) + event.RevSeqno = binary.BigEndian.Uint64(rq.Extras[8:16]) + event.Flags = binary.BigEndian.Uint32(rq.Extras[16:20]) + event.Expiry = binary.BigEndian.Uint32(rq.Extras[20:24]) + event.LockTime = binary.BigEndian.Uint32(rq.Extras[24:28]) + event.MetadataSize = binary.BigEndian.Uint16(rq.Extras[28:30]) + + } else if len(rq.Extras) >= uprDeletetionWithDeletionTimeExtraLen && + event.Opcode == gomemcached.UPR_DELETION { + + event.Seqno = binary.BigEndian.Uint64(rq.Extras[:8]) + event.RevSeqno = binary.BigEndian.Uint64(rq.Extras[8:16]) + event.Expiry = binary.BigEndian.Uint32(rq.Extras[16:20]) + + } else if len(rq.Extras) >= uprDeletetionExtraLen && + event.Opcode == gomemcached.UPR_DELETION || + event.Opcode == gomemcached.UPR_EXPIRATION { + + event.Seqno = binary.BigEndian.Uint64(rq.Extras[:8]) + event.RevSeqno = binary.BigEndian.Uint64(rq.Extras[8:16]) + event.MetadataSize = binary.BigEndian.Uint16(rq.Extras[16:18]) + + } else if len(rq.Extras) >= uprSnapshotExtraLen && + event.Opcode == gomemcached.UPR_SNAPSHOT { + + event.SnapstartSeq = binary.BigEndian.Uint64(rq.Extras[:8]) + event.SnapendSeq = binary.BigEndian.Uint64(rq.Extras[8:16]) + event.SnapshotType = binary.BigEndian.Uint32(rq.Extras[16:20]) + } + + return event +} + +func (event *UprEvent) String() string { + name := gomemcached.CommandNames[event.Opcode] + if name == "" { + name = fmt.Sprintf("#%d", event.Opcode) + } + return name +} + +func (event *UprEvent) IsSnappyDataType() bool { + return event.Opcode == gomemcached.UPR_MUTATION && (event.DataType&SnappyDataType > 0) +} + +func (feed *UprFeed) sendCommands(mc *Client) { + transmitCh := feed.transmitCh + transmitCl := feed.transmitCl +loop: + for { + select { + case command := <-transmitCh: + if err := mc.Transmit(command); err != nil { + logging.Errorf("Failed to transmit command %s. Error %s", command.Opcode.String(), err.Error()) + // get feed to close and runFeed routine to exit + feed.Close() + break loop + } + + case <-transmitCl: + break loop + } + } + + // After sendCommands exits, write to transmitCh will block forever + // when we write to transmitCh, e.g., at CloseStream(), we need to check feed closure to have an exit route + + logging.Infof("sendCommands exiting") +} + +// Sets the specified stream as the connected stream for this vbno, and also cleans up negotiator +func (feed *UprFeed) activateStream(vbno, opaque uint16, stream *UprStream) error { + feed.muVbstreams.Lock() + defer feed.muVbstreams.Unlock() + + // Set this stream as the officially connected stream for this vb + stream.connected = true + feed.vbstreams[vbno] = stream + return nil +} + +func (feed *UprFeed) cleanUpVbStream(vbno uint16) { + feed.muVbstreams.Lock() + defer feed.muVbstreams.Unlock() + + delete(feed.vbstreams, vbno) +} + +// NewUprFeed creates a new UPR Feed. +// TODO: Describe side-effects on bucket instance and its connection pool. +func (mc *Client) NewUprFeed() (*UprFeed, error) { + return mc.NewUprFeedWithConfig(false /*ackByClient*/) +} + +func (mc *Client) NewUprFeedWithConfig(ackByClient bool) (*UprFeed, error) { + + feed := &UprFeed{ + conn: mc, + closer: make(chan bool, 1), + vbstreams: make(map[uint16]*UprStream), + transmitCh: make(chan *gomemcached.MCRequest), + transmitCl: make(chan bool), + ackByClient: ackByClient, + } + + feed.negotiator.initialize() + + go feed.sendCommands(mc) + return feed, nil +} + +func (mc *Client) NewUprFeedIface() (UprFeedIface, error) { + return mc.NewUprFeed() +} + +func (mc *Client) NewUprFeedWithConfigIface(ackByClient bool) (UprFeedIface, error) { + return mc.NewUprFeedWithConfig(ackByClient) +} + +func doUprOpen(mc *Client, name string, sequence uint32, features UprFeatures) error { + rq := &gomemcached.MCRequest{ + Opcode: gomemcached.UPR_OPEN, + Key: []byte(name), + Opaque: getUprOpenCtrlOpaque(), + } + + rq.Extras = make([]byte, 8) + binary.BigEndian.PutUint32(rq.Extras[:4], sequence) + + // opens a producer type connection + flags := gomemcached.DCP_PRODUCER + if features.Xattribute { + flags = flags | gomemcached.DCP_OPEN_INCLUDE_XATTRS + } + if features.IncludeDeletionTime { + flags = flags | gomemcached.DCP_OPEN_INCLUDE_DELETE_TIMES + } + binary.BigEndian.PutUint32(rq.Extras[4:], flags) + + return sendMcRequestSync(mc, rq) +} + +// Synchronously send a memcached request and wait for the response +func sendMcRequestSync(mc *Client, req *gomemcached.MCRequest) error { + if err := mc.Transmit(req); err != nil { + return err + } + + if res, err := mc.Receive(); err != nil { + return err + } else if req.Opcode != res.Opcode { + return fmt.Errorf("unexpected #opcode sent %v received %v", req.Opcode, res.Opaque) + } else if req.Opaque != res.Opaque { + return fmt.Errorf("opaque mismatch, sent %v received %v", req.Opaque, res.Opaque) + } else if res.Status != gomemcached.SUCCESS { + return fmt.Errorf("error %v", res.Status) + } + return nil +} + +// UprOpen to connect with a UPR producer. +// Name: name of te UPR connection +// sequence: sequence number for the connection +// bufsize: max size of the application +func (feed *UprFeed) UprOpen(name string, sequence uint32, bufSize uint32) error { + var allFeaturesDisabled UprFeatures + err, _ := feed.uprOpen(name, sequence, bufSize, allFeaturesDisabled) + return err +} + +// UprOpen with XATTR enabled. +func (feed *UprFeed) UprOpenWithXATTR(name string, sequence uint32, bufSize uint32) error { + var onlyXattrEnabled UprFeatures + onlyXattrEnabled.Xattribute = true + err, _ := feed.uprOpen(name, sequence, bufSize, onlyXattrEnabled) + return err +} + +func (feed *UprFeed) UprOpenWithFeatures(name string, sequence uint32, bufSize uint32, features UprFeatures) (error, UprFeatures) { + return feed.uprOpen(name, sequence, bufSize, features) +} + +func (feed *UprFeed) uprOpen(name string, sequence uint32, bufSize uint32, features UprFeatures) (err error, activatedFeatures UprFeatures) { + mc := feed.conn + + // First set this to an invalid value to state that the method hasn't gotten to executing this control yet + activatedFeatures.CompressionType = CompressionTypeEndMarker + + if err = doUprOpen(mc, name, sequence, features); err != nil { + return + } + + activatedFeatures.Xattribute = features.Xattribute + + // send a UPR control message to set the window size for the this connection + if bufSize > 0 { + rq := &gomemcached.MCRequest{ + Opcode: gomemcached.UPR_CONTROL, + Key: []byte("connection_buffer_size"), + Body: []byte(strconv.Itoa(int(bufSize))), + Opaque: getUprOpenCtrlOpaque(), + } + err = sendMcRequestSync(feed.conn, rq) + if err != nil { + return + } + feed.maxAckBytes = uint32(bufferAckThreshold * float32(bufSize)) + } + + // enable noop and set noop interval + rq := &gomemcached.MCRequest{ + Opcode: gomemcached.UPR_CONTROL, + Key: []byte("enable_noop"), + Body: []byte("true"), + Opaque: getUprOpenCtrlOpaque(), + } + err = sendMcRequestSync(feed.conn, rq) + if err != nil { + return + } + + rq = &gomemcached.MCRequest{ + Opcode: gomemcached.UPR_CONTROL, + Key: []byte("set_noop_interval"), + Body: []byte(strconv.Itoa(int(uprDefaultNoopInterval))), + Opaque: getUprOpenCtrlOpaque(), + } + err = sendMcRequestSync(feed.conn, rq) + if err != nil { + return + } + + if features.CompressionType == CompressionTypeSnappy { + activatedFeatures.CompressionType = CompressionTypeNone + rq = &gomemcached.MCRequest{ + Opcode: gomemcached.UPR_CONTROL, + Key: []byte("force_value_compression"), + Body: []byte("true"), + Opaque: getUprOpenCtrlOpaque(), + } + err = sendMcRequestSync(feed.conn, rq) + } else if features.CompressionType == CompressionTypeEndMarker { + err = fmt.Errorf("UPR_CONTROL Failed - Invalid CompressionType: %v", features.CompressionType) + } + if err != nil { + return + } + activatedFeatures.CompressionType = features.CompressionType + + return +} + +// UprGetFailoverLog for given list of vbuckets. +func (mc *Client) UprGetFailoverLog( + vb []uint16) (map[uint16]*FailoverLog, error) { + + rq := &gomemcached.MCRequest{ + Opcode: gomemcached.UPR_FAILOVERLOG, + Opaque: opaqueFailover, + } + + var allFeaturesDisabled UprFeatures + if err := doUprOpen(mc, "FailoverLog", 0, allFeaturesDisabled); err != nil { + return nil, fmt.Errorf("UPR_OPEN Failed %s", err.Error()) + } + + failoverLogs := make(map[uint16]*FailoverLog) + for _, vBucket := range vb { + rq.VBucket = vBucket + if err := mc.Transmit(rq); err != nil { + return nil, err + } + res, err := mc.Receive() + + if err != nil { + return nil, fmt.Errorf("failed to receive %s", err.Error()) + } else if res.Opcode != gomemcached.UPR_FAILOVERLOG || res.Status != gomemcached.SUCCESS { + return nil, fmt.Errorf("unexpected #opcode %v", res.Opcode) + } + + flog, err := parseFailoverLog(res.Body) + if err != nil { + return nil, fmt.Errorf("unable to parse failover logs for vb %d", vb) + } + failoverLogs[vBucket] = flog + } + + return failoverLogs, nil +} + +// UprRequestStream for a single vbucket. +func (feed *UprFeed) UprRequestStream(vbno, opaqueMSB uint16, flags uint32, + vuuid, startSequence, endSequence, snapStart, snapEnd uint64) error { + + rq := &gomemcached.MCRequest{ + Opcode: gomemcached.UPR_STREAMREQ, + VBucket: vbno, + Opaque: composeOpaque(vbno, opaqueMSB), + } + + rq.Extras = make([]byte, 48) // #Extras + binary.BigEndian.PutUint32(rq.Extras[:4], flags) + binary.BigEndian.PutUint32(rq.Extras[4:8], uint32(0)) + binary.BigEndian.PutUint64(rq.Extras[8:16], startSequence) + binary.BigEndian.PutUint64(rq.Extras[16:24], endSequence) + binary.BigEndian.PutUint64(rq.Extras[24:32], vuuid) + binary.BigEndian.PutUint64(rq.Extras[32:40], snapStart) + binary.BigEndian.PutUint64(rq.Extras[40:48], snapEnd) + + feed.negotiator.registerRequest(vbno, opaqueMSB, vuuid, startSequence, endSequence) + // Any client that has ever called this method, regardless of return code, + // should expect a potential UPR_CLOSESTREAM message due to this new map entry prior to Transmit. + + if err := feed.conn.Transmit(rq); err != nil { + logging.Errorf("Error in StreamRequest %s", err.Error()) + // If an error occurs during transmit, then the UPRFeed will keep the stream + // in the vbstreams map. This is to prevent nil lookup from any previously + // sent stream requests. + return err + } + + return nil +} + +// CloseStream for specified vbucket. +func (feed *UprFeed) CloseStream(vbno, opaqueMSB uint16) error { + + err := feed.validateCloseStream(vbno) + if err != nil { + logging.Infof("CloseStream for %v has been skipped because of error %v", vbno, err) + return err + } + + closeStream := &gomemcached.MCRequest{ + Opcode: gomemcached.UPR_CLOSESTREAM, + VBucket: vbno, + Opaque: composeOpaque(vbno, opaqueMSB), + } + + feed.writeToTransmitCh(closeStream) + + return nil +} + +func (feed *UprFeed) GetUprEventCh() <-chan *UprEvent { + return feed.C +} + +func (feed *UprFeed) GetError() error { + return feed.Error +} + +func (feed *UprFeed) validateCloseStream(vbno uint16) error { + feed.muVbstreams.RLock() + nilVbStream := feed.vbstreams[vbno] == nil + feed.muVbstreams.RUnlock() + + if nilVbStream && (feed.negotiator.getStreamsCntFromMap(vbno) == 0) { + return fmt.Errorf("Stream for vb %d has not been requested", vbno) + } + + return nil +} + +func (feed *UprFeed) writeToTransmitCh(rq *gomemcached.MCRequest) error { + // write to transmitCh may block forever if sendCommands has exited + // check for feed closure to have an exit route in this case + select { + case <-feed.closer: + errMsg := fmt.Sprintf("Abort sending request to transmitCh because feed has been closed. request=%v", rq) + logging.Infof(errMsg) + return errors.New(errMsg) + case feed.transmitCh <- rq: + } + return nil +} + +// StartFeed to start the upper feed. +func (feed *UprFeed) StartFeed() error { + return feed.StartFeedWithConfig(10) +} + +func (feed *UprFeed) StartFeedWithConfig(datachan_len int) error { + ch := make(chan *UprEvent, datachan_len) + feed.C = ch + go feed.runFeed(ch) + return nil +} + +func parseFailoverLog(body []byte) (*FailoverLog, error) { + + if len(body)%16 != 0 { + err := fmt.Errorf("invalid body length %v, in failover-log", len(body)) + return nil, err + } + log := make(FailoverLog, len(body)/16) + for i, j := 0, 0; i < len(body); i += 16 { + vuuid := binary.BigEndian.Uint64(body[i : i+8]) + seqno := binary.BigEndian.Uint64(body[i+8 : i+16]) + log[j] = [2]uint64{vuuid, seqno} + j++ + } + return &log, nil +} + +func handleStreamRequest( + res *gomemcached.MCResponse, + headerBuf []byte, +) (gomemcached.Status, uint64, *FailoverLog, error) { + + var rollback uint64 + var err error + + switch { + case res.Status == gomemcached.ROLLBACK: + logging.Infof("Rollback response. body=%v, headerBuf=%v\n", res.Body, headerBuf) + rollback = binary.BigEndian.Uint64(res.Body) + logging.Infof("Rollback seqno is %v for response with opaque %v\n", rollback, res.Opaque) + return res.Status, rollback, nil, nil + + case res.Status != gomemcached.SUCCESS: + err = fmt.Errorf("unexpected status %v for response with opaque %v", res.Status, res.Opaque) + return res.Status, 0, nil, err + } + + flog, err := parseFailoverLog(res.Body[:]) + return res.Status, rollback, flog, err +} + +// generate stream end responses for all active vb streams +func (feed *UprFeed) doStreamClose(ch chan *UprEvent) { + feed.muVbstreams.RLock() + + uprEvents := make([]*UprEvent, len(feed.vbstreams)) + index := 0 + for vbno, stream := range feed.vbstreams { + uprEvent := &UprEvent{ + VBucket: vbno, + VBuuid: stream.Vbuuid, + Opcode: gomemcached.UPR_STREAMEND, + } + uprEvents[index] = uprEvent + index++ + } + + // release the lock before sending uprEvents to ch, which may block + feed.muVbstreams.RUnlock() + +loop: + for _, uprEvent := range uprEvents { + select { + case ch <- uprEvent: + case <-feed.closer: + logging.Infof("Feed has been closed. Aborting doStreamClose.") + break loop + } + } +} + +func (feed *UprFeed) runFeed(ch chan *UprEvent) { + defer close(ch) + var headerBuf [gomemcached.HDR_LEN]byte + var pkt gomemcached.MCRequest + var event *UprEvent + + mc := feed.conn.Hijack() + uprStats := &feed.stats + +loop: + for { + select { + case <-feed.closer: + logging.Infof("Feed has been closed. Exiting.") + break loop + default: + bytes, err := pkt.Receive(mc, headerBuf[:]) + if err != nil { + logging.Errorf("Error in receive %s", err.Error()) + feed.Error = err + // send all the stream close messages to the client + feed.doStreamClose(ch) + break loop + } else { + event = nil + res := &gomemcached.MCResponse{ + Opcode: pkt.Opcode, + Cas: pkt.Cas, + Opaque: pkt.Opaque, + Status: gomemcached.Status(pkt.VBucket), + Extras: pkt.Extras, + Key: pkt.Key, + Body: pkt.Body, + } + + vb := vbOpaque(pkt.Opaque) + appOpaque := appOpaque(pkt.Opaque) + uprStats.TotalBytes = uint64(bytes) + + feed.muVbstreams.RLock() + stream := feed.vbstreams[vb] + feed.muVbstreams.RUnlock() + + switch pkt.Opcode { + case gomemcached.UPR_STREAMREQ: + event, err = feed.negotiator.handleStreamRequest(feed, headerBuf, &pkt, bytes, res) + if err != nil { + logging.Infof(err.Error()) + break loop + } + case gomemcached.UPR_MUTATION, + gomemcached.UPR_DELETION, + gomemcached.UPR_EXPIRATION: + if stream == nil { + logging.Infof("Stream not found for vb %d: %#v", vb, pkt) + break loop + } + event = makeUprEvent(pkt, stream, bytes) + uprStats.TotalMutation++ + + case gomemcached.UPR_STREAMEND: + if stream == nil { + logging.Infof("Stream not found for vb %d: %#v", vb, pkt) + break loop + } + //stream has ended + event = makeUprEvent(pkt, stream, bytes) + logging.Infof("Stream Ended for vb %d", vb) + + feed.negotiator.deleteStreamFromMap(vb, appOpaque) + feed.cleanUpVbStream(vb) + + case gomemcached.UPR_SNAPSHOT: + if stream == nil { + logging.Infof("Stream not found for vb %d: %#v", vb, pkt) + break loop + } + // snapshot marker + event = makeUprEvent(pkt, stream, bytes) + uprStats.TotalSnapShot++ + + case gomemcached.UPR_FLUSH: + if stream == nil { + logging.Infof("Stream not found for vb %d: %#v", vb, pkt) + break loop + } + // special processing for flush ? + event = makeUprEvent(pkt, stream, bytes) + + case gomemcached.UPR_CLOSESTREAM: + if stream == nil { + logging.Infof("Stream not found for vb %d: %#v", vb, pkt) + break loop + } + event = makeUprEvent(pkt, stream, bytes) + event.Opcode = gomemcached.UPR_STREAMEND // opcode re-write !! + logging.Infof("Stream Closed for vb %d StreamEnd simulated", vb) + + feed.negotiator.deleteStreamFromMap(vb, appOpaque) + feed.cleanUpVbStream(vb) + + case gomemcached.UPR_ADDSTREAM: + logging.Infof("Opcode %v not implemented", pkt.Opcode) + + case gomemcached.UPR_CONTROL, gomemcached.UPR_BUFFERACK: + if res.Status != gomemcached.SUCCESS { + logging.Infof("Opcode %v received status %d", pkt.Opcode.String(), res.Status) + } + + case gomemcached.UPR_NOOP: + // send a NOOP back + noop := &gomemcached.MCResponse{ + Opcode: gomemcached.UPR_NOOP, + Opaque: pkt.Opaque, + } + + if err := feed.conn.TransmitResponse(noop); err != nil { + logging.Warnf("failed to transmit command %s. Error %s", noop.Opcode.String(), err.Error()) + } + default: + logging.Infof("Recived an unknown response for vbucket %d", vb) + } + } + + if event != nil { + select { + case ch <- event: + case <-feed.closer: + logging.Infof("Feed has been closed. Skip sending events. Exiting.") + break loop + } + + feed.muVbstreams.RLock() + l := len(feed.vbstreams) + feed.muVbstreams.RUnlock() + + if event.Opcode == gomemcached.UPR_CLOSESTREAM && l == 0 { + logging.Infof("No more streams") + } + } + + if !feed.ackByClient { + // if client does not ack, do the ack check now + feed.sendBufferAckIfNeeded(event) + } + } + } + + // make sure that feed is closed before we signal transmitCl and exit runFeed + feed.Close() + + close(feed.transmitCl) + logging.Infof("runFeed exiting") +} + +// Client, after completing processing of an UprEvent, need to call this API to notify UprFeed, +// so that UprFeed can update its ack bytes stats and send ack to DCP if needed +// Client needs to set ackByClient flag to true in NewUprFeedWithConfig() call as a prerequisite for this call to work +// This API is not thread safe. Caller should NOT have more than one go rountine calling this API +func (feed *UprFeed) ClientAck(event *UprEvent) error { + if !feed.ackByClient { + return errors.New("Upr feed does not have ackByclient flag set") + } + feed.sendBufferAckIfNeeded(event) + return nil +} + +// increment ack bytes if the event needs to be acked to DCP +// send buffer ack if enough ack bytes have been accumulated +func (feed *UprFeed) sendBufferAckIfNeeded(event *UprEvent) { + if event == nil || event.AckSize == 0 { + // this indicates that there is no need to ack to DCP + return + } + + totalBytes := feed.toAckBytes + event.AckSize + if totalBytes > feed.maxAckBytes { + feed.toAckBytes = 0 + feed.sendBufferAck(totalBytes) + } else { + feed.toAckBytes = totalBytes + } +} + +// send buffer ack to dcp +func (feed *UprFeed) sendBufferAck(sendSize uint32) { + bufferAck := &gomemcached.MCRequest{ + Opcode: gomemcached.UPR_BUFFERACK, + } + bufferAck.Extras = make([]byte, 4) + binary.BigEndian.PutUint32(bufferAck.Extras[:4], uint32(sendSize)) + feed.writeToTransmitCh(bufferAck) + feed.stats.TotalBufferAckSent++ +} + +func (feed *UprFeed) GetUprStats() *UprStats { + return &feed.stats +} + +func composeOpaque(vbno, opaqueMSB uint16) uint32 { + return (uint32(opaqueMSB) << 16) | uint32(vbno) +} + +func getUprOpenCtrlOpaque() uint32 { + return atomic.AddUint32(&opaqueOpenCtrlWell, 1) +} + +func appOpaque(opq32 uint32) uint16 { + return uint16((opq32 & 0xFFFF0000) >> 16) +} + +func vbOpaque(opq32 uint32) uint16 { + return uint16(opq32 & 0xFFFF) +} + +// Close this UprFeed. +func (feed *UprFeed) Close() { + feed.muClosed.Lock() + defer feed.muClosed.Unlock() + if !feed.closed { + close(feed.closer) + feed.closed = true + feed.negotiator.initialize() + } +} + +// check if the UprFeed has been closed +func (feed *UprFeed) Closed() bool { + feed.muClosed.RLock() + defer feed.muClosed.RUnlock() + return feed.closed +} diff --git a/vendor/github.com/couchbase/gomemcached/mc_constants.go b/vendor/github.com/couchbase/gomemcached/mc_constants.go new file mode 100644 index 000000000000..1d5027d16c0e --- /dev/null +++ b/vendor/github.com/couchbase/gomemcached/mc_constants.go @@ -0,0 +1,335 @@ +// Package gomemcached is binary protocol packet formats and constants. +package gomemcached + +import ( + "fmt" +) + +const ( + REQ_MAGIC = 0x80 + RES_MAGIC = 0x81 +) + +// CommandCode for memcached packets. +type CommandCode uint8 + +const ( + GET = CommandCode(0x00) + SET = CommandCode(0x01) + ADD = CommandCode(0x02) + REPLACE = CommandCode(0x03) + DELETE = CommandCode(0x04) + INCREMENT = CommandCode(0x05) + DECREMENT = CommandCode(0x06) + QUIT = CommandCode(0x07) + FLUSH = CommandCode(0x08) + GETQ = CommandCode(0x09) + NOOP = CommandCode(0x0a) + VERSION = CommandCode(0x0b) + GETK = CommandCode(0x0c) + GETKQ = CommandCode(0x0d) + APPEND = CommandCode(0x0e) + PREPEND = CommandCode(0x0f) + STAT = CommandCode(0x10) + SETQ = CommandCode(0x11) + ADDQ = CommandCode(0x12) + REPLACEQ = CommandCode(0x13) + DELETEQ = CommandCode(0x14) + INCREMENTQ = CommandCode(0x15) + DECREMENTQ = CommandCode(0x16) + QUITQ = CommandCode(0x17) + FLUSHQ = CommandCode(0x18) + APPENDQ = CommandCode(0x19) + AUDIT = CommandCode(0x27) + PREPENDQ = CommandCode(0x1a) + GAT = CommandCode(0x1d) + HELLO = CommandCode(0x1f) + RGET = CommandCode(0x30) + RSET = CommandCode(0x31) + RSETQ = CommandCode(0x32) + RAPPEND = CommandCode(0x33) + RAPPENDQ = CommandCode(0x34) + RPREPEND = CommandCode(0x35) + RPREPENDQ = CommandCode(0x36) + RDELETE = CommandCode(0x37) + RDELETEQ = CommandCode(0x38) + RINCR = CommandCode(0x39) + RINCRQ = CommandCode(0x3a) + RDECR = CommandCode(0x3b) + RDECRQ = CommandCode(0x3c) + + SASL_LIST_MECHS = CommandCode(0x20) + SASL_AUTH = CommandCode(0x21) + SASL_STEP = CommandCode(0x22) + + SET_VBUCKET = CommandCode(0x3d) + + TAP_CONNECT = CommandCode(0x40) // Client-sent request to initiate Tap feed + TAP_MUTATION = CommandCode(0x41) // Notification of a SET/ADD/REPLACE/etc. on the server + TAP_DELETE = CommandCode(0x42) // Notification of a DELETE on the server + TAP_FLUSH = CommandCode(0x43) // Replicates a flush_all command + TAP_OPAQUE = CommandCode(0x44) // Opaque control data from the engine + TAP_VBUCKET_SET = CommandCode(0x45) // Sets state of vbucket in receiver (used in takeover) + TAP_CHECKPOINT_START = CommandCode(0x46) // Notifies start of new checkpoint + TAP_CHECKPOINT_END = CommandCode(0x47) // Notifies end of checkpoint + + UPR_OPEN = CommandCode(0x50) // Open a UPR connection with a name + UPR_ADDSTREAM = CommandCode(0x51) // Sent by ebucketMigrator to UPR Consumer + UPR_CLOSESTREAM = CommandCode(0x52) // Sent by eBucketMigrator to UPR Consumer + UPR_FAILOVERLOG = CommandCode(0x54) // Request failover logs + UPR_STREAMREQ = CommandCode(0x53) // Stream request from consumer to producer + UPR_STREAMEND = CommandCode(0x55) // Sent by producer when it has no more messages to stream + UPR_SNAPSHOT = CommandCode(0x56) // Start of a new snapshot + UPR_MUTATION = CommandCode(0x57) // Key mutation + UPR_DELETION = CommandCode(0x58) // Key deletion + UPR_EXPIRATION = CommandCode(0x59) // Key expiration + UPR_FLUSH = CommandCode(0x5a) // Delete all the data for a vbucket + UPR_NOOP = CommandCode(0x5c) // UPR NOOP + UPR_BUFFERACK = CommandCode(0x5d) // UPR Buffer Acknowledgement + UPR_CONTROL = CommandCode(0x5e) // Set flow control params + + SELECT_BUCKET = CommandCode(0x89) // Select bucket + + OBSERVE_SEQNO = CommandCode(0x91) // Sequence Number based Observe + OBSERVE = CommandCode(0x92) + + GET_META = CommandCode(0xA0) // Get meta. returns with expiry, flags, cas etc + SUBDOC_GET = CommandCode(0xc5) // Get subdoc. Returns with xattrs + SUBDOC_MULTI_LOOKUP = CommandCode(0xd0) // Multi lookup. Doc xattrs and meta. +) + +// command codes that are counted toward DCP control buffer +// when DCP clients receive DCP messages with these command codes, they need to provide acknowledgement +var BufferedCommandCodeMap = map[CommandCode]bool{ + SET_VBUCKET: true, + UPR_STREAMEND: true, + UPR_SNAPSHOT: true, + UPR_MUTATION: true, + UPR_DELETION: true, + UPR_EXPIRATION: true} + +// Status field for memcached response. +type Status uint16 + +// Matches with protocol_binary.h as source of truth +const ( + SUCCESS = Status(0x00) + KEY_ENOENT = Status(0x01) + KEY_EEXISTS = Status(0x02) + E2BIG = Status(0x03) + EINVAL = Status(0x04) + NOT_STORED = Status(0x05) + DELTA_BADVAL = Status(0x06) + NOT_MY_VBUCKET = Status(0x07) + NO_BUCKET = Status(0x08) + LOCKED = Status(0x09) + AUTH_STALE = Status(0x1f) + AUTH_ERROR = Status(0x20) + AUTH_CONTINUE = Status(0x21) + ERANGE = Status(0x22) + ROLLBACK = Status(0x23) + EACCESS = Status(0x24) + NOT_INITIALIZED = Status(0x25) + UNKNOWN_COMMAND = Status(0x81) + ENOMEM = Status(0x82) + NOT_SUPPORTED = Status(0x83) + EINTERNAL = Status(0x84) + EBUSY = Status(0x85) + TMPFAIL = Status(0x86) + + // SUBDOC + SUBDOC_PATH_NOT_FOUND = Status(0xc0) + SUBDOC_BAD_MULTI = Status(0xcc) + SUBDOC_MULTI_PATH_FAILURE_DELETED = Status(0xd3) +) + +// for log redaction +const ( + UdTagBegin = "" + UdTagEnd = "" +) + +var isFatal = map[Status]bool{ + DELTA_BADVAL: true, + NO_BUCKET: true, + AUTH_STALE: true, + AUTH_ERROR: true, + ERANGE: true, + ROLLBACK: true, + EACCESS: true, + ENOMEM: true, + NOT_SUPPORTED: true, +} + +// the producer/consumer bit in dcp flags +var DCP_PRODUCER uint32 = 0x01 + +// the include XATTRS bit in dcp flags +var DCP_OPEN_INCLUDE_XATTRS uint32 = 0x04 + +// the include deletion time bit in dcp flags +var DCP_OPEN_INCLUDE_DELETE_TIMES uint32 = 0x20 + +// Datatype to Include XATTRS in SUBDOC GET +var SUBDOC_FLAG_XATTR uint8 = 0x04 + +// MCItem is an internal representation of an item. +type MCItem struct { + Cas uint64 + Flags, Expiration uint32 + Data []byte +} + +// Number of bytes in a binary protocol header. +const HDR_LEN = 24 + +// Mapping of CommandCode -> name of command (not exhaustive) +var CommandNames map[CommandCode]string + +// StatusNames human readable names for memcached response. +var StatusNames map[Status]string + +func init() { + CommandNames = make(map[CommandCode]string) + CommandNames[GET] = "GET" + CommandNames[SET] = "SET" + CommandNames[ADD] = "ADD" + CommandNames[REPLACE] = "REPLACE" + CommandNames[DELETE] = "DELETE" + CommandNames[INCREMENT] = "INCREMENT" + CommandNames[DECREMENT] = "DECREMENT" + CommandNames[QUIT] = "QUIT" + CommandNames[FLUSH] = "FLUSH" + CommandNames[GETQ] = "GETQ" + CommandNames[NOOP] = "NOOP" + CommandNames[VERSION] = "VERSION" + CommandNames[GETK] = "GETK" + CommandNames[GETKQ] = "GETKQ" + CommandNames[APPEND] = "APPEND" + CommandNames[PREPEND] = "PREPEND" + CommandNames[STAT] = "STAT" + CommandNames[SETQ] = "SETQ" + CommandNames[ADDQ] = "ADDQ" + CommandNames[REPLACEQ] = "REPLACEQ" + CommandNames[DELETEQ] = "DELETEQ" + CommandNames[INCREMENTQ] = "INCREMENTQ" + CommandNames[DECREMENTQ] = "DECREMENTQ" + CommandNames[QUITQ] = "QUITQ" + CommandNames[FLUSHQ] = "FLUSHQ" + CommandNames[APPENDQ] = "APPENDQ" + CommandNames[PREPENDQ] = "PREPENDQ" + CommandNames[RGET] = "RGET" + CommandNames[RSET] = "RSET" + CommandNames[RSETQ] = "RSETQ" + CommandNames[RAPPEND] = "RAPPEND" + CommandNames[RAPPENDQ] = "RAPPENDQ" + CommandNames[RPREPEND] = "RPREPEND" + CommandNames[RPREPENDQ] = "RPREPENDQ" + CommandNames[RDELETE] = "RDELETE" + CommandNames[RDELETEQ] = "RDELETEQ" + CommandNames[RINCR] = "RINCR" + CommandNames[RINCRQ] = "RINCRQ" + CommandNames[RDECR] = "RDECR" + CommandNames[RDECRQ] = "RDECRQ" + + CommandNames[SASL_LIST_MECHS] = "SASL_LIST_MECHS" + CommandNames[SASL_AUTH] = "SASL_AUTH" + CommandNames[SASL_STEP] = "SASL_STEP" + + CommandNames[TAP_CONNECT] = "TAP_CONNECT" + CommandNames[TAP_MUTATION] = "TAP_MUTATION" + CommandNames[TAP_DELETE] = "TAP_DELETE" + CommandNames[TAP_FLUSH] = "TAP_FLUSH" + CommandNames[TAP_OPAQUE] = "TAP_OPAQUE" + CommandNames[TAP_VBUCKET_SET] = "TAP_VBUCKET_SET" + CommandNames[TAP_CHECKPOINT_START] = "TAP_CHECKPOINT_START" + CommandNames[TAP_CHECKPOINT_END] = "TAP_CHECKPOINT_END" + + CommandNames[UPR_OPEN] = "UPR_OPEN" + CommandNames[UPR_ADDSTREAM] = "UPR_ADDSTREAM" + CommandNames[UPR_CLOSESTREAM] = "UPR_CLOSESTREAM" + CommandNames[UPR_FAILOVERLOG] = "UPR_FAILOVERLOG" + CommandNames[UPR_STREAMREQ] = "UPR_STREAMREQ" + CommandNames[UPR_STREAMEND] = "UPR_STREAMEND" + CommandNames[UPR_SNAPSHOT] = "UPR_SNAPSHOT" + CommandNames[UPR_MUTATION] = "UPR_MUTATION" + CommandNames[UPR_DELETION] = "UPR_DELETION" + CommandNames[UPR_EXPIRATION] = "UPR_EXPIRATION" + CommandNames[UPR_FLUSH] = "UPR_FLUSH" + CommandNames[UPR_NOOP] = "UPR_NOOP" + CommandNames[UPR_BUFFERACK] = "UPR_BUFFERACK" + CommandNames[UPR_CONTROL] = "UPR_CONTROL" + CommandNames[SUBDOC_GET] = "SUBDOC_GET" + CommandNames[SUBDOC_MULTI_LOOKUP] = "SUBDOC_MULTI_LOOKUP" + + StatusNames = make(map[Status]string) + StatusNames[SUCCESS] = "SUCCESS" + StatusNames[KEY_ENOENT] = "KEY_ENOENT" + StatusNames[KEY_EEXISTS] = "KEY_EEXISTS" + StatusNames[E2BIG] = "E2BIG" + StatusNames[EINVAL] = "EINVAL" + StatusNames[NOT_STORED] = "NOT_STORED" + StatusNames[DELTA_BADVAL] = "DELTA_BADVAL" + StatusNames[NOT_MY_VBUCKET] = "NOT_MY_VBUCKET" + StatusNames[NO_BUCKET] = "NO_BUCKET" + StatusNames[AUTH_STALE] = "AUTH_STALE" + StatusNames[AUTH_ERROR] = "AUTH_ERROR" + StatusNames[AUTH_CONTINUE] = "AUTH_CONTINUE" + StatusNames[ERANGE] = "ERANGE" + StatusNames[ROLLBACK] = "ROLLBACK" + StatusNames[EACCESS] = "EACCESS" + StatusNames[NOT_INITIALIZED] = "NOT_INITIALIZED" + StatusNames[UNKNOWN_COMMAND] = "UNKNOWN_COMMAND" + StatusNames[ENOMEM] = "ENOMEM" + StatusNames[NOT_SUPPORTED] = "NOT_SUPPORTED" + StatusNames[EINTERNAL] = "EINTERNAL" + StatusNames[EBUSY] = "EBUSY" + StatusNames[TMPFAIL] = "TMPFAIL" + StatusNames[SUBDOC_PATH_NOT_FOUND] = "SUBDOC_PATH_NOT_FOUND" + StatusNames[SUBDOC_BAD_MULTI] = "SUBDOC_BAD_MULTI" + +} + +// String an op code. +func (o CommandCode) String() (rv string) { + rv = CommandNames[o] + if rv == "" { + rv = fmt.Sprintf("0x%02x", int(o)) + } + return rv +} + +// String an op code. +func (s Status) String() (rv string) { + rv = StatusNames[s] + if rv == "" { + rv = fmt.Sprintf("0x%02x", int(s)) + } + return rv +} + +// IsQuiet will return true if a command is a "quiet" command. +func (o CommandCode) IsQuiet() bool { + switch o { + case GETQ, + GETKQ, + SETQ, + ADDQ, + REPLACEQ, + DELETEQ, + INCREMENTQ, + DECREMENTQ, + QUITQ, + FLUSHQ, + APPENDQ, + PREPENDQ, + RSETQ, + RAPPENDQ, + RPREPENDQ, + RDELETEQ, + RINCRQ, + RDECRQ: + return true + } + return false +} diff --git a/vendor/github.com/couchbase/gomemcached/mc_req.go b/vendor/github.com/couchbase/gomemcached/mc_req.go new file mode 100644 index 000000000000..3ff67ab9a7a7 --- /dev/null +++ b/vendor/github.com/couchbase/gomemcached/mc_req.go @@ -0,0 +1,197 @@ +package gomemcached + +import ( + "encoding/binary" + "fmt" + "io" +) + +// The maximum reasonable body length to expect. +// Anything larger than this will result in an error. +// The current limit, 20MB, is the size limit supported by ep-engine. +var MaxBodyLen = int(20 * 1024 * 1024) + +// MCRequest is memcached Request +type MCRequest struct { + // The command being issued + Opcode CommandCode + // The CAS (if applicable, or 0) + Cas uint64 + // An opaque value to be returned with this request + Opaque uint32 + // The vbucket to which this command belongs + VBucket uint16 + // Command extras, key, and body + Extras, Key, Body, ExtMeta []byte + // Datatype identifier + DataType uint8 +} + +// Size gives the number of bytes this request requires. +func (req *MCRequest) Size() int { + return HDR_LEN + len(req.Extras) + len(req.Key) + len(req.Body) + len(req.ExtMeta) +} + +// A debugging string representation of this request +func (req MCRequest) String() string { + return fmt.Sprintf("{MCRequest opcode=%s, bodylen=%d, key='%s'}", + req.Opcode, len(req.Body), req.Key) +} + +func (req *MCRequest) fillHeaderBytes(data []byte) int { + + pos := 0 + data[pos] = REQ_MAGIC + pos++ + data[pos] = byte(req.Opcode) + pos++ + binary.BigEndian.PutUint16(data[pos:pos+2], + uint16(len(req.Key))) + pos += 2 + + // 4 + data[pos] = byte(len(req.Extras)) + pos++ + // Data type + if req.DataType != 0 { + data[pos] = byte(req.DataType) + } + pos++ + binary.BigEndian.PutUint16(data[pos:pos+2], req.VBucket) + pos += 2 + + // 8 + binary.BigEndian.PutUint32(data[pos:pos+4], + uint32(len(req.Body)+len(req.Key)+len(req.Extras)+len(req.ExtMeta))) + pos += 4 + + // 12 + binary.BigEndian.PutUint32(data[pos:pos+4], req.Opaque) + pos += 4 + + // 16 + if req.Cas != 0 { + binary.BigEndian.PutUint64(data[pos:pos+8], req.Cas) + } + pos += 8 + + if len(req.Extras) > 0 { + copy(data[pos:pos+len(req.Extras)], req.Extras) + pos += len(req.Extras) + } + + if len(req.Key) > 0 { + copy(data[pos:pos+len(req.Key)], req.Key) + pos += len(req.Key) + } + + return pos +} + +// HeaderBytes will return the wire representation of the request header +// (with the extras and key). +func (req *MCRequest) HeaderBytes() []byte { + data := make([]byte, HDR_LEN+len(req.Extras)+len(req.Key)) + + req.fillHeaderBytes(data) + + return data +} + +// Bytes will return the wire representation of this request. +func (req *MCRequest) Bytes() []byte { + data := make([]byte, req.Size()) + + pos := req.fillHeaderBytes(data) + + if len(req.Body) > 0 { + copy(data[pos:pos+len(req.Body)], req.Body) + } + + if len(req.ExtMeta) > 0 { + copy(data[pos+len(req.Body):pos+len(req.Body)+len(req.ExtMeta)], req.ExtMeta) + } + + return data +} + +// Transmit will send this request message across a writer. +func (req *MCRequest) Transmit(w io.Writer) (n int, err error) { + if len(req.Body) < 128 { + n, err = w.Write(req.Bytes()) + } else { + n, err = w.Write(req.HeaderBytes()) + if err == nil { + m := 0 + m, err = w.Write(req.Body) + n += m + } + } + return +} + +// Receive will fill this MCRequest with the data from a reader. +func (req *MCRequest) Receive(r io.Reader, hdrBytes []byte) (int, error) { + if len(hdrBytes) < HDR_LEN { + hdrBytes = []byte{ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0} + } + n, err := io.ReadFull(r, hdrBytes) + if err != nil { + return n, err + } + + if hdrBytes[0] != RES_MAGIC && hdrBytes[0] != REQ_MAGIC { + return n, fmt.Errorf("bad magic: 0x%02x", hdrBytes[0]) + } + + klen := int(binary.BigEndian.Uint16(hdrBytes[2:])) + elen := int(hdrBytes[4]) + // Data type at 5 + req.DataType = uint8(hdrBytes[5]) + + req.Opcode = CommandCode(hdrBytes[1]) + // Vbucket at 6:7 + req.VBucket = binary.BigEndian.Uint16(hdrBytes[6:]) + totalBodyLen := int(binary.BigEndian.Uint32(hdrBytes[8:])) + + req.Opaque = binary.BigEndian.Uint32(hdrBytes[12:]) + req.Cas = binary.BigEndian.Uint64(hdrBytes[16:]) + + if totalBodyLen > 0 { + buf := make([]byte, totalBodyLen) + m, err := io.ReadFull(r, buf) + n += m + if err == nil { + if req.Opcode >= TAP_MUTATION && + req.Opcode <= TAP_CHECKPOINT_END && + len(buf) > 1 { + // In these commands there is "engine private" + // data at the end of the extras. The first 2 + // bytes of extra data give its length. + elen += int(binary.BigEndian.Uint16(buf)) + } + + req.Extras = buf[0:elen] + req.Key = buf[elen : klen+elen] + + // get the length of extended metadata + extMetaLen := 0 + if elen > 29 { + extMetaLen = int(binary.BigEndian.Uint16(req.Extras[28:30])) + } + + bodyLen := totalBodyLen - klen - elen - extMetaLen + if bodyLen > MaxBodyLen { + return n, fmt.Errorf("%d is too big (max %d)", + bodyLen, MaxBodyLen) + } + + req.Body = buf[klen+elen : klen+elen+bodyLen] + req.ExtMeta = buf[klen+elen+bodyLen:] + } + } + return n, err +} diff --git a/vendor/github.com/couchbase/gomemcached/mc_res.go b/vendor/github.com/couchbase/gomemcached/mc_res.go new file mode 100644 index 000000000000..2b4cfe13495e --- /dev/null +++ b/vendor/github.com/couchbase/gomemcached/mc_res.go @@ -0,0 +1,267 @@ +package gomemcached + +import ( + "encoding/binary" + "fmt" + "io" + "sync" +) + +// MCResponse is memcached response +type MCResponse struct { + // The command opcode of the command that sent the request + Opcode CommandCode + // The status of the response + Status Status + // The opaque sent in the request + Opaque uint32 + // The CAS identifier (if applicable) + Cas uint64 + // Extras, key, and body for this response + Extras, Key, Body []byte + // If true, this represents a fatal condition and we should hang up + Fatal bool + // Datatype identifier + DataType uint8 +} + +// A debugging string representation of this response +func (res MCResponse) String() string { + return fmt.Sprintf("{MCResponse status=%v keylen=%d, extralen=%d, bodylen=%d}", + res.Status, len(res.Key), len(res.Extras), len(res.Body)) +} + +// Response as an error. +func (res *MCResponse) Error() string { + return fmt.Sprintf("MCResponse status=%v, opcode=%v, opaque=%v, msg: %s", + res.Status, res.Opcode, res.Opaque, string(res.Body)) +} + +func errStatus(e error) Status { + status := Status(0xffff) + if res, ok := e.(*MCResponse); ok { + status = res.Status + } + return status +} + +// IsNotFound is true if this error represents a "not found" response. +func IsNotFound(e error) bool { + return errStatus(e) == KEY_ENOENT +} + +// IsFatal is false if this error isn't believed to be fatal to a connection. +func IsFatal(e error) bool { + if e == nil { + return false + } + _, ok := isFatal[errStatus(e)] + if ok { + return true + } + return false +} + +// Size is number of bytes this response consumes on the wire. +func (res *MCResponse) Size() int { + return HDR_LEN + len(res.Extras) + len(res.Key) + len(res.Body) +} + +func (res *MCResponse) fillHeaderBytes(data []byte) int { + pos := 0 + data[pos] = RES_MAGIC + pos++ + data[pos] = byte(res.Opcode) + pos++ + binary.BigEndian.PutUint16(data[pos:pos+2], + uint16(len(res.Key))) + pos += 2 + + // 4 + data[pos] = byte(len(res.Extras)) + pos++ + // Data type + if res.DataType != 0 { + data[pos] = byte(res.DataType) + } else { + data[pos] = 0 + } + pos++ + binary.BigEndian.PutUint16(data[pos:pos+2], uint16(res.Status)) + pos += 2 + + // 8 + binary.BigEndian.PutUint32(data[pos:pos+4], + uint32(len(res.Body)+len(res.Key)+len(res.Extras))) + pos += 4 + + // 12 + binary.BigEndian.PutUint32(data[pos:pos+4], res.Opaque) + pos += 4 + + // 16 + binary.BigEndian.PutUint64(data[pos:pos+8], res.Cas) + pos += 8 + + if len(res.Extras) > 0 { + copy(data[pos:pos+len(res.Extras)], res.Extras) + pos += len(res.Extras) + } + + if len(res.Key) > 0 { + copy(data[pos:pos+len(res.Key)], res.Key) + pos += len(res.Key) + } + + return pos +} + +// HeaderBytes will get just the header bytes for this response. +func (res *MCResponse) HeaderBytes() []byte { + data := make([]byte, HDR_LEN+len(res.Extras)+len(res.Key)) + + res.fillHeaderBytes(data) + + return data +} + +// Bytes will return the actual bytes transmitted for this response. +func (res *MCResponse) Bytes() []byte { + data := make([]byte, res.Size()) + + pos := res.fillHeaderBytes(data) + + copy(data[pos:pos+len(res.Body)], res.Body) + + return data +} + +// Transmit will send this response message across a writer. +func (res *MCResponse) Transmit(w io.Writer) (n int, err error) { + if len(res.Body) < 128 { + n, err = w.Write(res.Bytes()) + } else { + n, err = w.Write(res.HeaderBytes()) + if err == nil { + m := 0 + m, err = w.Write(res.Body) + m += n + } + } + return +} + +// Receive will fill this MCResponse with the data from this reader. +func (res *MCResponse) Receive(r io.Reader, hdrBytes []byte) (n int, err error) { + if len(hdrBytes) < HDR_LEN { + hdrBytes = []byte{ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0} + } + n, err = io.ReadFull(r, hdrBytes) + if err != nil { + return n, err + } + + if hdrBytes[0] != RES_MAGIC && hdrBytes[0] != REQ_MAGIC { + return n, fmt.Errorf("bad magic: 0x%02x", hdrBytes[0]) + } + + klen := int(binary.BigEndian.Uint16(hdrBytes[2:4])) + elen := int(hdrBytes[4]) + + res.Opcode = CommandCode(hdrBytes[1]) + res.DataType = uint8(hdrBytes[5]) + res.Status = Status(binary.BigEndian.Uint16(hdrBytes[6:8])) + res.Opaque = binary.BigEndian.Uint32(hdrBytes[12:16]) + res.Cas = binary.BigEndian.Uint64(hdrBytes[16:24]) + + bodyLen := int(binary.BigEndian.Uint32(hdrBytes[8:12])) - (klen + elen) + + //defer function to debug the panic seen with MB-15557 + defer func() { + if e := recover(); e != nil { + err = fmt.Errorf(`Panic in Receive. Response %v \n + key len %v extra len %v bodylen %v`, res, klen, elen, bodyLen) + } + }() + + buf := make([]byte, klen+elen+bodyLen) + m, err := io.ReadFull(r, buf) + if err == nil { + res.Extras = buf[0:elen] + res.Key = buf[elen : klen+elen] + res.Body = buf[klen+elen:] + } + + return n + m, err +} + +type MCResponsePool struct { + pool *sync.Pool +} + +func NewMCResponsePool() *MCResponsePool { + rv := &MCResponsePool{ + pool: &sync.Pool{ + New: func() interface{} { + return &MCResponse{} + }, + }, + } + + return rv +} + +func (this *MCResponsePool) Get() *MCResponse { + return this.pool.Get().(*MCResponse) +} + +func (this *MCResponsePool) Put(r *MCResponse) { + if r == nil { + return + } + + r.Extras = nil + r.Key = nil + r.Body = nil + r.Fatal = false + + this.pool.Put(r) +} + +type StringMCResponsePool struct { + pool *sync.Pool + size int +} + +func NewStringMCResponsePool(size int) *StringMCResponsePool { + rv := &StringMCResponsePool{ + pool: &sync.Pool{ + New: func() interface{} { + return make(map[string]*MCResponse, size) + }, + }, + size: size, + } + + return rv +} + +func (this *StringMCResponsePool) Get() map[string]*MCResponse { + return this.pool.Get().(map[string]*MCResponse) +} + +func (this *StringMCResponsePool) Put(m map[string]*MCResponse) { + if m == nil || len(m) > 2*this.size { + return + } + + for k := range m { + m[k] = nil + delete(m, k) + } + + this.pool.Put(m) +} diff --git a/vendor/github.com/couchbase/gomemcached/tap.go b/vendor/github.com/couchbase/gomemcached/tap.go new file mode 100644 index 000000000000..e48623281b03 --- /dev/null +++ b/vendor/github.com/couchbase/gomemcached/tap.go @@ -0,0 +1,168 @@ +package gomemcached + +import ( + "bytes" + "encoding/binary" + "fmt" + "io" + "io/ioutil" + "strings" +) + +type TapConnectFlag uint32 + +// Tap connect option flags +const ( + BACKFILL = TapConnectFlag(0x01) + DUMP = TapConnectFlag(0x02) + LIST_VBUCKETS = TapConnectFlag(0x04) + TAKEOVER_VBUCKETS = TapConnectFlag(0x08) + SUPPORT_ACK = TapConnectFlag(0x10) + REQUEST_KEYS_ONLY = TapConnectFlag(0x20) + CHECKPOINT = TapConnectFlag(0x40) + REGISTERED_CLIENT = TapConnectFlag(0x80) + FIX_FLAG_BYTEORDER = TapConnectFlag(0x100) +) + +// Tap opaque event subtypes +const ( + TAP_OPAQUE_ENABLE_AUTO_NACK = 0 + TAP_OPAQUE_INITIAL_VBUCKET_STREAM = 1 + TAP_OPAQUE_ENABLE_CHECKPOINT_SYNC = 2 + TAP_OPAQUE_CLOSE_TAP_STREAM = 7 + TAP_OPAQUE_CLOSE_BACKFILL = 8 +) + +// Tap item flags +const ( + TAP_ACK = 1 + TAP_NO_VALUE = 2 + TAP_FLAG_NETWORK_BYTE_ORDER = 4 +) + +// TapConnectFlagNames for TapConnectFlag +var TapConnectFlagNames = map[TapConnectFlag]string{ + BACKFILL: "BACKFILL", + DUMP: "DUMP", + LIST_VBUCKETS: "LIST_VBUCKETS", + TAKEOVER_VBUCKETS: "TAKEOVER_VBUCKETS", + SUPPORT_ACK: "SUPPORT_ACK", + REQUEST_KEYS_ONLY: "REQUEST_KEYS_ONLY", + CHECKPOINT: "CHECKPOINT", + REGISTERED_CLIENT: "REGISTERED_CLIENT", + FIX_FLAG_BYTEORDER: "FIX_FLAG_BYTEORDER", +} + +// TapItemParser is a function to parse a single tap extra. +type TapItemParser func(io.Reader) (interface{}, error) + +// TapParseUint64 is a function to parse a single tap uint64. +func TapParseUint64(r io.Reader) (interface{}, error) { + var rv uint64 + err := binary.Read(r, binary.BigEndian, &rv) + return rv, err +} + +// TapParseUint16 is a function to parse a single tap uint16. +func TapParseUint16(r io.Reader) (interface{}, error) { + var rv uint16 + err := binary.Read(r, binary.BigEndian, &rv) + return rv, err +} + +// TapParseBool is a function to parse a single tap boolean. +func TapParseBool(r io.Reader) (interface{}, error) { + return true, nil +} + +// TapParseVBList parses a list of vBucket numbers as []uint16. +func TapParseVBList(r io.Reader) (interface{}, error) { + num, err := TapParseUint16(r) + if err != nil { + return nil, err + } + n := int(num.(uint16)) + + rv := make([]uint16, n) + for i := 0; i < n; i++ { + x, err := TapParseUint16(r) + if err != nil { + return nil, err + } + rv[i] = x.(uint16) + } + + return rv, err +} + +// TapFlagParsers parser functions for TAP fields. +var TapFlagParsers = map[TapConnectFlag]TapItemParser{ + BACKFILL: TapParseUint64, + LIST_VBUCKETS: TapParseVBList, +} + +// SplitFlags will split the ORed flags into the individual bit flags. +func (f TapConnectFlag) SplitFlags() []TapConnectFlag { + rv := []TapConnectFlag{} + for i := uint32(1); f != 0; i = i << 1 { + if uint32(f)&i == i { + rv = append(rv, TapConnectFlag(i)) + } + f = TapConnectFlag(uint32(f) & (^i)) + } + return rv +} + +func (f TapConnectFlag) String() string { + parts := []string{} + for _, x := range f.SplitFlags() { + p := TapConnectFlagNames[x] + if p == "" { + p = fmt.Sprintf("0x%x", int(x)) + } + parts = append(parts, p) + } + return strings.Join(parts, "|") +} + +type TapConnect struct { + Flags map[TapConnectFlag]interface{} + RemainingBody []byte + Name string +} + +// ParseTapCommands parse the tap request into the interesting bits we may +// need to do something with. +func (req *MCRequest) ParseTapCommands() (TapConnect, error) { + rv := TapConnect{ + Flags: map[TapConnectFlag]interface{}{}, + Name: string(req.Key), + } + + if len(req.Extras) < 4 { + return rv, fmt.Errorf("not enough extra bytes: %x", req.Extras) + } + + flags := TapConnectFlag(binary.BigEndian.Uint32(req.Extras)) + + r := bytes.NewReader(req.Body) + + for _, f := range flags.SplitFlags() { + fun := TapFlagParsers[f] + if fun == nil { + fun = TapParseBool + } + + val, err := fun(r) + if err != nil { + return rv, err + } + + rv.Flags[f] = val + } + + var err error + rv.RemainingBody, err = ioutil.ReadAll(r) + + return rv, err +} diff --git a/vendor/github.com/couchbase/goutils/LICENSE.md b/vendor/github.com/couchbase/goutils/LICENSE.md new file mode 100644 index 000000000000..a572e246e635 --- /dev/null +++ b/vendor/github.com/couchbase/goutils/LICENSE.md @@ -0,0 +1,47 @@ +COUCHBASE INC. COMMUNITY EDITION LICENSE AGREEMENT + +IMPORTANT-READ CAREFULLY: BY CLICKING THE "I ACCEPT" BOX OR INSTALLING, +DOWNLOADING OR OTHERWISE USING THIS SOFTWARE AND ANY ASSOCIATED +DOCUMENTATION, YOU, ON BEHALF OF YOURSELF OR AS AN AUTHORIZED +REPRESENTATIVE ON BEHALF OF AN ENTITY ("LICENSEE") AGREE TO ALL THE +TERMS OF THIS COMMUNITY EDITION LICENSE AGREEMENT (THE "AGREEMENT") +REGARDING YOUR USE OF THE SOFTWARE. YOU REPRESENT AND WARRANT THAT YOU +HAVE FULL LEGAL AUTHORITY TO BIND THE LICENSEE TO THIS AGREEMENT. IF YOU +DO NOT AGREE WITH ALL OF THESE TERMS, DO NOT SELECT THE "I ACCEPT" BOX +AND DO NOT INSTALL, DOWNLOAD OR OTHERWISE USE THE SOFTWARE. THE +EFFECTIVE DATE OF THIS AGREEMENT IS THE DATE ON WHICH YOU CLICK "I +ACCEPT" OR OTHERWISE INSTALL, DOWNLOAD OR USE THE SOFTWARE. + +1. License Grant. Couchbase Inc. hereby grants Licensee, free of charge, +the non-exclusive right to use, copy, merge, publish, distribute, +sublicense, and/or sell copies of the Software, and to permit persons to +whom the Software is furnished to do so, subject to Licensee including +the following copyright notice in all copies or substantial portions of +the Software: + +Couchbase (r) http://www.Couchbase.com Copyright 2016 Couchbase, Inc. + +As used in this Agreement, "Software" means the object code version of +the applicable elastic data management server software provided by +Couchbase Inc. + +2. Restrictions. Licensee will not reverse engineer, disassemble, or +decompile the Software (except to the extent such restrictions are +prohibited by law). + +3. Support. Couchbase, Inc. will provide Licensee with access to, and +use of, the Couchbase, Inc. support forum available at the following +URL: http://www.couchbase.org/forums/. Couchbase, Inc. may, at its +discretion, modify, suspend or terminate support at any time upon notice +to Licensee. + +4. Warranty Disclaimer and Limitation of Liability. THE SOFTWARE IS +PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +COUCHBASE INC. OR THE AUTHORS OR COPYRIGHT HOLDERS IN THE SOFTWARE BE +LIABLE FOR ANY CLAIM, DAMAGES (IINCLUDING, WITHOUT LIMITATION, DIRECT, +INDIRECT OR CONSEQUENTIAL DAMAGES) OR OTHER LIABILITY, WHETHER IN AN +ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/vendor/github.com/couchbase/goutils/logging/logger.go b/vendor/github.com/couchbase/goutils/logging/logger.go new file mode 100644 index 000000000000..b9948f9b2ef8 --- /dev/null +++ b/vendor/github.com/couchbase/goutils/logging/logger.go @@ -0,0 +1,481 @@ +// Copyright (c) 2016 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. + +package logging + +import ( + "os" + "runtime" + "strings" + "sync" +) + +type Level int + +const ( + NONE = Level(iota) // Disable all logging + FATAL // System is in severe error state and has to abort + SEVERE // System is in severe error state and cannot recover reliably + ERROR // System is in error state but can recover and continue reliably + WARN // System approaching error state, or is in a correct but undesirable state + INFO // System-level events and status, in correct states + REQUEST // Request-level events, with request-specific rlevel + TRACE // Trace detailed system execution, e.g. function entry / exit + DEBUG // Debug +) + +type LogEntryFormatter int + +const ( + TEXTFORMATTER = LogEntryFormatter(iota) + JSONFORMATTER + KVFORMATTER +) + +func (level Level) String() string { + return _LEVEL_NAMES[level] +} + +var _LEVEL_NAMES = []string{ + DEBUG: "DEBUG", + TRACE: "TRACE", + REQUEST: "REQUEST", + INFO: "INFO", + WARN: "WARN", + ERROR: "ERROR", + SEVERE: "SEVERE", + FATAL: "FATAL", + NONE: "NONE", +} + +var _LEVEL_MAP = map[string]Level{ + "debug": DEBUG, + "trace": TRACE, + "request": REQUEST, + "info": INFO, + "warn": WARN, + "error": ERROR, + "severe": SEVERE, + "fatal": FATAL, + "none": NONE, +} + +func ParseLevel(name string) (level Level, ok bool) { + level, ok = _LEVEL_MAP[strings.ToLower(name)] + return +} + +/* + +Pair supports logging of key-value pairs. Keys beginning with _ are +reserved for the logger, e.g. _time, _level, _msg, and _rlevel. The +Pair APIs are designed to avoid heap allocation and garbage +collection. + +*/ +type Pairs []Pair +type Pair struct { + Name string + Value interface{} +} + +/* + +Map allows key-value pairs to be specified using map literals or data +structures. For example: + +Errorm(msg, Map{...}) + +Map incurs heap allocation and garbage collection, so the Pair APIs +should be preferred. + +*/ +type Map map[string]interface{} + +// Logger provides a common interface for logging libraries +type Logger interface { + /* + These APIs write all the given pairs in addition to standard logger keys. + */ + Logp(level Level, msg string, kv ...Pair) + + Debugp(msg string, kv ...Pair) + + Tracep(msg string, kv ...Pair) + + Requestp(rlevel Level, msg string, kv ...Pair) + + Infop(msg string, kv ...Pair) + + Warnp(msg string, kv ...Pair) + + Errorp(msg string, kv ...Pair) + + Severep(msg string, kv ...Pair) + + Fatalp(msg string, kv ...Pair) + + /* + These APIs write the fields in the given kv Map in addition to standard logger keys. + */ + Logm(level Level, msg string, kv Map) + + Debugm(msg string, kv Map) + + Tracem(msg string, kv Map) + + Requestm(rlevel Level, msg string, kv Map) + + Infom(msg string, kv Map) + + Warnm(msg string, kv Map) + + Errorm(msg string, kv Map) + + Severem(msg string, kv Map) + + Fatalm(msg string, kv Map) + + /* + + These APIs only write _msg, _time, _level, and other logger keys. If + the msg contains other fields, use the Pair or Map APIs instead. + + */ + Logf(level Level, fmt string, args ...interface{}) + + Debugf(fmt string, args ...interface{}) + + Tracef(fmt string, args ...interface{}) + + Requestf(rlevel Level, fmt string, args ...interface{}) + + Infof(fmt string, args ...interface{}) + + Warnf(fmt string, args ...interface{}) + + Errorf(fmt string, args ...interface{}) + + Severef(fmt string, args ...interface{}) + + Fatalf(fmt string, args ...interface{}) + + /* + These APIs control the logging level + */ + + SetLevel(Level) // Set the logging level + + Level() Level // Get the current logging level +} + +var logger Logger = nil +var curLevel Level = DEBUG // initially set to never skip + +var loggerMutex sync.RWMutex + +// All the methods below first acquire the mutex (mostly in exclusive mode) +// and only then check if logging at the current level is enabled. +// This introduces a fair bottleneck for those log entries that should be +// skipped (the majority, at INFO or below levels) +// We try to predict here if we should lock the mutex at all by caching +// the current log level: while dynamically changing logger, there might +// be the odd entry skipped as the new level is cached. +// Since we seem to never change the logger, this is not an issue. +func skipLogging(level Level) bool { + if logger == nil { + return true + } + return level > curLevel +} + +func SetLogger(newLogger Logger) { + loggerMutex.Lock() + defer loggerMutex.Unlock() + logger = newLogger + if logger == nil { + curLevel = NONE + } else { + curLevel = newLogger.Level() + } +} + +func Logp(level Level, msg string, kv ...Pair) { + if skipLogging(level) { + return + } + loggerMutex.Lock() + defer loggerMutex.Unlock() + logger.Logp(level, msg, kv...) +} + +func Debugp(msg string, kv ...Pair) { + if skipLogging(DEBUG) { + return + } + loggerMutex.Lock() + defer loggerMutex.Unlock() + logger.Debugp(msg, kv...) +} + +func Tracep(msg string, kv ...Pair) { + if skipLogging(TRACE) { + return + } + loggerMutex.Lock() + defer loggerMutex.Unlock() + logger.Tracep(msg, kv...) +} + +func Requestp(rlevel Level, msg string, kv ...Pair) { + if skipLogging(REQUEST) { + return + } + loggerMutex.Lock() + defer loggerMutex.Unlock() + logger.Requestp(rlevel, msg, kv...) +} + +func Infop(msg string, kv ...Pair) { + if skipLogging(INFO) { + return + } + loggerMutex.Lock() + defer loggerMutex.Unlock() + logger.Infop(msg, kv...) +} + +func Warnp(msg string, kv ...Pair) { + if skipLogging(WARN) { + return + } + loggerMutex.Lock() + defer loggerMutex.Unlock() + logger.Warnp(msg, kv...) +} + +func Errorp(msg string, kv ...Pair) { + if skipLogging(ERROR) { + return + } + loggerMutex.Lock() + defer loggerMutex.Unlock() + logger.Errorp(msg, kv...) +} + +func Severep(msg string, kv ...Pair) { + if skipLogging(SEVERE) { + return + } + loggerMutex.Lock() + defer loggerMutex.Unlock() + logger.Severep(msg, kv...) +} + +func Fatalp(msg string, kv ...Pair) { + if skipLogging(FATAL) { + return + } + loggerMutex.Lock() + defer loggerMutex.Unlock() + logger.Fatalp(msg, kv...) +} + +func Logm(level Level, msg string, kv Map) { + if skipLogging(level) { + return + } + loggerMutex.Lock() + defer loggerMutex.Unlock() + logger.Logm(level, msg, kv) +} + +func Debugm(msg string, kv Map) { + if skipLogging(DEBUG) { + return + } + loggerMutex.Lock() + defer loggerMutex.Unlock() + logger.Debugm(msg, kv) +} + +func Tracem(msg string, kv Map) { + if skipLogging(TRACE) { + return + } + loggerMutex.Lock() + defer loggerMutex.Unlock() + logger.Tracem(msg, kv) +} + +func Requestm(rlevel Level, msg string, kv Map) { + if skipLogging(REQUEST) { + return + } + loggerMutex.Lock() + defer loggerMutex.Unlock() + logger.Requestm(rlevel, msg, kv) +} + +func Infom(msg string, kv Map) { + if skipLogging(INFO) { + return + } + loggerMutex.Lock() + defer loggerMutex.Unlock() + logger.Infom(msg, kv) +} + +func Warnm(msg string, kv Map) { + if skipLogging(WARN) { + return + } + loggerMutex.Lock() + defer loggerMutex.Unlock() + logger.Warnm(msg, kv) +} + +func Errorm(msg string, kv Map) { + if skipLogging(ERROR) { + return + } + loggerMutex.Lock() + defer loggerMutex.Unlock() + logger.Errorm(msg, kv) +} + +func Severem(msg string, kv Map) { + if skipLogging(SEVERE) { + return + } + loggerMutex.Lock() + defer loggerMutex.Unlock() + logger.Severem(msg, kv) +} + +func Fatalm(msg string, kv Map) { + if skipLogging(FATAL) { + return + } + loggerMutex.Lock() + defer loggerMutex.Unlock() + logger.Fatalm(msg, kv) +} + +func Logf(level Level, fmt string, args ...interface{}) { + if skipLogging(level) { + return + } + loggerMutex.Lock() + defer loggerMutex.Unlock() + logger.Logf(level, fmt, args...) +} + +func Debugf(fmt string, args ...interface{}) { + if skipLogging(DEBUG) { + return + } + loggerMutex.Lock() + defer loggerMutex.Unlock() + logger.Debugf(fmt, args...) +} + +func Tracef(fmt string, args ...interface{}) { + if skipLogging(TRACE) { + return + } + loggerMutex.Lock() + defer loggerMutex.Unlock() + logger.Tracef(fmt, args...) +} + +func Requestf(rlevel Level, fmt string, args ...interface{}) { + if skipLogging(REQUEST) { + return + } + loggerMutex.Lock() + defer loggerMutex.Unlock() + logger.Requestf(rlevel, fmt, args...) +} + +func Infof(fmt string, args ...interface{}) { + if skipLogging(INFO) { + return + } + loggerMutex.Lock() + defer loggerMutex.Unlock() + logger.Infof(fmt, args...) +} + +func Warnf(fmt string, args ...interface{}) { + if skipLogging(WARN) { + return + } + loggerMutex.Lock() + defer loggerMutex.Unlock() + logger.Warnf(fmt, args...) +} + +func Errorf(fmt string, args ...interface{}) { + if skipLogging(ERROR) { + return + } + loggerMutex.Lock() + defer loggerMutex.Unlock() + logger.Errorf(fmt, args...) +} + +func Severef(fmt string, args ...interface{}) { + if skipLogging(SEVERE) { + return + } + loggerMutex.Lock() + defer loggerMutex.Unlock() + logger.Severef(fmt, args...) +} + +func Fatalf(fmt string, args ...interface{}) { + if skipLogging(FATAL) { + return + } + loggerMutex.Lock() + defer loggerMutex.Unlock() + logger.Fatalf(fmt, args...) +} + +func SetLevel(level Level) { + loggerMutex.Lock() + defer loggerMutex.Unlock() + logger.SetLevel(level) + curLevel = level +} + +func LogLevel() Level { + loggerMutex.RLock() + defer loggerMutex.RUnlock() + return logger.Level() +} + +func Stackf(level Level, fmt string, args ...interface{}) { + if skipLogging(level) { + return + } + buf := make([]byte, 1<<16) + n := runtime.Stack(buf, false) + s := string(buf[0:n]) + loggerMutex.Lock() + defer loggerMutex.Unlock() + logger.Logf(level, fmt, args...) + logger.Logf(level, s) +} + +func init() { + logger = NewLogger(os.Stderr, INFO, TEXTFORMATTER) + SetLogger(logger) +} diff --git a/vendor/github.com/couchbase/goutils/logging/logger_golog.go b/vendor/github.com/couchbase/goutils/logging/logger_golog.go new file mode 100644 index 000000000000..eec432a51369 --- /dev/null +++ b/vendor/github.com/couchbase/goutils/logging/logger_golog.go @@ -0,0 +1,318 @@ +// Copyright (c) 2016 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. + +package logging + +import ( + "bytes" + "encoding/json" + "fmt" + "io" + "log" + "time" +) + +type goLogger struct { + logger *log.Logger + level Level + entryFormatter formatter +} + +const ( + _LEVEL = "_level" + _MSG = "_msg" + _TIME = "_time" + _RLEVEL = "_rlevel" +) + +func NewLogger(out io.Writer, lvl Level, fmtLogging LogEntryFormatter) *goLogger { + logger := &goLogger{ + logger: log.New(out, "", 0), + level: lvl, + } + if fmtLogging == JSONFORMATTER { + logger.entryFormatter = &jsonFormatter{} + } else if fmtLogging == KVFORMATTER { + logger.entryFormatter = &keyvalueFormatter{} + } else { + logger.entryFormatter = &textFormatter{} + } + return logger +} + +func (gl *goLogger) Logp(level Level, msg string, kv ...Pair) { + if gl.logger == nil { + return + } + if level <= gl.level { + e := newLogEntry(msg, level) + copyPairs(e, kv) + gl.log(e) + } +} + +func (gl *goLogger) Debugp(msg string, kv ...Pair) { + gl.Logp(DEBUG, msg, kv...) +} + +func (gl *goLogger) Tracep(msg string, kv ...Pair) { + gl.Logp(TRACE, msg, kv...) +} + +func (gl *goLogger) Requestp(rlevel Level, msg string, kv ...Pair) { + if gl.logger == nil { + return + } + if REQUEST <= gl.level { + e := newLogEntry(msg, REQUEST) + e.Rlevel = rlevel + copyPairs(e, kv) + gl.log(e) + } +} + +func (gl *goLogger) Infop(msg string, kv ...Pair) { + gl.Logp(INFO, msg, kv...) +} + +func (gl *goLogger) Warnp(msg string, kv ...Pair) { + gl.Logp(WARN, msg, kv...) +} + +func (gl *goLogger) Errorp(msg string, kv ...Pair) { + gl.Logp(ERROR, msg, kv...) +} + +func (gl *goLogger) Severep(msg string, kv ...Pair) { + gl.Logp(SEVERE, msg, kv...) +} + +func (gl *goLogger) Fatalp(msg string, kv ...Pair) { + gl.Logp(FATAL, msg, kv...) +} + +func (gl *goLogger) Logm(level Level, msg string, kv Map) { + if gl.logger == nil { + return + } + if level <= gl.level { + e := newLogEntry(msg, level) + e.Data = kv + gl.log(e) + } +} + +func (gl *goLogger) Debugm(msg string, kv Map) { + gl.Logm(DEBUG, msg, kv) +} + +func (gl *goLogger) Tracem(msg string, kv Map) { + gl.Logm(TRACE, msg, kv) +} + +func (gl *goLogger) Requestm(rlevel Level, msg string, kv Map) { + if gl.logger == nil { + return + } + if REQUEST <= gl.level { + e := newLogEntry(msg, REQUEST) + e.Rlevel = rlevel + e.Data = kv + gl.log(e) + } +} + +func (gl *goLogger) Infom(msg string, kv Map) { + gl.Logm(INFO, msg, kv) +} + +func (gl *goLogger) Warnm(msg string, kv Map) { + gl.Logm(WARN, msg, kv) +} + +func (gl *goLogger) Errorm(msg string, kv Map) { + gl.Logm(ERROR, msg, kv) +} + +func (gl *goLogger) Severem(msg string, kv Map) { + gl.Logm(SEVERE, msg, kv) +} + +func (gl *goLogger) Fatalm(msg string, kv Map) { + gl.Logm(FATAL, msg, kv) +} + +func (gl *goLogger) Logf(level Level, format string, args ...interface{}) { + if gl.logger == nil { + return + } + if level <= gl.level { + e := newLogEntry(fmt.Sprintf(format, args...), level) + gl.log(e) + } +} + +func (gl *goLogger) Debugf(format string, args ...interface{}) { + gl.Logf(DEBUG, format, args...) +} + +func (gl *goLogger) Tracef(format string, args ...interface{}) { + gl.Logf(TRACE, format, args...) +} + +func (gl *goLogger) Requestf(rlevel Level, format string, args ...interface{}) { + if gl.logger == nil { + return + } + if REQUEST <= gl.level { + e := newLogEntry(fmt.Sprintf(format, args...), REQUEST) + e.Rlevel = rlevel + gl.log(e) + } +} + +func (gl *goLogger) Infof(format string, args ...interface{}) { + gl.Logf(INFO, format, args...) +} + +func (gl *goLogger) Warnf(format string, args ...interface{}) { + gl.Logf(WARN, format, args...) +} + +func (gl *goLogger) Errorf(format string, args ...interface{}) { + gl.Logf(ERROR, format, args...) +} + +func (gl *goLogger) Severef(format string, args ...interface{}) { + gl.Logf(SEVERE, format, args...) +} + +func (gl *goLogger) Fatalf(format string, args ...interface{}) { + gl.Logf(FATAL, format, args...) +} + +func (gl *goLogger) Level() Level { + return gl.level +} + +func (gl *goLogger) SetLevel(level Level) { + gl.level = level +} + +func (gl *goLogger) log(newEntry *logEntry) { + s := gl.entryFormatter.format(newEntry) + gl.logger.Print(s) +} + +type logEntry struct { + Time string + Level Level + Rlevel Level + Message string + Data Map +} + +func newLogEntry(msg string, level Level) *logEntry { + return &logEntry{ + Time: time.Now().Format("2006-01-02T15:04:05.000-07:00"), // time.RFC3339 with milliseconds + Level: level, + Rlevel: NONE, + Message: msg, + } +} + +func copyPairs(newEntry *logEntry, pairs []Pair) { + newEntry.Data = make(Map, len(pairs)) + for _, p := range pairs { + newEntry.Data[p.Name] = p.Value + } +} + +type formatter interface { + format(*logEntry) string +} + +type textFormatter struct { +} + +// ex. 2016-02-10T09:15:25.498-08:00 [INFO] This is a message from test in text format + +func (*textFormatter) format(newEntry *logEntry) string { + b := &bytes.Buffer{} + appendValue(b, newEntry.Time) + if newEntry.Rlevel != NONE { + fmt.Fprintf(b, "[%s,%s] ", newEntry.Level.String(), newEntry.Rlevel.String()) + } else { + fmt.Fprintf(b, "[%s] ", newEntry.Level.String()) + } + appendValue(b, newEntry.Message) + for key, value := range newEntry.Data { + appendKeyValue(b, key, value) + } + b.WriteByte('\n') + s := bytes.NewBuffer(b.Bytes()) + return s.String() +} + +func appendValue(b *bytes.Buffer, value interface{}) { + if _, ok := value.(string); ok { + fmt.Fprintf(b, "%s ", value) + } else { + fmt.Fprintf(b, "%v ", value) + } +} + +type keyvalueFormatter struct { +} + +// ex. _time=2016-02-10T09:15:25.498-08:00 _level=INFO _msg=This is a message from test in key-value format + +func (*keyvalueFormatter) format(newEntry *logEntry) string { + b := &bytes.Buffer{} + appendKeyValue(b, _TIME, newEntry.Time) + appendKeyValue(b, _LEVEL, newEntry.Level.String()) + if newEntry.Rlevel != NONE { + appendKeyValue(b, _RLEVEL, newEntry.Rlevel.String()) + } + appendKeyValue(b, _MSG, newEntry.Message) + for key, value := range newEntry.Data { + appendKeyValue(b, key, value) + } + b.WriteByte('\n') + s := bytes.NewBuffer(b.Bytes()) + return s.String() +} + +func appendKeyValue(b *bytes.Buffer, key, value interface{}) { + if _, ok := value.(string); ok { + fmt.Fprintf(b, "%v=%s ", key, value) + } else { + fmt.Fprintf(b, "%v=%v ", key, value) + } +} + +type jsonFormatter struct { +} + +// ex. {"_level":"INFO","_msg":"This is a message from test in json format","_time":"2016-02-10T09:12:59.518-08:00"} + +func (*jsonFormatter) format(newEntry *logEntry) string { + if newEntry.Data == nil { + newEntry.Data = make(Map, 5) + } + newEntry.Data[_TIME] = newEntry.Time + newEntry.Data[_LEVEL] = newEntry.Level.String() + if newEntry.Rlevel != NONE { + newEntry.Data[_RLEVEL] = newEntry.Rlevel.String() + } + newEntry.Data[_MSG] = newEntry.Message + serialized, _ := json.Marshal(newEntry.Data) + s := bytes.NewBuffer(append(serialized, '\n')) + return s.String() +} diff --git a/vendor/github.com/couchbase/goutils/scramsha/scramsha.go b/vendor/github.com/couchbase/goutils/scramsha/scramsha.go new file mode 100644 index 000000000000..b234bfc8a96f --- /dev/null +++ b/vendor/github.com/couchbase/goutils/scramsha/scramsha.go @@ -0,0 +1,207 @@ +// @author Couchbase +// @copyright 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package scramsha provides implementation of client side SCRAM-SHA +// according to https://tools.ietf.org/html/rfc5802 +package scramsha + +import ( + "crypto/hmac" + "crypto/rand" + "crypto/sha1" + "crypto/sha256" + "crypto/sha512" + "encoding/base64" + "fmt" + "github.com/pkg/errors" + "golang.org/x/crypto/pbkdf2" + "hash" + "strconv" + "strings" +) + +func hmacHash(message []byte, secret []byte, hashFunc func() hash.Hash) []byte { + h := hmac.New(hashFunc, secret) + h.Write(message) + return h.Sum(nil) +} + +func shaHash(message []byte, hashFunc func() hash.Hash) []byte { + h := hashFunc() + h.Write(message) + return h.Sum(nil) +} + +func generateClientNonce(size int) (string, error) { + randomBytes := make([]byte, size) + _, err := rand.Read(randomBytes) + if err != nil { + return "", errors.Wrap(err, "Unable to generate nonce") + } + return base64.StdEncoding.EncodeToString(randomBytes), nil +} + +// ScramSha provides context for SCRAM-SHA handling +type ScramSha struct { + hashSize int + hashFunc func() hash.Hash + clientNonce string + serverNonce string + salt []byte + i int + saltedPassword []byte + authMessage string +} + +var knownMethods = []string{"SCRAM-SHA512", "SCRAM-SHA256", "SCRAM-SHA1"} + +// BestMethod returns SCRAM-SHA method we consider the best out of suggested +// by server +func BestMethod(methods string) (string, error) { + for _, m := range knownMethods { + if strings.Index(methods, m) != -1 { + return m, nil + } + } + return "", errors.Errorf( + "None of the server suggested methods [%s] are supported", + methods) +} + +// NewScramSha creates context for SCRAM-SHA handling +func NewScramSha(method string) (*ScramSha, error) { + s := &ScramSha{} + + if method == knownMethods[0] { + s.hashFunc = sha512.New + s.hashSize = 64 + } else if method == knownMethods[1] { + s.hashFunc = sha256.New + s.hashSize = 32 + } else if method == knownMethods[2] { + s.hashFunc = sha1.New + s.hashSize = 20 + } else { + return nil, errors.Errorf("Unsupported method %s", method) + } + return s, nil +} + +// GetStartRequest builds start SCRAM-SHA request to be sent to server +func (s *ScramSha) GetStartRequest(user string) (string, error) { + var err error + s.clientNonce, err = generateClientNonce(24) + if err != nil { + return "", errors.Wrapf(err, "Unable to generate SCRAM-SHA "+ + "start request for user %s", user) + } + + message := fmt.Sprintf("n,,n=%s,r=%s", user, s.clientNonce) + s.authMessage = message[3:] + return message, nil +} + +// HandleStartResponse handles server response on start SCRAM-SHA request +func (s *ScramSha) HandleStartResponse(response string) error { + parts := strings.Split(response, ",") + if len(parts) != 3 { + return errors.Errorf("expected 3 fields in first SCRAM-SHA-1 "+ + "server message %s", response) + } + if !strings.HasPrefix(parts[0], "r=") || len(parts[0]) < 3 { + return errors.Errorf("Server sent an invalid nonce %s", + parts[0]) + } + if !strings.HasPrefix(parts[1], "s=") || len(parts[1]) < 3 { + return errors.Errorf("Server sent an invalid salt %s", parts[1]) + } + if !strings.HasPrefix(parts[2], "i=") || len(parts[2]) < 3 { + return errors.Errorf("Server sent an invalid iteration count %s", + parts[2]) + } + + s.serverNonce = parts[0][2:] + encodedSalt := parts[1][2:] + var err error + s.i, err = strconv.Atoi(parts[2][2:]) + if err != nil { + return errors.Errorf("Iteration count %s must be integer.", + parts[2][2:]) + } + + if s.i < 1 { + return errors.New("Iteration count should be positive") + } + + if !strings.HasPrefix(s.serverNonce, s.clientNonce) { + return errors.Errorf("Server nonce %s doesn't contain client"+ + " nonce %s", s.serverNonce, s.clientNonce) + } + + s.salt, err = base64.StdEncoding.DecodeString(encodedSalt) + if err != nil { + return errors.Wrapf(err, "Unable to decode salt %s", + encodedSalt) + } + + s.authMessage = s.authMessage + "," + response + return nil +} + +// GetFinalRequest builds final SCRAM-SHA request to be sent to server +func (s *ScramSha) GetFinalRequest(pass string) string { + clientFinalMessageBare := "c=biws,r=" + s.serverNonce + s.authMessage = s.authMessage + "," + clientFinalMessageBare + + s.saltedPassword = pbkdf2.Key([]byte(pass), s.salt, s.i, + s.hashSize, s.hashFunc) + + clientKey := hmacHash([]byte("Client Key"), s.saltedPassword, s.hashFunc) + storedKey := shaHash(clientKey, s.hashFunc) + clientSignature := hmacHash([]byte(s.authMessage), storedKey, s.hashFunc) + + clientProof := make([]byte, len(clientSignature)) + for i := 0; i < len(clientSignature); i++ { + clientProof[i] = clientKey[i] ^ clientSignature[i] + } + + return clientFinalMessageBare + ",p=" + + base64.StdEncoding.EncodeToString(clientProof) +} + +// HandleFinalResponse handles server's response on final SCRAM-SHA request +func (s *ScramSha) HandleFinalResponse(response string) error { + if strings.Contains(response, ",") || + !strings.HasPrefix(response, "v=") { + return errors.Errorf("Server sent an invalid final message %s", + response) + } + + decodedMessage, err := base64.StdEncoding.DecodeString(response[2:]) + if err != nil { + return errors.Wrapf(err, "Unable to decode server message %s", + response[2:]) + } + serverKey := hmacHash([]byte("Server Key"), s.saltedPassword, + s.hashFunc) + serverSignature := hmacHash([]byte(s.authMessage), serverKey, + s.hashFunc) + if string(decodedMessage) != string(serverSignature) { + return errors.Errorf("Server proof %s doesn't match "+ + "the expected: %s", + string(decodedMessage), string(serverSignature)) + } + return nil +} diff --git a/vendor/github.com/couchbase/goutils/scramsha/scramsha_http.go b/vendor/github.com/couchbase/goutils/scramsha/scramsha_http.go new file mode 100644 index 000000000000..19f32b31347d --- /dev/null +++ b/vendor/github.com/couchbase/goutils/scramsha/scramsha_http.go @@ -0,0 +1,252 @@ +// @author Couchbase +// @copyright 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package scramsha provides implementation of client side SCRAM-SHA +// via Http according to https://tools.ietf.org/html/rfc7804 +package scramsha + +import ( + "encoding/base64" + "github.com/pkg/errors" + "io" + "io/ioutil" + "net/http" + "strings" +) + +// consts used to parse scramsha response from target +const ( + WWWAuthenticate = "WWW-Authenticate" + AuthenticationInfo = "Authentication-Info" + Authorization = "Authorization" + DataPrefix = "data=" + SidPrefix = "sid=" +) + +// Request provides implementation of http request that can be retried +type Request struct { + body io.ReadSeeker + + // Embed an HTTP request directly. This makes a *Request act exactly + // like an *http.Request so that all meta methods are supported. + *http.Request +} + +type lenReader interface { + Len() int +} + +// NewRequest creates http request that can be retried +func NewRequest(method, url string, body io.ReadSeeker) (*Request, error) { + // Wrap the body in a noop ReadCloser if non-nil. This prevents the + // reader from being closed by the HTTP client. + var rcBody io.ReadCloser + if body != nil { + rcBody = ioutil.NopCloser(body) + } + + // Make the request with the noop-closer for the body. + httpReq, err := http.NewRequest(method, url, rcBody) + if err != nil { + return nil, err + } + + // Check if we can set the Content-Length automatically. + if lr, ok := body.(lenReader); ok { + httpReq.ContentLength = int64(lr.Len()) + } + + return &Request{body, httpReq}, nil +} + +func encode(str string) string { + return base64.StdEncoding.EncodeToString([]byte(str)) +} + +func decode(str string) (string, error) { + bytes, err := base64.StdEncoding.DecodeString(str) + if err != nil { + return "", errors.Errorf("Cannot base64 decode %s", + str) + } + return string(bytes), err +} + +func trimPrefix(s, prefix string) (string, error) { + l := len(s) + trimmed := strings.TrimPrefix(s, prefix) + if l == len(trimmed) { + return trimmed, errors.Errorf("Prefix %s not found in %s", + prefix, s) + } + return trimmed, nil +} + +func drainBody(resp *http.Response) { + defer resp.Body.Close() + io.Copy(ioutil.Discard, resp.Body) +} + +// DoScramSha performs SCRAM-SHA handshake via Http +func DoScramSha(req *Request, + username string, + password string, + client *http.Client) (*http.Response, error) { + + method := "SCRAM-SHA-512" + s, err := NewScramSha("SCRAM-SHA512") + if err != nil { + return nil, errors.Wrap(err, + "Unable to initialize SCRAM-SHA handler") + } + + message, err := s.GetStartRequest(username) + if err != nil { + return nil, err + } + + encodedMessage := method + " " + DataPrefix + encode(message) + + req.Header.Set(Authorization, encodedMessage) + + res, err := client.Do(req.Request) + if err != nil { + return nil, errors.Wrap(err, "Problem sending SCRAM-SHA start"+ + "request") + } + + if res.StatusCode != http.StatusUnauthorized { + return res, nil + } + + authHeader := res.Header.Get(WWWAuthenticate) + if authHeader == "" { + drainBody(res) + return nil, errors.Errorf("Header %s is not populated in "+ + "SCRAM-SHA start response", WWWAuthenticate) + } + + authHeader, err = trimPrefix(authHeader, method+" ") + if err != nil { + if strings.HasPrefix(authHeader, "Basic ") { + // user not found + return res, nil + } + drainBody(res) + return nil, errors.Wrapf(err, "Error while parsing SCRAM-SHA "+ + "start response %s", authHeader) + } + + drainBody(res) + + sid, response, err := parseSidAndData(authHeader) + if err != nil { + return nil, errors.Wrapf(err, "Error while parsing SCRAM-SHA "+ + "start response %s", authHeader) + } + + err = s.HandleStartResponse(response) + if err != nil { + return nil, errors.Wrapf(err, "Error parsing SCRAM-SHA start "+ + "response %s", response) + } + + message = s.GetFinalRequest(password) + encodedMessage = method + " " + SidPrefix + sid + "," + DataPrefix + + encode(message) + + req.Header.Set(Authorization, encodedMessage) + + // rewind request body so it can be resent again + if req.body != nil { + if _, err = req.body.Seek(0, 0); err != nil { + return nil, errors.Errorf("Failed to seek body: %v", + err) + } + } + + res, err = client.Do(req.Request) + if err != nil { + return nil, errors.Wrap(err, "Problem sending SCRAM-SHA final"+ + "request") + } + + if res.StatusCode == http.StatusUnauthorized { + // TODO retrieve and return error + return res, nil + } + + if res.StatusCode >= http.StatusInternalServerError { + // in this case we cannot expect server to set headers properly + return res, nil + } + + authHeader = res.Header.Get(AuthenticationInfo) + if authHeader == "" { + drainBody(res) + return nil, errors.Errorf("Header %s is not populated in "+ + "SCRAM-SHA final response", AuthenticationInfo) + } + + finalSid, response, err := parseSidAndData(authHeader) + if err != nil { + drainBody(res) + return nil, errors.Wrapf(err, "Error while parsing SCRAM-SHA "+ + "final response %s", authHeader) + } + + if finalSid != sid { + drainBody(res) + return nil, errors.Errorf("Sid %s returned by server "+ + "doesn't match the original sid %s", finalSid, sid) + } + + err = s.HandleFinalResponse(response) + if err != nil { + drainBody(res) + return nil, errors.Wrapf(err, + "Error handling SCRAM-SHA final server response %s", + response) + } + return res, nil +} + +func parseSidAndData(authHeader string) (string, string, error) { + sidIndex := strings.Index(authHeader, SidPrefix) + if sidIndex < 0 { + return "", "", errors.Errorf("Cannot find %s in %s", + SidPrefix, authHeader) + } + + sidEndIndex := strings.Index(authHeader, ",") + if sidEndIndex < 0 { + return "", "", errors.Errorf("Cannot find ',' in %s", + authHeader) + } + + sid := authHeader[sidIndex+len(SidPrefix) : sidEndIndex] + + dataIndex := strings.Index(authHeader, DataPrefix) + if dataIndex < 0 { + return "", "", errors.Errorf("Cannot find %s in %s", + DataPrefix, authHeader) + } + + data, err := decode(authHeader[dataIndex+len(DataPrefix):]) + if err != nil { + return "", "", err + } + return sid, data, nil +} diff --git a/vendor/github.com/couchbaselabs/go-couchbase/LICENSE b/vendor/github.com/couchbaselabs/go-couchbase/LICENSE new file mode 100644 index 000000000000..0b23ef358e11 --- /dev/null +++ b/vendor/github.com/couchbaselabs/go-couchbase/LICENSE @@ -0,0 +1,19 @@ +Copyright (c) 2013 Couchbase, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/vendor/github.com/couchbaselabs/go-couchbase/audit.go b/vendor/github.com/couchbaselabs/go-couchbase/audit.go new file mode 100644 index 000000000000..3db7d9f9ff2b --- /dev/null +++ b/vendor/github.com/couchbaselabs/go-couchbase/audit.go @@ -0,0 +1,32 @@ +package couchbase + +import () + +// Sample data: +// {"disabled":["12333", "22244"],"uid":"132492431","auditdEnabled":true, +// "disabledUsers":[{"name":"bill","domain":"local"},{"name":"bob","domain":"local"}], +// "logPath":"/Users/johanlarson/Library/Application Support/Couchbase/var/lib/couchbase/logs", +// "rotateInterval":86400,"rotateSize":20971520} +type AuditSpec struct { + Disabled []uint32 `json:"disabled"` + Uid string `json:"uid"` + AuditdEnabled bool `json:"auditdEnabled` + DisabledUsers []AuditUser `json:"disabledUsers"` + LogPath string `json:"logPath"` + RotateInterval int64 `json:"rotateInterval"` + RotateSize int64 `json:"rotateSize"` +} + +type AuditUser struct { + Name string `json:"name"` + Domain string `json:"domain"` +} + +func (c *Client) GetAuditSpec() (*AuditSpec, error) { + ret := &AuditSpec{} + err := c.parseURLResponse("/settings/audit", ret) + if err != nil { + return nil, err + } + return ret, nil +} diff --git a/vendor/github.com/couchbaselabs/go-couchbase/client.go b/vendor/github.com/couchbaselabs/go-couchbase/client.go new file mode 100644 index 000000000000..43c338296018 --- /dev/null +++ b/vendor/github.com/couchbaselabs/go-couchbase/client.go @@ -0,0 +1,1385 @@ +/* +Package couchbase provides a smart client for go. + +Usage: + + client, err := couchbase.Connect("http://myserver:8091/") + handleError(err) + pool, err := client.GetPool("default") + handleError(err) + bucket, err := pool.GetBucket("MyAwesomeBucket") + handleError(err) + ... + +or a shortcut for the bucket directly + + bucket, err := couchbase.GetBucket("http://myserver:8091/", "default", "default") + +in any case, you can specify authentication credentials using +standard URL userinfo syntax: + + b, err := couchbase.GetBucket("http://bucketname:bucketpass@myserver:8091/", + "default", "bucket") +*/ +package couchbase + +import ( + "encoding/binary" + "encoding/json" + "errors" + "fmt" + "io" + "runtime" + "strconv" + "strings" + "sync" + "time" + "unsafe" + + "github.com/couchbase/gomemcached" + "github.com/couchbase/gomemcached/client" // package name is 'memcached' + "github.com/couchbase/goutils/logging" +) + +// Mutation Token +type MutationToken struct { + VBid uint16 // vbucket id + Guard uint64 // vbuuid + Value uint64 // sequence number +} + +// Maximum number of times to retry a chunk of a bulk get on error. +var MaxBulkRetries = 5000 +var backOffDuration time.Duration = 100 * time.Millisecond +var MaxBackOffRetries = 25 // exponentail backOff result in over 30sec (25*13*0.1s) + +// If this is set to a nonzero duration, Do() and ViewCustom() will log a warning if the call +// takes longer than that. +var SlowServerCallWarningThreshold time.Duration + +func slowLog(startTime time.Time, format string, args ...interface{}) { + if elapsed := time.Now().Sub(startTime); elapsed > SlowServerCallWarningThreshold { + pc, _, _, _ := runtime.Caller(2) + caller := runtime.FuncForPC(pc).Name() + logging.Infof("go-couchbase: "+format+" in "+caller+" took "+elapsed.String(), args...) + } +} + +// Return true if error is KEY_ENOENT. Required by cbq-engine +func IsKeyEExistsError(err error) bool { + + res, ok := err.(*gomemcached.MCResponse) + if ok && res.Status == gomemcached.KEY_EEXISTS { + return true + } + + return false +} + +// Return true if error is KEY_ENOENT. Required by cbq-engine +func IsKeyNoEntError(err error) bool { + + res, ok := err.(*gomemcached.MCResponse) + if ok && res.Status == gomemcached.KEY_ENOENT { + return true + } + + return false +} + +// Return true if error suggests a bucket refresh is required. Required by cbq-engine +func IsRefreshRequired(err error) bool { + + res, ok := err.(*gomemcached.MCResponse) + if ok && (res.Status == gomemcached.NO_BUCKET || res.Status == gomemcached.NOT_MY_VBUCKET) { + return true + } + + return false +} + +// ClientOpCallback is called for each invocation of Do. +var ClientOpCallback func(opname, k string, start time.Time, err error) + +// Do executes a function on a memcached connection to the node owning key "k" +// +// Note that this automatically handles transient errors by replaying +// your function on a "not-my-vbucket" error, so don't assume +// your command will only be executed only once. +func (b *Bucket) Do(k string, f func(mc *memcached.Client, vb uint16) error) (err error) { + return b.Do2(k, f, true) +} + +func (b *Bucket) Do2(k string, f func(mc *memcached.Client, vb uint16) error, deadline bool) (err error) { + if SlowServerCallWarningThreshold > 0 { + defer slowLog(time.Now(), "call to Do(%q)", k) + } + + vb := b.VBHash(k) + maxTries := len(b.Nodes()) * 2 + for i := 0; i < maxTries; i++ { + conn, pool, err := b.getConnectionToVBucket(vb) + if err != nil { + if isConnError(err) && backOff(i, maxTries, backOffDuration, true) { + b.Refresh() + continue + } + return err + } + + if deadline && DefaultTimeout > 0 { + conn.SetDeadline(getDeadline(noDeadline, DefaultTimeout)) + err = f(conn, uint16(vb)) + conn.SetDeadline(noDeadline) + } else { + err = f(conn, uint16(vb)) + } + + var retry bool + discard := isOutOfBoundsError(err) + + // MB-30967 / MB-31001 implement back off for transient errors + if resp, ok := err.(*gomemcached.MCResponse); ok { + switch resp.Status { + case gomemcached.NOT_MY_VBUCKET: + b.Refresh() + // MB-28842: in case of NMVB, check if the node is still part of the map + // and ditch the connection if it isn't. + discard = b.checkVBmap(pool.Node()) + retry = true + case gomemcached.NOT_SUPPORTED: + discard = true + retry = true + case gomemcached.ENOMEM: + fallthrough + case gomemcached.TMPFAIL: + retry = backOff(i, maxTries, backOffDuration, true) + default: + retry = false + } + } else if err != nil && isConnError(err) && backOff(i, maxTries, backOffDuration, true) { + retry = true + } + + if discard { + pool.Discard(conn) + } else { + pool.Return(conn) + } + + if !retry { + return err + } + } + + return fmt.Errorf("unable to complete action after %v attemps", maxTries) +} + +type GatheredStats struct { + Server string + Stats map[string]string + Err error +} + +func getStatsParallel(sn string, b *Bucket, offset int, which string, + ch chan<- GatheredStats) { + pool := b.getConnPool(offset) + var gatheredStats GatheredStats + + conn, err := pool.Get() + defer func() { + pool.Return(conn) + ch <- gatheredStats + }() + + if err != nil { + gatheredStats = GatheredStats{Server: sn, Err: err} + } else { + sm, err := conn.StatsMap(which) + gatheredStats = GatheredStats{Server: sn, Stats: sm, Err: err} + } +} + +// GetStats gets a set of stats from all servers. +// +// Returns a map of server ID -> map of stat key to map value. +func (b *Bucket) GetStats(which string) map[string]map[string]string { + rv := map[string]map[string]string{} + for server, gs := range b.GatherStats(which) { + if len(gs.Stats) > 0 { + rv[server] = gs.Stats + } + } + return rv +} + +// GatherStats returns a map of server ID -> GatheredStats from all servers. +func (b *Bucket) GatherStats(which string) map[string]GatheredStats { + vsm := b.VBServerMap() + if vsm.ServerList == nil { + return nil + } + + // Go grab all the things at once. + ch := make(chan GatheredStats, len(vsm.ServerList)) + for i, sn := range vsm.ServerList { + go getStatsParallel(sn, b, i, which, ch) + } + + // Gather the results + rv := map[string]GatheredStats{} + for range vsm.ServerList { + gs := <-ch + rv[gs.Server] = gs + } + return rv +} + +// Get bucket count through the bucket stats +func (b *Bucket) GetCount(refresh bool) (count int64, err error) { + if refresh { + b.Refresh() + } + + var cnt int64 + for _, gs := range b.GatherStats("") { + if len(gs.Stats) > 0 { + cnt, err = strconv.ParseInt(gs.Stats["curr_items"], 10, 64) + if err != nil { + return 0, err + } + count += cnt + } + } + + return count, nil +} + +func isAuthError(err error) bool { + estr := err.Error() + return strings.Contains(estr, "Auth failure") +} + +func IsReadTimeOutError(err error) bool { + estr := err.Error() + return strings.Contains(estr, "read tcp") || + strings.Contains(estr, "i/o timeout") +} + +func isTimeoutError(err error) bool { + estr := err.Error() + return strings.Contains(estr, "i/o timeout") || + strings.Contains(estr, "connection timed out") || + strings.Contains(estr, "no route to host") +} + +// Errors that are not considered fatal for our fetch loop +func isConnError(err error) bool { + if err == io.EOF { + return true + } + estr := err.Error() + return strings.Contains(estr, "broken pipe") || + strings.Contains(estr, "connection reset") || + strings.Contains(estr, "connection refused") || + strings.Contains(estr, "connection pool is closed") +} + +func isOutOfBoundsError(err error) bool { + return err != nil && strings.Contains(err.Error(), "Out of Bounds error") + +} + +func getDeadline(reqDeadline time.Time, duration time.Duration) time.Time { + if reqDeadline.IsZero() && duration > 0 { + return time.Now().Add(duration) + } + return reqDeadline +} + +func backOff(attempt, maxAttempts int, duration time.Duration, exponential bool) bool { + if attempt < maxAttempts { + // 0th attempt return immediately + if attempt > 0 { + if exponential { + duration = time.Duration(attempt) * duration + } + time.Sleep(duration) + } + return true + } + + return false +} + +func (b *Bucket) doBulkGet(vb uint16, keys []string, reqDeadline time.Time, + ch chan<- map[string]*gomemcached.MCResponse, ech chan<- error, subPaths []string, + eStatus *errorStatus) { + if SlowServerCallWarningThreshold > 0 { + defer slowLog(time.Now(), "call to doBulkGet(%d, %d keys)", vb, len(keys)) + } + + rv := _STRING_MCRESPONSE_POOL.Get() + attempts := 0 + backOffAttempts := 0 + done := false + bname := b.Name + for ; attempts < MaxBulkRetries && !done && !eStatus.errStatus; attempts++ { + + if len(b.VBServerMap().VBucketMap) < int(vb) { + //fatal + err := fmt.Errorf("vbmap smaller than requested for %v", bname) + logging.Errorf("go-couchbase: %v vb %d vbmap len %d", err.Error(), vb, len(b.VBServerMap().VBucketMap)) + ech <- err + return + } + + masterID := b.VBServerMap().VBucketMap[vb][0] + + if masterID < 0 { + // fatal + err := fmt.Errorf("No master node available for %v vb %d", bname, vb) + logging.Errorf("%v", err.Error()) + ech <- err + return + } + + // This stack frame exists to ensure we can clean up + // connection at a reasonable time. + err := func() error { + pool := b.getConnPool(masterID) + conn, err := pool.Get() + if err != nil { + if isAuthError(err) || isTimeoutError(err) { + logging.Errorf("Fatal Error %v : %v", bname, err) + ech <- err + return err + } else if isConnError(err) { + if !backOff(backOffAttempts, MaxBackOffRetries, backOffDuration, true) { + logging.Errorf("Connection Error %v : %v", bname, err) + ech <- err + return err + } + b.Refresh() + backOffAttempts++ + } + logging.Infof("Pool Get returned %v: %v", bname, err) + // retry + return nil + } + + conn.SetDeadline(getDeadline(reqDeadline, DefaultTimeout)) + err = conn.GetBulk(vb, keys, rv, subPaths) + conn.SetDeadline(noDeadline) + + discard := false + defer func() { + if discard { + pool.Discard(conn) + } else { + pool.Return(conn) + } + }() + + switch err.(type) { + case *gomemcached.MCResponse: + notSMaxTries := len(b.Nodes()) * 2 + st := err.(*gomemcached.MCResponse).Status + if st == gomemcached.NOT_MY_VBUCKET || (st == gomemcached.NOT_SUPPORTED && attempts < notSMaxTries) { + b.Refresh() + discard = b.checkVBmap(pool.Node()) + return nil // retry + } else if st == gomemcached.EBUSY || st == gomemcached.LOCKED { + if (attempts % (MaxBulkRetries / 100)) == 0 { + logging.Infof("Retrying Memcached error (%v) FOR %v(vbid:%d, keys:%v)", + err.Error(), bname, vb, keys) + } + return nil // retry + } else if (st == gomemcached.ENOMEM || st == gomemcached.TMPFAIL) && backOff(backOffAttempts, MaxBackOffRetries, backOffDuration, true) { + // MB-30967 / MB-31001 use backoff for TMPFAIL too + backOffAttempts++ + logging.Infof("Retrying Memcached error (%v) FOR %v(vbid:%d, keys:%v)", + err.Error(), bname, vb, keys) + return nil // retry + } + ech <- err + return err + case error: + if isOutOfBoundsError(err) { + // We got an out of bound error, retry the operation + discard = true + return nil + } else if isConnError(err) && backOff(backOffAttempts, MaxBackOffRetries, backOffDuration, true) { + backOffAttempts++ + logging.Errorf("Connection Error: %s. Refreshing bucket %v (vbid:%v,keys:%v)", + err.Error(), bname, vb, keys) + discard = true + b.Refresh() + return nil // retry + } + ech <- err + ch <- rv + return err + } + + done = true + return nil + }() + + if err != nil { + return + } + } + + if attempts >= MaxBulkRetries { + err := fmt.Errorf("bulkget exceeded MaxBulkRetries for %v(vbid:%d,keys:%v)", bname, vb, keys) + logging.Errorf("%v", err.Error()) + ech <- err + } + + ch <- rv +} + +type errorStatus struct { + errStatus bool +} + +type vbBulkGet struct { + b *Bucket + ch chan<- map[string]*gomemcached.MCResponse + ech chan<- error + k uint16 + keys []string + reqDeadline time.Time + wg *sync.WaitGroup + subPaths []string + groupError *errorStatus +} + +const _NUM_CHANNELS = 5 + +var _NUM_CHANNEL_WORKERS = (runtime.NumCPU() + 1) / 2 +var DefaultDialTimeout = time.Duration(0) +var DefaultTimeout = time.Duration(0) +var noDeadline = time.Time{} + +// Buffer 4k requests per worker +var _VB_BULK_GET_CHANNELS []chan *vbBulkGet + +func InitBulkGet() { + + DefaultDialTimeout = 20 * time.Second + DefaultTimeout = 120 * time.Second + + memcached.SetDefaultDialTimeout(DefaultDialTimeout) + + _VB_BULK_GET_CHANNELS = make([]chan *vbBulkGet, _NUM_CHANNELS) + + for i := 0; i < _NUM_CHANNELS; i++ { + channel := make(chan *vbBulkGet, 16*1024*_NUM_CHANNEL_WORKERS) + _VB_BULK_GET_CHANNELS[i] = channel + + for j := 0; j < _NUM_CHANNEL_WORKERS; j++ { + go vbBulkGetWorker(channel) + } + } +} + +func vbBulkGetWorker(ch chan *vbBulkGet) { + defer func() { + // Workers cannot panic and die + recover() + go vbBulkGetWorker(ch) + }() + + for vbg := range ch { + vbDoBulkGet(vbg) + } +} + +func vbDoBulkGet(vbg *vbBulkGet) { + defer vbg.wg.Done() + defer func() { + // Workers cannot panic and die + recover() + }() + vbg.b.doBulkGet(vbg.k, vbg.keys, vbg.reqDeadline, vbg.ch, vbg.ech, vbg.subPaths, vbg.groupError) +} + +var _ERR_CHAN_FULL = fmt.Errorf("Data request queue full, aborting query.") + +func (b *Bucket) processBulkGet(kdm map[uint16][]string, reqDeadline time.Time, + ch chan<- map[string]*gomemcached.MCResponse, ech chan<- error, subPaths []string, + eStatus *errorStatus) { + + defer close(ch) + defer close(ech) + + wg := &sync.WaitGroup{} + + for k, keys := range kdm { + + // GetBulk() group has error donot Queue items for this group + if eStatus.errStatus { + break + } + + vbg := &vbBulkGet{ + b: b, + ch: ch, + ech: ech, + k: k, + keys: keys, + reqDeadline: reqDeadline, + wg: wg, + subPaths: subPaths, + groupError: eStatus, + } + + wg.Add(1) + + // Random int + // Right shift to avoid 8-byte alignment, and take low bits + c := (uintptr(unsafe.Pointer(vbg)) >> 4) % _NUM_CHANNELS + + select { + case _VB_BULK_GET_CHANNELS[c] <- vbg: + // No-op + default: + // Buffer full, abandon the bulk get + ech <- _ERR_CHAN_FULL + wg.Add(-1) + } + } + + // Wait for my vb bulk gets + wg.Wait() +} + +type multiError []error + +func (m multiError) Error() string { + if len(m) == 0 { + panic("Error of none") + } + + return fmt.Sprintf("{%v errors, starting with %v}", len(m), m[0].Error()) +} + +// Convert a stream of errors from ech into a multiError (or nil) and +// send down eout. +// +// At least one send is guaranteed on eout, but two is possible, so +// buffer the out channel appropriately. +func errorCollector(ech <-chan error, eout chan<- error, eStatus *errorStatus) { + defer func() { eout <- nil }() + var errs multiError + for e := range ech { + if !eStatus.errStatus && !IsKeyNoEntError(e) { + eStatus.errStatus = true + } + + errs = append(errs, e) + } + + if len(errs) > 0 { + eout <- errs + } +} + +// Fetches multiple keys concurrently, with []byte values +// +// This is a wrapper around GetBulk which converts all values returned +// by GetBulk from raw memcached responses into []byte slices. +// Returns one document for duplicate keys +func (b *Bucket) GetBulkRaw(keys []string) (map[string][]byte, error) { + + resp, eout := b.getBulk(keys, noDeadline, nil) + + rv := make(map[string][]byte, len(keys)) + for k, av := range resp { + rv[k] = av.Body + } + + b.ReleaseGetBulkPools(resp) + return rv, eout + +} + +// GetBulk fetches multiple keys concurrently. +// +// Unlike more convenient GETs, the entire response is returned in the +// map array for each key. Keys that were not found will not be included in +// the map. + +func (b *Bucket) GetBulk(keys []string, reqDeadline time.Time, subPaths []string) (map[string]*gomemcached.MCResponse, error) { + return b.getBulk(keys, reqDeadline, subPaths) +} + +func (b *Bucket) ReleaseGetBulkPools(rv map[string]*gomemcached.MCResponse) { + _STRING_MCRESPONSE_POOL.Put(rv) +} + +func (b *Bucket) getBulk(keys []string, reqDeadline time.Time, subPaths []string) (map[string]*gomemcached.MCResponse, error) { + kdm := _VB_STRING_POOL.Get() + defer _VB_STRING_POOL.Put(kdm) + for _, k := range keys { + if k != "" { + vb := uint16(b.VBHash(k)) + a, ok1 := kdm[vb] + if !ok1 { + a = _STRING_POOL.Get() + } + kdm[vb] = append(a, k) + } + } + + eout := make(chan error, 2) + groupErrorStatus := &errorStatus{} + + // processBulkGet will own both of these channels and + // guarantee they're closed before it returns. + ch := make(chan map[string]*gomemcached.MCResponse) + ech := make(chan error) + + go errorCollector(ech, eout, groupErrorStatus) + go b.processBulkGet(kdm, reqDeadline, ch, ech, subPaths, groupErrorStatus) + + var rv map[string]*gomemcached.MCResponse + + for m := range ch { + if rv == nil { + rv = m + continue + } + + for k, v := range m { + rv[k] = v + } + _STRING_MCRESPONSE_POOL.Put(m) + } + + return rv, <-eout +} + +// WriteOptions is the set of option flags availble for the Write +// method. They are ORed together to specify the desired request. +type WriteOptions int + +const ( + // Raw specifies that the value is raw []byte or nil; don't + // JSON-encode it. + Raw = WriteOptions(1 << iota) + // AddOnly indicates an item should only be written if it + // doesn't exist, otherwise ErrKeyExists is returned. + AddOnly + // Persist causes the operation to block until the server + // confirms the item is persisted. + Persist + // Indexable causes the operation to block until it's availble via the index. + Indexable + // Append indicates the given value should be appended to the + // existing value for the given key. + Append +) + +var optNames = []struct { + opt WriteOptions + name string +}{ + {Raw, "raw"}, + {AddOnly, "addonly"}, {Persist, "persist"}, + {Indexable, "indexable"}, {Append, "append"}, +} + +// String representation of WriteOptions +func (w WriteOptions) String() string { + f := []string{} + for _, on := range optNames { + if w&on.opt != 0 { + f = append(f, on.name) + w &= ^on.opt + } + } + if len(f) == 0 || w != 0 { + f = append(f, fmt.Sprintf("0x%x", int(w))) + } + return strings.Join(f, "|") +} + +// Error returned from Write with AddOnly flag, when key already exists in the bucket. +var ErrKeyExists = errors.New("key exists") + +// General-purpose value setter. +// +// The Set, Add and Delete methods are just wrappers around this. The +// interpretation of `v` depends on whether the `Raw` option is +// given. If it is, v must be a byte array or nil. (A nil value causes +// a delete.) If `Raw` is not given, `v` will be marshaled as JSON +// before being written. It must be JSON-marshalable and it must not +// be nil. +func (b *Bucket) Write(k string, flags, exp int, v interface{}, + opt WriteOptions) (err error) { + + if ClientOpCallback != nil { + defer func(t time.Time) { + ClientOpCallback(fmt.Sprintf("Write(%v)", opt), k, t, err) + }(time.Now()) + } + + var data []byte + if opt&Raw == 0 { + data, err = json.Marshal(v) + if err != nil { + return err + } + } else if v != nil { + data = v.([]byte) + } + + var res *gomemcached.MCResponse + err = b.Do(k, func(mc *memcached.Client, vb uint16) error { + if opt&AddOnly != 0 { + res, err = memcached.UnwrapMemcachedError( + mc.Add(vb, k, flags, exp, data)) + if err == nil && res.Status != gomemcached.SUCCESS { + if res.Status == gomemcached.KEY_EEXISTS { + err = ErrKeyExists + } else { + err = res + } + } + } else if opt&Append != 0 { + res, err = mc.Append(vb, k, data) + } else if data == nil { + res, err = mc.Del(vb, k) + } else { + res, err = mc.Set(vb, k, flags, exp, data) + } + + return err + }) + + if err == nil && (opt&(Persist|Indexable) != 0) { + err = b.WaitForPersistence(k, res.Cas, data == nil) + } + + return err +} + +func (b *Bucket) WriteWithMT(k string, flags, exp int, v interface{}, + opt WriteOptions) (mt *MutationToken, err error) { + + if ClientOpCallback != nil { + defer func(t time.Time) { + ClientOpCallback(fmt.Sprintf("WriteWithMT(%v)", opt), k, t, err) + }(time.Now()) + } + + var data []byte + if opt&Raw == 0 { + data, err = json.Marshal(v) + if err != nil { + return nil, err + } + } else if v != nil { + data = v.([]byte) + } + + var res *gomemcached.MCResponse + err = b.Do(k, func(mc *memcached.Client, vb uint16) error { + if opt&AddOnly != 0 { + res, err = memcached.UnwrapMemcachedError( + mc.Add(vb, k, flags, exp, data)) + if err == nil && res.Status != gomemcached.SUCCESS { + if res.Status == gomemcached.KEY_EEXISTS { + err = ErrKeyExists + } else { + err = res + } + } + } else if opt&Append != 0 { + res, err = mc.Append(vb, k, data) + } else if data == nil { + res, err = mc.Del(vb, k) + } else { + res, err = mc.Set(vb, k, flags, exp, data) + } + + if len(res.Extras) >= 16 { + vbuuid := uint64(binary.BigEndian.Uint64(res.Extras[0:8])) + seqNo := uint64(binary.BigEndian.Uint64(res.Extras[8:16])) + mt = &MutationToken{VBid: vb, Guard: vbuuid, Value: seqNo} + } + + return err + }) + + if err == nil && (opt&(Persist|Indexable) != 0) { + err = b.WaitForPersistence(k, res.Cas, data == nil) + } + + return mt, err +} + +// Set a value in this bucket with Cas and return the new Cas value +func (b *Bucket) Cas(k string, exp int, cas uint64, v interface{}) (uint64, error) { + return b.WriteCas(k, 0, exp, cas, v, 0) +} + +// Set a value in this bucket with Cas without json encoding it +func (b *Bucket) CasRaw(k string, exp int, cas uint64, v interface{}) (uint64, error) { + return b.WriteCas(k, 0, exp, cas, v, Raw) +} + +func (b *Bucket) WriteCas(k string, flags, exp int, cas uint64, v interface{}, + opt WriteOptions) (newCas uint64, err error) { + + if ClientOpCallback != nil { + defer func(t time.Time) { + ClientOpCallback(fmt.Sprintf("Write(%v)", opt), k, t, err) + }(time.Now()) + } + + var data []byte + if opt&Raw == 0 { + data, err = json.Marshal(v) + if err != nil { + return 0, err + } + } else if v != nil { + data = v.([]byte) + } + + var res *gomemcached.MCResponse + err = b.Do(k, func(mc *memcached.Client, vb uint16) error { + res, err = mc.SetCas(vb, k, flags, exp, cas, data) + return err + }) + + if err == nil && (opt&(Persist|Indexable) != 0) { + err = b.WaitForPersistence(k, res.Cas, data == nil) + } + + return res.Cas, err +} + +// Extended CAS operation. These functions will return the mutation token, i.e vbuuid & guard +func (b *Bucket) CasWithMeta(k string, flags int, exp int, cas uint64, v interface{}) (uint64, *MutationToken, error) { + return b.WriteCasWithMT(k, flags, exp, cas, v, 0) +} + +func (b *Bucket) CasWithMetaRaw(k string, flags int, exp int, cas uint64, v interface{}) (uint64, *MutationToken, error) { + return b.WriteCasWithMT(k, flags, exp, cas, v, Raw) +} + +func (b *Bucket) WriteCasWithMT(k string, flags, exp int, cas uint64, v interface{}, + opt WriteOptions) (newCas uint64, mt *MutationToken, err error) { + + if ClientOpCallback != nil { + defer func(t time.Time) { + ClientOpCallback(fmt.Sprintf("Write(%v)", opt), k, t, err) + }(time.Now()) + } + + var data []byte + if opt&Raw == 0 { + data, err = json.Marshal(v) + if err != nil { + return 0, nil, err + } + } else if v != nil { + data = v.([]byte) + } + + var res *gomemcached.MCResponse + err = b.Do(k, func(mc *memcached.Client, vb uint16) error { + res, err = mc.SetCas(vb, k, flags, exp, cas, data) + return err + }) + + if err != nil { + return 0, nil, err + } + + // check for extras + if len(res.Extras) >= 16 { + vbuuid := uint64(binary.BigEndian.Uint64(res.Extras[0:8])) + seqNo := uint64(binary.BigEndian.Uint64(res.Extras[8:16])) + vb := b.VBHash(k) + mt = &MutationToken{VBid: uint16(vb), Guard: vbuuid, Value: seqNo} + } + + if err == nil && (opt&(Persist|Indexable) != 0) { + err = b.WaitForPersistence(k, res.Cas, data == nil) + } + + return res.Cas, mt, err +} + +// Set a value in this bucket. +// The value will be serialized into a JSON document. +func (b *Bucket) Set(k string, exp int, v interface{}) error { + return b.Write(k, 0, exp, v, 0) +} + +// Set a value in this bucket with with flags +func (b *Bucket) SetWithMeta(k string, flags int, exp int, v interface{}) (*MutationToken, error) { + return b.WriteWithMT(k, flags, exp, v, 0) +} + +// SetRaw sets a value in this bucket without JSON encoding it. +func (b *Bucket) SetRaw(k string, exp int, v []byte) error { + return b.Write(k, 0, exp, v, Raw) +} + +// Add adds a value to this bucket; like Set except that nothing +// happens if the key exists. The value will be serialized into a +// JSON document. +func (b *Bucket) Add(k string, exp int, v interface{}) (added bool, err error) { + err = b.Write(k, 0, exp, v, AddOnly) + if err == ErrKeyExists { + return false, nil + } + return (err == nil), err +} + +// AddRaw adds a value to this bucket; like SetRaw except that nothing +// happens if the key exists. The value will be stored as raw bytes. +func (b *Bucket) AddRaw(k string, exp int, v []byte) (added bool, err error) { + err = b.Write(k, 0, exp, v, AddOnly|Raw) + if err == ErrKeyExists { + return false, nil + } + return (err == nil), err +} + +// Add adds a value to this bucket; like Set except that nothing +// happens if the key exists. The value will be serialized into a +// JSON document. +func (b *Bucket) AddWithMT(k string, exp int, v interface{}) (added bool, mt *MutationToken, err error) { + mt, err = b.WriteWithMT(k, 0, exp, v, AddOnly) + if err == ErrKeyExists { + return false, mt, nil + } + return (err == nil), mt, err +} + +// AddRaw adds a value to this bucket; like SetRaw except that nothing +// happens if the key exists. The value will be stored as raw bytes. +func (b *Bucket) AddRawWithMT(k string, exp int, v []byte) (added bool, mt *MutationToken, err error) { + mt, err = b.WriteWithMT(k, 0, exp, v, AddOnly|Raw) + if err == ErrKeyExists { + return false, mt, nil + } + return (err == nil), mt, err +} + +// Append appends raw data to an existing item. +func (b *Bucket) Append(k string, data []byte) error { + return b.Write(k, 0, 0, data, Append|Raw) +} + +// Get a value straight from Memcached +func (b *Bucket) GetsMC(key string, reqDeadline time.Time) (*gomemcached.MCResponse, error) { + var err error + var response *gomemcached.MCResponse + + if key == "" { + return nil, nil + } + + if ClientOpCallback != nil { + defer func(t time.Time) { ClientOpCallback("GetsMC", key, t, err) }(time.Now()) + } + + err = b.Do2(key, func(mc *memcached.Client, vb uint16) error { + var err1 error + + mc.SetDeadline(getDeadline(reqDeadline, DefaultTimeout)) + response, err1 = mc.Get(vb, key) + mc.SetDeadline(noDeadline) + if err1 != nil { + return err1 + } + return nil + }, false) + return response, err +} + +// Get a value through the subdoc API +func (b *Bucket) GetsSubDoc(key string, reqDeadline time.Time, subPaths []string) (*gomemcached.MCResponse, error) { + var err error + var response *gomemcached.MCResponse + + if key == "" { + return nil, nil + } + + if ClientOpCallback != nil { + defer func(t time.Time) { ClientOpCallback("GetsSubDoc", key, t, err) }(time.Now()) + } + + err = b.Do2(key, func(mc *memcached.Client, vb uint16) error { + var err1 error + + mc.SetDeadline(getDeadline(reqDeadline, DefaultTimeout)) + response, err1 = mc.GetSubdoc(vb, key, subPaths) + mc.SetDeadline(noDeadline) + if err1 != nil { + return err1 + } + return nil + }, false) + return response, err +} + +// GetsRaw gets a raw value from this bucket including its CAS +// counter and flags. +func (b *Bucket) GetsRaw(k string) (data []byte, flags int, + cas uint64, err error) { + + if ClientOpCallback != nil { + defer func(t time.Time) { ClientOpCallback("GetsRaw", k, t, err) }(time.Now()) + } + + err = b.Do(k, func(mc *memcached.Client, vb uint16) error { + res, err := mc.Get(vb, k) + if err != nil { + return err + } + cas = res.Cas + if len(res.Extras) >= 4 { + flags = int(binary.BigEndian.Uint32(res.Extras)) + } + data = res.Body + return nil + }) + return +} + +// Gets gets a value from this bucket, including its CAS counter. The +// value is expected to be a JSON stream and will be deserialized into +// rv. +func (b *Bucket) Gets(k string, rv interface{}, caso *uint64) error { + data, _, cas, err := b.GetsRaw(k) + if err != nil { + return err + } + if caso != nil { + *caso = cas + } + return json.Unmarshal(data, rv) +} + +// Get a value from this bucket. +// The value is expected to be a JSON stream and will be deserialized +// into rv. +func (b *Bucket) Get(k string, rv interface{}) error { + return b.Gets(k, rv, nil) +} + +// GetRaw gets a raw value from this bucket. No marshaling is performed. +func (b *Bucket) GetRaw(k string) ([]byte, error) { + d, _, _, err := b.GetsRaw(k) + return d, err +} + +// GetAndTouchRaw gets a raw value from this bucket including its CAS +// counter and flags, and updates the expiry on the doc. +func (b *Bucket) GetAndTouchRaw(k string, exp int) (data []byte, + cas uint64, err error) { + + if ClientOpCallback != nil { + defer func(t time.Time) { ClientOpCallback("GetsRaw", k, t, err) }(time.Now()) + } + + err = b.Do(k, func(mc *memcached.Client, vb uint16) error { + res, err := mc.GetAndTouch(vb, k, exp) + if err != nil { + return err + } + cas = res.Cas + data = res.Body + return nil + }) + return data, cas, err +} + +// GetMeta returns the meta values for a key +func (b *Bucket) GetMeta(k string, flags *int, expiry *int, cas *uint64, seqNo *uint64) (err error) { + + if ClientOpCallback != nil { + defer func(t time.Time) { ClientOpCallback("GetsMeta", k, t, err) }(time.Now()) + } + + err = b.Do(k, func(mc *memcached.Client, vb uint16) error { + res, err := mc.GetMeta(vb, k) + if err != nil { + return err + } + + *cas = res.Cas + if len(res.Extras) >= 8 { + *flags = int(binary.BigEndian.Uint32(res.Extras[4:])) + } + + if len(res.Extras) >= 12 { + *expiry = int(binary.BigEndian.Uint32(res.Extras[8:])) + } + + if len(res.Extras) >= 20 { + *seqNo = uint64(binary.BigEndian.Uint64(res.Extras[12:])) + } + + return nil + }) + + return err +} + +// Delete a key from this bucket. +func (b *Bucket) Delete(k string) error { + return b.Write(k, 0, 0, nil, Raw) +} + +// Incr increments the value at a given key by amt and defaults to def if no value present. +func (b *Bucket) Incr(k string, amt, def uint64, exp int) (val uint64, err error) { + if ClientOpCallback != nil { + defer func(t time.Time) { ClientOpCallback("Incr", k, t, err) }(time.Now()) + } + + var rv uint64 + err = b.Do(k, func(mc *memcached.Client, vb uint16) error { + res, err := mc.Incr(vb, k, amt, def, exp) + if err != nil { + return err + } + rv = res + return nil + }) + return rv, err +} + +// Decr decrements the value at a given key by amt and defaults to def if no value present +func (b *Bucket) Decr(k string, amt, def uint64, exp int) (val uint64, err error) { + if ClientOpCallback != nil { + defer func(t time.Time) { ClientOpCallback("Decr", k, t, err) }(time.Now()) + } + + var rv uint64 + err = b.Do(k, func(mc *memcached.Client, vb uint16) error { + res, err := mc.Decr(vb, k, amt, def, exp) + if err != nil { + return err + } + rv = res + return nil + }) + return rv, err +} + +// Wrapper around memcached.CASNext() +func (b *Bucket) casNext(k string, exp int, state *memcached.CASState) bool { + if ClientOpCallback != nil { + defer func(t time.Time) { + ClientOpCallback("casNext", k, t, state.Err) + }(time.Now()) + } + + keepGoing := false + state.Err = b.Do(k, func(mc *memcached.Client, vb uint16) error { + keepGoing = mc.CASNext(vb, k, exp, state) + return state.Err + }) + return keepGoing && state.Err == nil +} + +// An UpdateFunc is a callback function to update a document +type UpdateFunc func(current []byte) (updated []byte, err error) + +// Return this as the error from an UpdateFunc to cancel the Update +// operation. +const UpdateCancel = memcached.CASQuit + +// Update performs a Safe update of a document, avoiding conflicts by +// using CAS. +// +// The callback function will be invoked with the current raw document +// contents (or nil if the document doesn't exist); it should return +// the updated raw contents (or nil to delete.) If it decides not to +// change anything it can return UpdateCancel as the error. +// +// If another writer modifies the document between the get and the +// set, the callback will be invoked again with the newer value. +func (b *Bucket) Update(k string, exp int, callback UpdateFunc) error { + _, err := b.update(k, exp, callback) + return err +} + +// internal version of Update that returns a CAS value +func (b *Bucket) update(k string, exp int, callback UpdateFunc) (newCas uint64, err error) { + var state memcached.CASState + for b.casNext(k, exp, &state) { + var err error + if state.Value, err = callback(state.Value); err != nil { + return 0, err + } + } + return state.Cas, state.Err +} + +// A WriteUpdateFunc is a callback function to update a document +type WriteUpdateFunc func(current []byte) (updated []byte, opt WriteOptions, err error) + +// WriteUpdate performs a Safe update of a document, avoiding +// conflicts by using CAS. WriteUpdate is like Update, except that +// the callback can return a set of WriteOptions, of which Persist and +// Indexable are recognized: these cause the call to wait until the +// document update has been persisted to disk and/or become available +// to index. +func (b *Bucket) WriteUpdate(k string, exp int, callback WriteUpdateFunc) error { + var writeOpts WriteOptions + var deletion bool + // Wrap the callback in an UpdateFunc we can pass to Update: + updateCallback := func(current []byte) (updated []byte, err error) { + update, opt, err := callback(current) + writeOpts = opt + deletion = (update == nil) + return update, err + } + cas, err := b.update(k, exp, updateCallback) + if err != nil { + return err + } + // If callback asked, wait for persistence or indexability: + if writeOpts&(Persist|Indexable) != 0 { + err = b.WaitForPersistence(k, cas, deletion) + } + return err +} + +// Observe observes the current state of a document. +func (b *Bucket) Observe(k string) (result memcached.ObserveResult, err error) { + if ClientOpCallback != nil { + defer func(t time.Time) { ClientOpCallback("Observe", k, t, err) }(time.Now()) + } + + err = b.Do(k, func(mc *memcached.Client, vb uint16) error { + result, err = mc.Observe(vb, k) + return err + }) + return +} + +// Returned from WaitForPersistence (or Write, if the Persistent or Indexable flag is used) +// if the value has been overwritten by another before being persisted. +var ErrOverwritten = errors.New("overwritten") + +// Returned from WaitForPersistence (or Write, if the Persistent or Indexable flag is used) +// if the value hasn't been persisted by the timeout interval +var ErrTimeout = errors.New("timeout") + +// WaitForPersistence waits for an item to be considered durable. +// +// Besides transport errors, ErrOverwritten may be returned if the +// item is overwritten before it reaches durability. ErrTimeout may +// occur if the item isn't found durable in a reasonable amount of +// time. +func (b *Bucket) WaitForPersistence(k string, cas uint64, deletion bool) error { + timeout := 10 * time.Second + sleepDelay := 5 * time.Millisecond + start := time.Now() + for { + time.Sleep(sleepDelay) + sleepDelay += sleepDelay / 2 // multiply delay by 1.5 every time + + result, err := b.Observe(k) + if err != nil { + return err + } + if persisted, overwritten := result.CheckPersistence(cas, deletion); overwritten { + return ErrOverwritten + } else if persisted { + return nil + } + + if result.PersistenceTime > 0 { + timeout = 2 * result.PersistenceTime + } + if time.Since(start) >= timeout-sleepDelay { + return ErrTimeout + } + } +} + +var _STRING_MCRESPONSE_POOL = gomemcached.NewStringMCResponsePool(16) + +type stringPool struct { + pool *sync.Pool + size int +} + +func newStringPool(size int) *stringPool { + rv := &stringPool{ + pool: &sync.Pool{ + New: func() interface{} { + return make([]string, 0, size) + }, + }, + size: size, + } + + return rv +} + +func (this *stringPool) Get() []string { + return this.pool.Get().([]string) +} + +func (this *stringPool) Put(s []string) { + if s == nil || cap(s) < this.size || cap(s) > 2*this.size { + return + } + + this.pool.Put(s[0:0]) +} + +var _STRING_POOL = newStringPool(16) + +type vbStringPool struct { + pool *sync.Pool + strPool *stringPool +} + +func newVBStringPool(size int, sp *stringPool) *vbStringPool { + rv := &vbStringPool{ + pool: &sync.Pool{ + New: func() interface{} { + return make(map[uint16][]string, size) + }, + }, + strPool: sp, + } + + return rv +} + +func (this *vbStringPool) Get() map[uint16][]string { + return this.pool.Get().(map[uint16][]string) +} + +func (this *vbStringPool) Put(s map[uint16][]string) { + if s == nil { + return + } + + for k, v := range s { + delete(s, k) + this.strPool.Put(v) + } + + this.pool.Put(s) +} + +var _VB_STRING_POOL = newVBStringPool(16, _STRING_POOL) diff --git a/vendor/github.com/couchbaselabs/go-couchbase/conn_pool.go b/vendor/github.com/couchbaselabs/go-couchbase/conn_pool.go new file mode 100644 index 000000000000..babd3adb6a6c --- /dev/null +++ b/vendor/github.com/couchbaselabs/go-couchbase/conn_pool.go @@ -0,0 +1,387 @@ +package couchbase + +import ( + "errors" + "sync/atomic" + "time" + + "github.com/couchbase/gomemcached" + "github.com/couchbase/gomemcached/client" + "github.com/couchbase/goutils/logging" +) + +// GenericMcdAuthHandler is a kind of AuthHandler that performs +// special auth exchange (like non-standard auth, possibly followed by +// select-bucket). +type GenericMcdAuthHandler interface { + AuthHandler + AuthenticateMemcachedConn(host string, conn *memcached.Client) error +} + +// Error raised when a connection can't be retrieved from a pool. +var TimeoutError = errors.New("timeout waiting to build connection") +var errClosedPool = errors.New("the connection pool is closed") +var errNoPool = errors.New("no connection pool") + +// Default timeout for retrieving a connection from the pool. +var ConnPoolTimeout = time.Hour * 24 * 30 + +// overflow connection closer cycle time +var ConnCloserInterval = time.Second * 30 + +// ConnPoolAvailWaitTime is the amount of time to wait for an existing +// connection from the pool before considering the creation of a new +// one. +var ConnPoolAvailWaitTime = time.Millisecond + +type connectionPool struct { + host string + mkConn func(host string, ah AuthHandler) (*memcached.Client, error) + auth AuthHandler + connections chan *memcached.Client + createsem chan bool + bailOut chan bool + poolSize int + connCount uint64 + inUse bool +} + +func newConnectionPool(host string, ah AuthHandler, closer bool, poolSize, poolOverflow int) *connectionPool { + connSize := poolSize + if closer { + connSize += poolOverflow + } + rv := &connectionPool{ + host: host, + connections: make(chan *memcached.Client, connSize), + createsem: make(chan bool, poolSize+poolOverflow), + mkConn: defaultMkConn, + auth: ah, + poolSize: poolSize, + } + if closer { + rv.bailOut = make(chan bool, 1) + go rv.connCloser() + } + return rv +} + +// ConnPoolTimeout is notified whenever connections are acquired from a pool. +var ConnPoolCallback func(host string, source string, start time.Time, err error) + +func defaultMkConn(host string, ah AuthHandler) (*memcached.Client, error) { + var features memcached.Features + + conn, err := memcached.Connect("tcp", host) + if err != nil { + return nil, err + } + + if TCPKeepalive == true { + conn.SetKeepAliveOptions(time.Duration(TCPKeepaliveInterval) * time.Second) + } + + if EnableMutationToken == true { + features = append(features, memcached.FeatureMutationToken) + } + if EnableDataType == true { + features = append(features, memcached.FeatureDataType) + } + + if EnableXattr == true { + features = append(features, memcached.FeatureXattr) + } + + if len(features) > 0 { + if DefaultTimeout > 0 { + conn.SetDeadline(getDeadline(noDeadline, DefaultTimeout)) + } + + res, err := conn.EnableFeatures(features) + + if DefaultTimeout > 0 { + conn.SetDeadline(noDeadline) + } + + if err != nil && isTimeoutError(err) { + conn.Close() + return nil, err + } + + if err != nil || res.Status != gomemcached.SUCCESS { + logging.Warnf("Unable to enable features %v", err) + } + } + + if gah, ok := ah.(GenericMcdAuthHandler); ok { + err = gah.AuthenticateMemcachedConn(host, conn) + if err != nil { + conn.Close() + return nil, err + } + return conn, nil + } + name, pass, bucket := ah.GetCredentials() + if name != "default" { + _, err = conn.Auth(name, pass) + if err != nil { + conn.Close() + return nil, err + } + // Select bucket (Required for cb_auth creds) + // Required when doing auth with _admin credentials + if bucket != "" && bucket != name { + _, err = conn.SelectBucket(bucket) + if err != nil { + conn.Close() + return nil, err + } + } + } + return conn, nil +} + +func (cp *connectionPool) Close() (err error) { + defer func() { + if recover() != nil { + err = errors.New("connectionPool.Close error") + } + }() + if cp.bailOut != nil { + + // defensively, we won't wait if the channel is full + select { + case cp.bailOut <- false: + default: + } + } + close(cp.connections) + for c := range cp.connections { + c.Close() + } + return +} + +func (cp *connectionPool) Node() string { + return cp.host +} + +func (cp *connectionPool) GetWithTimeout(d time.Duration) (rv *memcached.Client, err error) { + if cp == nil { + return nil, errNoPool + } + + path := "" + + if ConnPoolCallback != nil { + defer func(path *string, start time.Time) { + ConnPoolCallback(cp.host, *path, start, err) + }(&path, time.Now()) + } + + path = "short-circuit" + + // short-circuit available connetions. + select { + case rv, isopen := <-cp.connections: + if !isopen { + return nil, errClosedPool + } + atomic.AddUint64(&cp.connCount, 1) + return rv, nil + default: + } + + t := time.NewTimer(ConnPoolAvailWaitTime) + defer t.Stop() + + // Try to grab an available connection within 1ms + select { + case rv, isopen := <-cp.connections: + path = "avail1" + if !isopen { + return nil, errClosedPool + } + atomic.AddUint64(&cp.connCount, 1) + return rv, nil + case <-t.C: + // No connection came around in time, let's see + // whether we can get one or build a new one first. + t.Reset(d) // Reuse the timer for the full timeout. + select { + case rv, isopen := <-cp.connections: + path = "avail2" + if !isopen { + return nil, errClosedPool + } + atomic.AddUint64(&cp.connCount, 1) + return rv, nil + case cp.createsem <- true: + path = "create" + // Build a connection if we can't get a real one. + // This can potentially be an overflow connection, or + // a pooled connection. + rv, err := cp.mkConn(cp.host, cp.auth) + if err != nil { + // On error, release our create hold + <-cp.createsem + } else { + atomic.AddUint64(&cp.connCount, 1) + } + return rv, err + case <-t.C: + return nil, ErrTimeout + } + } +} + +func (cp *connectionPool) Get() (*memcached.Client, error) { + return cp.GetWithTimeout(ConnPoolTimeout) +} + +func (cp *connectionPool) Return(c *memcached.Client) { + if c == nil { + return + } + + if cp == nil { + c.Close() + } + + if c.IsHealthy() { + defer func() { + if recover() != nil { + // This happens when the pool has already been + // closed and we're trying to return a + // connection to it anyway. Just close the + // connection. + c.Close() + } + }() + + select { + case cp.connections <- c: + default: + <-cp.createsem + c.Close() + } + } else { + <-cp.createsem + c.Close() + } +} + +// give the ability to discard a connection from a pool +// useful for ditching connections to the wrong node after a rebalance +func (cp *connectionPool) Discard(c *memcached.Client) { + <-cp.createsem + c.Close() +} + +// asynchronous connection closer +func (cp *connectionPool) connCloser() { + var connCount uint64 + + t := time.NewTimer(ConnCloserInterval) + defer t.Stop() + + for { + connCount = cp.connCount + + // we don't exist anymore! bail out! + select { + case <-cp.bailOut: + return + case <-t.C: + } + t.Reset(ConnCloserInterval) + + // no overflow connections open or sustained requests for connections + // nothing to do until the next cycle + if len(cp.connections) <= cp.poolSize || + ConnCloserInterval/ConnPoolAvailWaitTime < time.Duration(cp.connCount-connCount) { + continue + } + + // close overflow connections now that they are not needed + for c := range cp.connections { + select { + case <-cp.bailOut: + return + default: + } + + // bail out if close did not work out + if !cp.connCleanup(c) { + return + } + if len(cp.connections) <= cp.poolSize { + break + } + } + } +} + +// close connection with recovery on error +func (cp *connectionPool) connCleanup(c *memcached.Client) (rv bool) { + + // just in case we are closing a connection after + // bailOut has been sent but we haven't yet read it + defer func() { + if recover() != nil { + rv = false + } + }() + rv = true + + c.Close() + <-cp.createsem + return +} + +func (cp *connectionPool) StartTapFeed(args *memcached.TapArguments) (*memcached.TapFeed, error) { + if cp == nil { + return nil, errNoPool + } + mc, err := cp.Get() + if err != nil { + return nil, err + } + + // A connection can't be used after TAP; Dont' count it against the + // connection pool capacity + <-cp.createsem + + return mc.StartTapFeed(*args) +} + +const DEFAULT_WINDOW_SIZE = 20 * 1024 * 1024 // 20 Mb + +func (cp *connectionPool) StartUprFeed(name string, sequence uint32, dcp_buffer_size uint32, data_chan_size int) (*memcached.UprFeed, error) { + if cp == nil { + return nil, errNoPool + } + mc, err := cp.Get() + if err != nil { + return nil, err + } + + // A connection can't be used after it has been allocated to UPR; + // Dont' count it against the connection pool capacity + <-cp.createsem + + uf, err := mc.NewUprFeed() + if err != nil { + return nil, err + } + + if err := uf.UprOpen(name, sequence, dcp_buffer_size); err != nil { + return nil, err + } + + if err := uf.StartFeedWithConfig(data_chan_size); err != nil { + return nil, err + } + + return uf, nil +} diff --git a/vendor/github.com/couchbaselabs/go-couchbase/ddocs.go b/vendor/github.com/couchbaselabs/go-couchbase/ddocs.go new file mode 100644 index 000000000000..f9cc343aa8ed --- /dev/null +++ b/vendor/github.com/couchbaselabs/go-couchbase/ddocs.go @@ -0,0 +1,288 @@ +package couchbase + +import ( + "bytes" + "encoding/json" + "fmt" + "github.com/couchbase/goutils/logging" + "io/ioutil" + "net/http" +) + +// ViewDefinition represents a single view within a design document. +type ViewDefinition struct { + Map string `json:"map"` + Reduce string `json:"reduce,omitempty"` +} + +// DDoc is the document body of a design document specifying a view. +type DDoc struct { + Language string `json:"language,omitempty"` + Views map[string]ViewDefinition `json:"views"` +} + +// DDocsResult represents the result from listing the design +// documents. +type DDocsResult struct { + Rows []struct { + DDoc struct { + Meta map[string]interface{} + JSON DDoc + } `json:"doc"` + } `json:"rows"` +} + +// GetDDocs lists all design documents +func (b *Bucket) GetDDocs() (DDocsResult, error) { + var ddocsResult DDocsResult + b.RLock() + pool := b.pool + uri := b.DDocs.URI + b.RUnlock() + + // MB-23555 ephemeral buckets have no ddocs + if uri == "" { + return DDocsResult{}, nil + } + + err := pool.client.parseURLResponse(uri, &ddocsResult) + if err != nil { + return DDocsResult{}, err + } + return ddocsResult, nil +} + +func (b *Bucket) GetDDocWithRetry(docname string, into interface{}) error { + ddocURI := fmt.Sprintf("/%s/_design/%s", b.GetName(), docname) + err := b.parseAPIResponse(ddocURI, &into) + if err != nil { + return err + } + return nil +} + +func (b *Bucket) GetDDocsWithRetry() (DDocsResult, error) { + var ddocsResult DDocsResult + b.RLock() + uri := b.DDocs.URI + b.RUnlock() + + // MB-23555 ephemeral buckets have no ddocs + if uri == "" { + return DDocsResult{}, nil + } + + err := b.parseURLResponse(uri, &ddocsResult) + if err != nil { + return DDocsResult{}, err + } + return ddocsResult, nil +} + +func (b *Bucket) ddocURL(docname string) (string, error) { + u, err := b.randomBaseURL() + if err != nil { + return "", err + } + u.Path = fmt.Sprintf("/%s/_design/%s", b.GetName(), docname) + return u.String(), nil +} + +func (b *Bucket) ddocURLNext(nodeId int, docname string) (string, int, error) { + u, selected, err := b.randomNextURL(nodeId) + if err != nil { + return "", -1, err + } + u.Path = fmt.Sprintf("/%s/_design/%s", b.GetName(), docname) + return u.String(), selected, nil +} + +const ABS_MAX_RETRIES = 10 +const ABS_MIN_RETRIES = 3 + +func (b *Bucket) getMaxRetries() (int, error) { + + maxRetries := len(b.Nodes()) + + if maxRetries == 0 { + return 0, fmt.Errorf("No available Couch rest URLs") + } + + if maxRetries > ABS_MAX_RETRIES { + maxRetries = ABS_MAX_RETRIES + } else if maxRetries < ABS_MIN_RETRIES { + maxRetries = ABS_MIN_RETRIES + } + + return maxRetries, nil +} + +// PutDDoc installs a design document. +func (b *Bucket) PutDDoc(docname string, value interface{}) error { + + var Err error + + maxRetries, err := b.getMaxRetries() + if err != nil { + return err + } + + lastNode := START_NODE_ID + + for retryCount := 0; retryCount < maxRetries; retryCount++ { + + Err = nil + + ddocU, selectedNode, err := b.ddocURLNext(lastNode, docname) + if err != nil { + return err + } + + lastNode = selectedNode + + logging.Infof(" Trying with selected node %d", selectedNode) + j, err := json.Marshal(value) + if err != nil { + return err + } + + req, err := http.NewRequest("PUT", ddocU, bytes.NewReader(j)) + if err != nil { + return err + } + req.Header.Set("Content-Type", "application/json") + err = maybeAddAuth(req, b.authHandler(false /* bucket not yet locked */)) + if err != nil { + return err + } + + res, err := doHTTPRequest(req) + if err != nil { + return err + } + + if res.StatusCode != 201 { + body, _ := ioutil.ReadAll(res.Body) + Err = fmt.Errorf("error installing view: %v / %s", + res.Status, body) + logging.Errorf(" Error in PutDDOC %v. Retrying...", Err) + res.Body.Close() + b.Refresh() + continue + } + + res.Body.Close() + break + } + + return Err +} + +// GetDDoc retrieves a specific a design doc. +func (b *Bucket) GetDDoc(docname string, into interface{}) error { + var Err error + var res *http.Response + + maxRetries, err := b.getMaxRetries() + if err != nil { + return err + } + + lastNode := START_NODE_ID + for retryCount := 0; retryCount < maxRetries; retryCount++ { + + Err = nil + ddocU, selectedNode, err := b.ddocURLNext(lastNode, docname) + if err != nil { + return err + } + + lastNode = selectedNode + logging.Infof(" Trying with selected node %d", selectedNode) + + req, err := http.NewRequest("GET", ddocU, nil) + if err != nil { + return err + } + req.Header.Set("Content-Type", "application/json") + err = maybeAddAuth(req, b.authHandler(false /* bucket not yet locked */)) + if err != nil { + return err + } + + res, err = doHTTPRequest(req) + if err != nil { + return err + } + if res.StatusCode != 200 { + body, _ := ioutil.ReadAll(res.Body) + Err = fmt.Errorf("error reading view: %v / %s", + res.Status, body) + logging.Errorf(" Error in GetDDOC %v Retrying...", Err) + b.Refresh() + res.Body.Close() + continue + } + defer res.Body.Close() + break + } + + if Err != nil { + return Err + } + + d := json.NewDecoder(res.Body) + return d.Decode(into) +} + +// DeleteDDoc removes a design document. +func (b *Bucket) DeleteDDoc(docname string) error { + + var Err error + + maxRetries, err := b.getMaxRetries() + if err != nil { + return err + } + + lastNode := START_NODE_ID + + for retryCount := 0; retryCount < maxRetries; retryCount++ { + + Err = nil + ddocU, selectedNode, err := b.ddocURLNext(lastNode, docname) + if err != nil { + return err + } + + lastNode = selectedNode + logging.Infof(" Trying with selected node %d", selectedNode) + + req, err := http.NewRequest("DELETE", ddocU, nil) + if err != nil { + return err + } + req.Header.Set("Content-Type", "application/json") + err = maybeAddAuth(req, b.authHandler(false /* bucket not already locked */)) + if err != nil { + return err + } + + res, err := doHTTPRequest(req) + if err != nil { + return err + } + if res.StatusCode != 200 { + body, _ := ioutil.ReadAll(res.Body) + Err = fmt.Errorf("error deleting view : %v / %s", res.Status, body) + logging.Errorf(" Error in DeleteDDOC %v. Retrying ... ", Err) + b.Refresh() + res.Body.Close() + continue + } + + res.Body.Close() + break + } + return Err +} diff --git a/vendor/github.com/couchbaselabs/go-couchbase/observe.go b/vendor/github.com/couchbaselabs/go-couchbase/observe.go new file mode 100644 index 000000000000..6e746f5a16df --- /dev/null +++ b/vendor/github.com/couchbaselabs/go-couchbase/observe.go @@ -0,0 +1,300 @@ +package couchbase + +import ( + "fmt" + "github.com/couchbase/goutils/logging" + "sync" +) + +type PersistTo uint8 + +const ( + PersistNone = PersistTo(0x00) + PersistMaster = PersistTo(0x01) + PersistOne = PersistTo(0x02) + PersistTwo = PersistTo(0x03) + PersistThree = PersistTo(0x04) + PersistFour = PersistTo(0x05) +) + +type ObserveTo uint8 + +const ( + ObserveNone = ObserveTo(0x00) + ObserveReplicateOne = ObserveTo(0x01) + ObserveReplicateTwo = ObserveTo(0x02) + ObserveReplicateThree = ObserveTo(0x03) + ObserveReplicateFour = ObserveTo(0x04) +) + +type JobType uint8 + +const ( + OBSERVE = JobType(0x00) + PERSIST = JobType(0x01) +) + +type ObservePersistJob struct { + vb uint16 + vbuuid uint64 + hostname string + jobType JobType + failover uint8 + lastPersistedSeqNo uint64 + currentSeqNo uint64 + resultChan chan *ObservePersistJob + errorChan chan *OPErrResponse +} + +type OPErrResponse struct { + vb uint16 + vbuuid uint64 + err error + job *ObservePersistJob +} + +var ObservePersistPool = NewPool(1024) +var OPJobChan = make(chan *ObservePersistJob, 1024) +var OPJobDone = make(chan bool) + +var wg sync.WaitGroup + +func (b *Bucket) StartOPPollers(maxWorkers int) { + + for i := 0; i < maxWorkers; i++ { + go b.OPJobPoll() + wg.Add(1) + } + wg.Wait() +} + +func (b *Bucket) SetObserveAndPersist(nPersist PersistTo, nObserve ObserveTo) (err error) { + + numNodes := len(b.Nodes()) + if int(nPersist) > numNodes || int(nObserve) > numNodes { + return fmt.Errorf("Not enough healthy nodes in the cluster") + } + + if int(nPersist) > (b.Replicas+1) || int(nObserve) > b.Replicas { + return fmt.Errorf("Not enough replicas in the cluster") + } + + if EnableMutationToken == false { + return fmt.Errorf("Mutation Tokens not enabled ") + } + + b.ds = &DurablitySettings{Persist: PersistTo(nPersist), Observe: ObserveTo(nObserve)} + return +} + +func (b *Bucket) ObserveAndPersistPoll(vb uint16, vbuuid uint64, seqNo uint64) (err error, failover bool) { + b.RLock() + ds := b.ds + b.RUnlock() + + if ds == nil { + return + } + + nj := 0 // total number of jobs + resultChan := make(chan *ObservePersistJob, 10) + errChan := make(chan *OPErrResponse, 10) + + nodes := b.GetNodeList(vb) + if int(ds.Observe) > len(nodes) || int(ds.Persist) > len(nodes) { + return fmt.Errorf("Not enough healthy nodes in the cluster"), false + } + + logging.Infof("Node list %v", nodes) + + if ds.Observe >= ObserveReplicateOne { + // create a job for each host + for i := ObserveReplicateOne; i < ds.Observe+1; i++ { + opJob := ObservePersistPool.Get() + opJob.vb = vb + opJob.vbuuid = vbuuid + opJob.jobType = OBSERVE + opJob.hostname = nodes[i] + opJob.resultChan = resultChan + opJob.errorChan = errChan + + OPJobChan <- opJob + nj++ + + } + } + + if ds.Persist >= PersistMaster { + for i := PersistMaster; i < ds.Persist+1; i++ { + opJob := ObservePersistPool.Get() + opJob.vb = vb + opJob.vbuuid = vbuuid + opJob.jobType = PERSIST + opJob.hostname = nodes[i] + opJob.resultChan = resultChan + opJob.errorChan = errChan + + OPJobChan <- opJob + nj++ + + } + } + + ok := true + for ok { + select { + case res := <-resultChan: + jobDone := false + if res.failover == 0 { + // no failover + if res.jobType == PERSIST { + if res.lastPersistedSeqNo >= seqNo { + jobDone = true + } + + } else { + if res.currentSeqNo >= seqNo { + jobDone = true + } + } + + if jobDone == true { + nj-- + ObservePersistPool.Put(res) + } else { + // requeue this job + OPJobChan <- res + } + + } else { + // Not currently handling failover scenarios TODO + nj-- + ObservePersistPool.Put(res) + failover = true + } + + if nj == 0 { + // done with all the jobs + ok = false + close(resultChan) + close(errChan) + } + + case Err := <-errChan: + logging.Errorf("Error in Observe/Persist %v", Err.err) + err = fmt.Errorf("Error in Observe/Persist job %v", Err.err) + nj-- + ObservePersistPool.Put(Err.job) + if nj == 0 { + close(resultChan) + close(errChan) + ok = false + } + } + } + + return +} + +func (b *Bucket) OPJobPoll() { + + ok := true + for ok == true { + select { + case job := <-OPJobChan: + pool := b.getConnPoolByHost(job.hostname, false /* bucket not already locked */) + if pool == nil { + errRes := &OPErrResponse{vb: job.vb, vbuuid: job.vbuuid} + errRes.err = fmt.Errorf("Pool not found for host %v", job.hostname) + errRes.job = job + job.errorChan <- errRes + continue + } + conn, err := pool.Get() + if err != nil { + errRes := &OPErrResponse{vb: job.vb, vbuuid: job.vbuuid} + errRes.err = fmt.Errorf("Unable to get connection from pool %v", err) + errRes.job = job + job.errorChan <- errRes + continue + } + + res, err := conn.ObserveSeq(job.vb, job.vbuuid) + if err != nil { + errRes := &OPErrResponse{vb: job.vb, vbuuid: job.vbuuid} + errRes.err = fmt.Errorf("Command failed %v", err) + errRes.job = job + job.errorChan <- errRes + continue + + } + pool.Return(conn) + job.lastPersistedSeqNo = res.LastPersistedSeqNo + job.currentSeqNo = res.CurrentSeqNo + job.failover = res.Failover + + job.resultChan <- job + case <-OPJobDone: + logging.Infof("Observe Persist Poller exitting") + ok = false + } + } + wg.Done() +} + +func (b *Bucket) GetNodeList(vb uint16) []string { + + vbm := b.VBServerMap() + if len(vbm.VBucketMap) < int(vb) { + logging.Infof("vbmap smaller than vblist") + return nil + } + + nodes := make([]string, len(vbm.VBucketMap[vb])) + for i := 0; i < len(vbm.VBucketMap[vb]); i++ { + n := vbm.VBucketMap[vb][i] + if n < 0 { + continue + } + + node := b.getMasterNode(n) + if len(node) > 1 { + nodes[i] = node + } + continue + + } + return nodes +} + +//pool of ObservePersist Jobs +type OPpool struct { + pool chan *ObservePersistJob +} + +// NewPool creates a new pool of jobs +func NewPool(max int) *OPpool { + return &OPpool{ + pool: make(chan *ObservePersistJob, max), + } +} + +// Borrow a Client from the pool. +func (p *OPpool) Get() *ObservePersistJob { + var o *ObservePersistJob + select { + case o = <-p.pool: + default: + o = &ObservePersistJob{} + } + return o +} + +// Return returns a Client to the pool. +func (p *OPpool) Put(o *ObservePersistJob) { + select { + case p.pool <- o: + default: + // let it go, let it go... + } +} diff --git a/vendor/github.com/couchbaselabs/go-couchbase/pools.go b/vendor/github.com/couchbaselabs/go-couchbase/pools.go new file mode 100644 index 000000000000..5f3ff8c49523 --- /dev/null +++ b/vendor/github.com/couchbaselabs/go-couchbase/pools.go @@ -0,0 +1,1282 @@ +package couchbase + +import ( + "bufio" + "bytes" + "crypto/tls" + "crypto/x509" + "encoding/base64" + "encoding/json" + "errors" + "fmt" + "io" + "io/ioutil" + "math/rand" + "net/http" + "net/url" + "runtime" + "sort" + "strings" + "sync" + "unsafe" + + "github.com/couchbase/goutils/logging" + + "github.com/couchbase/gomemcached" // package name is 'gomemcached' + "github.com/couchbase/gomemcached/client" // package name is 'memcached' +) + +// HTTPClient to use for REST and view operations. +var MaxIdleConnsPerHost = 256 +var HTTPTransport = &http.Transport{MaxIdleConnsPerHost: MaxIdleConnsPerHost} +var HTTPClient = &http.Client{Transport: HTTPTransport} + +// PoolSize is the size of each connection pool (per host). +var PoolSize = 64 + +// PoolOverflow is the number of overflow connections allowed in a +// pool. +var PoolOverflow = 16 + +// AsynchronousCloser turns on asynchronous closing for overflow connections +var AsynchronousCloser = false + +// TCP KeepAlive enabled/disabled +var TCPKeepalive = false + +// Enable MutationToken +var EnableMutationToken = false + +// Enable Data Type response +var EnableDataType = false + +// Enable Xattr +var EnableXattr = false + +// TCP keepalive interval in seconds. Default 30 minutes +var TCPKeepaliveInterval = 30 * 60 + +// Used to decide whether to skip verification of certificates when +// connecting to an ssl port. +var skipVerify = true +var certFile = "" +var keyFile = "" +var rootFile = "" + +func SetSkipVerify(skip bool) { + skipVerify = skip +} + +func SetCertFile(cert string) { + certFile = cert +} + +func SetKeyFile(cert string) { + keyFile = cert +} + +func SetRootFile(cert string) { + rootFile = cert +} + +// Allow applications to speciify the Poolsize and Overflow +func SetConnectionPoolParams(size, overflow int) { + + if size > 0 { + PoolSize = size + } + + if overflow > 0 { + PoolOverflow = overflow + } +} + +// Turn off overflow connections +func DisableOverflowConnections() { + PoolOverflow = 0 +} + +// Toggle asynchronous overflow closer +func EnableAsynchronousCloser(closer bool) { + AsynchronousCloser = closer +} + +// Allow TCP keepalive parameters to be set by the application +func SetTcpKeepalive(enabled bool, interval int) { + + TCPKeepalive = enabled + + if interval > 0 { + TCPKeepaliveInterval = interval + } +} + +// AuthHandler is a callback that gets the auth username and password +// for the given bucket. +type AuthHandler interface { + GetCredentials() (string, string, string) +} + +// AuthHandler is a callback that gets the auth username and password +// for the given bucket and sasl for memcached. +type AuthWithSaslHandler interface { + AuthHandler + GetSaslCredentials() (string, string) +} + +// MultiBucketAuthHandler is kind of AuthHandler that may perform +// different auth for different buckets. +type MultiBucketAuthHandler interface { + AuthHandler + ForBucket(bucket string) AuthHandler +} + +// HTTPAuthHandler is kind of AuthHandler that performs more general +// for outgoing http requests than is possible via simple +// GetCredentials() call (i.e. digest auth or different auth per +// different destinations). +type HTTPAuthHandler interface { + AuthHandler + SetCredsForRequest(req *http.Request) error +} + +// RestPool represents a single pool returned from the pools REST API. +type RestPool struct { + Name string `json:"name"` + StreamingURI string `json:"streamingUri"` + URI string `json:"uri"` +} + +// Pools represents the collection of pools as returned from the REST API. +type Pools struct { + ComponentsVersion map[string]string `json:"componentsVersion,omitempty"` + ImplementationVersion string `json:"implementationVersion"` + IsAdmin bool `json:"isAdminCreds"` + UUID string `json:"uuid"` + Pools []RestPool `json:"pools"` +} + +// A Node is a computer in a cluster running the couchbase software. +type Node struct { + ClusterCompatibility int `json:"clusterCompatibility"` + ClusterMembership string `json:"clusterMembership"` + CouchAPIBase string `json:"couchApiBase"` + Hostname string `json:"hostname"` + InterestingStats map[string]float64 `json:"interestingStats,omitempty"` + MCDMemoryAllocated float64 `json:"mcdMemoryAllocated"` + MCDMemoryReserved float64 `json:"mcdMemoryReserved"` + MemoryFree float64 `json:"memoryFree"` + MemoryTotal float64 `json:"memoryTotal"` + OS string `json:"os"` + Ports map[string]int `json:"ports"` + Services []string `json:"services"` + Status string `json:"status"` + Uptime int `json:"uptime,string"` + Version string `json:"version"` + ThisNode bool `json:"thisNode,omitempty"` +} + +// A Pool of nodes and buckets. +type Pool struct { + BucketMap map[string]Bucket + Nodes []Node + + BucketURL map[string]string `json:"buckets"` + + client Client +} + +// VBucketServerMap is the a mapping of vbuckets to nodes. +type VBucketServerMap struct { + HashAlgorithm string `json:"hashAlgorithm"` + NumReplicas int `json:"numReplicas"` + ServerList []string `json:"serverList"` + VBucketMap [][]int `json:"vBucketMap"` +} + +type DurablitySettings struct { + Persist PersistTo + Observe ObserveTo +} + +// Bucket is the primary entry point for most data operations. +// Bucket is a locked data structure. All access to its fields should be done using read or write locking, +// as appropriate. +// +// Some access methods require locking, but rely on the caller to do so. These are appropriate +// for calls from methods that have already locked the structure. Methods like this +// take a boolean parameter "bucketLocked". +type Bucket struct { + sync.RWMutex + AuthType string `json:"authType"` + Capabilities []string `json:"bucketCapabilities"` + CapabilitiesVersion string `json:"bucketCapabilitiesVer"` + Type string `json:"bucketType"` + Name string `json:"name"` + NodeLocator string `json:"nodeLocator"` + Quota map[string]float64 `json:"quota,omitempty"` + Replicas int `json:"replicaNumber"` + Password string `json:"saslPassword"` + URI string `json:"uri"` + StreamingURI string `json:"streamingUri"` + LocalRandomKeyURI string `json:"localRandomKeyUri,omitempty"` + UUID string `json:"uuid"` + ConflictResolutionType string `json:"conflictResolutionType,omitempty"` + DDocs struct { + URI string `json:"uri"` + } `json:"ddocs,omitempty"` + BasicStats map[string]interface{} `json:"basicStats,omitempty"` + Controllers map[string]interface{} `json:"controllers,omitempty"` + + // These are used for JSON IO, but isn't used for processing + // since it needs to be swapped out safely. + VBSMJson VBucketServerMap `json:"vBucketServerMap"` + NodesJSON []Node `json:"nodes"` + + pool *Pool + connPools unsafe.Pointer // *[]*connectionPool + vBucketServerMap unsafe.Pointer // *VBucketServerMap + nodeList unsafe.Pointer // *[]Node + commonSufix string + ah AuthHandler // auth handler + ds *DurablitySettings // Durablity Settings for this bucket + Scopes Scopes +} + +// PoolServices is all the bucket-independent services in a pool +type PoolServices struct { + Rev int `json:"rev"` + NodesExt []NodeServices `json:"nodesExt"` +} + +// NodeServices is all the bucket-independent services running on +// a node (given by Hostname) +type NodeServices struct { + Services map[string]int `json:"services,omitempty"` + Hostname string `json:"hostname"` + ThisNode bool `json:"thisNode"` +} + +type BucketNotFoundError struct { + bucket string +} + +func (e *BucketNotFoundError) Error() string { + return fmt.Sprint("No bucket named " + e.bucket) +} + +type BucketAuth struct { + name string + saslPwd string + bucket string +} + +func newBucketAuth(name string, pass string, bucket string) *BucketAuth { + return &BucketAuth{name: name, saslPwd: pass, bucket: bucket} +} + +func (ba *BucketAuth) GetCredentials() (string, string, string) { + return ba.name, ba.saslPwd, ba.bucket +} + +// VBServerMap returns the current VBucketServerMap. +func (b *Bucket) VBServerMap() *VBucketServerMap { + b.RLock() + defer b.RUnlock() + ret := (*VBucketServerMap)(b.vBucketServerMap) + return ret +} + +func (b *Bucket) GetVBmap(addrs []string) (map[string][]uint16, error) { + vbmap := b.VBServerMap() + servers := vbmap.ServerList + if addrs == nil { + addrs = vbmap.ServerList + } + + m := make(map[string][]uint16) + for _, addr := range addrs { + m[addr] = make([]uint16, 0) + } + for vbno, idxs := range vbmap.VBucketMap { + if len(idxs) == 0 { + return nil, fmt.Errorf("vbmap: No KV node no for vb %d", vbno) + } else if idxs[0] < 0 || idxs[0] >= len(servers) { + return nil, fmt.Errorf("vbmap: Invalid KV node no %d for vb %d", idxs[0], vbno) + } + addr := servers[idxs[0]] + if _, ok := m[addr]; ok { + m[addr] = append(m[addr], uint16(vbno)) + } + } + return m, nil +} + +// true if node is not on the bucket VBmap +func (b *Bucket) checkVBmap(node string) bool { + vbmap := b.VBServerMap() + servers := vbmap.ServerList + + for _, idxs := range vbmap.VBucketMap { + if len(idxs) == 0 { + return true + } else if idxs[0] < 0 || idxs[0] >= len(servers) { + return true + } + if servers[idxs[0]] == node { + return false + } + } + return true +} + +func (b *Bucket) GetName() string { + b.RLock() + defer b.RUnlock() + ret := b.Name + return ret +} + +// Nodes returns teh current list of nodes servicing this bucket. +func (b *Bucket) Nodes() []Node { + b.RLock() + defer b.RUnlock() + ret := *(*[]Node)(b.nodeList) + return ret +} + +// return the list of healthy nodes +func (b *Bucket) HealthyNodes() []Node { + nodes := []Node{} + + for _, n := range b.Nodes() { + if n.Status == "healthy" && n.CouchAPIBase != "" { + nodes = append(nodes, n) + } + if n.Status != "healthy" { // log non-healthy node + logging.Infof("Non-healthy node; node details:") + logging.Infof("Hostname=%v, Status=%v, CouchAPIBase=%v, ThisNode=%v", n.Hostname, n.Status, n.CouchAPIBase, n.ThisNode) + } + } + + return nodes +} + +func (b *Bucket) getConnPools(bucketLocked bool) []*connectionPool { + if !bucketLocked { + b.RLock() + defer b.RUnlock() + } + if b.connPools != nil { + return *(*[]*connectionPool)(b.connPools) + } else { + return nil + } +} + +func (b *Bucket) replaceConnPools(with []*connectionPool) { + b.Lock() + defer b.Unlock() + + old := b.connPools + b.connPools = unsafe.Pointer(&with) + if old != nil { + for _, pool := range *(*[]*connectionPool)(old) { + if pool != nil { + pool.Close() + } + } + } + return +} + +func (b *Bucket) getConnPool(i int) *connectionPool { + + if i < 0 { + return nil + } + + p := b.getConnPools(false /* not already locked */) + if len(p) > i { + return p[i] + } + + return nil +} + +func (b *Bucket) getConnPoolByHost(host string, bucketLocked bool) *connectionPool { + pools := b.getConnPools(bucketLocked) + for _, p := range pools { + if p != nil && p.host == host { + return p + } + } + + return nil +} + +// Given a vbucket number, returns a memcached connection to it. +// The connection must be returned to its pool after use. +func (b *Bucket) getConnectionToVBucket(vb uint32) (*memcached.Client, *connectionPool, error) { + for { + vbm := b.VBServerMap() + if len(vbm.VBucketMap) < int(vb) { + return nil, nil, fmt.Errorf("go-couchbase: vbmap smaller than vbucket list: %v vs. %v", + vb, vbm.VBucketMap) + } + masterId := vbm.VBucketMap[vb][0] + if masterId < 0 { + return nil, nil, fmt.Errorf("go-couchbase: No master for vbucket %d", vb) + } + pool := b.getConnPool(masterId) + conn, err := pool.Get() + if err != errClosedPool { + return conn, pool, err + } + // If conn pool was closed, because another goroutine refreshed the vbucket map, retry... + } +} + +// To get random documents, we need to cover all the nodes, so select +// a connection at random. + +func (b *Bucket) getRandomConnection() (*memcached.Client, *connectionPool, error) { + for { + var currentPool = 0 + pools := b.getConnPools(false /* not already locked */) + if len(pools) == 0 { + return nil, nil, fmt.Errorf("No connection pool found") + } else if len(pools) > 1 { // choose a random connection + currentPool = rand.Intn(len(pools)) + } // if only one pool, currentPool defaults to 0, i.e., the only pool + + // get the pool + pool := pools[currentPool] + conn, err := pool.Get() + if err != errClosedPool { + return conn, pool, err + } + + // If conn pool was closed, because another goroutine refreshed the vbucket map, retry... + } +} + +// +// Get a random document from a bucket. Since the bucket may be distributed +// across nodes, we must first select a random connection, and then use the +// Client.GetRandomDoc() call to get a random document from that node. +// + +func (b *Bucket) GetRandomDoc() (*gomemcached.MCResponse, error) { + // get a connection from the pool + conn, pool, err := b.getRandomConnection() + + if err != nil { + return nil, err + } + + // get a randomm document from the connection + doc, err := conn.GetRandomDoc() + // need to return the connection to the pool + pool.Return(conn) + return doc, err +} + +func (b *Bucket) getMasterNode(i int) string { + p := b.getConnPools(false /* not already locked */) + if len(p) > i { + return p[i].host + } + return "" +} + +func (b *Bucket) authHandler(bucketLocked bool) (ah AuthHandler) { + if !bucketLocked { + b.RLock() + defer b.RUnlock() + } + pool := b.pool + name := b.Name + + if pool != nil { + ah = pool.client.ah + } + if mbah, ok := ah.(MultiBucketAuthHandler); ok { + return mbah.ForBucket(name) + } + if ah == nil { + ah = &basicAuth{name, ""} + } + return +} + +// NodeAddresses gets the (sorted) list of memcached node addresses +// (hostname:port). +func (b *Bucket) NodeAddresses() []string { + vsm := b.VBServerMap() + rv := make([]string, len(vsm.ServerList)) + copy(rv, vsm.ServerList) + sort.Strings(rv) + return rv +} + +// CommonAddressSuffix finds the longest common suffix of all +// host:port strings in the node list. +func (b *Bucket) CommonAddressSuffix() string { + input := []string{} + for _, n := range b.Nodes() { + input = append(input, n.Hostname) + } + return FindCommonSuffix(input) +} + +// A Client is the starting point for all services across all buckets +// in a Couchbase cluster. +type Client struct { + BaseURL *url.URL + ah AuthHandler + Info Pools +} + +func maybeAddAuth(req *http.Request, ah AuthHandler) error { + if hah, ok := ah.(HTTPAuthHandler); ok { + return hah.SetCredsForRequest(req) + } + if ah != nil { + user, pass, _ := ah.GetCredentials() + req.Header.Set("Authorization", "Basic "+ + base64.StdEncoding.EncodeToString([]byte(user+":"+pass))) + } + return nil +} + +// arbitary number, may need to be tuned #FIXME +const HTTP_MAX_RETRY = 5 + +// Someday golang network packages will implement standard +// error codes. Until then #sigh +func isHttpConnError(err error) bool { + + estr := err.Error() + return strings.Contains(estr, "broken pipe") || + strings.Contains(estr, "broken connection") || + strings.Contains(estr, "connection reset") +} + +var client *http.Client + +func ClientConfigForX509(certFile, keyFile, rootFile string) (*tls.Config, error) { + cfg := &tls.Config{} + + if certFile != "" && keyFile != "" { + tlsCert, err := tls.LoadX509KeyPair(certFile, keyFile) + if err != nil { + return nil, err + } + cfg.Certificates = []tls.Certificate{tlsCert} + } else { + //error need to pass both certfile and keyfile + return nil, fmt.Errorf("N1QL: Need to pass both certfile and keyfile") + } + + var caCert []byte + var err1 error + + caCertPool := x509.NewCertPool() + if rootFile != "" { + // Read that value in + caCert, err1 = ioutil.ReadFile(rootFile) + if err1 != nil { + return nil, fmt.Errorf(" Error in reading cacert file, err: %v", err1) + } + caCertPool.AppendCertsFromPEM(caCert) + } + + cfg.RootCAs = caCertPool + return cfg, nil +} + +func doHTTPRequest(req *http.Request) (*http.Response, error) { + + var err error + var res *http.Response + + tr := &http.Transport{} + + // we need a client that ignores certificate errors, since we self-sign + // our certs + if client == nil && req.URL.Scheme == "https" { + if skipVerify { + tr = &http.Transport{ + TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, + } + } else { + // Handle cases with cert + + cfg, err := ClientConfigForX509(certFile, keyFile, rootFile) + if err != nil { + return nil, err + } + + tr = &http.Transport{ + TLSClientConfig: cfg, + } + } + + client = &http.Client{Transport: tr} + + } else if client == nil { + client = HTTPClient + } + + for i := 0; i < HTTP_MAX_RETRY; i++ { + res, err = client.Do(req) + if err != nil && isHttpConnError(err) { + continue + } + break + } + + if err != nil { + return nil, err + } + + return res, err +} + +func doPutAPI(baseURL *url.URL, path string, params map[string]interface{}, authHandler AuthHandler, out interface{}) error { + return doOutputAPI("PUT", baseURL, path, params, authHandler, out) +} + +func doPostAPI(baseURL *url.URL, path string, params map[string]interface{}, authHandler AuthHandler, out interface{}) error { + return doOutputAPI("POST", baseURL, path, params, authHandler, out) +} + +func doOutputAPI( + httpVerb string, + baseURL *url.URL, + path string, + params map[string]interface{}, + authHandler AuthHandler, + out interface{}) error { + + var requestUrl string + + if q := strings.Index(path, "?"); q > 0 { + requestUrl = baseURL.Scheme + "://" + baseURL.Host + path[:q] + "?" + path[q+1:] + } else { + requestUrl = baseURL.Scheme + "://" + baseURL.Host + path + } + + postData := url.Values{} + for k, v := range params { + postData.Set(k, fmt.Sprintf("%v", v)) + } + + req, err := http.NewRequest(httpVerb, requestUrl, bytes.NewBufferString(postData.Encode())) + if err != nil { + return err + } + + req.Header.Add("Content-Type", "application/x-www-form-urlencoded") + + err = maybeAddAuth(req, authHandler) + if err != nil { + return err + } + + res, err := doHTTPRequest(req) + if err != nil { + return err + } + + defer res.Body.Close() + if res.StatusCode != 200 { + bod, _ := ioutil.ReadAll(io.LimitReader(res.Body, 512)) + return fmt.Errorf("HTTP error %v getting %q: %s", + res.Status, requestUrl, bod) + } + + d := json.NewDecoder(res.Body) + if err = d.Decode(&out); err != nil { + return err + } + return nil +} + +func queryRestAPI( + baseURL *url.URL, + path string, + authHandler AuthHandler, + out interface{}) error { + + var requestUrl string + + if q := strings.Index(path, "?"); q > 0 { + requestUrl = baseURL.Scheme + "://" + baseURL.Host + path[:q] + "?" + path[q+1:] + } else { + requestUrl = baseURL.Scheme + "://" + baseURL.Host + path + } + + req, err := http.NewRequest("GET", requestUrl, nil) + if err != nil { + return err + } + + err = maybeAddAuth(req, authHandler) + if err != nil { + return err + } + + res, err := doHTTPRequest(req) + if err != nil { + return err + } + + defer res.Body.Close() + if res.StatusCode != 200 { + bod, _ := ioutil.ReadAll(io.LimitReader(res.Body, 512)) + return fmt.Errorf("HTTP error %v getting %q: %s", + res.Status, requestUrl, bod) + } + + d := json.NewDecoder(res.Body) + if err = d.Decode(&out); err != nil { + return err + } + return nil +} + +func (c *Client) ProcessStream(path string, callb func(interface{}) error, data interface{}) error { + return c.processStream(c.BaseURL, path, c.ah, callb, data) +} + +// Based on code in http://src.couchbase.org/source/xref/trunk/goproj/src/github.com/couchbase/indexing/secondary/dcp/pools.go#309 +func (c *Client) processStream(baseURL *url.URL, path string, authHandler AuthHandler, callb func(interface{}) error, data interface{}) error { + var requestUrl string + + if q := strings.Index(path, "?"); q > 0 { + requestUrl = baseURL.Scheme + "://" + baseURL.Host + path[:q] + "?" + path[q+1:] + } else { + requestUrl = baseURL.Scheme + "://" + baseURL.Host + path + } + + req, err := http.NewRequest("GET", requestUrl, nil) + if err != nil { + return err + } + + err = maybeAddAuth(req, authHandler) + if err != nil { + return err + } + + res, err := doHTTPRequest(req) + if err != nil { + return err + } + + defer res.Body.Close() + if res.StatusCode != 200 { + bod, _ := ioutil.ReadAll(io.LimitReader(res.Body, 512)) + return fmt.Errorf("HTTP error %v getting %q: %s", + res.Status, requestUrl, bod) + } + + reader := bufio.NewReader(res.Body) + for { + bs, err := reader.ReadBytes('\n') + if err != nil { + return err + } + if len(bs) == 1 && bs[0] == '\n' { + continue + } + + err = json.Unmarshal(bs, data) + if err != nil { + return err + } + err = callb(data) + if err != nil { + return err + } + } + return nil + +} + +func (c *Client) parseURLResponse(path string, out interface{}) error { + return queryRestAPI(c.BaseURL, path, c.ah, out) +} + +func (c *Client) parsePostURLResponse(path string, params map[string]interface{}, out interface{}) error { + return doPostAPI(c.BaseURL, path, params, c.ah, out) +} + +func (c *Client) parsePutURLResponse(path string, params map[string]interface{}, out interface{}) error { + return doPutAPI(c.BaseURL, path, params, c.ah, out) +} + +func (b *Bucket) parseURLResponse(path string, out interface{}) error { + nodes := b.Nodes() + if len(nodes) == 0 { + return errors.New("no couch rest URLs") + } + + // Pick a random node to start querying. + startNode := rand.Intn(len(nodes)) + maxRetries := len(nodes) + for i := 0; i < maxRetries; i++ { + node := nodes[(startNode+i)%len(nodes)] // Wrap around the nodes list. + // Skip non-healthy nodes. + if node.Status != "healthy" || node.CouchAPIBase == "" { + continue + } + url := &url.URL{ + Host: node.Hostname, + Scheme: "http", + } + + // Lock here to avoid having pool closed under us. + b.RLock() + err := queryRestAPI(url, path, b.pool.client.ah, out) + b.RUnlock() + if err == nil { + return err + } + } + return errors.New("All nodes failed to respond or no healthy nodes for bucket found") +} + +func (b *Bucket) parseAPIResponse(path string, out interface{}) error { + nodes := b.Nodes() + if len(nodes) == 0 { + return errors.New("no couch rest URLs") + } + + var err error + var u *url.URL + + // Pick a random node to start querying. + startNode := rand.Intn(len(nodes)) + maxRetries := len(nodes) + for i := 0; i < maxRetries; i++ { + node := nodes[(startNode+i)%len(nodes)] // Wrap around the nodes list. + // Skip non-healthy nodes. + if node.Status != "healthy" || node.CouchAPIBase == "" { + continue + } + + u, err = ParseURL(node.CouchAPIBase) + // Lock here so pool does not get closed under us. + b.RLock() + if err != nil { + b.RUnlock() + return fmt.Errorf("config error: Bucket %q node #%d CouchAPIBase=%q: %v", + b.Name, i, node.CouchAPIBase, err) + } else if b.pool != nil { + u.User = b.pool.client.BaseURL.User + } + u.Path = path + + // generate the path so that the strings are properly escaped + // MB-13770 + requestPath := strings.Split(u.String(), u.Host)[1] + + err = queryRestAPI(u, requestPath, b.pool.client.ah, out) + b.RUnlock() + if err == nil { + return err + } + } + + var errStr string + if err != nil { + errStr = "Error " + err.Error() + } + + return errors.New("All nodes failed to respond or returned error or no healthy nodes for bucket found." + errStr) +} + +type basicAuth struct { + u, p string +} + +func (b basicAuth) GetCredentials() (string, string, string) { + return b.u, b.p, b.u +} + +func basicAuthFromURL(us string) (ah AuthHandler) { + u, err := ParseURL(us) + if err != nil { + return + } + if user := u.User; user != nil { + pw, _ := user.Password() + ah = basicAuth{user.Username(), pw} + } + return +} + +// ConnectWithAuth connects to a couchbase cluster with the given +// authentication handler. +func ConnectWithAuth(baseU string, ah AuthHandler) (c Client, err error) { + c.BaseURL, err = ParseURL(baseU) + if err != nil { + return + } + c.ah = ah + + return c, c.parseURLResponse("/pools", &c.Info) +} + +// ConnectWithAuthCreds connects to a couchbase cluster with the give +// authorization creds returned by cb_auth +func ConnectWithAuthCreds(baseU, username, password string) (c Client, err error) { + c.BaseURL, err = ParseURL(baseU) + if err != nil { + return + } + + c.ah = newBucketAuth(username, password, "") + return c, c.parseURLResponse("/pools", &c.Info) + +} + +// Connect to a couchbase cluster. An authentication handler will be +// created from the userinfo in the URL if provided. +func Connect(baseU string) (Client, error) { + return ConnectWithAuth(baseU, basicAuthFromURL(baseU)) +} + +type BucketInfo struct { + Name string // name of bucket + Password string // SASL password of bucket +} + +//Get SASL buckets +func GetBucketList(baseU string) (bInfo []BucketInfo, err error) { + + c := &Client{} + c.BaseURL, err = ParseURL(baseU) + if err != nil { + return + } + c.ah = basicAuthFromURL(baseU) + + var buckets []Bucket + err = c.parseURLResponse("/pools/default/buckets", &buckets) + if err != nil { + return + } + bInfo = make([]BucketInfo, 0) + for _, bucket := range buckets { + bucketInfo := BucketInfo{Name: bucket.Name, Password: bucket.Password} + bInfo = append(bInfo, bucketInfo) + } + return bInfo, err +} + +//Set viewUpdateDaemonOptions +func SetViewUpdateParams(baseU string, params map[string]interface{}) (viewOpts map[string]interface{}, err error) { + + c := &Client{} + c.BaseURL, err = ParseURL(baseU) + if err != nil { + return + } + c.ah = basicAuthFromURL(baseU) + + if len(params) < 1 { + return nil, fmt.Errorf("No params to set") + } + + err = c.parsePostURLResponse("/settings/viewUpdateDaemon", params, &viewOpts) + if err != nil { + return + } + return viewOpts, err +} + +// This API lets the caller know, if the list of nodes a bucket is +// connected to has gone through an edit (a rebalance operation) +// since the last update to the bucket, in which case a Refresh is +// advised. +func (b *Bucket) NodeListChanged() bool { + b.RLock() + pool := b.pool + uri := b.URI + b.RUnlock() + + tmpb := &Bucket{} + err := pool.client.parseURLResponse(uri, tmpb) + if err != nil { + return true + } + + bNodes := *(*[]Node)(b.nodeList) + if len(bNodes) != len(tmpb.NodesJSON) { + return true + } + + bucketHostnames := map[string]bool{} + for _, node := range bNodes { + bucketHostnames[node.Hostname] = true + } + + for _, node := range tmpb.NodesJSON { + if _, found := bucketHostnames[node.Hostname]; !found { + return true + } + } + + return false +} + +// Sample data for scopes and collections as returned from the +// /pooles/default/$BUCKET_NAME/collections API. +// {"myScope2":{"myCollectionC":{}},"myScope1":{"myCollectionB":{},"myCollectionA":{}},"_default":{"_default":{}}} + +// A Scopes holds the set of scopes in a bucket. +// The map key is the name of the scope. +type Scopes map[string]Collections + +// A Collections holds the set of collections in a scope. +// The map key is the name of the collection. +type Collections map[string]Collection + +// A Collection holds the information for a collection. +// It is currently returned empty. +type Collection struct{} + +func getScopesAndCollections(pool *Pool, bucketName string) (Scopes, error) { + scopes := make(Scopes) + // This URL is a bit of a hack. The "default" is the name of the pool, and should + // be a parameter. But the name does not appear to be available anywhere, + // and in any case we never use a pool other than "default". + err := pool.client.parseURLResponse(fmt.Sprintf("/pools/default/buckets/%s/collections", bucketName), &scopes) + if err != nil { + return nil, err + } + return scopes, nil +} + +func (b *Bucket) Refresh() error { + b.RLock() + pool := b.pool + uri := b.URI + name := b.Name + b.RUnlock() + + tmpb := &Bucket{} + err := pool.client.parseURLResponse(uri, tmpb) + if err != nil { + return err + } + + scopes, err := getScopesAndCollections(pool, name) + if err != nil { + return err + } + + pools := b.getConnPools(false /* bucket not already locked */) + + // We need this lock to ensure that bucket refreshes happening because + // of NMVb errors received during bulkGet do not end up over-writing + // pool.inUse. + b.Lock() + + for _, pool := range pools { + if pool != nil { + pool.inUse = false + } + } + + newcps := make([]*connectionPool, len(tmpb.VBSMJson.ServerList)) + for i := range newcps { + + pool := b.getConnPoolByHost(tmpb.VBSMJson.ServerList[i], true /* bucket already locked */) + if pool != nil && pool.inUse == false { + // if the hostname and index is unchanged then reuse this pool + newcps[i] = pool + pool.inUse = true + continue + } + + if b.ah != nil { + newcps[i] = newConnectionPool( + tmpb.VBSMJson.ServerList[i], + b.ah, AsynchronousCloser, PoolSize, PoolOverflow) + + } else { + newcps[i] = newConnectionPool( + tmpb.VBSMJson.ServerList[i], + b.authHandler(true /* bucket already locked */), + AsynchronousCloser, PoolSize, PoolOverflow) + } + } + b.replaceConnPools2(newcps, true /* bucket already locked */) + tmpb.ah = b.ah + b.vBucketServerMap = unsafe.Pointer(&tmpb.VBSMJson) + b.nodeList = unsafe.Pointer(&tmpb.NodesJSON) + b.Scopes = scopes + + b.Unlock() + return nil +} + +func (p *Pool) refresh() (err error) { + p.BucketMap = make(map[string]Bucket) + + buckets := []Bucket{} + err = p.client.parseURLResponse(p.BucketURL["uri"], &buckets) + if err != nil { + return err + } + for _, b := range buckets { + b.pool = p + b.nodeList = unsafe.Pointer(&b.NodesJSON) + b.replaceConnPools(make([]*connectionPool, len(b.VBSMJson.ServerList))) + + p.BucketMap[b.Name] = b + } + return nil +} + +// GetPool gets a pool from within the couchbase cluster (usually +// "default"). +func (c *Client) GetPool(name string) (p Pool, err error) { + var poolURI string + for _, p := range c.Info.Pools { + if p.Name == name { + poolURI = p.URI + } + } + if poolURI == "" { + return p, errors.New("No pool named " + name) + } + + err = c.parseURLResponse(poolURI, &p) + + p.client = *c + + err = p.refresh() + return +} + +// GetPoolServices returns all the bucket-independent services in a pool. +// (See "Exposing services outside of bucket context" in http://goo.gl/uuXRkV) +func (c *Client) GetPoolServices(name string) (ps PoolServices, err error) { + var poolName string + for _, p := range c.Info.Pools { + if p.Name == name { + poolName = p.Name + } + } + if poolName == "" { + return ps, errors.New("No pool named " + name) + } + + poolURI := "/pools/" + poolName + "/nodeServices" + err = c.parseURLResponse(poolURI, &ps) + + return +} + +// Close marks this bucket as no longer needed, closing connections it +// may have open. +func (b *Bucket) Close() { + b.Lock() + defer b.Unlock() + if b.connPools != nil { + for _, c := range b.getConnPools(true /* already locked */) { + if c != nil { + c.Close() + } + } + b.connPools = nil + } +} + +func bucketFinalizer(b *Bucket) { + if b.connPools != nil { + logging.Warnf("Finalizing a bucket with active connections.") + } +} + +// GetBucket gets a bucket from within this pool. +func (p *Pool) GetBucket(name string) (*Bucket, error) { + rv, ok := p.BucketMap[name] + if !ok { + return nil, &BucketNotFoundError{bucket: name} + } + runtime.SetFinalizer(&rv, bucketFinalizer) + err := rv.Refresh() + if err != nil { + return nil, err + } + return &rv, nil +} + +// GetBucket gets a bucket from within this pool. +func (p *Pool) GetBucketWithAuth(bucket, username, password string) (*Bucket, error) { + rv, ok := p.BucketMap[bucket] + if !ok { + return nil, &BucketNotFoundError{bucket: bucket} + } + runtime.SetFinalizer(&rv, bucketFinalizer) + rv.ah = newBucketAuth(username, password, bucket) + err := rv.Refresh() + if err != nil { + return nil, err + } + return &rv, nil +} + +// GetPool gets the pool to which this bucket belongs. +func (b *Bucket) GetPool() *Pool { + b.RLock() + defer b.RUnlock() + ret := b.pool + return ret +} + +// GetClient gets the client from which we got this pool. +func (p *Pool) GetClient() *Client { + return &p.client +} + +// GetBucket is a convenience function for getting a named bucket from +// a URL +func GetBucket(endpoint, poolname, bucketname string) (*Bucket, error) { + var err error + client, err := Connect(endpoint) + if err != nil { + return nil, err + } + + pool, err := client.GetPool(poolname) + if err != nil { + return nil, err + } + + return pool.GetBucket(bucketname) +} + +// ConnectWithAuthAndGetBucket is a convenience function for +// getting a named bucket from a given URL and an auth callback +func ConnectWithAuthAndGetBucket(endpoint, poolname, bucketname string, + ah AuthHandler) (*Bucket, error) { + client, err := ConnectWithAuth(endpoint, ah) + if err != nil { + return nil, err + } + + pool, err := client.GetPool(poolname) + if err != nil { + return nil, err + } + + return pool.GetBucket(bucketname) +} diff --git a/vendor/github.com/couchbaselabs/go-couchbase/streaming.go b/vendor/github.com/couchbaselabs/go-couchbase/streaming.go new file mode 100644 index 000000000000..646763537160 --- /dev/null +++ b/vendor/github.com/couchbaselabs/go-couchbase/streaming.go @@ -0,0 +1,209 @@ +package couchbase + +import ( + "encoding/json" + "fmt" + "github.com/couchbase/goutils/logging" + "io" + "io/ioutil" + "math/rand" + "net" + "net/http" + "time" + "unsafe" +) + +// Bucket auto-updater gets the latest version of the bucket config from +// the server. If the configuration has changed then updated the local +// bucket information. If the bucket has been deleted then notify anyone +// who is holding a reference to this bucket + +const MAX_RETRY_COUNT = 5 +const DISCONNECT_PERIOD = 120 * time.Second + +type NotifyFn func(bucket string, err error) + +// Use TCP keepalive to detect half close sockets +var updaterTransport http.RoundTripper = &http.Transport{ + Proxy: http.ProxyFromEnvironment, + Dial: (&net.Dialer{ + Timeout: 30 * time.Second, + KeepAlive: 30 * time.Second, + }).Dial, +} + +var updaterHTTPClient = &http.Client{Transport: updaterTransport} + +func doHTTPRequestForUpdate(req *http.Request) (*http.Response, error) { + + var err error + var res *http.Response + + for i := 0; i < HTTP_MAX_RETRY; i++ { + res, err = updaterHTTPClient.Do(req) + if err != nil && isHttpConnError(err) { + continue + } + break + } + + if err != nil { + return nil, err + } + + return res, err +} + +func (b *Bucket) RunBucketUpdater(notify NotifyFn) { + go func() { + err := b.UpdateBucket() + if err != nil { + if notify != nil { + notify(b.GetName(), err) + } + logging.Errorf(" Bucket Updater exited with err %v", err) + } + }() +} + +func (b *Bucket) replaceConnPools2(with []*connectionPool, bucketLocked bool) { + if !bucketLocked { + b.Lock() + defer b.Unlock() + } + old := b.connPools + b.connPools = unsafe.Pointer(&with) + if old != nil { + for _, pool := range *(*[]*connectionPool)(old) { + if pool != nil && pool.inUse == false { + pool.Close() + } + } + } + return +} + +func (b *Bucket) UpdateBucket() error { + + var failures int + var returnErr error + + for { + + if failures == MAX_RETRY_COUNT { + logging.Errorf(" Maximum failures reached. Exiting loop...") + return fmt.Errorf("Max failures reached. Last Error %v", returnErr) + } + + nodes := b.Nodes() + if len(nodes) < 1 { + return fmt.Errorf("No healthy nodes found") + } + + startNode := rand.Intn(len(nodes)) + node := nodes[(startNode)%len(nodes)] + + streamUrl := fmt.Sprintf("http://%s/pools/default/bucketsStreaming/%s", node.Hostname, b.GetName()) + logging.Infof(" Trying with %s", streamUrl) + req, err := http.NewRequest("GET", streamUrl, nil) + if err != nil { + return err + } + + b.RLock() + pool := b.pool + bucketName := b.Name + b.RUnlock() + scopes, err := getScopesAndCollections(pool, bucketName) + if err != nil { + return err + } + + // Lock here to avoid having pool closed under us. + b.RLock() + err = maybeAddAuth(req, b.pool.client.ah) + b.RUnlock() + if err != nil { + return err + } + + res, err := doHTTPRequestForUpdate(req) + if err != nil { + return err + } + + if res.StatusCode != 200 { + bod, _ := ioutil.ReadAll(io.LimitReader(res.Body, 512)) + logging.Errorf("Failed to connect to host, unexpected status code: %v. Body %s", res.StatusCode, bod) + res.Body.Close() + returnErr = fmt.Errorf("Failed to connect to host. Status %v Body %s", res.StatusCode, bod) + failures++ + continue + } + + dec := json.NewDecoder(res.Body) + + tmpb := &Bucket{} + for { + + err := dec.Decode(&tmpb) + if err != nil { + returnErr = err + res.Body.Close() + break + } + + // if we got here, reset failure count + failures = 0 + b.Lock() + + // mark all the old connection pools for deletion + pools := b.getConnPools(true /* already locked */) + for _, pool := range pools { + if pool != nil { + pool.inUse = false + } + } + + newcps := make([]*connectionPool, len(tmpb.VBSMJson.ServerList)) + for i := range newcps { + // get the old connection pool and check if it is still valid + pool := b.getConnPoolByHost(tmpb.VBSMJson.ServerList[i], true /* bucket already locked */) + if pool != nil && pool.inUse == false { + // if the hostname and index is unchanged then reuse this pool + newcps[i] = pool + pool.inUse = true + continue + } + // else create a new pool + if b.ah != nil { + newcps[i] = newConnectionPool( + tmpb.VBSMJson.ServerList[i], + b.ah, false, PoolSize, PoolOverflow) + + } else { + newcps[i] = newConnectionPool( + tmpb.VBSMJson.ServerList[i], + b.authHandler(true /* bucket already locked */), + false, PoolSize, PoolOverflow) + } + } + + b.replaceConnPools2(newcps, true /* bucket already locked */) + + tmpb.ah = b.ah + b.vBucketServerMap = unsafe.Pointer(&tmpb.VBSMJson) + b.nodeList = unsafe.Pointer(&tmpb.NodesJSON) + b.Scopes = scopes + b.Unlock() + + logging.Infof("Got new configuration for bucket %s", b.GetName()) + + } + // we are here because of an error + failures++ + continue + + } + return nil +} diff --git a/vendor/github.com/couchbaselabs/go-couchbase/tap.go b/vendor/github.com/couchbaselabs/go-couchbase/tap.go new file mode 100644 index 000000000000..86edd30554ff --- /dev/null +++ b/vendor/github.com/couchbaselabs/go-couchbase/tap.go @@ -0,0 +1,143 @@ +package couchbase + +import ( + "github.com/couchbase/gomemcached/client" + "github.com/couchbase/goutils/logging" + "sync" + "time" +) + +const initialRetryInterval = 1 * time.Second +const maximumRetryInterval = 30 * time.Second + +// A TapFeed streams mutation events from a bucket. +// +// Events from the bucket can be read from the channel 'C'. Remember +// to call Close() on it when you're done, unless its channel has +// closed itself already. +type TapFeed struct { + C <-chan memcached.TapEvent + + bucket *Bucket + args *memcached.TapArguments + nodeFeeds []*memcached.TapFeed // The TAP feeds of the individual nodes + output chan memcached.TapEvent // Same as C but writeably-typed + wg sync.WaitGroup + quit chan bool +} + +// StartTapFeed creates and starts a new Tap feed +func (b *Bucket) StartTapFeed(args *memcached.TapArguments) (*TapFeed, error) { + if args == nil { + defaultArgs := memcached.DefaultTapArguments() + args = &defaultArgs + } + + feed := &TapFeed{ + bucket: b, + args: args, + output: make(chan memcached.TapEvent, 10), + quit: make(chan bool), + } + + go feed.run() + + feed.C = feed.output + return feed, nil +} + +// Goroutine that runs the feed +func (feed *TapFeed) run() { + retryInterval := initialRetryInterval + bucketOK := true + for { + // Connect to the TAP feed of each server node: + if bucketOK { + killSwitch, err := feed.connectToNodes() + if err == nil { + // Run until one of the sub-feeds fails: + select { + case <-killSwitch: + case <-feed.quit: + return + } + feed.closeNodeFeeds() + retryInterval = initialRetryInterval + } + } + + // On error, try to refresh the bucket in case the list of nodes changed: + logging.Infof("go-couchbase: TAP connection lost; reconnecting to bucket %q in %v", + feed.bucket.Name, retryInterval) + err := feed.bucket.Refresh() + bucketOK = err == nil + + select { + case <-time.After(retryInterval): + case <-feed.quit: + return + } + if retryInterval *= 2; retryInterval > maximumRetryInterval { + retryInterval = maximumRetryInterval + } + } +} + +func (feed *TapFeed) connectToNodes() (killSwitch chan bool, err error) { + killSwitch = make(chan bool) + for _, serverConn := range feed.bucket.getConnPools(false /* not already locked */) { + var singleFeed *memcached.TapFeed + singleFeed, err = serverConn.StartTapFeed(feed.args) + if err != nil { + logging.Errorf("go-couchbase: Error connecting to tap feed of %s: %v", serverConn.host, err) + feed.closeNodeFeeds() + return + } + feed.nodeFeeds = append(feed.nodeFeeds, singleFeed) + go feed.forwardTapEvents(singleFeed, killSwitch, serverConn.host) + feed.wg.Add(1) + } + return +} + +// Goroutine that forwards Tap events from a single node's feed to the aggregate feed. +func (feed *TapFeed) forwardTapEvents(singleFeed *memcached.TapFeed, killSwitch chan bool, host string) { + defer feed.wg.Done() + for { + select { + case event, ok := <-singleFeed.C: + if !ok { + if singleFeed.Error != nil { + logging.Errorf("go-couchbase: Tap feed from %s failed: %v", host, singleFeed.Error) + } + killSwitch <- true + return + } + feed.output <- event + case <-feed.quit: + return + } + } +} + +func (feed *TapFeed) closeNodeFeeds() { + for _, f := range feed.nodeFeeds { + f.Close() + } + feed.nodeFeeds = nil +} + +// Close a Tap feed. +func (feed *TapFeed) Close() error { + select { + case <-feed.quit: + return nil + default: + } + + feed.closeNodeFeeds() + close(feed.quit) + feed.wg.Wait() + close(feed.output) + return nil +} diff --git a/vendor/github.com/couchbaselabs/go-couchbase/upr.go b/vendor/github.com/couchbaselabs/go-couchbase/upr.go new file mode 100644 index 000000000000..bf1b209b7e4d --- /dev/null +++ b/vendor/github.com/couchbaselabs/go-couchbase/upr.go @@ -0,0 +1,398 @@ +package couchbase + +import ( + "log" + "sync" + "time" + + "fmt" + "github.com/couchbase/gomemcached" + "github.com/couchbase/gomemcached/client" + "github.com/couchbase/goutils/logging" +) + +// A UprFeed streams mutation events from a bucket. +// +// Events from the bucket can be read from the channel 'C'. Remember +// to call Close() on it when you're done, unless its channel has +// closed itself already. +type UprFeed struct { + C <-chan *memcached.UprEvent + + bucket *Bucket + nodeFeeds map[string]*FeedInfo // The UPR feeds of the individual nodes + output chan *memcached.UprEvent // Same as C but writeably-typed + outputClosed bool + quit chan bool + name string // name of this UPR feed + sequence uint32 // sequence number for this feed + connected bool + killSwitch chan bool + closing bool + wg sync.WaitGroup + dcp_buffer_size uint32 + data_chan_size int +} + +// UprFeed from a single connection +type FeedInfo struct { + uprFeed *memcached.UprFeed // UPR feed handle + host string // hostname + connected bool // connected + quit chan bool // quit channel +} + +type FailoverLog map[uint16]memcached.FailoverLog + +// GetFailoverLogs, get the failover logs for a set of vbucket ids +func (b *Bucket) GetFailoverLogs(vBuckets []uint16) (FailoverLog, error) { + + // map vbids to their corresponding hosts + vbHostList := make(map[string][]uint16) + vbm := b.VBServerMap() + if len(vbm.VBucketMap) < len(vBuckets) { + return nil, fmt.Errorf("vbmap smaller than vbucket list: %v vs. %v", + vbm.VBucketMap, vBuckets) + } + + for _, vb := range vBuckets { + masterID := vbm.VBucketMap[vb][0] + master := b.getMasterNode(masterID) + if master == "" { + return nil, fmt.Errorf("No master found for vb %d", vb) + } + + vbList := vbHostList[master] + if vbList == nil { + vbList = make([]uint16, 0) + } + vbList = append(vbList, vb) + vbHostList[master] = vbList + } + + failoverLogMap := make(FailoverLog) + for _, serverConn := range b.getConnPools(false /* not already locked */) { + + vbList := vbHostList[serverConn.host] + if vbList == nil { + continue + } + + mc, err := serverConn.Get() + if err != nil { + logging.Infof("No Free connections for vblist %v", vbList) + return nil, fmt.Errorf("No Free connections for host %s", + serverConn.host) + + } + // close the connection so that it doesn't get reused for upr data + // connection + defer mc.Close() + failoverlogs, err := mc.UprGetFailoverLog(vbList) + if err != nil { + return nil, fmt.Errorf("Error getting failover log %s host %s", + err.Error(), serverConn.host) + + } + + for vb, log := range failoverlogs { + failoverLogMap[vb] = *log + } + } + + return failoverLogMap, nil +} + +func (b *Bucket) StartUprFeed(name string, sequence uint32) (*UprFeed, error) { + return b.StartUprFeedWithConfig(name, sequence, 10, DEFAULT_WINDOW_SIZE) +} + +// StartUprFeed creates and starts a new Upr feed +// No data will be sent on the channel unless vbuckets streams are requested +func (b *Bucket) StartUprFeedWithConfig(name string, sequence uint32, data_chan_size int, dcp_buffer_size uint32) (*UprFeed, error) { + + feed := &UprFeed{ + bucket: b, + output: make(chan *memcached.UprEvent, data_chan_size), + quit: make(chan bool), + nodeFeeds: make(map[string]*FeedInfo, 0), + name: name, + sequence: sequence, + killSwitch: make(chan bool), + dcp_buffer_size: dcp_buffer_size, + data_chan_size: data_chan_size, + } + + err := feed.connectToNodes() + if err != nil { + return nil, fmt.Errorf("Cannot connect to bucket %s", err.Error()) + } + feed.connected = true + go feed.run() + + feed.C = feed.output + return feed, nil +} + +// UprRequestStream starts a stream for a vb on a feed +func (feed *UprFeed) UprRequestStream(vb uint16, opaque uint16, flags uint32, + vuuid, startSequence, endSequence, snapStart, snapEnd uint64) error { + + defer func() { + if r := recover(); r != nil { + log.Panicf("Panic in UprRequestStream. Feed %v Bucket %v", feed, feed.bucket) + } + }() + + vbm := feed.bucket.VBServerMap() + if len(vbm.VBucketMap) < int(vb) { + return fmt.Errorf("vbmap smaller than vbucket list: %v vs. %v", + vb, vbm.VBucketMap) + } + + if int(vb) >= len(vbm.VBucketMap) { + return fmt.Errorf("Invalid vbucket id %d", vb) + } + + masterID := vbm.VBucketMap[vb][0] + master := feed.bucket.getMasterNode(masterID) + if master == "" { + return fmt.Errorf("Master node not found for vbucket %d", vb) + } + singleFeed := feed.nodeFeeds[master] + if singleFeed == nil { + return fmt.Errorf("UprFeed for this host not found") + } + + if err := singleFeed.uprFeed.UprRequestStream(vb, opaque, flags, + vuuid, startSequence, endSequence, snapStart, snapEnd); err != nil { + return err + } + + return nil +} + +// UprCloseStream ends a vbucket stream. +func (feed *UprFeed) UprCloseStream(vb, opaqueMSB uint16) error { + + defer func() { + if r := recover(); r != nil { + log.Panicf("Panic in UprCloseStream. Feed %v Bucket %v ", feed, feed.bucket) + } + }() + + vbm := feed.bucket.VBServerMap() + if len(vbm.VBucketMap) < int(vb) { + return fmt.Errorf("vbmap smaller than vbucket list: %v vs. %v", + vb, vbm.VBucketMap) + } + + if int(vb) >= len(vbm.VBucketMap) { + return fmt.Errorf("Invalid vbucket id %d", vb) + } + + masterID := vbm.VBucketMap[vb][0] + master := feed.bucket.getMasterNode(masterID) + if master == "" { + return fmt.Errorf("Master node not found for vbucket %d", vb) + } + singleFeed := feed.nodeFeeds[master] + if singleFeed == nil { + return fmt.Errorf("UprFeed for this host not found") + } + + if err := singleFeed.uprFeed.CloseStream(vb, opaqueMSB); err != nil { + return err + } + return nil +} + +// Goroutine that runs the feed +func (feed *UprFeed) run() { + retryInterval := initialRetryInterval + bucketOK := true + for { + // Connect to the UPR feed of each server node: + if bucketOK { + // Run until one of the sub-feeds fails: + select { + case <-feed.killSwitch: + case <-feed.quit: + return + } + //feed.closeNodeFeeds() + retryInterval = initialRetryInterval + } + + if feed.closing == true { + // we have been asked to shut down + return + } + + // On error, try to refresh the bucket in case the list of nodes changed: + logging.Infof("go-couchbase: UPR connection lost; reconnecting to bucket %q in %v", + feed.bucket.Name, retryInterval) + + if err := feed.bucket.Refresh(); err != nil { + // if we fail to refresh the bucket, exit the feed + // MB-14917 + logging.Infof("Unable to refresh bucket %s ", err.Error()) + close(feed.output) + feed.outputClosed = true + feed.closeNodeFeeds() + return + } + + // this will only connect to nodes that are not connected or changed + // user will have to reconnect the stream + err := feed.connectToNodes() + if err != nil { + logging.Infof("Unable to connect to nodes..exit ") + close(feed.output) + feed.outputClosed = true + feed.closeNodeFeeds() + return + } + bucketOK = err == nil + + select { + case <-time.After(retryInterval): + case <-feed.quit: + return + } + if retryInterval *= 2; retryInterval > maximumRetryInterval { + retryInterval = maximumRetryInterval + } + } +} + +func (feed *UprFeed) connectToNodes() (err error) { + nodeCount := 0 + for _, serverConn := range feed.bucket.getConnPools(false /* not already locked */) { + + // this maybe a reconnection, so check if the connection to the node + // already exists. Connect only if the node is not found in the list + // or connected == false + nodeFeed := feed.nodeFeeds[serverConn.host] + + if nodeFeed != nil && nodeFeed.connected == true { + continue + } + + var singleFeed *memcached.UprFeed + var name string + if feed.name == "" { + name = "DefaultUprClient" + } else { + name = feed.name + } + singleFeed, err = serverConn.StartUprFeed(name, feed.sequence, feed.dcp_buffer_size, feed.data_chan_size) + if err != nil { + logging.Errorf("go-couchbase: Error connecting to upr feed of %s: %v", serverConn.host, err) + feed.closeNodeFeeds() + return + } + // add the node to the connection map + feedInfo := &FeedInfo{ + uprFeed: singleFeed, + connected: true, + host: serverConn.host, + quit: make(chan bool), + } + feed.nodeFeeds[serverConn.host] = feedInfo + go feed.forwardUprEvents(feedInfo, feed.killSwitch, serverConn.host) + feed.wg.Add(1) + nodeCount++ + } + if nodeCount == 0 { + return fmt.Errorf("No connection to bucket") + } + + return nil +} + +// Goroutine that forwards Upr events from a single node's feed to the aggregate feed. +func (feed *UprFeed) forwardUprEvents(nodeFeed *FeedInfo, killSwitch chan bool, host string) { + singleFeed := nodeFeed.uprFeed + + defer func() { + feed.wg.Done() + if r := recover(); r != nil { + //if feed is not closing, re-throw the panic + if feed.outputClosed != true && feed.closing != true { + panic(r) + } else { + logging.Errorf("Panic is recovered. Since feed is closed, exit gracefully") + + } + } + }() + + for { + select { + case <-nodeFeed.quit: + nodeFeed.connected = false + return + + case event, ok := <-singleFeed.C: + if !ok { + if singleFeed.Error != nil { + logging.Errorf("go-couchbase: Upr feed from %s failed: %v", host, singleFeed.Error) + } + killSwitch <- true + return + } + if feed.outputClosed == true { + // someone closed the node feed + logging.Infof("Node need closed, returning from forwardUprEvent") + return + } + feed.output <- event + if event.Status == gomemcached.NOT_MY_VBUCKET { + logging.Infof(" Got a not my vbucket error !! ") + if err := feed.bucket.Refresh(); err != nil { + logging.Errorf("Unable to refresh bucket %s ", err.Error()) + feed.closeNodeFeeds() + return + } + // this will only connect to nodes that are not connected or changed + // user will have to reconnect the stream + if err := feed.connectToNodes(); err != nil { + logging.Errorf("Unable to connect to nodes %s", err.Error()) + return + } + + } + } + } +} + +func (feed *UprFeed) closeNodeFeeds() { + for _, f := range feed.nodeFeeds { + logging.Infof(" Sending close to forwardUprEvent ") + close(f.quit) + f.uprFeed.Close() + } + feed.nodeFeeds = nil +} + +// Close a Upr feed. +func (feed *UprFeed) Close() error { + select { + case <-feed.quit: + return nil + default: + } + + feed.closing = true + feed.closeNodeFeeds() + close(feed.quit) + + feed.wg.Wait() + if feed.outputClosed == false { + feed.outputClosed = true + close(feed.output) + } + + return nil +} diff --git a/vendor/github.com/couchbaselabs/go-couchbase/users.go b/vendor/github.com/couchbaselabs/go-couchbase/users.go new file mode 100644 index 000000000000..47d486152250 --- /dev/null +++ b/vendor/github.com/couchbaselabs/go-couchbase/users.go @@ -0,0 +1,119 @@ +package couchbase + +import ( + "bytes" + "fmt" +) + +type User struct { + Name string + Id string + Domain string + Roles []Role +} + +type Role struct { + Role string + BucketName string `json:"bucket_name"` +} + +// Sample: +// {"role":"admin","name":"Admin","desc":"Can manage ALL cluster features including security.","ce":true} +// {"role":"query_select","bucket_name":"*","name":"Query Select","desc":"Can execute SELECT statement on bucket to retrieve data"} +type RoleDescription struct { + Role string + Name string + Desc string + Ce bool + BucketName string `json:"bucket_name"` +} + +// Return user-role data, as parsed JSON. +// Sample: +// [{"id":"ivanivanov","name":"Ivan Ivanov","roles":[{"role":"cluster_admin"},{"bucket_name":"default","role":"bucket_admin"}]}, +// {"id":"petrpetrov","name":"Petr Petrov","roles":[{"role":"replication_admin"}]}] +func (c *Client) GetUserRoles() ([]interface{}, error) { + ret := make([]interface{}, 0, 1) + err := c.parseURLResponse("/settings/rbac/users", &ret) + if err != nil { + return nil, err + } + + // Get the configured administrator. + // Expected result: {"port":8091,"username":"Administrator"} + adminInfo := make(map[string]interface{}, 2) + err = c.parseURLResponse("/settings/web", &adminInfo) + if err != nil { + return nil, err + } + + // Create a special entry for the configured administrator. + adminResult := map[string]interface{}{ + "name": adminInfo["username"], + "id": adminInfo["username"], + "domain": "ns_server", + "roles": []interface{}{ + map[string]interface{}{ + "role": "admin", + }, + }, + } + + // Add the configured administrator to the list of results. + ret = append(ret, adminResult) + + return ret, nil +} + +func (c *Client) GetUserInfoAll() ([]User, error) { + ret := make([]User, 0, 16) + err := c.parseURLResponse("/settings/rbac/users", &ret) + if err != nil { + return nil, err + } + return ret, nil +} + +func rolesToParamFormat(roles []Role) string { + var buffer bytes.Buffer + for i, role := range roles { + if i > 0 { + buffer.WriteString(",") + } + buffer.WriteString(role.Role) + if role.BucketName != "" { + buffer.WriteString("[") + buffer.WriteString(role.BucketName) + buffer.WriteString("]") + } + } + return buffer.String() +} + +func (c *Client) PutUserInfo(u *User) error { + params := map[string]interface{}{ + "name": u.Name, + "roles": rolesToParamFormat(u.Roles), + } + var target string + switch u.Domain { + case "external": + target = "/settings/rbac/users/" + u.Id + case "local": + target = "/settings/rbac/users/local/" + u.Id + default: + return fmt.Errorf("Unknown user type: %s", u.Domain) + } + var ret string // PUT returns an empty string. We ignore it. + err := c.parsePutURLResponse(target, params, &ret) + return err +} + +func (c *Client) GetRolesAll() ([]RoleDescription, error) { + ret := make([]RoleDescription, 0, 32) + err := c.parseURLResponse("/settings/rbac/roles", &ret) + if err != nil { + return nil, err + } + return ret, nil +} diff --git a/vendor/github.com/couchbaselabs/go-couchbase/util.go b/vendor/github.com/couchbaselabs/go-couchbase/util.go new file mode 100644 index 000000000000..4d286a32713c --- /dev/null +++ b/vendor/github.com/couchbaselabs/go-couchbase/util.go @@ -0,0 +1,49 @@ +package couchbase + +import ( + "fmt" + "net/url" + "strings" +) + +// CleanupHost returns the hostname with the given suffix removed. +func CleanupHost(h, commonSuffix string) string { + if strings.HasSuffix(h, commonSuffix) { + return h[:len(h)-len(commonSuffix)] + } + return h +} + +// FindCommonSuffix returns the longest common suffix from the given +// strings. +func FindCommonSuffix(input []string) string { + rv := "" + if len(input) < 2 { + return "" + } + from := input + for i := len(input[0]); i > 0; i-- { + common := true + suffix := input[0][i:] + for _, s := range from { + if !strings.HasSuffix(s, suffix) { + common = false + break + } + } + if common { + rv = suffix + } + } + return rv +} + +// ParseURL is a wrapper around url.Parse with some sanity-checking +func ParseURL(urlStr string) (result *url.URL, err error) { + result, err = url.Parse(urlStr) + if result != nil && result.Scheme == "" { + result = nil + err = fmt.Errorf("invalid URL <%s>", urlStr) + } + return +} diff --git a/vendor/github.com/couchbaselabs/go-couchbase/vbmap.go b/vendor/github.com/couchbaselabs/go-couchbase/vbmap.go new file mode 100644 index 000000000000..b96a18ed5752 --- /dev/null +++ b/vendor/github.com/couchbaselabs/go-couchbase/vbmap.go @@ -0,0 +1,77 @@ +package couchbase + +var crc32tab = []uint32{ + 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, + 0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3, + 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, + 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, + 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de, + 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, + 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, + 0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5, + 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, + 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, + 0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, + 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59, + 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, + 0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f, + 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, + 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, + 0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a, + 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433, + 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, + 0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01, + 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, + 0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, + 0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c, + 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65, + 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, + 0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb, + 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, + 0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, + 0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086, + 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f, + 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, + 0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad, + 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, + 0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, + 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8, + 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, + 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, + 0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7, + 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, + 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, + 0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, + 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b, + 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, + 0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79, + 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, + 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, + 0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, + 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d, + 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, + 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713, + 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, + 0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, + 0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e, + 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777, + 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, + 0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45, + 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, + 0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, + 0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0, + 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9, + 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, + 0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf, + 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, + 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d} + +// VBHash finds the vbucket for the given key. +func (b *Bucket) VBHash(key string) uint32 { + crc := uint32(0xffffffff) + for x := 0; x < len(key); x++ { + crc = (crc >> 8) ^ crc32tab[(uint64(crc)^uint64(key[x]))&0xff] + } + vbm := b.VBServerMap() + return ((^crc) >> 16) & 0x7fff & (uint32(len(vbm.VBucketMap)) - 1) +} diff --git a/vendor/github.com/couchbaselabs/go-couchbase/views.go b/vendor/github.com/couchbaselabs/go-couchbase/views.go new file mode 100644 index 000000000000..2f68642f5a6d --- /dev/null +++ b/vendor/github.com/couchbaselabs/go-couchbase/views.go @@ -0,0 +1,231 @@ +package couchbase + +import ( + "encoding/json" + "errors" + "fmt" + "io/ioutil" + "math/rand" + "net/http" + "net/url" + "time" +) + +// ViewRow represents a single result from a view. +// +// Doc is present only if include_docs was set on the request. +type ViewRow struct { + ID string + Key interface{} + Value interface{} + Doc *interface{} +} + +// A ViewError is a node-specific error indicating a partial failure +// within a view result. +type ViewError struct { + From string + Reason string +} + +func (ve ViewError) Error() string { + return "Node: " + ve.From + ", reason: " + ve.Reason +} + +// ViewResult holds the entire result set from a view request, +// including the rows and the errors. +type ViewResult struct { + TotalRows int `json:"total_rows"` + Rows []ViewRow + Errors []ViewError +} + +func (b *Bucket) randomBaseURL() (*url.URL, error) { + nodes := b.HealthyNodes() + if len(nodes) == 0 { + return nil, errors.New("no available couch rest URLs") + } + nodeNo := rand.Intn(len(nodes)) + node := nodes[nodeNo] + + b.RLock() + name := b.Name + pool := b.pool + b.RUnlock() + + u, err := ParseURL(node.CouchAPIBase) + if err != nil { + return nil, fmt.Errorf("config error: Bucket %q node #%d CouchAPIBase=%q: %v", + name, nodeNo, node.CouchAPIBase, err) + } else if pool != nil { + u.User = pool.client.BaseURL.User + } + return u, err +} + +const START_NODE_ID = -1 + +func (b *Bucket) randomNextURL(lastNode int) (*url.URL, int, error) { + nodes := b.HealthyNodes() + if len(nodes) == 0 { + return nil, -1, errors.New("no available couch rest URLs") + } + + var nodeNo int + if lastNode == START_NODE_ID || lastNode >= len(nodes) { + // randomly select a node if the value of lastNode is invalid + nodeNo = rand.Intn(len(nodes)) + } else { + // wrap around the node list + nodeNo = (lastNode + 1) % len(nodes) + } + + b.RLock() + name := b.Name + pool := b.pool + b.RUnlock() + + node := nodes[nodeNo] + u, err := ParseURL(node.CouchAPIBase) + if err != nil { + return nil, -1, fmt.Errorf("config error: Bucket %q node #%d CouchAPIBase=%q: %v", + name, nodeNo, node.CouchAPIBase, err) + } else if pool != nil { + u.User = pool.client.BaseURL.User + } + return u, nodeNo, err +} + +// DocID is the document ID type for the startkey_docid parameter in +// views. +type DocID string + +func qParam(k, v string) string { + format := `"%s"` + switch k { + case "startkey_docid", "endkey_docid", "stale": + format = "%s" + } + return fmt.Sprintf(format, v) +} + +// ViewURL constructs a URL for a view with the given ddoc, view name, +// and parameters. +func (b *Bucket) ViewURL(ddoc, name string, + params map[string]interface{}) (string, error) { + u, err := b.randomBaseURL() + if err != nil { + return "", err + } + + values := url.Values{} + for k, v := range params { + switch t := v.(type) { + case DocID: + values[k] = []string{string(t)} + case string: + values[k] = []string{qParam(k, t)} + case int: + values[k] = []string{fmt.Sprintf(`%d`, t)} + case bool: + values[k] = []string{fmt.Sprintf(`%v`, t)} + default: + b, err := json.Marshal(v) + if err != nil { + return "", fmt.Errorf("unsupported value-type %T in Query, "+ + "json encoder said %v", t, err) + } + values[k] = []string{fmt.Sprintf(`%v`, string(b))} + } + } + + if ddoc == "" && name == "_all_docs" { + u.Path = fmt.Sprintf("/%s/_all_docs", b.GetName()) + } else { + u.Path = fmt.Sprintf("/%s/_design/%s/_view/%s", b.GetName(), ddoc, name) + } + u.RawQuery = values.Encode() + + return u.String(), nil +} + +// ViewCallback is called for each view invocation. +var ViewCallback func(ddoc, name string, start time.Time, err error) + +// ViewCustom performs a view request that can map row values to a +// custom type. +// +// See the source to View for an example usage. +func (b *Bucket) ViewCustom(ddoc, name string, params map[string]interface{}, + vres interface{}) (err error) { + if SlowServerCallWarningThreshold > 0 { + defer slowLog(time.Now(), "call to ViewCustom(%q, %q)", ddoc, name) + } + + if ViewCallback != nil { + defer func(t time.Time) { ViewCallback(ddoc, name, t, err) }(time.Now()) + } + + u, err := b.ViewURL(ddoc, name, params) + if err != nil { + return err + } + + req, err := http.NewRequest("GET", u, nil) + if err != nil { + return err + } + + ah := b.authHandler(false /* bucket not yet locked */) + maybeAddAuth(req, ah) + + res, err := doHTTPRequest(req) + if err != nil { + return fmt.Errorf("error starting view req at %v: %v", u, err) + } + defer res.Body.Close() + + if res.StatusCode != 200 { + bod := make([]byte, 512) + l, _ := res.Body.Read(bod) + return fmt.Errorf("error executing view req at %v: %v - %s", + u, res.Status, bod[:l]) + } + + body, err := ioutil.ReadAll(res.Body) + if err := json.Unmarshal(body, vres); err != nil { + return nil + } + + return nil +} + +// View executes a view. +// +// The ddoc parameter is just the bare name of your design doc without +// the "_design/" prefix. +// +// Parameters are string keys with values that correspond to couchbase +// view parameters. Primitive should work fairly naturally (booleans, +// ints, strings, etc...) and other values will attempt to be JSON +// marshaled (useful for array indexing on on view keys, for example). +// +// Example: +// +// res, err := couchbase.View("myddoc", "myview", map[string]interface{}{ +// "group_level": 2, +// "startkey_docid": []interface{}{"thing"}, +// "endkey_docid": []interface{}{"thing", map[string]string{}}, +// "stale": false, +// }) +func (b *Bucket) View(ddoc, name string, params map[string]interface{}) (ViewResult, error) { + vres := ViewResult{} + + if err := b.ViewCustom(ddoc, name, params, &vres); err != nil { + //error in accessing views. Retry once after a bucket refresh + b.Refresh() + return vres, b.ViewCustom(ddoc, name, params, &vres) + } else { + return vres, nil + } +} diff --git a/vendor/github.com/go-macaron/session/couchbase/couchbase.go b/vendor/github.com/go-macaron/session/couchbase/couchbase.go new file mode 100644 index 000000000000..8001fd15f1e6 --- /dev/null +++ b/vendor/github.com/go-macaron/session/couchbase/couchbase.go @@ -0,0 +1,228 @@ +// Copyright 2013 Beego Authors +// Copyright 2014 The Macaron Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"): you may +// not use this file except in compliance with the License. You may obtain +// a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. + +package session + +import ( + "strings" + "sync" + + "github.com/couchbaselabs/go-couchbase" + + "github.com/go-macaron/session" +) + +// CouchbaseSessionStore represents a couchbase session store implementation. +type CouchbaseSessionStore struct { + b *couchbase.Bucket + sid string + lock sync.RWMutex + data map[interface{}]interface{} + maxlifetime int64 +} + +// Set sets value to given key in session. +func (s *CouchbaseSessionStore) Set(key, val interface{}) error { + s.lock.Lock() + defer s.lock.Unlock() + + s.data[key] = val + return nil +} + +// Get gets value by given key in session. +func (s *CouchbaseSessionStore) Get(key interface{}) interface{} { + s.lock.RLock() + defer s.lock.RUnlock() + + return s.data[key] +} + +// Delete delete a key from session. +func (s *CouchbaseSessionStore) Delete(key interface{}) error { + s.lock.Lock() + defer s.lock.Unlock() + + delete(s.data, key) + return nil +} + +// ID returns current session ID. +func (s *CouchbaseSessionStore) ID() string { + return s.sid +} + +// Release releases resource and save data to provider. +func (s *CouchbaseSessionStore) Release() error { + defer s.b.Close() + + // Skip encoding if the data is empty + if len(s.data) == 0 { + return nil + } + + data, err := session.EncodeGob(s.data) + if err != nil { + return err + } + + return s.b.Set(s.sid, int(s.maxlifetime), data) +} + +// Flush deletes all session data. +func (s *CouchbaseSessionStore) Flush() error { + s.lock.Lock() + defer s.lock.Unlock() + + s.data = make(map[interface{}]interface{}) + return nil +} + +// CouchbaseProvider represents a couchbase session provider implementation. +type CouchbaseProvider struct { + maxlifetime int64 + connStr string + pool string + bucket string + b *couchbase.Bucket +} + +func (cp *CouchbaseProvider) getBucket() *couchbase.Bucket { + c, err := couchbase.Connect(cp.connStr) + if err != nil { + return nil + } + + pool, err := c.GetPool(cp.pool) + if err != nil { + return nil + } + + bucket, err := pool.GetBucket(cp.bucket) + if err != nil { + return nil + } + + return bucket +} + +// Init initializes memory session provider. +// connStr is couchbase server REST/JSON URL +// e.g. http://host:port/, Pool, Bucket +func (p *CouchbaseProvider) Init(maxlifetime int64, connStr string) error { + p.maxlifetime = maxlifetime + configs := strings.Split(connStr, ",") + if len(configs) > 0 { + p.connStr = configs[0] + } + if len(configs) > 1 { + p.pool = configs[1] + } + if len(configs) > 2 { + p.bucket = configs[2] + } + + return nil +} + +// Read returns raw session store by session ID. +func (p *CouchbaseProvider) Read(sid string) (session.RawStore, error) { + p.b = p.getBucket() + + var doc []byte + + err := p.b.Get(sid, &doc) + var kv map[interface{}]interface{} + if doc == nil { + kv = make(map[interface{}]interface{}) + } else { + kv, err = session.DecodeGob(doc) + if err != nil { + return nil, err + } + } + + cs := &CouchbaseSessionStore{b: p.b, sid: sid, data: kv, maxlifetime: p.maxlifetime} + return cs, nil +} + +// Exist returns true if session with given ID exists. +func (p *CouchbaseProvider) Exist(sid string) bool { + p.b = p.getBucket() + defer p.b.Close() + + var doc []byte + + if err := p.b.Get(sid, &doc); err != nil || doc == nil { + return false + } else { + return true + } +} + +// Destory deletes a session by session ID. +func (p *CouchbaseProvider) Destory(sid string) error { + p.b = p.getBucket() + defer p.b.Close() + + p.b.Delete(sid) + return nil +} + +// Regenerate regenerates a session store from old session ID to new one. +func (p *CouchbaseProvider) Regenerate(oldsid, sid string) (session.RawStore, error) { + p.b = p.getBucket() + + var doc []byte + if err := p.b.Get(oldsid, &doc); err != nil || doc == nil { + p.b.Set(sid, int(p.maxlifetime), "") + } else { + err := p.b.Delete(oldsid) + if err != nil { + return nil, err + } + _, _ = p.b.Add(sid, int(p.maxlifetime), doc) + } + + err := p.b.Get(sid, &doc) + if err != nil { + return nil, err + } + var kv map[interface{}]interface{} + if doc == nil { + kv = make(map[interface{}]interface{}) + } else { + kv, err = session.DecodeGob(doc) + if err != nil { + return nil, err + } + } + + cs := &CouchbaseSessionStore{b: p.b, sid: sid, data: kv, maxlifetime: p.maxlifetime} + return cs, nil +} + +// Count counts and returns number of sessions. +func (p *CouchbaseProvider) Count() int { + // FIXME + return 0 +} + +// GC calls GC to clean expired sessions. +func (p *CouchbaseProvider) GC() {} + +func init() { + session.Register("couchbase", &CouchbaseProvider{}) +} diff --git a/vendor/github.com/go-macaron/session/file.go b/vendor/github.com/go-macaron/session/file.go index 9bbc7aed20b6..64b47f2b0039 100644 --- a/vendor/github.com/go-macaron/session/file.go +++ b/vendor/github.com/go-macaron/session/file.go @@ -81,6 +81,11 @@ func (s *FileStore) Release() error { s.p.lock.Lock() defer s.p.lock.Unlock() + // Skip encoding if the data is empty + if len(s.data) == 0 { + return nil + } + data, err := EncodeGob(s.data) if err != nil { return err diff --git a/vendor/github.com/go-macaron/session/flash.go b/vendor/github.com/go-macaron/session/flash.go new file mode 100644 index 000000000000..99aae71e224e --- /dev/null +++ b/vendor/github.com/go-macaron/session/flash.go @@ -0,0 +1,61 @@ +// Copyright 2018 The Macaron Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"): you may +// not use this file except in compliance with the License. You may obtain +// a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. + +package session + +import ( + "net/url" + + "gopkg.in/macaron.v1" +) + +type Flash struct { + ctx *macaron.Context + url.Values + ErrorMsg, WarningMsg, InfoMsg, SuccessMsg string +} + +func (f *Flash) set(name, msg string, current ...bool) { + isShow := false + if (len(current) == 0 && macaron.FlashNow) || + (len(current) > 0 && current[0]) { + isShow = true + } + + if isShow { + f.ctx.Data["Flash"] = f + } else { + f.Set(name, msg) + } +} + +func (f *Flash) Error(msg string, current ...bool) { + f.ErrorMsg = msg + f.set("error", msg, current...) +} + +func (f *Flash) Warning(msg string, current ...bool) { + f.WarningMsg = msg + f.set("warning", msg, current...) +} + +func (f *Flash) Info(msg string, current ...bool) { + f.InfoMsg = msg + f.set("info", msg, current...) +} + +func (f *Flash) Success(msg string, current ...bool) { + f.SuccessMsg = msg + f.set("success", msg, current...) +} diff --git a/vendor/github.com/go-macaron/session/ledis/ledis.go b/vendor/github.com/go-macaron/session/ledis/ledis.go new file mode 100644 index 000000000000..bb5ce95244ab --- /dev/null +++ b/vendor/github.com/go-macaron/session/ledis/ledis.go @@ -0,0 +1,227 @@ +// Copyright 2013 Beego Authors +// Copyright 2014 The Macaron Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"): you may +// not use this file except in compliance with the License. You may obtain +// a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. + +package session + +import ( + "fmt" + "strings" + "sync" + + "github.com/Unknwon/com" + "github.com/siddontang/ledisdb/config" + "github.com/siddontang/ledisdb/ledis" + "gopkg.in/ini.v1" + + "github.com/go-macaron/session" +) + +// LedisStore represents a ledis session store implementation. +type LedisStore struct { + c *ledis.DB + sid string + expire int64 + lock sync.RWMutex + data map[interface{}]interface{} +} + +// NewLedisStore creates and returns a ledis session store. +func NewLedisStore(c *ledis.DB, sid string, expire int64, kv map[interface{}]interface{}) *LedisStore { + return &LedisStore{ + c: c, + expire: expire, + sid: sid, + data: kv, + } +} + +// Set sets value to given key in session. +func (s *LedisStore) Set(key, val interface{}) error { + s.lock.Lock() + defer s.lock.Unlock() + + s.data[key] = val + return nil +} + +// Get gets value by given key in session. +func (s *LedisStore) Get(key interface{}) interface{} { + s.lock.RLock() + defer s.lock.RUnlock() + + return s.data[key] +} + +// Delete delete a key from session. +func (s *LedisStore) Delete(key interface{}) error { + s.lock.Lock() + defer s.lock.Unlock() + + delete(s.data, key) + return nil +} + +// ID returns current session ID. +func (s *LedisStore) ID() string { + return s.sid +} + +// Release releases resource and save data to provider. +func (s *LedisStore) Release() error { + // Skip encoding if the data is empty + if len(s.data) == 0 { + return nil + } + + data, err := session.EncodeGob(s.data) + if err != nil { + return err + } + + if err = s.c.Set([]byte(s.sid), data); err != nil { + return err + } + _, err = s.c.Expire([]byte(s.sid), s.expire) + return err +} + +// Flush deletes all session data. +func (s *LedisStore) Flush() error { + s.lock.Lock() + defer s.lock.Unlock() + + s.data = make(map[interface{}]interface{}) + return nil +} + +// LedisProvider represents a ledis session provider implementation. +type LedisProvider struct { + c *ledis.DB + expire int64 +} + +// Init initializes ledis session provider. +// configs: data_dir=./app.db,db=0 +func (p *LedisProvider) Init(expire int64, configs string) error { + p.expire = expire + + cfg, err := ini.Load([]byte(strings.Replace(configs, ",", "\n", -1))) + if err != nil { + return err + } + + db := 0 + opt := new(config.Config) + for k, v := range cfg.Section("").KeysHash() { + switch k { + case "data_dir": + opt.DataDir = v + case "db": + db = com.StrTo(v).MustInt() + default: + return fmt.Errorf("session/ledis: unsupported option '%s'", k) + } + } + + l, err := ledis.Open(opt) + if err != nil { + return fmt.Errorf("session/ledis: error opening db: %v", err) + } + p.c, err = l.Select(db) + return err +} + +// Read returns raw session store by session ID. +func (p *LedisProvider) Read(sid string) (session.RawStore, error) { + if !p.Exist(sid) { + if err := p.c.Set([]byte(sid), []byte("")); err != nil { + return nil, err + } + } + + var kv map[interface{}]interface{} + kvs, err := p.c.Get([]byte(sid)) + if err != nil { + return nil, err + } + if len(kvs) == 0 { + kv = make(map[interface{}]interface{}) + } else { + kv, err = session.DecodeGob(kvs) + if err != nil { + return nil, err + } + } + + return NewLedisStore(p.c, sid, p.expire, kv), nil +} + +// Exist returns true if session with given ID exists. +func (p *LedisProvider) Exist(sid string) bool { + count, err := p.c.Exists([]byte(sid)) + return err == nil && count > 0 +} + +// Destory deletes a session by session ID. +func (p *LedisProvider) Destory(sid string) error { + _, err := p.c.Del([]byte(sid)) + return err +} + +// Regenerate regenerates a session store from old session ID to new one. +func (p *LedisProvider) Regenerate(oldsid, sid string) (_ session.RawStore, err error) { + if p.Exist(sid) { + return nil, fmt.Errorf("new sid '%s' already exists", sid) + } + + kvs := make([]byte, 0) + if p.Exist(oldsid) { + if kvs, err = p.c.Get([]byte(oldsid)); err != nil { + return nil, err + } else if _, err = p.c.Del([]byte(oldsid)); err != nil { + return nil, err + } + } + if err = p.c.SetEX([]byte(sid), p.expire, kvs); err != nil { + return nil, err + } + + var kv map[interface{}]interface{} + if len(kvs) == 0 { + kv = make(map[interface{}]interface{}) + } else { + kv, err = session.DecodeGob([]byte(kvs)) + if err != nil { + return nil, err + } + } + + return NewLedisStore(p.c, sid, p.expire, kv), nil +} + +// Count counts and returns number of sessions. +func (p *LedisProvider) Count() int { + // FIXME: how come this library does not have DbSize() method? + return -1 +} + +// GC calls GC to clean expired sessions. +func (p *LedisProvider) GC() { + // FIXME: wtf??? +} + +func init() { + session.Register("ledis", &LedisProvider{}) +} diff --git a/vendor/github.com/go-macaron/session/memcache/memcache.go b/vendor/github.com/go-macaron/session/memcache/memcache.go new file mode 100644 index 000000000000..496939398b0c --- /dev/null +++ b/vendor/github.com/go-macaron/session/memcache/memcache.go @@ -0,0 +1,204 @@ +// Copyright 2013 Beego Authors +// Copyright 2014 The Macaron Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"): you may +// not use this file except in compliance with the License. You may obtain +// a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. + +package session + +import ( + "fmt" + "strings" + "sync" + + "github.com/bradfitz/gomemcache/memcache" + + "github.com/go-macaron/session" +) + +// MemcacheStore represents a memcache session store implementation. +type MemcacheStore struct { + c *memcache.Client + sid string + expire int32 + lock sync.RWMutex + data map[interface{}]interface{} +} + +// NewMemcacheStore creates and returns a memcache session store. +func NewMemcacheStore(c *memcache.Client, sid string, expire int32, kv map[interface{}]interface{}) *MemcacheStore { + return &MemcacheStore{ + c: c, + sid: sid, + expire: expire, + data: kv, + } +} + +func NewItem(sid string, data []byte, expire int32) *memcache.Item { + return &memcache.Item{ + Key: sid, + Value: data, + Expiration: expire, + } +} + +// Set sets value to given key in session. +func (s *MemcacheStore) Set(key, val interface{}) error { + s.lock.Lock() + defer s.lock.Unlock() + + s.data[key] = val + return nil +} + +// Get gets value by given key in session. +func (s *MemcacheStore) Get(key interface{}) interface{} { + s.lock.RLock() + defer s.lock.RUnlock() + + return s.data[key] +} + +// Delete delete a key from session. +func (s *MemcacheStore) Delete(key interface{}) error { + s.lock.Lock() + defer s.lock.Unlock() + + delete(s.data, key) + return nil +} + +// ID returns current session ID. +func (s *MemcacheStore) ID() string { + return s.sid +} + +// Release releases resource and save data to provider. +func (s *MemcacheStore) Release() error { + // Skip encoding if the data is empty + if len(s.data) == 0 { + return nil + } + + data, err := session.EncodeGob(s.data) + if err != nil { + return err + } + + return s.c.Set(NewItem(s.sid, data, s.expire)) +} + +// Flush deletes all session data. +func (s *MemcacheStore) Flush() error { + s.lock.Lock() + defer s.lock.Unlock() + + s.data = make(map[interface{}]interface{}) + return nil +} + +// MemcacheProvider represents a memcache session provider implementation. +type MemcacheProvider struct { + c *memcache.Client + expire int32 +} + +// Init initializes memcache session provider. +// connStrs: 127.0.0.1:9090;127.0.0.1:9091 +func (p *MemcacheProvider) Init(expire int64, connStrs string) error { + p.expire = int32(expire) + p.c = memcache.New(strings.Split(connStrs, ";")...) + return nil +} + +// Read returns raw session store by session ID. +func (p *MemcacheProvider) Read(sid string) (session.RawStore, error) { + if !p.Exist(sid) { + if err := p.c.Set(NewItem(sid, []byte(""), p.expire)); err != nil { + return nil, err + } + } + + var kv map[interface{}]interface{} + item, err := p.c.Get(sid) + if err != nil { + return nil, err + } + if len(item.Value) == 0 { + kv = make(map[interface{}]interface{}) + } else { + kv, err = session.DecodeGob(item.Value) + if err != nil { + return nil, err + } + } + + return NewMemcacheStore(p.c, sid, p.expire, kv), nil +} + +// Exist returns true if session with given ID exists. +func (p *MemcacheProvider) Exist(sid string) bool { + _, err := p.c.Get(sid) + return err == nil +} + +// Destory deletes a session by session ID. +func (p *MemcacheProvider) Destory(sid string) error { + return p.c.Delete(sid) +} + +// Regenerate regenerates a session store from old session ID to new one. +func (p *MemcacheProvider) Regenerate(oldsid, sid string) (_ session.RawStore, err error) { + if p.Exist(sid) { + return nil, fmt.Errorf("new sid '%s' already exists", sid) + } + + item := NewItem(sid, []byte(""), p.expire) + if p.Exist(oldsid) { + item, err = p.c.Get(oldsid) + if err != nil { + return nil, err + } else if err = p.c.Delete(oldsid); err != nil { + return nil, err + } + item.Key = sid + } + if err = p.c.Set(item); err != nil { + return nil, err + } + + var kv map[interface{}]interface{} + if len(item.Value) == 0 { + kv = make(map[interface{}]interface{}) + } else { + kv, err = session.DecodeGob(item.Value) + if err != nil { + return nil, err + } + } + + return NewMemcacheStore(p.c, sid, p.expire, kv), nil +} + +// Count counts and returns number of sessions. +func (p *MemcacheProvider) Count() int { + // FIXME: how come this library does not have Stats method? + return -1 +} + +// GC calls GC to clean expired sessions. +func (p *MemcacheProvider) GC() {} + +func init() { + session.Register("memcache", &MemcacheProvider{}) +} diff --git a/vendor/github.com/go-macaron/session/mysql/mysql.go b/vendor/github.com/go-macaron/session/mysql/mysql.go new file mode 100644 index 000000000000..7bde37445e46 --- /dev/null +++ b/vendor/github.com/go-macaron/session/mysql/mysql.go @@ -0,0 +1,200 @@ +// Copyright 2013 Beego Authors +// Copyright 2014 The Macaron Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"): you may +// not use this file except in compliance with the License. You may obtain +// a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. + +package session + +import ( + "database/sql" + "fmt" + "log" + "sync" + "time" + + _ "github.com/go-sql-driver/mysql" + + "github.com/go-macaron/session" +) + +// MysqlStore represents a mysql session store implementation. +type MysqlStore struct { + c *sql.DB + sid string + lock sync.RWMutex + data map[interface{}]interface{} +} + +// NewMysqlStore creates and returns a mysql session store. +func NewMysqlStore(c *sql.DB, sid string, kv map[interface{}]interface{}) *MysqlStore { + return &MysqlStore{ + c: c, + sid: sid, + data: kv, + } +} + +// Set sets value to given key in session. +func (s *MysqlStore) Set(key, val interface{}) error { + s.lock.Lock() + defer s.lock.Unlock() + + s.data[key] = val + return nil +} + +// Get gets value by given key in session. +func (s *MysqlStore) Get(key interface{}) interface{} { + s.lock.RLock() + defer s.lock.RUnlock() + + return s.data[key] +} + +// Delete delete a key from session. +func (s *MysqlStore) Delete(key interface{}) error { + s.lock.Lock() + defer s.lock.Unlock() + + delete(s.data, key) + return nil +} + +// ID returns current session ID. +func (s *MysqlStore) ID() string { + return s.sid +} + +// Release releases resource and save data to provider. +func (s *MysqlStore) Release() error { + // Skip encoding if the data is empty + if len(s.data) == 0 { + return nil + } + + data, err := session.EncodeGob(s.data) + if err != nil { + return err + } + + _, err = s.c.Exec("UPDATE session SET data=?, expiry=? WHERE `key`=?", + data, time.Now().Unix(), s.sid) + return err +} + +// Flush deletes all session data. +func (s *MysqlStore) Flush() error { + s.lock.Lock() + defer s.lock.Unlock() + + s.data = make(map[interface{}]interface{}) + return nil +} + +// MysqlProvider represents a mysql session provider implementation. +type MysqlProvider struct { + c *sql.DB + expire int64 +} + +// Init initializes mysql session provider. +// connStr: username:password@protocol(address)/dbname?param=value +func (p *MysqlProvider) Init(expire int64, connStr string) (err error) { + p.expire = expire + + p.c, err = sql.Open("mysql", connStr) + if err != nil { + return err + } + return p.c.Ping() +} + +// Read returns raw session store by session ID. +func (p *MysqlProvider) Read(sid string) (session.RawStore, error) { + var data []byte + err := p.c.QueryRow("SELECT data FROM session WHERE `key`=?", sid).Scan(&data) + if err == sql.ErrNoRows { + _, err = p.c.Exec("INSERT INTO session(`key`,data,expiry) VALUES(?,?,?)", + sid, "", time.Now().Unix()) + } + if err != nil { + return nil, err + } + + var kv map[interface{}]interface{} + if len(data) == 0 { + kv = make(map[interface{}]interface{}) + } else { + kv, err = session.DecodeGob(data) + if err != nil { + return nil, err + } + } + + return NewMysqlStore(p.c, sid, kv), nil +} + +// Exist returns true if session with given ID exists. +func (p *MysqlProvider) Exist(sid string) bool { + var data []byte + err := p.c.QueryRow("SELECT data FROM session WHERE `key`=?", sid).Scan(&data) + if err != nil && err != sql.ErrNoRows { + panic("session/mysql: error checking existence: " + err.Error()) + } + return err != sql.ErrNoRows +} + +// Destory deletes a session by session ID. +func (p *MysqlProvider) Destory(sid string) error { + _, err := p.c.Exec("DELETE FROM session WHERE `key`=?", sid) + return err +} + +// Regenerate regenerates a session store from old session ID to new one. +func (p *MysqlProvider) Regenerate(oldsid, sid string) (_ session.RawStore, err error) { + if p.Exist(sid) { + return nil, fmt.Errorf("new sid '%s' already exists", sid) + } + + if !p.Exist(oldsid) { + if _, err = p.c.Exec("INSERT INTO session(`key`,data,expiry) VALUES(?,?,?)", + oldsid, "", time.Now().Unix()); err != nil { + return nil, err + } + } + + if _, err = p.c.Exec("UPDATE session SET `key`=? WHERE `key`=?", sid, oldsid); err != nil { + return nil, err + } + + return p.Read(sid) +} + +// Count counts and returns number of sessions. +func (p *MysqlProvider) Count() (total int) { + if err := p.c.QueryRow("SELECT COUNT(*) AS NUM FROM session").Scan(&total); err != nil { + panic("session/mysql: error counting records: " + err.Error()) + } + return total +} + +// GC calls GC to clean expired sessions. +func (p *MysqlProvider) GC() { + if _, err := p.c.Exec("DELETE FROM session WHERE expiry + ? <= UNIX_TIMESTAMP(NOW())", p.expire); err != nil { + log.Printf("session/mysql: error garbage collecting: %v", err) + } +} + +func init() { + session.Register("mysql", &MysqlProvider{}) +} diff --git a/vendor/github.com/go-macaron/session/nodb/nodb.go b/vendor/github.com/go-macaron/session/nodb/nodb.go new file mode 100644 index 000000000000..8b5a7116933d --- /dev/null +++ b/vendor/github.com/go-macaron/session/nodb/nodb.go @@ -0,0 +1,208 @@ +// Copyright 2015 The Macaron Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"): you may +// not use this file except in compliance with the License. You may obtain +// a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. + +package session + +import ( + "fmt" + "sync" + + "github.com/lunny/nodb" + "github.com/lunny/nodb/config" + + "github.com/go-macaron/session" +) + +// NodbStore represents a nodb session store implementation. +type NodbStore struct { + c *nodb.DB + sid string + expire int64 + lock sync.RWMutex + data map[interface{}]interface{} +} + +// NewNodbStore creates and returns a ledis session store. +func NewNodbStore(c *nodb.DB, sid string, expire int64, kv map[interface{}]interface{}) *NodbStore { + return &NodbStore{ + c: c, + expire: expire, + sid: sid, + data: kv, + } +} + +// Set sets value to given key in session. +func (s *NodbStore) Set(key, val interface{}) error { + s.lock.Lock() + defer s.lock.Unlock() + + s.data[key] = val + return nil +} + +// Get gets value by given key in session. +func (s *NodbStore) Get(key interface{}) interface{} { + s.lock.RLock() + defer s.lock.RUnlock() + + return s.data[key] +} + +// Delete delete a key from session. +func (s *NodbStore) Delete(key interface{}) error { + s.lock.Lock() + defer s.lock.Unlock() + + delete(s.data, key) + return nil +} + +// ID returns current session ID. +func (s *NodbStore) ID() string { + return s.sid +} + +// Release releases resource and save data to provider. +func (s *NodbStore) Release() error { + // Skip encoding if the data is empty + if len(s.data) == 0 { + return nil + } + + data, err := session.EncodeGob(s.data) + if err != nil { + return err + } + + if err = s.c.Set([]byte(s.sid), data); err != nil { + return err + } + _, err = s.c.Expire([]byte(s.sid), s.expire) + return err +} + +// Flush deletes all session data. +func (s *NodbStore) Flush() error { + s.lock.Lock() + defer s.lock.Unlock() + + s.data = make(map[interface{}]interface{}) + return nil +} + +// NodbProvider represents a ledis session provider implementation. +type NodbProvider struct { + c *nodb.DB + expire int64 +} + +// Init initializes nodb session provider. +func (p *NodbProvider) Init(expire int64, configs string) error { + p.expire = expire + + cfg := new(config.Config) + cfg.DataDir = configs + dbs, err := nodb.Open(cfg) + if err != nil { + return fmt.Errorf("session/nodb: error opening db: %v", err) + } + + p.c, err = dbs.Select(0) + return err +} + +// Read returns raw session store by session ID. +func (p *NodbProvider) Read(sid string) (session.RawStore, error) { + if !p.Exist(sid) { + if err := p.c.Set([]byte(sid), []byte("")); err != nil { + return nil, err + } + } + + var kv map[interface{}]interface{} + kvs, err := p.c.Get([]byte(sid)) + if err != nil { + return nil, err + } + if len(kvs) == 0 { + kv = make(map[interface{}]interface{}) + } else { + kv, err = session.DecodeGob(kvs) + if err != nil { + return nil, err + } + } + + return NewNodbStore(p.c, sid, p.expire, kv), nil +} + +// Exist returns true if session with given ID exists. +func (p *NodbProvider) Exist(sid string) bool { + count, err := p.c.Exists([]byte(sid)) + return err == nil && count > 0 +} + +// Destory deletes a session by session ID. +func (p *NodbProvider) Destory(sid string) error { + _, err := p.c.Del([]byte(sid)) + return err +} + +// Regenerate regenerates a session store from old session ID to new one. +func (p *NodbProvider) Regenerate(oldsid, sid string) (_ session.RawStore, err error) { + if p.Exist(sid) { + return nil, fmt.Errorf("new sid '%s' already exists", sid) + } + + kvs := make([]byte, 0) + if p.Exist(oldsid) { + if kvs, err = p.c.Get([]byte(oldsid)); err != nil { + return nil, err + } else if _, err = p.c.Del([]byte(oldsid)); err != nil { + return nil, err + } + } + + if err = p.c.Set([]byte(sid), kvs); err != nil { + return nil, err + } else if _, err = p.c.Expire([]byte(sid), p.expire); err != nil { + return nil, err + } + + var kv map[interface{}]interface{} + if len(kvs) == 0 { + kv = make(map[interface{}]interface{}) + } else { + kv, err = session.DecodeGob([]byte(kvs)) + if err != nil { + return nil, err + } + } + + return NewNodbStore(p.c, sid, p.expire, kv), nil +} + +// Count counts and returns number of sessions. +func (p *NodbProvider) Count() int { + // FIXME: how come this library does not have DbSize() method? + return -1 +} + +// GC calls GC to clean expired sessions. +func (p *NodbProvider) GC() {} + +func init() { + session.Register("nodb", &NodbProvider{}) +} diff --git a/vendor/github.com/go-macaron/session/postgres/postgres.go b/vendor/github.com/go-macaron/session/postgres/postgres.go new file mode 100644 index 000000000000..f1e034b501fa --- /dev/null +++ b/vendor/github.com/go-macaron/session/postgres/postgres.go @@ -0,0 +1,201 @@ +// Copyright 2013 Beego Authors +// Copyright 2014 The Macaron Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"): you may +// not use this file except in compliance with the License. You may obtain +// a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. + +package session + +import ( + "database/sql" + "fmt" + "log" + "sync" + "time" + + _ "github.com/lib/pq" + + "github.com/go-macaron/session" +) + +// PostgresStore represents a postgres session store implementation. +type PostgresStore struct { + c *sql.DB + sid string + lock sync.RWMutex + data map[interface{}]interface{} +} + +// NewPostgresStore creates and returns a postgres session store. +func NewPostgresStore(c *sql.DB, sid string, kv map[interface{}]interface{}) *PostgresStore { + return &PostgresStore{ + c: c, + sid: sid, + data: kv, + } +} + +// Set sets value to given key in session. +func (s *PostgresStore) Set(key, value interface{}) error { + s.lock.Lock() + defer s.lock.Unlock() + + s.data[key] = value + return nil +} + +// Get gets value by given key in session. +func (s *PostgresStore) Get(key interface{}) interface{} { + s.lock.RLock() + defer s.lock.RUnlock() + + return s.data[key] +} + +// Delete delete a key from session. +func (s *PostgresStore) Delete(key interface{}) error { + s.lock.Lock() + defer s.lock.Unlock() + + delete(s.data, key) + return nil +} + +// ID returns current session ID. +func (s *PostgresStore) ID() string { + return s.sid +} + +// save postgres session values to database. +// must call this method to save values to database. +func (s *PostgresStore) Release() error { + // Skip encoding if the data is empty + if len(s.data) == 0 { + return nil + } + + data, err := session.EncodeGob(s.data) + if err != nil { + return err + } + + _, err = s.c.Exec("UPDATE session SET data=$1, expiry=$2 WHERE key=$3", + data, time.Now().Unix(), s.sid) + return err +} + +// Flush deletes all session data. +func (s *PostgresStore) Flush() error { + s.lock.Lock() + defer s.lock.Unlock() + + s.data = make(map[interface{}]interface{}) + return nil +} + +// PostgresProvider represents a postgres session provider implementation. +type PostgresProvider struct { + c *sql.DB + maxlifetime int64 +} + +// Init initializes postgres session provider. +// connStr: user=a password=b host=localhost port=5432 dbname=c sslmode=disable +func (p *PostgresProvider) Init(maxlifetime int64, connStr string) (err error) { + p.maxlifetime = maxlifetime + + p.c, err = sql.Open("postgres", connStr) + if err != nil { + return err + } + return p.c.Ping() +} + +// Read returns raw session store by session ID. +func (p *PostgresProvider) Read(sid string) (session.RawStore, error) { + var data []byte + err := p.c.QueryRow("SELECT data FROM session WHERE key=$1", sid).Scan(&data) + if err == sql.ErrNoRows { + _, err = p.c.Exec("INSERT INTO session(key,data,expiry) VALUES($1,$2,$3)", + sid, "", time.Now().Unix()) + } + if err != nil { + return nil, err + } + + var kv map[interface{}]interface{} + if len(data) == 0 { + kv = make(map[interface{}]interface{}) + } else { + kv, err = session.DecodeGob(data) + if err != nil { + return nil, err + } + } + + return NewPostgresStore(p.c, sid, kv), nil +} + +// Exist returns true if session with given ID exists. +func (p *PostgresProvider) Exist(sid string) bool { + var data []byte + err := p.c.QueryRow("SELECT data FROM session WHERE key=$1", sid).Scan(&data) + if err != nil && err != sql.ErrNoRows { + panic("session/postgres: error checking existence: " + err.Error()) + } + return err != sql.ErrNoRows +} + +// Destory deletes a session by session ID. +func (p *PostgresProvider) Destory(sid string) error { + _, err := p.c.Exec("DELETE FROM session WHERE key=$1", sid) + return err +} + +// Regenerate regenerates a session store from old session ID to new one. +func (p *PostgresProvider) Regenerate(oldsid, sid string) (_ session.RawStore, err error) { + if p.Exist(sid) { + return nil, fmt.Errorf("new sid '%s' already exists", sid) + } + + if !p.Exist(oldsid) { + if _, err = p.c.Exec("INSERT INTO session(key,data,expiry) VALUES($1,$2,$3)", + oldsid, "", time.Now().Unix()); err != nil { + return nil, err + } + } + + if _, err = p.c.Exec("UPDATE session SET key=$1 WHERE key=$2", sid, oldsid); err != nil { + return nil, err + } + + return p.Read(sid) +} + +// Count counts and returns number of sessions. +func (p *PostgresProvider) Count() (total int) { + if err := p.c.QueryRow("SELECT COUNT(*) AS NUM FROM session").Scan(&total); err != nil { + panic("session/postgres: error counting records: " + err.Error()) + } + return total +} + +// GC calls GC to clean expired sessions. +func (p *PostgresProvider) GC() { + if _, err := p.c.Exec("DELETE FROM session WHERE EXTRACT(EPOCH FROM NOW()) - expiry > $1", p.maxlifetime); err != nil { + log.Printf("session/postgres: error garbage collecting: %v", err) + } +} + +func init() { + session.Register("postgres", &PostgresProvider{}) +} diff --git a/vendor/github.com/go-macaron/session/redis/redis.go b/vendor/github.com/go-macaron/session/redis/redis.go index ca1cf88de61a..2d7fe984055d 100644 --- a/vendor/github.com/go-macaron/session/redis/redis.go +++ b/vendor/github.com/go-macaron/session/redis/redis.go @@ -81,6 +81,11 @@ func (s *RedisStore) ID() string { // Release releases resource and save data to provider. func (s *RedisStore) Release() error { + // Skip encoding if the data is empty + if len(s.data) == 0 { + return nil + } + data, err := session.EncodeGob(s.data) if err != nil { return err @@ -153,7 +158,7 @@ func (p *RedisProvider) Init(maxlifetime int64, configs string) (err error) { func (p *RedisProvider) Read(sid string) (session.RawStore, error) { psid := p.prefix + sid if !p.Exist(sid) { - if err := p.c.Set(psid, "").Err(); err != nil { + if err := p.c.SetEx(psid, p.duration, "").Err(); err != nil { return nil, err } } diff --git a/vendor/github.com/go-macaron/session/session.go b/vendor/github.com/go-macaron/session/session.go index d9bbae20321e..97fa56ede60f 100644 --- a/vendor/github.com/go-macaron/session/session.go +++ b/vendor/github.com/go-macaron/session/session.go @@ -22,13 +22,12 @@ import ( "fmt" "net/http" "net/url" - "strings" "time" "gopkg.in/macaron.v1" ) -const _VERSION = "0.4.0" +const _VERSION = "0.6.0" func Version() string { return _VERSION @@ -96,6 +95,8 @@ type Options struct { IDLength int // Configuration section name. Default is "session". Section string + // Ignore release for websocket. Default is false. + IgnoreReleaseForWebSocket bool } func prepareOptions(options []Options) Options { @@ -138,6 +139,9 @@ func prepareOptions(options []Options) Options { if opt.IDLength == 0 { opt.IDLength = sec.Key("ID_LENGTH").MustInt(16) } + if !opt.IgnoreReleaseForWebSocket { + opt.IgnoreReleaseForWebSocket = sec.Key("IGNORE_RELEASE_FOR_WEBSOCKET").MustBool() + } return opt } @@ -187,6 +191,10 @@ func Sessioner(options ...Options) macaron.Handler { ctx.Next() + if manager.opt.IgnoreReleaseForWebSocket && ctx.Req.Header.Get("Upgrade") == "websocket" { + return + } + if err = sess.Release(); err != nil { panic("session(release): " + err.Error()) } @@ -252,12 +260,30 @@ func (m *Manager) sessionID() string { return hex.EncodeToString(generateRandomKey(m.opt.IDLength / 2)) } +// validSessionID tests whether a provided session ID is a valid session ID. +func (m *Manager) validSessionID(sid string) (bool, error) { + if len(sid) != m.opt.IDLength { + return false, errors.New("invalid 'sid': " + sid) + } + + for i := range sid { + switch { + case '0' <= sid[i] && sid[i] <= '9': + case 'a' <= sid[i] && sid[i] <= 'f': + default: + return false, errors.New("invalid 'sid': " + sid) + } + } + return true, nil +} + // Start starts a session by generating new one // or retrieve existence one by reading session ID from HTTP request if it's valid. func (m *Manager) Start(ctx *macaron.Context) (RawStore, error) { sid := ctx.GetCookie(m.opt.CookieName) - if len(sid) > 0 && m.provider.Exist(sid) { - return m.Read(sid) + valid, _ := m.validSessionID(sid) + if len(sid) > 0 && valid && m.provider.Exist(sid) { + return m.provider.Read(sid) } sid = m.sessionID() @@ -284,10 +310,9 @@ func (m *Manager) Start(ctx *macaron.Context) (RawStore, error) { // Read returns raw session store by session ID. func (m *Manager) Read(sid string) (RawStore, error) { - // No slashes or dots "./" should ever occur in the sid and to prevent session file forgery bug. - // See https://github.com/gogs/gogs/issues/5469 - if strings.ContainsAny(sid, "./") { - return nil, errors.New("invalid 'sid': " + sid) + // Ensure we're trying to read a valid session ID + if _, err := m.validSessionID(sid); err != nil { + return nil, err } return m.provider.Read(sid) @@ -300,6 +325,10 @@ func (m *Manager) Destory(ctx *macaron.Context) error { return nil } + if _, err := m.validSessionID(sid); err != nil { + return err + } + if err := m.provider.Destory(sid); err != nil { return err } @@ -318,11 +347,15 @@ func (m *Manager) Destory(ctx *macaron.Context) error { func (m *Manager) RegenerateId(ctx *macaron.Context) (sess RawStore, err error) { sid := m.sessionID() oldsid := ctx.GetCookie(m.opt.CookieName) + _, err = m.validSessionID(oldsid) + if err != nil { + return nil, err + } sess, err = m.provider.Regenerate(oldsid, sid) if err != nil { return nil, err } - ck := &http.Cookie{ + cookie := &http.Cookie{ Name: m.opt.CookieName, Value: sid, Path: m.opt.CookiePath, @@ -331,10 +364,10 @@ func (m *Manager) RegenerateId(ctx *macaron.Context) (sess RawStore, err error) Domain: m.opt.Domain, } if m.opt.CookieLifeTime >= 0 { - ck.MaxAge = m.opt.CookieLifeTime + cookie.MaxAge = m.opt.CookieLifeTime } - http.SetCookie(ctx.Resp, ck) - ctx.Req.AddCookie(ck) + http.SetCookie(ctx.Resp, cookie) + ctx.Req.AddCookie(cookie) return sess, nil } @@ -358,50 +391,3 @@ func (m *Manager) startGC() { func (m *Manager) SetSecure(secure bool) { m.opt.Secure = secure } - -// ___________.____ _____ _________ ___ ___ -// \_ _____/| | / _ \ / _____// | \ -// | __) | | / /_\ \ \_____ \/ ~ \ -// | \ | |___/ | \/ \ Y / -// \___ / |_______ \____|__ /_______ /\___|_ / -// \/ \/ \/ \/ \/ - -type Flash struct { - ctx *macaron.Context - url.Values - ErrorMsg, WarningMsg, InfoMsg, SuccessMsg string -} - -func (f *Flash) set(name, msg string, current ...bool) { - isShow := false - if (len(current) == 0 && macaron.FlashNow) || - (len(current) > 0 && current[0]) { - isShow = true - } - - if isShow { - f.ctx.Data["Flash"] = f - } else { - f.Set(name, msg) - } -} - -func (f *Flash) Error(msg string, current ...bool) { - f.ErrorMsg = msg - f.set("error", msg, current...) -} - -func (f *Flash) Warning(msg string, current ...bool) { - f.WarningMsg = msg - f.set("warning", msg, current...) -} - -func (f *Flash) Info(msg string, current ...bool) { - f.InfoMsg = msg - f.set("info", msg, current...) -} - -func (f *Flash) Success(msg string, current ...bool) { - f.SuccessMsg = msg - f.set("success", msg, current...) -} diff --git a/vendor/github.com/go-macaron/session/utils.go b/vendor/github.com/go-macaron/session/utils.go index 07a1283df94f..90ca38064bee 100644 --- a/vendor/github.com/go-macaron/session/utils.go +++ b/vendor/github.com/go-macaron/session/utils.go @@ -50,11 +50,14 @@ func DecodeGob(encoded []byte) (out map[interface{}]interface{}, err error) { return out, err } +// NOTE: A local copy in case of underlying package change +var alphanum = []byte("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz") + // generateRandomKey creates a random key with the given strength. func generateRandomKey(strength int) []byte { k := make([]byte, strength) if n, err := io.ReadFull(rand.Reader, k); n != strength || err != nil { - return com.RandomCreateBytes(strength) + return com.RandomCreateBytes(strength, alphanum...) } return k } diff --git a/vendor/github.com/lunny/log/LICENSE b/vendor/github.com/lunny/log/LICENSE new file mode 100644 index 000000000000..c9338f829339 --- /dev/null +++ b/vendor/github.com/lunny/log/LICENSE @@ -0,0 +1,27 @@ +Copyright (c) 2014 - 2016 lunny +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the {organization} nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/github.com/lunny/log/dbwriter.go b/vendor/github.com/lunny/log/dbwriter.go new file mode 100644 index 000000000000..e8ff00bd8964 --- /dev/null +++ b/vendor/github.com/lunny/log/dbwriter.go @@ -0,0 +1,36 @@ +package log + +import ( + "database/sql" + "time" +) + +type DBWriter struct { + db *sql.DB + stmt *sql.Stmt + content chan []byte +} + +func NewDBWriter(db *sql.DB) (*DBWriter, error) { + _, err := db.Exec("CREATE TABLE IF NOT EXISTS log (id int, content text, created datetime)") + if err != nil { + return nil, err + } + stmt, err := db.Prepare("INSERT INTO log (content, created) values (?, ?)") + if err != nil { + return nil, err + } + return &DBWriter{db, stmt, make(chan []byte, 1000)}, nil +} + +func (w *DBWriter) Write(p []byte) (n int, err error) { + _, err = w.stmt.Exec(string(p), time.Now()) + if err == nil { + n = len(p) + } + return +} + +func (w *DBWriter) Close() { + w.stmt.Close() +} diff --git a/vendor/github.com/lunny/log/filewriter.go b/vendor/github.com/lunny/log/filewriter.go new file mode 100644 index 000000000000..f0bb4d1df1b2 --- /dev/null +++ b/vendor/github.com/lunny/log/filewriter.go @@ -0,0 +1,112 @@ +package log + +import ( + "io" + "os" + "path/filepath" + "sync" + "time" +) + +var _ io.Writer = &Files{} + +type ByType int + +const ( + ByDay ByType = iota + ByHour + ByMonth +) + +var ( + formats = map[ByType]string{ + ByDay: "2006-01-02", + ByHour: "2006-01-02-15", + ByMonth: "2006-01", + } +) + +func SetFileFormat(t ByType, format string) { + formats[t] = format +} + +func (b ByType) Format() string { + return formats[b] +} + +type Files struct { + FileOptions + f *os.File + lastFormat string + lock sync.Mutex +} + +type FileOptions struct { + Dir string + ByType ByType + Loc *time.Location +} + +func prepareFileOption(opts []FileOptions) FileOptions { + var opt FileOptions + if len(opts) > 0 { + opt = opts[0] + } + if opt.Dir == "" { + opt.Dir = "./" + } + err := os.MkdirAll(opt.Dir, os.ModePerm) + if err != nil { + panic(err.Error()) + } + + if opt.Loc == nil { + opt.Loc = time.Local + } + return opt +} + +func NewFileWriter(opts ...FileOptions) *Files { + opt := prepareFileOption(opts) + return &Files{ + FileOptions: opt, + } +} + +func (f *Files) getFile() (*os.File, error) { + var err error + t := time.Now().In(f.Loc) + if f.f == nil { + f.lastFormat = t.Format(f.ByType.Format()) + f.f, err = os.OpenFile(filepath.Join(f.Dir, f.lastFormat+".log"), + os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0600) + return f.f, err + } + if f.lastFormat != t.Format(f.ByType.Format()) { + f.f.Close() + f.lastFormat = t.Format(f.ByType.Format()) + f.f, err = os.OpenFile(filepath.Join(f.Dir, f.lastFormat+".log"), + os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0600) + return f.f, err + } + return f.f, nil +} + +func (f *Files) Write(bs []byte) (int, error) { + f.lock.Lock() + defer f.lock.Unlock() + + w, err := f.getFile() + if err != nil { + return 0, err + } + return w.Write(bs) +} + +func (f *Files) Close() { + if f.f != nil { + f.f.Close() + f.f = nil + } + f.lastFormat = "" +} diff --git a/vendor/github.com/lunny/log/logext.go b/vendor/github.com/lunny/log/logext.go new file mode 100644 index 000000000000..215c45f30935 --- /dev/null +++ b/vendor/github.com/lunny/log/logext.go @@ -0,0 +1,595 @@ +package log + +import ( + "bytes" + "fmt" + "io" + "os" + "runtime" + "strings" + "sync" + "time" +) + +// These flags define which text to prefix to each log entry generated by the Logger. +const ( + // Bits or'ed together to control what's printed. There is no control over the + // order they appear (the order listed here) or the format they present (as + // described in the comments). A colon appears after these items: + // 2009/0123 01:23:23.123123 /a/b/c/d.go:23: message + Ldate = 1 << iota // the date: 2009/0123 + Ltime // the time: 01:23:23 + Lmicroseconds // microsecond resolution: 01:23:23.123123. assumes Ltime. + Llongfile // full file name and line number: /a/b/c/d.go:23 + Lshortfile // final file name element and line number: d.go:23. overrides Llongfile + Lmodule // module name + Llevel // level: 0(Debug), 1(Info), 2(Warn), 3(Error), 4(Panic), 5(Fatal) + Llongcolor // color will start [info] end of line + Lshortcolor // color only include [info] + LstdFlags = Ldate | Ltime // initial values for the standard logger + //Ldefault = Llevel | LstdFlags | Lshortfile | Llongcolor +) // [prefix][time][level][module][shortfile|longfile] + +func Ldefault() int { + if runtime.GOOS == "windows" { + return Llevel | LstdFlags | Lshortfile + } + return Llevel | LstdFlags | Lshortfile | Llongcolor +} + +func Version() string { + return "0.2.0.1121" +} + +const ( + Lall = iota +) +const ( + Ldebug = iota + Linfo + Lwarn + Lerror + Lpanic + Lfatal + Lnone +) + +const ( + ForeBlack = iota + 30 //30 + ForeRed //31 + ForeGreen //32 + ForeYellow //33 + ForeBlue //34 + ForePurple //35 + ForeCyan //36 + ForeWhite //37 +) + +const ( + BackBlack = iota + 40 //40 + BackRed //41 + BackGreen //42 + BackYellow //43 + BackBlue //44 + BackPurple //45 + BackCyan //46 + BackWhite //47 +) + +var levels = []string{ + "[Debug]", + "[Info]", + "[Warn]", + "[Error]", + "[Panic]", + "[Fatal]", +} + +// MUST called before all logs +func SetLevels(lvs []string) { + levels = lvs +} + +var colors = []int{ + ForeCyan, + ForeGreen, + ForeYellow, + ForeRed, + ForePurple, + ForeBlue, +} + +// MUST called before all logs +func SetColors(cls []int) { + colors = cls +} + +// A Logger represents an active logging object that generates lines of +// output to an io.Writer. Each logging operation makes a single call to +// the Writer's Write method. A Logger can be used simultaneously from +// multiple goroutines; it guarantees to serialize access to the Writer. +type Logger struct { + mu sync.Mutex // ensures atomic writes; protects the following fields + prefix string // prefix to write at beginning of each line + flag int // properties + Level int + out io.Writer // destination for output + buf bytes.Buffer // for accumulating text to write + levelStats [6]int64 + loc *time.Location +} + +// New creates a new Logger. The out variable sets the +// destination to which log data will be written. +// The prefix appears at the beginning of each generated log line. +// The flag argument defines the logging properties. +func New(out io.Writer, prefix string, flag int) *Logger { + l := &Logger{out: out, prefix: prefix, Level: 1, flag: flag, loc: time.Local} + if out != os.Stdout { + l.flag = RmColorFlags(l.flag) + } + return l +} + +var Std = New(os.Stderr, "", Ldefault()) + +// Cheap integer to fixed-width decimal ASCII. Give a negative width to avoid zero-padding. +// Knows the buffer has capacity. +func itoa(buf *bytes.Buffer, i int, wid int) { + var u uint = uint(i) + if u == 0 && wid <= 1 { + buf.WriteByte('0') + return + } + + // Assemble decimal in reverse order. + var b [32]byte + bp := len(b) + for ; u > 0 || wid > 0; u /= 10 { + bp-- + wid-- + b[bp] = byte(u%10) + '0' + } + + // avoid slicing b to avoid an allocation. + for bp < len(b) { + buf.WriteByte(b[bp]) + bp++ + } +} + +func moduleOf(file string) string { + pos := strings.LastIndex(file, "/") + if pos != -1 { + pos1 := strings.LastIndex(file[:pos], "/src/") + if pos1 != -1 { + return file[pos1+5 : pos] + } + } + return "UNKNOWN" +} + +func (l *Logger) formatHeader(buf *bytes.Buffer, t time.Time, + file string, line int, lvl int, reqId string) { + if l.prefix != "" { + buf.WriteString(l.prefix) + } + if l.flag&(Ldate|Ltime|Lmicroseconds) != 0 { + if l.flag&Ldate != 0 { + year, month, day := t.Date() + itoa(buf, year, 4) + buf.WriteByte('/') + itoa(buf, int(month), 2) + buf.WriteByte('/') + itoa(buf, day, 2) + buf.WriteByte(' ') + } + if l.flag&(Ltime|Lmicroseconds) != 0 { + hour, min, sec := t.Clock() + itoa(buf, hour, 2) + buf.WriteByte(':') + itoa(buf, min, 2) + buf.WriteByte(':') + itoa(buf, sec, 2) + if l.flag&Lmicroseconds != 0 { + buf.WriteByte('.') + itoa(buf, t.Nanosecond()/1e3, 6) + } + buf.WriteByte(' ') + } + } + if reqId != "" { + buf.WriteByte('[') + buf.WriteString(reqId) + buf.WriteByte(']') + buf.WriteByte(' ') + } + + if l.flag&(Lshortcolor|Llongcolor) != 0 { + buf.WriteString(fmt.Sprintf("\033[1;%dm", colors[lvl])) + } + if l.flag&Llevel != 0 { + buf.WriteString(levels[lvl]) + buf.WriteByte(' ') + } + if l.flag&Lshortcolor != 0 { + buf.WriteString("\033[0m") + } + + if l.flag&Lmodule != 0 { + buf.WriteByte('[') + buf.WriteString(moduleOf(file)) + buf.WriteByte(']') + buf.WriteByte(' ') + } + if l.flag&(Lshortfile|Llongfile) != 0 { + if l.flag&Lshortfile != 0 { + short := file + for i := len(file) - 1; i > 0; i-- { + if file[i] == '/' { + short = file[i+1:] + break + } + } + file = short + } + buf.WriteString(file) + buf.WriteByte(':') + itoa(buf, line, -1) + buf.WriteByte(' ') + } +} + +// Output writes the output for a logging event. The string s contains +// the text to print after the prefix specified by the flags of the +// Logger. A newline is appended if the last character of s is not +// already a newline. Calldepth is used to recover the PC and is +// provided for generality, although at the moment on all pre-defined +// paths it will be 2. +func (l *Logger) Output(reqId string, lvl int, calldepth int, s string) error { + if lvl < l.Level { + return nil + } + now := time.Now().In(l.loc) // get this early. + var file string + var line int + l.mu.Lock() + defer l.mu.Unlock() + if l.flag&(Lshortfile|Llongfile|Lmodule) != 0 { + // release lock while getting caller info - it's expensive. + l.mu.Unlock() + var ok bool + _, file, line, ok = runtime.Caller(calldepth) + if !ok { + file = "???" + line = 0 + } + l.mu.Lock() + } + l.levelStats[lvl]++ + l.buf.Reset() + l.formatHeader(&l.buf, now, file, line, lvl, reqId) + l.buf.WriteString(s) + if l.flag&Llongcolor != 0 { + l.buf.WriteString("\033[0m") + } + if len(s) > 0 && s[len(s)-1] != '\n' { + l.buf.WriteByte('\n') + } + _, err := l.out.Write(l.buf.Bytes()) + return err +} + +// ----------------------------------------- + +// Printf calls l.Output to print to the logger. +// Arguments are handled in the manner of fmt.Printf. +func (l *Logger) Printf(format string, v ...interface{}) { + l.Output("", Linfo, 2, fmt.Sprintf(format, v...)) +} + +// Print calls l.Output to print to the logger. +// Arguments are handled in the manner of fmt.Print. +func (l *Logger) Print(v ...interface{}) { + l.Output("", Linfo, 2, fmt.Sprint(v...)) +} + +// Println calls l.Output to print to the logger. +// Arguments are handled in the manner of fmt.Println. +func (l *Logger) Println(v ...interface{}) { + l.Output("", Linfo, 2, fmt.Sprintln(v...)) +} + +// ----------------------------------------- + +func (l *Logger) Debugf(format string, v ...interface{}) { + l.Output("", Ldebug, 2, fmt.Sprintf(format, v...)) +} + +func (l *Logger) Debug(v ...interface{}) { + l.Output("", Ldebug, 2, fmt.Sprintln(v...)) +} + +// ----------------------------------------- +func (l *Logger) Infof(format string, v ...interface{}) { + l.Output("", Linfo, 2, fmt.Sprintf(format, v...)) +} + +func (l *Logger) Info(v ...interface{}) { + l.Output("", Linfo, 2, fmt.Sprintln(v...)) +} + +// ----------------------------------------- +func (l *Logger) Warnf(format string, v ...interface{}) { + l.Output("", Lwarn, 2, fmt.Sprintf(format, v...)) +} + +func (l *Logger) Warn(v ...interface{}) { + l.Output("", Lwarn, 2, fmt.Sprintln(v...)) +} + +// ----------------------------------------- + +func (l *Logger) Errorf(format string, v ...interface{}) { + l.Output("", Lerror, 2, fmt.Sprintf(format, v...)) +} + +func (l *Logger) Error(v ...interface{}) { + l.Output("", Lerror, 2, fmt.Sprintln(v...)) +} + +// ----------------------------------------- + +func (l *Logger) Fatal(v ...interface{}) { + l.Output("", Lfatal, 2, fmt.Sprintln(v...)) + os.Exit(1) +} + +// Fatalf is equivalent to l.Printf() followed by a call to os.Exit(1). +func (l *Logger) Fatalf(format string, v ...interface{}) { + l.Output("", Lfatal, 2, fmt.Sprintf(format, v...)) + os.Exit(1) +} + +// ----------------------------------------- +// Panic is equivalent to l.Print() followed by a call to panic(). +func (l *Logger) Panic(v ...interface{}) { + s := fmt.Sprintln(v...) + l.Output("", Lpanic, 2, s) + panic(s) +} + +// Panicf is equivalent to l.Printf() followed by a call to panic(). +func (l *Logger) Panicf(format string, v ...interface{}) { + s := fmt.Sprintf(format, v...) + l.Output("", Lpanic, 2, s) + panic(s) +} + +// ----------------------------------------- +func (l *Logger) Stack(v ...interface{}) { + s := fmt.Sprint(v...) + s += "\n" + buf := make([]byte, 1024*1024) + n := runtime.Stack(buf, true) + s += string(buf[:n]) + s += "\n" + l.Output("", Lerror, 2, s) +} + +// ----------------------------------------- +func (l *Logger) Stat() (stats []int64) { + l.mu.Lock() + v := l.levelStats + l.mu.Unlock() + return v[:] +} + +// Flags returns the output flags for the logger. +func (l *Logger) Flags() int { + l.mu.Lock() + defer l.mu.Unlock() + return l.flag +} + +func RmColorFlags(flag int) int { + // for un std out, it should not show color since almost them don't support + if flag&Llongcolor != 0 { + flag = flag ^ Llongcolor + } + if flag&Lshortcolor != 0 { + flag = flag ^ Lshortcolor + } + return flag +} + +func (l *Logger) Location() *time.Location { + return l.loc +} + +func (l *Logger) SetLocation(loc *time.Location) { + l.loc = loc +} + +// SetFlags sets the output flags for the logger. +func (l *Logger) SetFlags(flag int) { + l.mu.Lock() + defer l.mu.Unlock() + if l.out != os.Stdout { + flag = RmColorFlags(flag) + } + l.flag = flag +} + +// Prefix returns the output prefix for the logger. +func (l *Logger) Prefix() string { + l.mu.Lock() + defer l.mu.Unlock() + return l.prefix +} + +// SetPrefix sets the output prefix for the logger. +func (l *Logger) SetPrefix(prefix string) { + l.mu.Lock() + defer l.mu.Unlock() + l.prefix = prefix +} + +// SetOutputLevel sets the output level for the logger. +func (l *Logger) SetOutputLevel(lvl int) { + l.mu.Lock() + defer l.mu.Unlock() + l.Level = lvl +} + +func (l *Logger) OutputLevel() int { + return l.Level +} + +func (l *Logger) SetOutput(w io.Writer) { + l.mu.Lock() + defer l.mu.Unlock() + l.out = w + if w != os.Stdout { + l.flag = RmColorFlags(l.flag) + } +} + +// SetOutput sets the output destination for the standard logger. +func SetOutput(w io.Writer) { + Std.SetOutput(w) +} + +func SetLocation(loc *time.Location) { + Std.SetLocation(loc) +} + +func Location() *time.Location { + return Std.Location() +} + +// Flags returns the output flags for the standard logger. +func Flags() int { + return Std.Flags() +} + +// SetFlags sets the output flags for the standard logger. +func SetFlags(flag int) { + Std.SetFlags(flag) +} + +// Prefix returns the output prefix for the standard logger. +func Prefix() string { + return Std.Prefix() +} + +// SetPrefix sets the output prefix for the standard logger. +func SetPrefix(prefix string) { + Std.SetPrefix(prefix) +} + +func SetOutputLevel(lvl int) { + Std.SetOutputLevel(lvl) +} + +func OutputLevel() int { + return Std.OutputLevel() +} + +// ----------------------------------------- + +// Print calls Output to print to the standard logger. +// Arguments are handled in the manner of fmt.Print. +func Print(v ...interface{}) { + Std.Output("", Linfo, 2, fmt.Sprintln(v...)) +} + +// Printf calls Output to print to the standard logger. +// Arguments are handled in the manner of fmt.Printf. +func Printf(format string, v ...interface{}) { + Std.Output("", Linfo, 2, fmt.Sprintf(format, v...)) +} + +// Println calls Output to print to the standard logger. +// Arguments are handled in the manner of fmt.Println. +func Println(v ...interface{}) { + Std.Output("", Linfo, 2, fmt.Sprintln(v...)) +} + +// ----------------------------------------- + +func Debugf(format string, v ...interface{}) { + Std.Output("", Ldebug, 2, fmt.Sprintf(format, v...)) +} + +func Debug(v ...interface{}) { + Std.Output("", Ldebug, 2, fmt.Sprintln(v...)) +} + +// ----------------------------------------- + +func Infof(format string, v ...interface{}) { + Std.Output("", Linfo, 2, fmt.Sprintf(format, v...)) +} + +func Info(v ...interface{}) { + Std.Output("", Linfo, 2, fmt.Sprintln(v...)) +} + +// ----------------------------------------- + +func Warnf(format string, v ...interface{}) { + Std.Output("", Lwarn, 2, fmt.Sprintf(format, v...)) +} + +func Warn(v ...interface{}) { + Std.Output("", Lwarn, 2, fmt.Sprintln(v...)) +} + +// ----------------------------------------- + +func Errorf(format string, v ...interface{}) { + Std.Output("", Lerror, 2, fmt.Sprintf(format, v...)) +} + +func Error(v ...interface{}) { + Std.Output("", Lerror, 2, fmt.Sprintln(v...)) +} + +// ----------------------------------------- + +// Fatal is equivalent to Print() followed by a call to os.Exit(1). +func Fatal(v ...interface{}) { + Std.Output("", Lfatal, 2, fmt.Sprintln(v...)) +} + +// Fatalf is equivalent to Printf() followed by a call to os.Exit(1). +func Fatalf(format string, v ...interface{}) { + Std.Output("", Lfatal, 2, fmt.Sprintf(format, v...)) +} + +// ----------------------------------------- + +// Panic is equivalent to Print() followed by a call to panic(). +func Panic(v ...interface{}) { + Std.Output("", Lpanic, 2, fmt.Sprintln(v...)) +} + +// Panicf is equivalent to Printf() followed by a call to panic(). +func Panicf(format string, v ...interface{}) { + Std.Output("", Lpanic, 2, fmt.Sprintf(format, v...)) +} + +// ----------------------------------------- + +func Stack(v ...interface{}) { + s := fmt.Sprint(v...) + s += "\n" + buf := make([]byte, 1024*1024) + n := runtime.Stack(buf, true) + s += string(buf[:n]) + s += "\n" + Std.Output("", Lerror, 2, s) +} + +// ----------------------------------------- diff --git a/vendor/github.com/lunny/nodb/LICENSE b/vendor/github.com/lunny/nodb/LICENSE new file mode 100644 index 000000000000..7ece9fdf5a64 --- /dev/null +++ b/vendor/github.com/lunny/nodb/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2014 siddontang + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/vendor/github.com/lunny/nodb/batch.go b/vendor/github.com/lunny/nodb/batch.go new file mode 100644 index 000000000000..e69d96a122e9 --- /dev/null +++ b/vendor/github.com/lunny/nodb/batch.go @@ -0,0 +1,106 @@ +package nodb + +import ( + "sync" + + "github.com/lunny/nodb/store" +) + +type batch struct { + l *Nodb + + store.WriteBatch + + sync.Locker + + logs [][]byte + + tx *Tx +} + +func (b *batch) Commit() error { + b.l.commitLock.Lock() + defer b.l.commitLock.Unlock() + + err := b.WriteBatch.Commit() + + if b.l.binlog != nil { + if err == nil { + if b.tx == nil { + b.l.binlog.Log(b.logs...) + } else { + b.tx.logs = append(b.tx.logs, b.logs...) + } + } + b.logs = [][]byte{} + } + + return err +} + +func (b *batch) Lock() { + b.Locker.Lock() +} + +func (b *batch) Unlock() { + if b.l.binlog != nil { + b.logs = [][]byte{} + } + b.WriteBatch.Rollback() + b.Locker.Unlock() +} + +func (b *batch) Put(key []byte, value []byte) { + if b.l.binlog != nil { + buf := encodeBinLogPut(key, value) + b.logs = append(b.logs, buf) + } + b.WriteBatch.Put(key, value) +} + +func (b *batch) Delete(key []byte) { + if b.l.binlog != nil { + buf := encodeBinLogDelete(key) + b.logs = append(b.logs, buf) + } + b.WriteBatch.Delete(key) +} + +type dbBatchLocker struct { + l *sync.Mutex + wrLock *sync.RWMutex +} + +func (l *dbBatchLocker) Lock() { + l.wrLock.RLock() + l.l.Lock() +} + +func (l *dbBatchLocker) Unlock() { + l.l.Unlock() + l.wrLock.RUnlock() +} + +type txBatchLocker struct { +} + +func (l *txBatchLocker) Lock() {} +func (l *txBatchLocker) Unlock() {} + +type multiBatchLocker struct { +} + +func (l *multiBatchLocker) Lock() {} +func (l *multiBatchLocker) Unlock() {} + +func (l *Nodb) newBatch(wb store.WriteBatch, locker sync.Locker, tx *Tx) *batch { + b := new(batch) + b.l = l + b.WriteBatch = wb + + b.tx = tx + b.Locker = locker + + b.logs = [][]byte{} + return b +} diff --git a/vendor/github.com/lunny/nodb/binlog.go b/vendor/github.com/lunny/nodb/binlog.go new file mode 100644 index 000000000000..4c094d946302 --- /dev/null +++ b/vendor/github.com/lunny/nodb/binlog.go @@ -0,0 +1,391 @@ +package nodb + +import ( + "bufio" + "encoding/binary" + "fmt" + "io" + "io/ioutil" + "os" + "path" + "strconv" + "strings" + "sync" + "time" + + "github.com/lunny/log" + "github.com/lunny/nodb/config" +) + +type BinLogHead struct { + CreateTime uint32 + BatchId uint32 + PayloadLen uint32 +} + +func (h *BinLogHead) Len() int { + return 12 +} + +func (h *BinLogHead) Write(w io.Writer) error { + if err := binary.Write(w, binary.BigEndian, h.CreateTime); err != nil { + return err + } + + if err := binary.Write(w, binary.BigEndian, h.BatchId); err != nil { + return err + } + + if err := binary.Write(w, binary.BigEndian, h.PayloadLen); err != nil { + return err + } + + return nil +} + +func (h *BinLogHead) handleReadError(err error) error { + if err == io.EOF { + return io.ErrUnexpectedEOF + } else { + return err + } +} + +func (h *BinLogHead) Read(r io.Reader) error { + var err error + if err = binary.Read(r, binary.BigEndian, &h.CreateTime); err != nil { + return err + } + + if err = binary.Read(r, binary.BigEndian, &h.BatchId); err != nil { + return h.handleReadError(err) + } + + if err = binary.Read(r, binary.BigEndian, &h.PayloadLen); err != nil { + return h.handleReadError(err) + } + + return nil +} + +func (h *BinLogHead) InSameBatch(ho *BinLogHead) bool { + if h.CreateTime == ho.CreateTime && h.BatchId == ho.BatchId { + return true + } else { + return false + } +} + +/* +index file format: +ledis-bin.00001 +ledis-bin.00002 +ledis-bin.00003 + +log file format + +Log: Head|PayloadData + +Head: createTime|batchId|payloadData + +*/ + +type BinLog struct { + sync.Mutex + + path string + + cfg *config.BinLogConfig + + logFile *os.File + + logWb *bufio.Writer + + indexName string + logNames []string + lastLogIndex int64 + + batchId uint32 + + ch chan struct{} +} + +func NewBinLog(cfg *config.Config) (*BinLog, error) { + l := new(BinLog) + + l.cfg = &cfg.BinLog + l.cfg.Adjust() + + l.path = path.Join(cfg.DataDir, "binlog") + + if err := os.MkdirAll(l.path, os.ModePerm); err != nil { + return nil, err + } + + l.logNames = make([]string, 0, 16) + + l.ch = make(chan struct{}) + + if err := l.loadIndex(); err != nil { + return nil, err + } + + return l, nil +} + +func (l *BinLog) flushIndex() error { + data := strings.Join(l.logNames, "\n") + + bakName := fmt.Sprintf("%s.bak", l.indexName) + f, err := os.OpenFile(bakName, os.O_WRONLY|os.O_CREATE, 0666) + if err != nil { + log.Error("create binlog bak index error %s", err.Error()) + return err + } + + if _, err := f.WriteString(data); err != nil { + log.Error("write binlog index error %s", err.Error()) + f.Close() + return err + } + + f.Close() + + if err := os.Rename(bakName, l.indexName); err != nil { + log.Error("rename binlog bak index error %s", err.Error()) + return err + } + + return nil +} + +func (l *BinLog) loadIndex() error { + l.indexName = path.Join(l.path, fmt.Sprintf("ledis-bin.index")) + if _, err := os.Stat(l.indexName); os.IsNotExist(err) { + //no index file, nothing to do + } else { + indexData, err := ioutil.ReadFile(l.indexName) + if err != nil { + return err + } + + lines := strings.Split(string(indexData), "\n") + for _, line := range lines { + line = strings.Trim(line, "\r\n ") + if len(line) == 0 { + continue + } + + if _, err := os.Stat(path.Join(l.path, line)); err != nil { + log.Error("load index line %s error %s", line, err.Error()) + return err + } else { + l.logNames = append(l.logNames, line) + } + } + } + if l.cfg.MaxFileNum > 0 && len(l.logNames) > l.cfg.MaxFileNum { + //remove oldest logfile + if err := l.Purge(len(l.logNames) - l.cfg.MaxFileNum); err != nil { + return err + } + } + + var err error + if len(l.logNames) == 0 { + l.lastLogIndex = 1 + } else { + lastName := l.logNames[len(l.logNames)-1] + + if l.lastLogIndex, err = strconv.ParseInt(path.Ext(lastName)[1:], 10, 64); err != nil { + log.Error("invalid logfile name %s", err.Error()) + return err + } + + //like mysql, if server restart, a new binlog will create + l.lastLogIndex++ + } + + return nil +} + +func (l *BinLog) getLogFile() string { + return l.FormatLogFileName(l.lastLogIndex) +} + +func (l *BinLog) openNewLogFile() error { + var err error + lastName := l.getLogFile() + + logPath := path.Join(l.path, lastName) + if l.logFile, err = os.OpenFile(logPath, os.O_CREATE|os.O_WRONLY, 0666); err != nil { + log.Error("open new logfile error %s", err.Error()) + return err + } + + if l.cfg.MaxFileNum > 0 && len(l.logNames) == l.cfg.MaxFileNum { + l.purge(1) + } + + l.logNames = append(l.logNames, lastName) + + if l.logWb == nil { + l.logWb = bufio.NewWriterSize(l.logFile, 1024) + } else { + l.logWb.Reset(l.logFile) + } + + if err = l.flushIndex(); err != nil { + return err + } + + return nil +} + +func (l *BinLog) checkLogFileSize() bool { + if l.logFile == nil { + return false + } + + st, _ := l.logFile.Stat() + if st.Size() >= int64(l.cfg.MaxFileSize) { + l.closeLog() + return true + } + + return false +} + +func (l *BinLog) closeLog() { + l.lastLogIndex++ + + l.logFile.Close() + l.logFile = nil +} + +func (l *BinLog) purge(n int) { + for i := 0; i < n; i++ { + logPath := path.Join(l.path, l.logNames[i]) + os.Remove(logPath) + } + + copy(l.logNames[0:], l.logNames[n:]) + l.logNames = l.logNames[0 : len(l.logNames)-n] +} + +func (l *BinLog) Close() { + if l.logFile != nil { + l.logFile.Close() + l.logFile = nil + } +} + +func (l *BinLog) LogNames() []string { + return l.logNames +} + +func (l *BinLog) LogFileName() string { + return l.getLogFile() +} + +func (l *BinLog) LogFilePos() int64 { + if l.logFile == nil { + return 0 + } else { + st, _ := l.logFile.Stat() + return st.Size() + } +} + +func (l *BinLog) LogFileIndex() int64 { + return l.lastLogIndex +} + +func (l *BinLog) FormatLogFileName(index int64) string { + return fmt.Sprintf("ledis-bin.%07d", index) +} + +func (l *BinLog) FormatLogFilePath(index int64) string { + return path.Join(l.path, l.FormatLogFileName(index)) +} + +func (l *BinLog) LogPath() string { + return l.path +} + +func (l *BinLog) Purge(n int) error { + l.Lock() + defer l.Unlock() + + if len(l.logNames) == 0 { + return nil + } + + if n >= len(l.logNames) { + n = len(l.logNames) + //can not purge current log file + if l.logNames[n-1] == l.getLogFile() { + n = n - 1 + } + } + + l.purge(n) + + return l.flushIndex() +} + +func (l *BinLog) PurgeAll() error { + l.Lock() + defer l.Unlock() + + l.closeLog() + return l.openNewLogFile() +} + +func (l *BinLog) Log(args ...[]byte) error { + l.Lock() + defer l.Unlock() + + var err error + + if l.logFile == nil { + if err = l.openNewLogFile(); err != nil { + return err + } + } + + head := &BinLogHead{} + + head.CreateTime = uint32(time.Now().Unix()) + head.BatchId = l.batchId + + l.batchId++ + + for _, data := range args { + head.PayloadLen = uint32(len(data)) + + if err := head.Write(l.logWb); err != nil { + return err + } + + if _, err := l.logWb.Write(data); err != nil { + return err + } + } + + if err = l.logWb.Flush(); err != nil { + log.Error("write log error %s", err.Error()) + return err + } + + l.checkLogFileSize() + + close(l.ch) + l.ch = make(chan struct{}) + + return nil +} + +func (l *BinLog) Wait() <-chan struct{} { + return l.ch +} diff --git a/vendor/github.com/lunny/nodb/binlog_util.go b/vendor/github.com/lunny/nodb/binlog_util.go new file mode 100644 index 000000000000..22124dda07d8 --- /dev/null +++ b/vendor/github.com/lunny/nodb/binlog_util.go @@ -0,0 +1,215 @@ +package nodb + +import ( + "encoding/binary" + "errors" + "fmt" + "strconv" +) + +var ( + errBinLogDeleteType = errors.New("invalid bin log delete type") + errBinLogPutType = errors.New("invalid bin log put type") + errBinLogCommandType = errors.New("invalid bin log command type") +) + +func encodeBinLogDelete(key []byte) []byte { + buf := make([]byte, 1+len(key)) + buf[0] = BinLogTypeDeletion + copy(buf[1:], key) + return buf +} + +func decodeBinLogDelete(sz []byte) ([]byte, error) { + if len(sz) < 1 || sz[0] != BinLogTypeDeletion { + return nil, errBinLogDeleteType + } + + return sz[1:], nil +} + +func encodeBinLogPut(key []byte, value []byte) []byte { + buf := make([]byte, 3+len(key)+len(value)) + buf[0] = BinLogTypePut + pos := 1 + binary.BigEndian.PutUint16(buf[pos:], uint16(len(key))) + pos += 2 + copy(buf[pos:], key) + pos += len(key) + copy(buf[pos:], value) + + return buf +} + +func decodeBinLogPut(sz []byte) ([]byte, []byte, error) { + if len(sz) < 3 || sz[0] != BinLogTypePut { + return nil, nil, errBinLogPutType + } + + keyLen := int(binary.BigEndian.Uint16(sz[1:])) + if 3+keyLen > len(sz) { + return nil, nil, errBinLogPutType + } + + return sz[3 : 3+keyLen], sz[3+keyLen:], nil +} + +func FormatBinLogEvent(event []byte) (string, error) { + logType := uint8(event[0]) + + var err error + var k []byte + var v []byte + + var buf []byte = make([]byte, 0, 1024) + + switch logType { + case BinLogTypePut: + k, v, err = decodeBinLogPut(event) + buf = append(buf, "PUT "...) + case BinLogTypeDeletion: + k, err = decodeBinLogDelete(event) + buf = append(buf, "DELETE "...) + default: + err = errInvalidBinLogEvent + } + + if err != nil { + return "", err + } + + if buf, err = formatDataKey(buf, k); err != nil { + return "", err + } + + if v != nil && len(v) != 0 { + buf = append(buf, fmt.Sprintf(" %q", v)...) + } + + return String(buf), nil +} + +func formatDataKey(buf []byte, k []byte) ([]byte, error) { + if len(k) < 2 { + return nil, errInvalidBinLogEvent + } + + buf = append(buf, fmt.Sprintf("DB:%2d ", k[0])...) + buf = append(buf, fmt.Sprintf("%s ", TypeName[k[1]])...) + + db := new(DB) + db.index = k[0] + + //to do format at respective place + + switch k[1] { + case KVType: + if key, err := db.decodeKVKey(k); err != nil { + return nil, err + } else { + buf = strconv.AppendQuote(buf, String(key)) + } + case HashType: + if key, field, err := db.hDecodeHashKey(k); err != nil { + return nil, err + } else { + buf = strconv.AppendQuote(buf, String(key)) + buf = append(buf, ' ') + buf = strconv.AppendQuote(buf, String(field)) + } + case HSizeType: + if key, err := db.hDecodeSizeKey(k); err != nil { + return nil, err + } else { + buf = strconv.AppendQuote(buf, String(key)) + } + case ListType: + if key, seq, err := db.lDecodeListKey(k); err != nil { + return nil, err + } else { + buf = strconv.AppendQuote(buf, String(key)) + buf = append(buf, ' ') + buf = strconv.AppendInt(buf, int64(seq), 10) + } + case LMetaType: + if key, err := db.lDecodeMetaKey(k); err != nil { + return nil, err + } else { + buf = strconv.AppendQuote(buf, String(key)) + } + case ZSetType: + if key, m, err := db.zDecodeSetKey(k); err != nil { + return nil, err + } else { + buf = strconv.AppendQuote(buf, String(key)) + buf = append(buf, ' ') + buf = strconv.AppendQuote(buf, String(m)) + } + case ZSizeType: + if key, err := db.zDecodeSizeKey(k); err != nil { + return nil, err + } else { + buf = strconv.AppendQuote(buf, String(key)) + } + case ZScoreType: + if key, m, score, err := db.zDecodeScoreKey(k); err != nil { + return nil, err + } else { + buf = strconv.AppendQuote(buf, String(key)) + buf = append(buf, ' ') + buf = strconv.AppendQuote(buf, String(m)) + buf = append(buf, ' ') + buf = strconv.AppendInt(buf, score, 10) + } + case BitType: + if key, seq, err := db.bDecodeBinKey(k); err != nil { + return nil, err + } else { + buf = strconv.AppendQuote(buf, String(key)) + buf = append(buf, ' ') + buf = strconv.AppendUint(buf, uint64(seq), 10) + } + case BitMetaType: + if key, err := db.bDecodeMetaKey(k); err != nil { + return nil, err + } else { + buf = strconv.AppendQuote(buf, String(key)) + } + case SetType: + if key, member, err := db.sDecodeSetKey(k); err != nil { + return nil, err + } else { + buf = strconv.AppendQuote(buf, String(key)) + buf = append(buf, ' ') + buf = strconv.AppendQuote(buf, String(member)) + } + case SSizeType: + if key, err := db.sDecodeSizeKey(k); err != nil { + return nil, err + } else { + buf = strconv.AppendQuote(buf, String(key)) + } + case ExpTimeType: + if tp, key, t, err := db.expDecodeTimeKey(k); err != nil { + return nil, err + } else { + buf = append(buf, TypeName[tp]...) + buf = append(buf, ' ') + buf = strconv.AppendQuote(buf, String(key)) + buf = append(buf, ' ') + buf = strconv.AppendInt(buf, t, 10) + } + case ExpMetaType: + if tp, key, err := db.expDecodeMetaKey(k); err != nil { + return nil, err + } else { + buf = append(buf, TypeName[tp]...) + buf = append(buf, ' ') + buf = strconv.AppendQuote(buf, String(key)) + } + default: + return nil, errInvalidBinLogEvent + } + + return buf, nil +} diff --git a/vendor/github.com/lunny/nodb/config/config.go b/vendor/github.com/lunny/nodb/config/config.go new file mode 100644 index 000000000000..3b44d3043f04 --- /dev/null +++ b/vendor/github.com/lunny/nodb/config/config.go @@ -0,0 +1,135 @@ +package config + +import ( + "io/ioutil" + + "github.com/BurntSushi/toml" +) + +type Size int + +const ( + DefaultAddr string = "127.0.0.1:6380" + DefaultHttpAddr string = "127.0.0.1:11181" + + DefaultDBName string = "goleveldb" + + DefaultDataDir string = "./data" +) + +const ( + MaxBinLogFileSize int = 1024 * 1024 * 1024 + MaxBinLogFileNum int = 10000 + + DefaultBinLogFileSize int = MaxBinLogFileSize + DefaultBinLogFileNum int = 10 +) + +type LevelDBConfig struct { + Compression bool `toml:"compression"` + BlockSize int `toml:"block_size"` + WriteBufferSize int `toml:"write_buffer_size"` + CacheSize int `toml:"cache_size"` + MaxOpenFiles int `toml:"max_open_files"` +} + +type LMDBConfig struct { + MapSize int `toml:"map_size"` + NoSync bool `toml:"nosync"` +} + +type BinLogConfig struct { + MaxFileSize int `toml:"max_file_size"` + MaxFileNum int `toml:"max_file_num"` +} + +type Config struct { + DataDir string `toml:"data_dir"` + + DBName string `toml:"db_name"` + + LevelDB LevelDBConfig `toml:"leveldb"` + + LMDB LMDBConfig `toml:"lmdb"` + + BinLog BinLogConfig `toml:"binlog"` + + SlaveOf string `toml:"slaveof"` + + AccessLog string `toml:"access_log"` +} + +func NewConfigWithFile(fileName string) (*Config, error) { + data, err := ioutil.ReadFile(fileName) + if err != nil { + return nil, err + } + + return NewConfigWithData(data) +} + +func NewConfigWithData(data []byte) (*Config, error) { + cfg := NewConfigDefault() + + _, err := toml.Decode(string(data), cfg) + if err != nil { + return nil, err + } + + return cfg, nil +} + +func NewConfigDefault() *Config { + cfg := new(Config) + + cfg.DataDir = DefaultDataDir + + cfg.DBName = DefaultDBName + + // disable binlog + cfg.BinLog.MaxFileNum = 0 + cfg.BinLog.MaxFileSize = 0 + + // disable replication + cfg.SlaveOf = "" + + // disable access log + cfg.AccessLog = "" + + cfg.LMDB.MapSize = 20 * 1024 * 1024 + cfg.LMDB.NoSync = true + + return cfg +} + +func (cfg *LevelDBConfig) Adjust() { + if cfg.CacheSize <= 0 { + cfg.CacheSize = 4 * 1024 * 1024 + } + + if cfg.BlockSize <= 0 { + cfg.BlockSize = 4 * 1024 + } + + if cfg.WriteBufferSize <= 0 { + cfg.WriteBufferSize = 4 * 1024 * 1024 + } + + if cfg.MaxOpenFiles < 1024 { + cfg.MaxOpenFiles = 1024 + } +} + +func (cfg *BinLogConfig) Adjust() { + if cfg.MaxFileSize <= 0 { + cfg.MaxFileSize = DefaultBinLogFileSize + } else if cfg.MaxFileSize > MaxBinLogFileSize { + cfg.MaxFileSize = MaxBinLogFileSize + } + + if cfg.MaxFileNum <= 0 { + cfg.MaxFileNum = DefaultBinLogFileNum + } else if cfg.MaxFileNum > MaxBinLogFileNum { + cfg.MaxFileNum = MaxBinLogFileNum + } +} diff --git a/vendor/github.com/lunny/nodb/const.go b/vendor/github.com/lunny/nodb/const.go new file mode 100644 index 000000000000..446dae634ede --- /dev/null +++ b/vendor/github.com/lunny/nodb/const.go @@ -0,0 +1,98 @@ +package nodb + +import ( + "errors" +) + +const ( + NoneType byte = 0 + KVType byte = 1 + HashType byte = 2 + HSizeType byte = 3 + ListType byte = 4 + LMetaType byte = 5 + ZSetType byte = 6 + ZSizeType byte = 7 + ZScoreType byte = 8 + BitType byte = 9 + BitMetaType byte = 10 + SetType byte = 11 + SSizeType byte = 12 + + maxDataType byte = 100 + + ExpTimeType byte = 101 + ExpMetaType byte = 102 +) + +var ( + TypeName = map[byte]string{ + KVType: "kv", + HashType: "hash", + HSizeType: "hsize", + ListType: "list", + LMetaType: "lmeta", + ZSetType: "zset", + ZSizeType: "zsize", + ZScoreType: "zscore", + BitType: "bit", + BitMetaType: "bitmeta", + SetType: "set", + SSizeType: "ssize", + ExpTimeType: "exptime", + ExpMetaType: "expmeta", + } +) + +const ( + defaultScanCount int = 10 +) + +var ( + errKeySize = errors.New("invalid key size") + errValueSize = errors.New("invalid value size") + errHashFieldSize = errors.New("invalid hash field size") + errSetMemberSize = errors.New("invalid set member size") + errZSetMemberSize = errors.New("invalid zset member size") + errExpireValue = errors.New("invalid expire value") +) + +const ( + //we don't support too many databases + MaxDBNumber uint8 = 16 + + //max key size + MaxKeySize int = 1024 + + //max hash field size + MaxHashFieldSize int = 1024 + + //max zset member size + MaxZSetMemberSize int = 1024 + + //max set member size + MaxSetMemberSize int = 1024 + + //max value size + MaxValueSize int = 10 * 1024 * 1024 +) + +var ( + ErrScoreMiss = errors.New("zset score miss") +) + +const ( + BinLogTypeDeletion uint8 = 0x0 + BinLogTypePut uint8 = 0x1 + BinLogTypeCommand uint8 = 0x2 +) + +const ( + DBAutoCommit uint8 = 0x0 + DBInTransaction uint8 = 0x1 + DBInMulti uint8 = 0x2 +) + +var ( + Version = "0.1" +) diff --git a/vendor/github.com/lunny/nodb/doc.go b/vendor/github.com/lunny/nodb/doc.go new file mode 100644 index 000000000000..2f7df33ffd03 --- /dev/null +++ b/vendor/github.com/lunny/nodb/doc.go @@ -0,0 +1,61 @@ +// package nodb is a high performance embedded NoSQL. +// +// nodb supports various data structure like kv, list, hash and zset like redis. +// +// Other features include binlog replication, data with a limited time-to-live. +// +// Usage +// +// First create a nodb instance before use: +// +// l := nodb.Open(cfg) +// +// cfg is a Config instance which contains configuration for nodb use, +// like DataDir (root directory for nodb working to store data). +// +// After you create a nodb instance, you can select a DB to store you data: +// +// db, _ := l.Select(0) +// +// DB must be selected by a index, nodb supports only 16 databases, so the index range is [0-15]. +// +// KV +// +// KV is the most basic nodb type like any other key-value database. +// +// err := db.Set(key, value) +// value, err := db.Get(key) +// +// List +// +// List is simply lists of values, sorted by insertion order. +// You can push or pop value on the list head (left) or tail (right). +// +// err := db.LPush(key, value1) +// err := db.RPush(key, value2) +// value1, err := db.LPop(key) +// value2, err := db.RPop(key) +// +// Hash +// +// Hash is a map between fields and values. +// +// n, err := db.HSet(key, field1, value1) +// n, err := db.HSet(key, field2, value2) +// value1, err := db.HGet(key, field1) +// value2, err := db.HGet(key, field2) +// +// ZSet +// +// ZSet is a sorted collections of values. +// Every member of zset is associated with score, a int64 value which used to sort, from smallest to greatest score. +// Members are unique, but score may be same. +// +// n, err := db.ZAdd(key, ScorePair{score1, member1}, ScorePair{score2, member2}) +// ay, err := db.ZRangeByScore(key, minScore, maxScore, 0, -1) +// +// Binlog +// +// nodb supports binlog, so you can sync binlog to another server for replication. If you want to open binlog support, set UseBinLog to true in config. +// +package nodb diff --git a/vendor/github.com/lunny/nodb/dump.go b/vendor/github.com/lunny/nodb/dump.go new file mode 100644 index 000000000000..3c9722e00db6 --- /dev/null +++ b/vendor/github.com/lunny/nodb/dump.go @@ -0,0 +1,200 @@ +package nodb + +import ( + "bufio" + "bytes" + "encoding/binary" + "io" + "os" + + "github.com/siddontang/go-snappy/snappy" +) + +//dump format +// fileIndex(bigendian int64)|filePos(bigendian int64) +// |keylen(bigendian int32)|key|valuelen(bigendian int32)|value...... +// +//key and value are both compressed for fast transfer dump on network using snappy + +type BinLogAnchor struct { + LogFileIndex int64 + LogPos int64 +} + +func (m *BinLogAnchor) WriteTo(w io.Writer) error { + if err := binary.Write(w, binary.BigEndian, m.LogFileIndex); err != nil { + return err + } + + if err := binary.Write(w, binary.BigEndian, m.LogPos); err != nil { + return err + } + return nil +} + +func (m *BinLogAnchor) ReadFrom(r io.Reader) error { + err := binary.Read(r, binary.BigEndian, &m.LogFileIndex) + if err != nil { + return err + } + + err = binary.Read(r, binary.BigEndian, &m.LogPos) + if err != nil { + return err + } + + return nil +} + +func (l *Nodb) DumpFile(path string) error { + f, err := os.Create(path) + if err != nil { + return err + } + defer f.Close() + + return l.Dump(f) +} + +func (l *Nodb) Dump(w io.Writer) error { + m := new(BinLogAnchor) + + var err error + + l.wLock.Lock() + defer l.wLock.Unlock() + + if l.binlog != nil { + m.LogFileIndex = l.binlog.LogFileIndex() + m.LogPos = l.binlog.LogFilePos() + } + + wb := bufio.NewWriterSize(w, 4096) + if err = m.WriteTo(wb); err != nil { + return err + } + + it := l.ldb.NewIterator() + it.SeekToFirst() + + compressBuf := make([]byte, 4096) + + var key []byte + var value []byte + for ; it.Valid(); it.Next() { + key = it.RawKey() + value = it.RawValue() + + if key, err = snappy.Encode(compressBuf, key); err != nil { + return err + } + + if err = binary.Write(wb, binary.BigEndian, uint16(len(key))); err != nil { + return err + } + + if _, err = wb.Write(key); err != nil { + return err + } + + if value, err = snappy.Encode(compressBuf, value); err != nil { + return err + } + + if err = binary.Write(wb, binary.BigEndian, uint32(len(value))); err != nil { + return err + } + + if _, err = wb.Write(value); err != nil { + return err + } + } + + if err = wb.Flush(); err != nil { + return err + } + + compressBuf = nil + + return nil +} + +func (l *Nodb) LoadDumpFile(path string) (*BinLogAnchor, error) { + f, err := os.Open(path) + if err != nil { + return nil, err + } + defer f.Close() + + return l.LoadDump(f) +} + +func (l *Nodb) LoadDump(r io.Reader) (*BinLogAnchor, error) { + l.wLock.Lock() + defer l.wLock.Unlock() + + info := new(BinLogAnchor) + + rb := bufio.NewReaderSize(r, 4096) + + err := info.ReadFrom(rb) + if err != nil { + return nil, err + } + + var keyLen uint16 + var valueLen uint32 + + var keyBuf bytes.Buffer + var valueBuf bytes.Buffer + + deKeyBuf := make([]byte, 4096) + deValueBuf := make([]byte, 4096) + + var key, value []byte + + for { + if err = binary.Read(rb, binary.BigEndian, &keyLen); err != nil && err != io.EOF { + return nil, err + } else if err == io.EOF { + break + } + + if _, err = io.CopyN(&keyBuf, rb, int64(keyLen)); err != nil { + return nil, err + } + + if key, err = snappy.Decode(deKeyBuf, keyBuf.Bytes()); err != nil { + return nil, err + } + + if err = binary.Read(rb, binary.BigEndian, &valueLen); err != nil { + return nil, err + } + + if _, err = io.CopyN(&valueBuf, rb, int64(valueLen)); err != nil { + return nil, err + } + + if value, err = snappy.Decode(deValueBuf, valueBuf.Bytes()); err != nil { + return nil, err + } + + if err = l.ldb.Put(key, value); err != nil { + return nil, err + } + + keyBuf.Reset() + valueBuf.Reset() + } + + deKeyBuf = nil + deValueBuf = nil + + //if binlog enable, we will delete all binlogs and open a new one for handling simply + if l.binlog != nil { + l.binlog.PurgeAll() + } + + return info, nil +} diff --git a/vendor/github.com/lunny/nodb/info.go b/vendor/github.com/lunny/nodb/info.go new file mode 100644 index 000000000000..3fd37e3d4454 --- /dev/null +++ b/vendor/github.com/lunny/nodb/info.go @@ -0,0 +1,24 @@ +package nodb + +// todo, add info + +// type Keyspace struct { +// Kvs int `json:"kvs"` +// KvExpires int `json:"kv_expires"` + +// Lists int `json:"lists"` +// ListExpires int `json:"list_expires"` + +// Bitmaps int `json:"bitmaps"` +// BitmapExpires int `json:"bitmap_expires"` + +// ZSets int `json:"zsets"` +// ZSetExpires int `json:"zset_expires"` + +// Hashes int `json:"hashes"` +// HashExpires int `json:"hahsh_expires"` +// } + +// type Info struct { +// KeySpaces [MaxDBNumber]Keyspace +// } diff --git a/vendor/github.com/lunny/nodb/multi.go b/vendor/github.com/lunny/nodb/multi.go new file mode 100644 index 000000000000..ca581ce9a232 --- /dev/null +++ b/vendor/github.com/lunny/nodb/multi.go @@ -0,0 +1,73 @@ +package nodb + +import ( + "errors" + "fmt" +) + +var ( + ErrNestMulti = errors.New("nest multi not supported") + ErrMultiDone = errors.New("multi has been closed") +) + +type Multi struct { + *DB +} + +func (db *DB) IsInMulti() bool { + return db.status == DBInMulti +} + +// begin a mutli to execute commands, +// it will block any other write operations before you close the multi, unlike transaction, mutli can not rollback +func (db *DB) Multi() (*Multi, error) { + if db.IsInMulti() { + return nil, ErrNestMulti + } + + m := new(Multi) + + m.DB = new(DB) + m.DB.status = DBInMulti + + m.DB.l = db.l + + m.l.wLock.Lock() + + m.DB.sdb = db.sdb + + m.DB.bucket = db.sdb + + m.DB.index = db.index + + m.DB.kvBatch = m.newBatch() + m.DB.listBatch = m.newBatch() + m.DB.hashBatch = m.newBatch() + m.DB.zsetBatch = m.newBatch() + m.DB.binBatch = m.newBatch() + m.DB.setBatch = m.newBatch() + + return m, nil +} + +func (m *Multi) newBatch() *batch { + return m.l.newBatch(m.bucket.NewWriteBatch(), &multiBatchLocker{}, nil) +} + +func (m *Multi) Close() error { + if m.bucket == nil { + return ErrMultiDone + } + m.l.wLock.Unlock() + m.bucket = nil + return nil +} + +func (m *Multi) Select(index int) error { + if index < 0 || index >= int(MaxDBNumber) { + return fmt.Errorf("invalid db index %d", index) + } + + m.DB.index = uint8(index) + return nil +} diff --git a/vendor/github.com/lunny/nodb/nodb.go b/vendor/github.com/lunny/nodb/nodb.go new file mode 100644 index 000000000000..fdd0272c94ef --- /dev/null +++ b/vendor/github.com/lunny/nodb/nodb.go @@ -0,0 +1,128 @@ +package nodb + +import ( + "fmt" + "sync" + "time" + + "github.com/lunny/log" + "github.com/lunny/nodb/config" + "github.com/lunny/nodb/store" +) + +type Nodb struct { + cfg *config.Config + + ldb *store.DB + dbs [MaxDBNumber]*DB + + quit chan struct{} + jobs *sync.WaitGroup + + binlog *BinLog + + wLock sync.RWMutex //allow one write at same time + commitLock sync.Mutex //allow one write commit at same time +} + +func Open(cfg *config.Config) (*Nodb, error) { + if len(cfg.DataDir) == 0 { + cfg.DataDir = config.DefaultDataDir + } + + ldb, err := store.Open(cfg) + if err != nil { + return nil, err + } + + l := new(Nodb) + + l.quit = make(chan struct{}) + l.jobs = new(sync.WaitGroup) + + l.ldb = ldb + + if cfg.BinLog.MaxFileNum > 0 && cfg.BinLog.MaxFileSize > 0 { + l.binlog, err = NewBinLog(cfg) + if err != nil { + return nil, err + } + } else { + l.binlog = nil + } + + for i := uint8(0); i < MaxDBNumber; i++ { + l.dbs[i] = l.newDB(i) + } + + l.activeExpireCycle() + + return l, nil +} + +func (l *Nodb) Close() { + close(l.quit) + l.jobs.Wait() + + l.ldb.Close() + + if l.binlog != nil { + l.binlog.Close() + l.binlog = nil + } +} + +func (l *Nodb) Select(index int) (*DB, error) { + if index < 0 || index >= int(MaxDBNumber) { + return nil, fmt.Errorf("invalid db index %d", index) + } + + return l.dbs[index], nil +} + +func (l *Nodb) FlushAll() error { + for index, db := range l.dbs { + if _, err := db.FlushAll(); err != nil { + log.Error("flush db %d error %s", index, err.Error()) + } + } + + return nil +} + +// very dangerous to use +func (l *Nodb) DataDB() *store.DB { + return l.ldb +} + +func (l *Nodb) activeExpireCycle() { + var executors []*elimination = make([]*elimination, len(l.dbs)) + for i, db := range l.dbs { + executors[i] = db.newEliminator() + } + + l.jobs.Add(1) + go func() { + tick := time.NewTicker(1 * time.Second) + end := false + done := make(chan struct{}) + for !end { + select { + case <-tick.C: + go func() { + for _, eli := range executors { + eli.active() + } + done <- struct{}{} + }() + <-done + case <-l.quit: + end = true + break + } + } + + tick.Stop() + l.jobs.Done() + }() +} diff --git a/vendor/github.com/lunny/nodb/nodb_db.go b/vendor/github.com/lunny/nodb/nodb_db.go new file mode 100644 index 000000000000..f68ebaa0d464 --- /dev/null +++ b/vendor/github.com/lunny/nodb/nodb_db.go @@ -0,0 +1,171 @@ +package nodb + +import ( + "fmt" + "sync" + + "github.com/lunny/nodb/store" +) + +type ibucket interface { + Get(key []byte) ([]byte, error) + + Put(key []byte, value []byte) error + Delete(key []byte) error + + NewIterator() *store.Iterator + + NewWriteBatch() store.WriteBatch + + RangeIterator(min []byte, max []byte, rangeType uint8) *store.RangeLimitIterator + RevRangeIterator(min []byte, max []byte, rangeType uint8) *store.RangeLimitIterator + RangeLimitIterator(min []byte, max []byte, rangeType uint8, offset int, count int) *store.RangeLimitIterator + RevRangeLimitIterator(min []byte, max []byte, rangeType uint8, offset int, count int) *store.RangeLimitIterator +} + +type DB struct { + l *Nodb + + sdb *store.DB + + bucket ibucket + + index uint8 + + kvBatch *batch + listBatch *batch + hashBatch *batch + zsetBatch *batch + binBatch *batch + setBatch *batch + + status uint8 +} + +func (l *Nodb) newDB(index uint8) *DB { + d := new(DB) + + d.l = l + + d.sdb = l.ldb + + d.bucket = d.sdb + + d.status = DBAutoCommit + d.index = index + + d.kvBatch = d.newBatch() + d.listBatch = d.newBatch() + d.hashBatch = d.newBatch() + d.zsetBatch = d.newBatch() + d.binBatch = d.newBatch() + d.setBatch = d.newBatch() + + return d +} + +func (db *DB) newBatch() *batch { + return db.l.newBatch(db.bucket.NewWriteBatch(), &dbBatchLocker{l: &sync.Mutex{}, wrLock: &db.l.wLock}, nil) +} + +func (db *DB) Index() int { + return int(db.index) +} + +func (db *DB) IsAutoCommit() bool { + return db.status == DBAutoCommit +} + +func (db *DB) FlushAll() (drop int64, err error) { + all := [...](func() (int64, error)){ + db.flush, + db.lFlush, + db.hFlush, + db.zFlush, + db.bFlush, + db.sFlush} + + for _, flush := range all { + if n, e := flush(); e != nil { + err = e + return + } else { + drop += n + } + } + + return +} + +func (db *DB) newEliminator() *elimination { + eliminator := newEliminator(db) + + eliminator.regRetireContext(KVType, db.kvBatch, db.delete) + eliminator.regRetireContext(ListType, db.listBatch, db.lDelete) + eliminator.regRetireContext(HashType, db.hashBatch, db.hDelete) + eliminator.regRetireContext(ZSetType, db.zsetBatch, db.zDelete) + eliminator.regRetireContext(BitType, db.binBatch, db.bDelete) + eliminator.regRetireContext(SetType, db.setBatch, db.sDelete) + + return eliminator +} + +func (db *DB) flushRegion(t *batch, minKey []byte, maxKey []byte) (drop int64, err error) { + it := db.bucket.RangeIterator(minKey, maxKey, store.RangeROpen) + for ; it.Valid(); it.Next() { + t.Delete(it.RawKey()) + drop++ + if drop&1023 == 0 { + if err = t.Commit(); err != nil { + return + } + } + } + it.Close() + return +} + +func (db *DB) flushType(t *batch, dataType byte) (drop int64, err error) { + var deleteFunc func(t *batch, key []byte) int64 + var metaDataType byte + switch dataType { + case KVType: + deleteFunc = db.delete + metaDataType = KVType + case ListType: + deleteFunc = db.lDelete + metaDataType = LMetaType + case HashType: + deleteFunc = db.hDelete + metaDataType = HSizeType + case ZSetType: + deleteFunc = db.zDelete + metaDataType = ZSizeType + case BitType: + deleteFunc = db.bDelete + metaDataType = BitMetaType + case SetType: + deleteFunc = db.sDelete + metaDataType = SSizeType + default: + return 0, fmt.Errorf("invalid data type: %s", TypeName[dataType]) + } + + var keys [][]byte + keys, err = db.scan(metaDataType, nil, 1024, false, "") + for len(keys) != 0 || err != nil { + for _, key := range keys { + deleteFunc(t, key) + db.rmExpire(t, dataType, key) + + } + + if err = t.Commit(); err != nil { + return + } else { + drop += int64(len(keys)) + } + keys, err = db.scan(metaDataType, nil, 1024, false, "") + } + return +} diff --git a/vendor/github.com/lunny/nodb/replication.go b/vendor/github.com/lunny/nodb/replication.go new file mode 100644 index 000000000000..f9bc95108509 --- /dev/null +++ b/vendor/github.com/lunny/nodb/replication.go @@ -0,0 +1,312 @@ +package nodb + +import ( + "bufio" + "bytes" + "errors" + "io" + "os" + "time" + + "github.com/lunny/log" + "github.com/lunny/nodb/store/driver" +) + +const ( + maxReplBatchNum = 100 + maxReplLogSize = 1 * 1024 * 1024 +) + +var ( + ErrSkipEvent = errors.New("skip to next event") +) + +var ( + errInvalidBinLogEvent = errors.New("invalid binglog event") + errInvalidBinLogFile = errors.New("invalid binlog file") +) + +type replBatch struct { + wb driver.IWriteBatch + events [][]byte + l *Nodb + + lastHead *BinLogHead +} + +func (b *replBatch) Commit() error { + b.l.commitLock.Lock() + defer b.l.commitLock.Unlock() + + err := b.wb.Commit() + if err != nil { + b.Rollback() + return err + } + + if b.l.binlog != nil { + if err = b.l.binlog.Log(b.events...); err != nil { + b.Rollback() + return err + } + } + + b.events = [][]byte{} + b.lastHead = nil + + return nil +} + +func (b *replBatch) Rollback() error { + b.wb.Rollback() + b.events = [][]byte{} + b.lastHead = nil + return nil +} + +func (l *Nodb) replicateEvent(b *replBatch, event []byte) error { + if len(event) == 0 { + return errInvalidBinLogEvent + } + + b.events = append(b.events, event) + + logType := uint8(event[0]) + switch logType { + case BinLogTypePut: + return l.replicatePutEvent(b, event) + case BinLogTypeDeletion: + return l.replicateDeleteEvent(b, event) + default: + return errInvalidBinLogEvent + } +} + +func (l *Nodb) replicatePutEvent(b *replBatch, event []byte) error { + key, value, err := decodeBinLogPut(event) + if err != nil { + return err + } + + b.wb.Put(key, value) + + return nil +} + +func (l *Nodb) replicateDeleteEvent(b *replBatch, event []byte) error { + key, err := decodeBinLogDelete(event) + if err != nil { + return err + } + + b.wb.Delete(key) + + return nil +} + +func ReadEventFromReader(rb io.Reader, f func(head *BinLogHead, event []byte) error) error { + head := &BinLogHead{} + var err error + + for { + if err = head.Read(rb); err != nil { + if err == io.EOF { + break + } else { + return err + } + } + + var dataBuf bytes.Buffer + + if _, err = io.CopyN(&dataBuf, rb, int64(head.PayloadLen)); err != nil { + return err + } + + err = f(head, dataBuf.Bytes()) + if err != nil && err != ErrSkipEvent { + return err + } + } + + return nil +} + +func (l *Nodb) ReplicateFromReader(rb io.Reader) error { + b := new(replBatch) + + b.wb = l.ldb.NewWriteBatch() + b.l = l + + f := func(head *BinLogHead, event []byte) error { + if b.lastHead == nil { + b.lastHead = head + } else if !b.lastHead.InSameBatch(head) { + if err := b.Commit(); err != nil { + log.Fatal("replication error %s, skip to next", err.Error()) + return ErrSkipEvent + } + b.lastHead = head + } + + err := l.replicateEvent(b, event) + if err != nil { + log.Fatal("replication error %s, skip to next", err.Error()) + return ErrSkipEvent + } + return nil + } + + err := ReadEventFromReader(rb, f) + if err != nil { + b.Rollback() + return err + } + return b.Commit() +} + +func (l *Nodb) ReplicateFromData(data []byte) error { + rb := bytes.NewReader(data) + + err := l.ReplicateFromReader(rb) + + return err +} + +func (l *Nodb) ReplicateFromBinLog(filePath string) error { + f, err := os.Open(filePath) + if err != nil { + return err + } + + rb := bufio.NewReaderSize(f, 4096) + + err = l.ReplicateFromReader(rb) + + f.Close() + + return err +} + +// try to read events, if no events read, try to wait the new event singal until timeout seconds +func (l *Nodb) ReadEventsToTimeout(info *BinLogAnchor, w io.Writer, timeout int) (n int, err error) { + lastIndex := info.LogFileIndex + lastPos := info.LogPos + + n = 0 + if l.binlog == nil { + //binlog not supported + info.LogFileIndex = 0 + info.LogPos = 0 + return + } + + n, err = l.ReadEventsTo(info, w) + if err == nil && info.LogFileIndex == lastIndex && info.LogPos == lastPos { + //no events read + select { + case <-l.binlog.Wait(): + case <-time.After(time.Duration(timeout) * time.Second): + } + return l.ReadEventsTo(info, w) + } + return +} + +func (l *Nodb) ReadEventsTo(info *BinLogAnchor, w io.Writer) (n int, err error) { + n = 0 + if l.binlog == nil { + //binlog not supported + info.LogFileIndex = 0 + info.LogPos = 0 + return + } + + index := info.LogFileIndex + offset := info.LogPos + + filePath := l.binlog.FormatLogFilePath(index) + + var f *os.File + f, err = os.Open(filePath) + if os.IsNotExist(err) { + lastIndex := l.binlog.LogFileIndex() + + if index == lastIndex { + //no binlog at all + info.LogPos = 0 + } else { + //slave binlog info had lost + info.LogFileIndex = -1 + } + } + + if err != nil { + if os.IsNotExist(err) { + err = nil + } + return + } + + defer f.Close() + + var fileSize int64 + st, _ := f.Stat() + fileSize = st.Size() + + if fileSize == info.LogPos { + return + } + + if _, err = f.Seek(offset, os.SEEK_SET); err != nil { + //may be invliad seek offset + return + } + + var lastHead *BinLogHead = nil + + head := &BinLogHead{} + + batchNum := 0 + + for { + if err = head.Read(f); err != nil { + if err == io.EOF { + //we will try to use next binlog + if index < l.binlog.LogFileIndex() { + info.LogFileIndex += 1 + info.LogPos = 0 + } + err = nil + return + } else { + return + } + + } + + if lastHead == nil { + lastHead = head + batchNum++ + } else if !lastHead.InSameBatch(head) { + lastHead = head + batchNum++ + if batchNum > maxReplBatchNum || n > maxReplLogSize { + return + } + } + + if err = head.Write(w); err != nil { + return + } + + if _, err = io.CopyN(w, f, int64(head.PayloadLen)); err != nil { + return + } + + n += (head.Len() + int(head.PayloadLen)) + info.LogPos = info.LogPos + int64(head.Len()) + int64(head.PayloadLen) + } + + return +} diff --git a/vendor/github.com/lunny/nodb/scan.go b/vendor/github.com/lunny/nodb/scan.go new file mode 100644 index 000000000000..e989db3fed72 --- /dev/null +++ b/vendor/github.com/lunny/nodb/scan.go @@ -0,0 +1,144 @@ +package nodb + +import ( + "bytes" + "errors" + "regexp" + + "github.com/lunny/nodb/store" +) + +var errDataType = errors.New("error data type") +var errMetaKey = errors.New("error meta key") + +// Seek search the prefix key +func (db *DB) Seek(key []byte) (*store.Iterator, error) { + return db.seek(KVType, key) +} + +func (db *DB) seek(dataType byte, key []byte) (*store.Iterator, error) { + var minKey []byte + var err error + + if len(key) > 0 { + if err = checkKeySize(key); err != nil { + return nil, err + } + if minKey, err = db.encodeMetaKey(dataType, key); err != nil { + return nil, err + } + + } else { + if minKey, err = db.encodeMinKey(dataType); err != nil { + return nil, err + } + } + + it := db.bucket.NewIterator() + it.Seek(minKey) + return it, nil +} + +func (db *DB) MaxKey() ([]byte, error) { + return db.encodeMaxKey(KVType) +} + +func (db *DB) Key(it *store.Iterator) ([]byte, error) { + return db.decodeMetaKey(KVType, it.Key()) +} + +func (db *DB) scan(dataType byte, key []byte, count int, inclusive bool, match string) ([][]byte, error) { + var minKey, maxKey []byte + var err error + var r *regexp.Regexp + + if len(match) > 0 { + if r, err = regexp.Compile(match); err != nil { + return nil, err + } + } + + if len(key) > 0 { + if err = checkKeySize(key); err != nil { + return nil, err + } + if minKey, err = db.encodeMetaKey(dataType, key); err != nil { + return nil, err + } + + } else { + if minKey, err = db.encodeMinKey(dataType); err != nil { + return nil, err + } + } + + if maxKey, err = db.encodeMaxKey(dataType); err != nil { + return nil, err + } + + if count <= 0 { + count = defaultScanCount + } + + v := make([][]byte, 0, count) + + it := db.bucket.NewIterator() + it.Seek(minKey) + + if !inclusive { + if it.Valid() && bytes.Equal(it.RawKey(), minKey) { + it.Next() + } + } + + for i := 0; it.Valid() && i < count && bytes.Compare(it.RawKey(), maxKey) < 0; it.Next() { + if k, err := db.decodeMetaKey(dataType, it.Key()); err != nil { + continue + } else if r != nil && !r.Match(k) { + continue + } else { + v = append(v, k) + i++ + } + } + it.Close() + return v, nil +} + +func (db *DB) encodeMinKey(dataType byte) ([]byte, error) { + return db.encodeMetaKey(dataType, nil) +} + +func (db *DB) encodeMaxKey(dataType byte) ([]byte, error) { + k, err := db.encodeMetaKey(dataType, nil) + if err != nil { + return nil, err + } + k[len(k)-1] = dataType + 1 + return k, nil +} + +func (db *DB) encodeMetaKey(dataType byte, key []byte) ([]byte, error) { + switch dataType { + case KVType: + return db.encodeKVKey(key), nil + case LMetaType: + return db.lEncodeMetaKey(key), nil + case HSizeType: + return db.hEncodeSizeKey(key), nil + case ZSizeType: + return db.zEncodeSizeKey(key), nil + case BitMetaType: + return db.bEncodeMetaKey(key), nil + case SSizeType: + return db.sEncodeSizeKey(key), nil + default: + return nil, errDataType + } +} +func (db *DB) decodeMetaKey(dataType byte, ek []byte) ([]byte, error) { + if len(ek) < 2 || ek[0] != db.index || ek[1] != dataType { + return nil, errMetaKey + } + return ek[2:], nil +} diff --git a/vendor/github.com/lunny/nodb/store/db.go b/vendor/github.com/lunny/nodb/store/db.go new file mode 100644 index 000000000000..00a8831a67ea --- /dev/null +++ b/vendor/github.com/lunny/nodb/store/db.go @@ -0,0 +1,61 @@ +package store + +import ( + "github.com/lunny/nodb/store/driver" +) + +type DB struct { + driver.IDB +} + +func (db *DB) NewIterator() *Iterator { + it := new(Iterator) + it.it = db.IDB.NewIterator() + + return it +} + +func (db *DB) NewWriteBatch() WriteBatch { + return db.IDB.NewWriteBatch() +} + +func (db *DB) NewSnapshot() (*Snapshot, error) { + var err error + s := &Snapshot{} + if s.ISnapshot, err = db.IDB.NewSnapshot(); err != nil { + return nil, err + } + + return s, nil +} + +func (db *DB) RangeIterator(min []byte, max []byte, rangeType uint8) *RangeLimitIterator { + return NewRangeLimitIterator(db.NewIterator(), &Range{min, max, rangeType}, &Limit{0, -1}) +} + +func (db *DB) RevRangeIterator(min []byte, max []byte, rangeType uint8) *RangeLimitIterator { + return NewRevRangeLimitIterator(db.NewIterator(), &Range{min, max, rangeType}, &Limit{0, -1}) +} + +//count < 0, unlimit. +// +//offset must >= 0, if < 0, will get nothing. +func (db *DB) RangeLimitIterator(min []byte, max []byte, rangeType uint8, offset int, count int) *RangeLimitIterator { + return NewRangeLimitIterator(db.NewIterator(), &Range{min, max, rangeType}, &Limit{offset, count}) +} + +//count < 0, unlimit. +// +//offset must >= 0, if < 0, will get nothing. +func (db *DB) RevRangeLimitIterator(min []byte, max []byte, rangeType uint8, offset int, count int) *RangeLimitIterator { + return NewRevRangeLimitIterator(db.NewIterator(), &Range{min, max, rangeType}, &Limit{offset, count}) +} + +func (db *DB) Begin() (*Tx, error) { + tx, err := db.IDB.Begin() + if err != nil { + return nil, err + } + + return &Tx{tx}, nil +} diff --git a/vendor/github.com/lunny/nodb/store/driver/batch.go b/vendor/github.com/lunny/nodb/store/driver/batch.go new file mode 100644 index 000000000000..6b79c21c48d1 --- /dev/null +++ b/vendor/github.com/lunny/nodb/store/driver/batch.go @@ -0,0 +1,39 @@ +package driver + +type BatchPuter interface { + BatchPut([]Write) error +} + +type Write struct { + Key []byte + Value []byte +} + +type WriteBatch struct { + batch BatchPuter + wb []Write +} + +func (w *WriteBatch) Put(key, value []byte) { + if value == nil { + value = []byte{} + } + w.wb = append(w.wb, Write{key, value}) +} + +func (w *WriteBatch) Delete(key []byte) { + w.wb = append(w.wb, Write{key, nil}) +} + +func (w *WriteBatch) Commit() error { + return w.batch.BatchPut(w.wb) +} + +func (w *WriteBatch) Rollback() error { + w.wb = w.wb[0:0] + return nil +} + +func NewWriteBatch(puter BatchPuter) IWriteBatch { + return &WriteBatch{puter, []Write{}} +} diff --git a/vendor/github.com/lunny/nodb/store/driver/driver.go b/vendor/github.com/lunny/nodb/store/driver/driver.go new file mode 100644 index 000000000000..6da67df08362 --- /dev/null +++ b/vendor/github.com/lunny/nodb/store/driver/driver.go @@ -0,0 +1,67 @@ +package driver + +import ( + "errors" +) + +var ( + ErrTxSupport = errors.New("transaction is not supported") +) + +type IDB interface { + Close() error + + Get(key []byte) ([]byte, error) + + Put(key []byte, value []byte) error + Delete(key []byte) error + + NewIterator() IIterator + + NewWriteBatch() IWriteBatch + + NewSnapshot() (ISnapshot, error) + + Begin() (Tx, error) +} + +type ISnapshot interface { + Get(key []byte) ([]byte, error) + NewIterator() IIterator + Close() +} + +type IIterator interface { + Close() error + + First() + Last() + Seek(key []byte) + + Next() + Prev() + + Valid() bool + + Key() []byte + Value() []byte +} + +type IWriteBatch interface { + Put(key []byte, value []byte) + Delete(key []byte) + Commit() error + Rollback() error +} + +type Tx interface { + Get(key []byte) ([]byte, error) + Put(key []byte, value []byte) error + Delete(key []byte) error + + NewIterator() IIterator + NewWriteBatch() IWriteBatch + + Commit() error + Rollback() error +} diff --git a/vendor/github.com/lunny/nodb/store/driver/store.go b/vendor/github.com/lunny/nodb/store/driver/store.go new file mode 100644 index 000000000000..173431d4c10c --- /dev/null +++ b/vendor/github.com/lunny/nodb/store/driver/store.go @@ -0,0 +1,46 @@ +package driver + +import ( + "fmt" + + "github.com/lunny/nodb/config" +) + +type Store interface { + String() string + Open(path string, cfg *config.Config) (IDB, error) + Repair(path string, cfg *config.Config) error +} + +var dbs = map[string]Store{} + +func Register(s Store) { + name := s.String() + if _, ok := dbs[name]; ok { + panic(fmt.Errorf("store %s is registered", s)) + } + + dbs[name] = s +} + +func ListStores() []string { + s := []string{} + for k, _ := range dbs { + s = append(s, k) + } + + return s +} + +func GetStore(cfg *config.Config) (Store, error) { + if len(cfg.DBName) == 0 { + cfg.DBName = config.DefaultDBName + } + + s, ok := dbs[cfg.DBName] + if !ok { + return nil, fmt.Errorf("store %s is not registered", cfg.DBName) + } + + return s, nil +} diff --git a/vendor/github.com/lunny/nodb/store/goleveldb/batch.go b/vendor/github.com/lunny/nodb/store/goleveldb/batch.go new file mode 100644 index 000000000000..b17e85e750d7 --- /dev/null +++ b/vendor/github.com/lunny/nodb/store/goleveldb/batch.go @@ -0,0 +1,27 @@ +package goleveldb + +import ( + "github.com/syndtr/goleveldb/leveldb" +) + +type WriteBatch struct { + db *DB + wbatch *leveldb.Batch +} + +func (w *WriteBatch) Put(key, value []byte) { + w.wbatch.Put(key, value) +} + +func (w *WriteBatch) Delete(key []byte) { + w.wbatch.Delete(key) +} + +func (w *WriteBatch) Commit() error { + return w.db.db.Write(w.wbatch, nil) +} + +func (w *WriteBatch) Rollback() error { + w.wbatch.Reset() + return nil +} diff --git a/vendor/github.com/lunny/nodb/store/goleveldb/const.go b/vendor/github.com/lunny/nodb/store/goleveldb/const.go new file mode 100644 index 000000000000..2fffa7c82bb4 --- /dev/null +++ b/vendor/github.com/lunny/nodb/store/goleveldb/const.go @@ -0,0 +1,4 @@ +package goleveldb + +const DBName = "goleveldb" +const MemDBName = "memory" diff --git a/vendor/github.com/lunny/nodb/store/goleveldb/db.go b/vendor/github.com/lunny/nodb/store/goleveldb/db.go new file mode 100644 index 000000000000..a36e87f62858 --- /dev/null +++ b/vendor/github.com/lunny/nodb/store/goleveldb/db.go @@ -0,0 +1,187 @@ +package goleveldb + +import ( + "github.com/syndtr/goleveldb/leveldb" + "github.com/syndtr/goleveldb/leveldb/cache" + "github.com/syndtr/goleveldb/leveldb/filter" + "github.com/syndtr/goleveldb/leveldb/opt" + "github.com/syndtr/goleveldb/leveldb/storage" + + "github.com/lunny/nodb/config" + "github.com/lunny/nodb/store/driver" + + "os" +) + +const defaultFilterBits int = 10 + +type Store struct { +} + +func (s Store) String() string { + return DBName +} + +type MemStore struct { +} + +func (s MemStore) String() string { + return MemDBName +} + +type DB struct { + path string + + cfg *config.LevelDBConfig + + db *leveldb.DB + + opts *opt.Options + + iteratorOpts *opt.ReadOptions + + cache cache.Cache + + filter filter.Filter +} + +func (s Store) Open(path string, cfg *config.Config) (driver.IDB, error) { + if err := os.MkdirAll(path, os.ModePerm); err != nil { + return nil, err + } + + db := new(DB) + db.path = path + db.cfg = &cfg.LevelDB + + db.initOpts() + + var err error + db.db, err = leveldb.OpenFile(db.path, db.opts) + + if err != nil { + return nil, err + } + + return db, nil +} + +func (s Store) Repair(path string, cfg *config.Config) error { + db, err := leveldb.RecoverFile(path, newOptions(&cfg.LevelDB)) + if err != nil { + return err + } + + db.Close() + return nil +} + +func (s MemStore) Open(path string, cfg *config.Config) (driver.IDB, error) { + db := new(DB) + db.path = path + db.cfg = &cfg.LevelDB + + db.initOpts() + + var err error + db.db, err = leveldb.Open(storage.NewMemStorage(), db.opts) + if err != nil { + return nil, err + } + + return db, nil +} + +func (s MemStore) Repair(path string, cfg *config.Config) error { + return nil +} + +func (db *DB) initOpts() { + db.opts = newOptions(db.cfg) + + db.iteratorOpts = &opt.ReadOptions{} + db.iteratorOpts.DontFillCache = true +} + +func newOptions(cfg *config.LevelDBConfig) *opt.Options { + opts := &opt.Options{} + opts.ErrorIfMissing = false + + cfg.Adjust() + + //opts.BlockCacher = cache.NewLRU(cfg.CacheSize) + opts.BlockCacheCapacity = cfg.CacheSize + + //we must use bloomfilter + opts.Filter = filter.NewBloomFilter(defaultFilterBits) + + if !cfg.Compression { + opts.Compression = opt.NoCompression + } else { + opts.Compression = opt.SnappyCompression + } + + opts.BlockSize = cfg.BlockSize + opts.WriteBuffer = cfg.WriteBufferSize + + return opts +} + +func (db *DB) Close() error { + return db.db.Close() +} + +func (db *DB) Put(key, value []byte) error { + return db.db.Put(key, value, nil) +} + +func (db *DB) Get(key []byte) ([]byte, error) { + v, err := db.db.Get(key, nil) + if err == leveldb.ErrNotFound { + return nil, nil + } + return v, nil +} + +func (db *DB) Delete(key []byte) error { + return db.db.Delete(key, nil) +} + +func (db *DB) NewWriteBatch() driver.IWriteBatch { + wb := &WriteBatch{ + db: db, + wbatch: new(leveldb.Batch), + } + return wb +} + +func (db *DB) NewIterator() driver.IIterator { + it := &Iterator{ + db.db.NewIterator(nil, db.iteratorOpts), + } + + return it +} + +func (db *DB) Begin() (driver.Tx, error) { + return nil, driver.ErrTxSupport +} + +func (db *DB) NewSnapshot() (driver.ISnapshot, error) { + snapshot, err := db.db.GetSnapshot() + if err != nil { + return nil, err + } + + s := &Snapshot{ + db: db, + snp: snapshot, + } + + return s, nil +} + +func init() { + driver.Register(Store{}) + driver.Register(MemStore{}) +} diff --git a/vendor/github.com/lunny/nodb/store/goleveldb/iterator.go b/vendor/github.com/lunny/nodb/store/goleveldb/iterator.go new file mode 100644 index 000000000000..c1fd8b5573bb --- /dev/null +++ b/vendor/github.com/lunny/nodb/store/goleveldb/iterator.go @@ -0,0 +1,49 @@ +package goleveldb + +import ( + "github.com/syndtr/goleveldb/leveldb/iterator" +) + +type Iterator struct { + it iterator.Iterator +} + +func (it *Iterator) Key() []byte { + return it.it.Key() +} + +func (it *Iterator) Value() []byte { + return it.it.Value() +} + +func (it *Iterator) Close() error { + if it.it != nil { + it.it.Release() + it.it = nil + } + return nil +} + +func (it *Iterator) Valid() bool { + return it.it.Valid() +} + +func (it *Iterator) Next() { + it.it.Next() +} + +func (it *Iterator) Prev() { + it.it.Prev() +} + +func (it *Iterator) First() { + it.it.First() +} + +func (it *Iterator) Last() { + it.it.Last() +} + +func (it *Iterator) Seek(key []byte) { + it.it.Seek(key) +} diff --git a/vendor/github.com/lunny/nodb/store/goleveldb/snapshot.go b/vendor/github.com/lunny/nodb/store/goleveldb/snapshot.go new file mode 100644 index 000000000000..fe2b409c3f71 --- /dev/null +++ b/vendor/github.com/lunny/nodb/store/goleveldb/snapshot.go @@ -0,0 +1,26 @@ +package goleveldb + +import ( + "github.com/lunny/nodb/store/driver" + "github.com/syndtr/goleveldb/leveldb" +) + +type Snapshot struct { + db *DB + snp *leveldb.Snapshot +} + +func (s *Snapshot) Get(key []byte) ([]byte, error) { + return s.snp.Get(key, s.db.iteratorOpts) +} + +func (s *Snapshot) NewIterator() driver.IIterator { + it := &Iterator{ + s.snp.NewIterator(nil, s.db.iteratorOpts), + } + return it +} + +func (s *Snapshot) Close() { + s.snp.Release() +} diff --git a/vendor/github.com/lunny/nodb/store/iterator.go b/vendor/github.com/lunny/nodb/store/iterator.go new file mode 100644 index 000000000000..27bf689da2fe --- /dev/null +++ b/vendor/github.com/lunny/nodb/store/iterator.go @@ -0,0 +1,327 @@ +package store + +import ( + "bytes" + + "github.com/lunny/nodb/store/driver" +) + +const ( + IteratorForward uint8 = 0 + IteratorBackward uint8 = 1 +) + +const ( + RangeClose uint8 = 0x00 + RangeLOpen uint8 = 0x01 + RangeROpen uint8 = 0x10 + RangeOpen uint8 = 0x11 +) + +// min must less or equal than max +// +// range type: +// +// close: [min, max] +// open: (min, max) +// lopen: (min, max] +// ropen: [min, max) +// +type Range struct { + Min []byte + Max []byte + + Type uint8 +} + +type Limit struct { + Offset int + Count int +} + +type Iterator struct { + it driver.IIterator +} + +// Returns a copy of key. +func (it *Iterator) Key() []byte { + k := it.it.Key() + if k == nil { + return nil + } + + return append([]byte{}, k...) +} + +// Returns a copy of value. +func (it *Iterator) Value() []byte { + v := it.it.Value() + if v == nil { + return nil + } + + return append([]byte{}, v...) +} + +// Returns a reference of key. +// you must be careful that it will be changed after next iterate. +func (it *Iterator) RawKey() []byte { + return it.it.Key() +} + +// Returns a reference of value. +// you must be careful that it will be changed after next iterate. +func (it *Iterator) RawValue() []byte { + return it.it.Value() +} + +// Copy key to b, if b len is small or nil, returns a new one. +func (it *Iterator) BufKey(b []byte) []byte { + k := it.RawKey() + if k == nil { + return nil + } + if b == nil { + b = []byte{} + } + + b = b[0:0] + return append(b, k...) +} + +// Copy value to b, if b len is small or nil, returns a new one. +func (it *Iterator) BufValue(b []byte) []byte { + v := it.RawValue() + if v == nil { + return nil + } + + if b == nil { + b = []byte{} + } + + b = b[0:0] + return append(b, v...) +} + +func (it *Iterator) Close() { + if it.it != nil { + it.it.Close() + it.it = nil + } +} + +func (it *Iterator) Valid() bool { + return it.it.Valid() +} + +func (it *Iterator) Next() { + it.it.Next() +} + +func (it *Iterator) Prev() { + it.it.Prev() +} + +func (it *Iterator) SeekToFirst() { + it.it.First() +} + +func (it *Iterator) SeekToLast() { + it.it.Last() +} + +func (it *Iterator) Seek(key []byte) { + it.it.Seek(key) +} + +// Finds by key, if not found, nil returns. +func (it *Iterator) Find(key []byte) []byte { + it.Seek(key) + if it.Valid() { + k := it.RawKey() + if k == nil { + return nil + } else if bytes.Equal(k, key) { + return it.Value() + } + } + + return nil +} + +// Finds by key, if not found, nil returns, else a reference of value returns. +// you must be careful that it will be changed after next iterate. +func (it *Iterator) RawFind(key []byte) []byte { + it.Seek(key) + if it.Valid() { + k := it.RawKey() + if k == nil { + return nil + } else if bytes.Equal(k, key) { + return it.RawValue() + } + } + + return nil +} + +type RangeLimitIterator struct { + it *Iterator + + r *Range + l *Limit + + step int + + //0 for IteratorForward, 1 for IteratorBackward + direction uint8 +} + +func (it *RangeLimitIterator) Key() []byte { + return it.it.Key() +} + +func (it *RangeLimitIterator) Value() []byte { + return it.it.Value() +} + +func (it *RangeLimitIterator) RawKey() []byte { + return it.it.RawKey() +} + +func (it *RangeLimitIterator) RawValue() []byte { + return it.it.RawValue() +} + +func (it *RangeLimitIterator) BufKey(b []byte) []byte { + return it.it.BufKey(b) +} + +func (it *RangeLimitIterator) BufValue(b []byte) []byte { + return it.it.BufValue(b) +} + +func (it *RangeLimitIterator) Valid() bool { + if it.l.Offset < 0 { + return false + } else if !it.it.Valid() { + return false + } else if it.l.Count >= 0 && it.step >= it.l.Count { + return false + } + + if it.direction == IteratorForward { + if it.r.Max != nil { + r := bytes.Compare(it.it.RawKey(), it.r.Max) + if it.r.Type&RangeROpen > 0 { + return !(r >= 0) + } else { + return !(r > 0) + } + } + } else { + if it.r.Min != nil { + r := bytes.Compare(it.it.RawKey(), it.r.Min) + if it.r.Type&RangeLOpen > 0 { + return !(r <= 0) + } else { + return !(r < 0) + } + } + } + + return true +} + +func (it *RangeLimitIterator) Next() { + it.step++ + + if it.direction == IteratorForward { + it.it.Next() + } else { + it.it.Prev() + } +} + +func (it *RangeLimitIterator) Close() { + it.it.Close() +} + +func NewRangeLimitIterator(i *Iterator, r *Range, l *Limit) *RangeLimitIterator { + return rangeLimitIterator(i, r, l, IteratorForward) +} + +func NewRevRangeLimitIterator(i *Iterator, r *Range, l *Limit) *RangeLimitIterator { + return rangeLimitIterator(i, r, l, IteratorBackward) +} + +func NewRangeIterator(i *Iterator, r *Range) *RangeLimitIterator { + return rangeLimitIterator(i, r, &Limit{0, -1}, IteratorForward) +} + +func NewRevRangeIterator(i *Iterator, r *Range) *RangeLimitIterator { + return rangeLimitIterator(i, r, &Limit{0, -1}, IteratorBackward) +} + +func rangeLimitIterator(i *Iterator, r *Range, l *Limit, direction uint8) *RangeLimitIterator { + it := new(RangeLimitIterator) + + it.it = i + + it.r = r + it.l = l + it.direction = direction + + it.step = 0 + + if l.Offset < 0 { + return it + } + + if direction == IteratorForward { + if r.Min == nil { + it.it.SeekToFirst() + } else { + it.it.Seek(r.Min) + + if r.Type&RangeLOpen > 0 { + if it.it.Valid() && bytes.Equal(it.it.RawKey(), r.Min) { + it.it.Next() + } + } + } + } else { + if r.Max == nil { + it.it.SeekToLast() + } else { + it.it.Seek(r.Max) + + if !it.it.Valid() { + it.it.SeekToLast() + } else { + if !bytes.Equal(it.it.RawKey(), r.Max) { + it.it.Prev() + } + } + + if r.Type&RangeROpen > 0 { + if it.it.Valid() && bytes.Equal(it.it.RawKey(), r.Max) { + it.it.Prev() + } + } + } + } + + for i := 0; i < l.Offset; i++ { + if it.it.Valid() { + if it.direction == IteratorForward { + it.it.Next() + } else { + it.it.Prev() + } + } + } + + return it +} diff --git a/vendor/github.com/lunny/nodb/store/snapshot.go b/vendor/github.com/lunny/nodb/store/snapshot.go new file mode 100644 index 000000000000..75ba0497db3e --- /dev/null +++ b/vendor/github.com/lunny/nodb/store/snapshot.go @@ -0,0 +1,16 @@ +package store + +import ( + "github.com/lunny/nodb/store/driver" +) + +type Snapshot struct { + driver.ISnapshot +} + +func (s *Snapshot) NewIterator() *Iterator { + it := new(Iterator) + it.it = s.ISnapshot.NewIterator() + + return it +} diff --git a/vendor/github.com/lunny/nodb/store/store.go b/vendor/github.com/lunny/nodb/store/store.go new file mode 100644 index 000000000000..5d0ade1bf0f0 --- /dev/null +++ b/vendor/github.com/lunny/nodb/store/store.go @@ -0,0 +1,51 @@ +package store + +import ( + "fmt" + "os" + "path" + "github.com/lunny/nodb/config" + "github.com/lunny/nodb/store/driver" + + _ "github.com/lunny/nodb/store/goleveldb" +) + +func getStorePath(cfg *config.Config) string { + return path.Join(cfg.DataDir, fmt.Sprintf("%s_data", cfg.DBName)) +} + +func Open(cfg *config.Config) (*DB, error) { + s, err := driver.GetStore(cfg) + if err != nil { + return nil, err + } + + path := getStorePath(cfg) + + if err := os.MkdirAll(path, os.ModePerm); err != nil { + return nil, err + } + + idb, err := s.Open(path, cfg) + if err != nil { + return nil, err + } + + db := &DB{idb} + + return db, nil +} + +func Repair(cfg *config.Config) error { + s, err := driver.GetStore(cfg) + if err != nil { + return err + } + + path := getStorePath(cfg) + + return s.Repair(path, cfg) +} + +func init() { +} diff --git a/vendor/github.com/lunny/nodb/store/tx.go b/vendor/github.com/lunny/nodb/store/tx.go new file mode 100644 index 000000000000..32bcbcda4bae --- /dev/null +++ b/vendor/github.com/lunny/nodb/store/tx.go @@ -0,0 +1,42 @@ +package store + +import ( + "github.com/lunny/nodb/store/driver" +) + +type Tx struct { + driver.Tx +} + +func (tx *Tx) NewIterator() *Iterator { + it := new(Iterator) + it.it = tx.Tx.NewIterator() + + return it +} + +func (tx *Tx) NewWriteBatch() WriteBatch { + return tx.Tx.NewWriteBatch() +} + +func (tx *Tx) RangeIterator(min []byte, max []byte, rangeType uint8) *RangeLimitIterator { + return NewRangeLimitIterator(tx.NewIterator(), &Range{min, max, rangeType}, &Limit{0, -1}) +} + +func (tx *Tx) RevRangeIterator(min []byte, max []byte, rangeType uint8) *RangeLimitIterator { + return NewRevRangeLimitIterator(tx.NewIterator(), &Range{min, max, rangeType}, &Limit{0, -1}) +} + +//count < 0, unlimit. +// +//offset must >= 0, if < 0, will get nothing. +func (tx *Tx) RangeLimitIterator(min []byte, max []byte, rangeType uint8, offset int, count int) *RangeLimitIterator { + return NewRangeLimitIterator(tx.NewIterator(), &Range{min, max, rangeType}, &Limit{offset, count}) +} + +//count < 0, unlimit. +// +//offset must >= 0, if < 0, will get nothing. +func (tx *Tx) RevRangeLimitIterator(min []byte, max []byte, rangeType uint8, offset int, count int) *RangeLimitIterator { + return NewRevRangeLimitIterator(tx.NewIterator(), &Range{min, max, rangeType}, &Limit{offset, count}) +} diff --git a/vendor/github.com/lunny/nodb/store/writebatch.go b/vendor/github.com/lunny/nodb/store/writebatch.go new file mode 100644 index 000000000000..23e079eba61b --- /dev/null +++ b/vendor/github.com/lunny/nodb/store/writebatch.go @@ -0,0 +1,9 @@ +package store + +import ( + "github.com/lunny/nodb/store/driver" +) + +type WriteBatch interface { + driver.IWriteBatch +} diff --git a/vendor/github.com/lunny/nodb/t_bit.go b/vendor/github.com/lunny/nodb/t_bit.go new file mode 100644 index 000000000000..930d4ba5684d --- /dev/null +++ b/vendor/github.com/lunny/nodb/t_bit.go @@ -0,0 +1,922 @@ +package nodb + +import ( + "encoding/binary" + "errors" + "sort" + "time" + + "github.com/lunny/nodb/store" +) + +const ( + OPand uint8 = iota + 1 + OPor + OPxor + OPnot +) + +type BitPair struct { + Pos int32 + Val uint8 +} + +type segBitInfo struct { + Seq uint32 + Off uint32 + Val uint8 +} + +type segBitInfoArray []segBitInfo + +const ( + // byte + segByteWidth uint32 = 9 + segByteSize uint32 = 1 << segByteWidth + + // bit + segBitWidth uint32 = segByteWidth + 3 + segBitSize uint32 = segByteSize << 3 + + maxByteSize uint32 = 8 << 20 + maxSegCount uint32 = maxByteSize / segByteSize + + minSeq uint32 = 0 + maxSeq uint32 = uint32((maxByteSize << 3) - 1) +) + +var bitsInByte = [256]int32{0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, + 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, + 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, + 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, + 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, + 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, + 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, + 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, + 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, + 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, + 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8} + +var fillBits = [...]uint8{1, 3, 7, 15, 31, 63, 127, 255} + +var emptySegment []byte = make([]byte, segByteSize, segByteSize) + +var fillSegment []byte = func() []byte { + data := make([]byte, segByteSize, segByteSize) + for i := uint32(0); i < segByteSize; i++ { + data[i] = 0xff + } + return data +}() + +var errBinKey = errors.New("invalid bin key") +var errOffset = errors.New("invalid offset") +var errDuplicatePos = errors.New("duplicate bit pos") + +func getBit(sz []byte, offset uint32) uint8 { + index := offset >> 3 + if index >= uint32(len(sz)) { + return 0 // error("overflow") + } + + offset -= index << 3 + return sz[index] >> offset & 1 +} + +func setBit(sz []byte, offset uint32, val uint8) bool { + if val != 1 && val != 0 { + return false // error("invalid val") + } + + index := offset >> 3 + if index >= uint32(len(sz)) { + return false // error("overflow") + } + + offset -= index << 3 + if sz[index]>>offset&1 != val { + sz[index] ^= (1 << offset) + } + return true +} + +func (datas segBitInfoArray) Len() int { + return len(datas) +} + +func (datas segBitInfoArray) Less(i, j int) bool { + res := (datas)[i].Seq < (datas)[j].Seq + if !res && (datas)[i].Seq == (datas)[j].Seq { + res = (datas)[i].Off < (datas)[j].Off + } + return res +} + +func (datas segBitInfoArray) Swap(i, j int) { + datas[i], datas[j] = datas[j], datas[i] +} + +func (db *DB) bEncodeMetaKey(key []byte) []byte { + mk := make([]byte, len(key)+2) + mk[0] = db.index + mk[1] = BitMetaType + + copy(mk[2:], key) + return mk +} + +func (db *DB) bDecodeMetaKey(bkey []byte) ([]byte, error) { + if len(bkey) < 2 || bkey[0] != db.index || bkey[1] != BitMetaType { + return nil, errBinKey + } + + return bkey[2:], nil +} + +func (db *DB) bEncodeBinKey(key []byte, seq uint32) []byte { + bk := make([]byte, len(key)+8) + + pos := 0 + bk[pos] = db.index + pos++ + bk[pos] = BitType + pos++ + + binary.BigEndian.PutUint16(bk[pos:], uint16(len(key))) + pos += 2 + + copy(bk[pos:], key) + pos += len(key) + + binary.BigEndian.PutUint32(bk[pos:], seq) + + return bk +} + +func (db *DB) bDecodeBinKey(bkey []byte) (key []byte, seq uint32, err error) { + if len(bkey) < 8 || bkey[0] != db.index { + err = errBinKey + return + } + + keyLen := binary.BigEndian.Uint16(bkey[2:4]) + if int(keyLen+8) != len(bkey) { + err = errBinKey + return + } + + key = bkey[4 : 4+keyLen] + seq = uint32(binary.BigEndian.Uint32(bkey[4+keyLen:])) + return +} + +func (db *DB) bCapByteSize(seq uint32, off uint32) uint32 { + var offByteSize uint32 = (off >> 3) + 1 + if offByteSize > segByteSize { + offByteSize = segByteSize + } + + return seq<= 0 { + offset += int32((uint32(tailSeq)<> segBitWidth + off &= (segBitSize - 1) + return +} + +func (db *DB) bGetMeta(key []byte) (tailSeq int32, tailOff int32, err error) { + var v []byte + + mk := db.bEncodeMetaKey(key) + v, err = db.bucket.Get(mk) + if err != nil { + return + } + + if v != nil { + tailSeq = int32(binary.LittleEndian.Uint32(v[0:4])) + tailOff = int32(binary.LittleEndian.Uint32(v[4:8])) + } else { + tailSeq = -1 + tailOff = -1 + } + return +} + +func (db *DB) bSetMeta(t *batch, key []byte, tailSeq uint32, tailOff uint32) { + ek := db.bEncodeMetaKey(key) + + buf := make([]byte, 8) + binary.LittleEndian.PutUint32(buf[0:4], tailSeq) + binary.LittleEndian.PutUint32(buf[4:8], tailOff) + + t.Put(ek, buf) + return +} + +func (db *DB) bUpdateMeta(t *batch, key []byte, seq uint32, off uint32) (tailSeq uint32, tailOff uint32, err error) { + var tseq, toff int32 + var update bool = false + + if tseq, toff, err = db.bGetMeta(key); err != nil { + return + } else if tseq < 0 { + update = true + } else { + tailSeq = uint32(MaxInt32(tseq, 0)) + tailOff = uint32(MaxInt32(toff, 0)) + update = (seq > tailSeq || (seq == tailSeq && off > tailOff)) + } + + if update { + db.bSetMeta(t, key, seq, off) + tailSeq = seq + tailOff = off + } + return +} + +func (db *DB) bDelete(t *batch, key []byte) (drop int64) { + mk := db.bEncodeMetaKey(key) + t.Delete(mk) + + minKey := db.bEncodeBinKey(key, minSeq) + maxKey := db.bEncodeBinKey(key, maxSeq) + it := db.bucket.RangeIterator(minKey, maxKey, store.RangeClose) + for ; it.Valid(); it.Next() { + t.Delete(it.RawKey()) + drop++ + } + it.Close() + + return drop +} + +func (db *DB) bGetSegment(key []byte, seq uint32) ([]byte, []byte, error) { + bk := db.bEncodeBinKey(key, seq) + segment, err := db.bucket.Get(bk) + if err != nil { + return bk, nil, err + } + return bk, segment, nil +} + +func (db *DB) bAllocateSegment(key []byte, seq uint32) ([]byte, []byte, error) { + bk, segment, err := db.bGetSegment(key, seq) + if err == nil && segment == nil { + segment = make([]byte, segByteSize, segByteSize) + } + return bk, segment, err +} + +func (db *DB) bIterator(key []byte) *store.RangeLimitIterator { + sk := db.bEncodeBinKey(key, minSeq) + ek := db.bEncodeBinKey(key, maxSeq) + return db.bucket.RangeIterator(sk, ek, store.RangeClose) +} + +func (db *DB) bSegAnd(a []byte, b []byte, res *[]byte) { + if a == nil || b == nil { + *res = nil + return + } + + data := *res + if data == nil { + data = make([]byte, segByteSize, segByteSize) + *res = data + } + + for i := uint32(0); i < segByteSize; i++ { + data[i] = a[i] & b[i] + } + return +} + +func (db *DB) bSegOr(a []byte, b []byte, res *[]byte) { + if a == nil || b == nil { + if a == nil && b == nil { + *res = nil + } else if a == nil { + *res = b + } else { + *res = a + } + return + } + + data := *res + if data == nil { + data = make([]byte, segByteSize, segByteSize) + *res = data + } + + for i := uint32(0); i < segByteSize; i++ { + data[i] = a[i] | b[i] + } + return +} + +func (db *DB) bSegXor(a []byte, b []byte, res *[]byte) { + if a == nil && b == nil { + *res = fillSegment + return + } + + if a == nil { + a = emptySegment + } + + if b == nil { + b = emptySegment + } + + data := *res + if data == nil { + data = make([]byte, segByteSize, segByteSize) + *res = data + } + + for i := uint32(0); i < segByteSize; i++ { + data[i] = a[i] ^ b[i] + } + + return +} + +func (db *DB) bExpireAt(key []byte, when int64) (int64, error) { + t := db.binBatch + t.Lock() + defer t.Unlock() + + if seq, _, err := db.bGetMeta(key); err != nil || seq < 0 { + return 0, err + } else { + db.expireAt(t, BitType, key, when) + if err := t.Commit(); err != nil { + return 0, err + } + } + return 1, nil +} + +func (db *DB) bCountByte(val byte, soff uint32, eoff uint32) int32 { + if soff > eoff { + soff, eoff = eoff, soff + } + + mask := uint8(0) + if soff > 0 { + mask |= fillBits[soff-1] + } + if eoff < 7 { + mask |= (fillBits[7] ^ fillBits[eoff]) + } + mask = fillBits[7] ^ mask + + return bitsInByte[val&mask] +} + +func (db *DB) bCountSeg(key []byte, seq uint32, soff uint32, eoff uint32) (cnt int32, err error) { + if soff >= segBitSize || soff < 0 || + eoff >= segBitSize || eoff < 0 { + return + } + + var segment []byte + if _, segment, err = db.bGetSegment(key, seq); err != nil { + return + } + + if segment == nil { + return + } + + if soff > eoff { + soff, eoff = eoff, soff + } + + headIdx := int(soff >> 3) + endIdx := int(eoff >> 3) + sByteOff := soff - ((soff >> 3) << 3) + eByteOff := eoff - ((eoff >> 3) << 3) + + if headIdx == endIdx { + cnt = db.bCountByte(segment[headIdx], sByteOff, eByteOff) + } else { + cnt = db.bCountByte(segment[headIdx], sByteOff, 7) + + db.bCountByte(segment[endIdx], 0, eByteOff) + } + + // sum up following bytes + for idx, end := headIdx+1, endIdx-1; idx <= end; idx += 1 { + cnt += bitsInByte[segment[idx]] + if idx == end { + break + } + } + + return +} + +func (db *DB) BGet(key []byte) (data []byte, err error) { + if err = checkKeySize(key); err != nil { + return + } + + var ts, to int32 + if ts, to, err = db.bGetMeta(key); err != nil || ts < 0 { + return + } + + var tailSeq, tailOff = uint32(ts), uint32(to) + var capByteSize uint32 = db.bCapByteSize(tailSeq, tailOff) + data = make([]byte, capByteSize, capByteSize) + + minKey := db.bEncodeBinKey(key, minSeq) + maxKey := db.bEncodeBinKey(key, tailSeq) + it := db.bucket.RangeIterator(minKey, maxKey, store.RangeClose) + + var seq, s, e uint32 + for ; it.Valid(); it.Next() { + if _, seq, err = db.bDecodeBinKey(it.RawKey()); err != nil { + data = nil + break + } + + s = seq << segByteWidth + e = MinUInt32(s+segByteSize, capByteSize) + copy(data[s:e], it.RawValue()) + } + it.Close() + + return +} + +func (db *DB) BDelete(key []byte) (drop int64, err error) { + if err = checkKeySize(key); err != nil { + return + } + + t := db.binBatch + t.Lock() + defer t.Unlock() + + drop = db.bDelete(t, key) + db.rmExpire(t, BitType, key) + + err = t.Commit() + return +} + +func (db *DB) BSetBit(key []byte, offset int32, val uint8) (ori uint8, err error) { + if err = checkKeySize(key); err != nil { + return + } + + // todo : check offset + var seq, off uint32 + if seq, off, err = db.bParseOffset(key, offset); err != nil { + return 0, err + } + + var bk, segment []byte + if bk, segment, err = db.bAllocateSegment(key, seq); err != nil { + return 0, err + } + + if segment != nil { + ori = getBit(segment, off) + if setBit(segment, off, val) { + t := db.binBatch + t.Lock() + defer t.Unlock() + + t.Put(bk, segment) + if _, _, e := db.bUpdateMeta(t, key, seq, off); e != nil { + err = e + return + } + + err = t.Commit() + } + } + + return +} + +func (db *DB) BMSetBit(key []byte, args ...BitPair) (place int64, err error) { + if err = checkKeySize(key); err != nil { + return + } + + // (ps : so as to aviod wasting memory copy while calling db.Get() and batch.Put(), + // here we sequence the params by pos, so that we can merge the execution of + // diff pos setting which targets on the same segment respectively. ) + + // #1 : sequence request data + var argCnt = len(args) + var bitInfos segBitInfoArray = make(segBitInfoArray, argCnt) + var seq, off uint32 + + for i, info := range args { + if seq, off, err = db.bParseOffset(key, info.Pos); err != nil { + return + } + + bitInfos[i].Seq = seq + bitInfos[i].Off = off + bitInfos[i].Val = info.Val + } + + sort.Sort(bitInfos) + + for i := 1; i < argCnt; i++ { + if bitInfos[i].Seq == bitInfos[i-1].Seq && bitInfos[i].Off == bitInfos[i-1].Off { + return 0, errDuplicatePos + } + } + + // #2 : execute bit set in order + t := db.binBatch + t.Lock() + defer t.Unlock() + + var curBinKey, curSeg []byte + var curSeq, maxSeq, maxOff uint32 + + for _, info := range bitInfos { + if curSeg != nil && info.Seq != curSeq { + t.Put(curBinKey, curSeg) + curSeg = nil + } + + if curSeg == nil { + curSeq = info.Seq + if curBinKey, curSeg, err = db.bAllocateSegment(key, info.Seq); err != nil { + return + } + + if curSeg == nil { + continue + } + } + + if setBit(curSeg, info.Off, info.Val) { + maxSeq = info.Seq + maxOff = info.Off + place++ + } + } + + if curSeg != nil { + t.Put(curBinKey, curSeg) + } + + // finally, update meta + if place > 0 { + if _, _, err = db.bUpdateMeta(t, key, maxSeq, maxOff); err != nil { + return + } + + err = t.Commit() + } + + return +} + +func (db *DB) BGetBit(key []byte, offset int32) (uint8, error) { + if seq, off, err := db.bParseOffset(key, offset); err != nil { + return 0, err + } else { + _, segment, err := db.bGetSegment(key, seq) + if err != nil { + return 0, err + } + + if segment == nil { + return 0, nil + } else { + return getBit(segment, off), nil + } + } +} + +// func (db *DB) BGetRange(key []byte, start int32, end int32) ([]byte, error) { +// section := make([]byte) + +// return +// } + +func (db *DB) BCount(key []byte, start int32, end int32) (cnt int32, err error) { + var sseq, soff uint32 + if sseq, soff, err = db.bParseOffset(key, start); err != nil { + return + } + + var eseq, eoff uint32 + if eseq, eoff, err = db.bParseOffset(key, end); err != nil { + return + } + + if sseq > eseq || (sseq == eseq && soff > eoff) { + sseq, eseq = eseq, sseq + soff, eoff = eoff, soff + } + + var segCnt int32 + if eseq == sseq { + if segCnt, err = db.bCountSeg(key, sseq, soff, eoff); err != nil { + return 0, err + } + + cnt = segCnt + + } else { + if segCnt, err = db.bCountSeg(key, sseq, soff, segBitSize-1); err != nil { + return 0, err + } else { + cnt += segCnt + } + + if segCnt, err = db.bCountSeg(key, eseq, 0, eoff); err != nil { + return 0, err + } else { + cnt += segCnt + } + } + + // middle segs + var segment []byte + skey := db.bEncodeBinKey(key, sseq) + ekey := db.bEncodeBinKey(key, eseq) + + it := db.bucket.RangeIterator(skey, ekey, store.RangeOpen) + for ; it.Valid(); it.Next() { + segment = it.RawValue() + for _, bt := range segment { + cnt += bitsInByte[bt] + } + } + it.Close() + + return +} + +func (db *DB) BTail(key []byte) (int32, error) { + // effective length of data, the highest bit-pos set in history + tailSeq, tailOff, err := db.bGetMeta(key) + if err != nil { + return 0, err + } + + tail := int32(-1) + if tailSeq >= 0 { + tail = int32(uint32(tailSeq)< maxDstSeq || (seq == maxDstSeq && off > maxDstOff) { + maxDstSeq = seq + maxDstOff = off + } + } + + if (op == OPnot && validKeyNum != 1) || + (op != OPnot && validKeyNum < 2) { + return // with not enough existing source key + } + + var srcIdx int + for srcIdx = 0; srcIdx < keyNum; srcIdx++ { + if srckeys[srcIdx] != nil { + break + } + } + + // init - data + var segments = make([][]byte, maxDstSeq+1) + + if op == OPnot { + // ps : + // ( ~num == num ^ 0x11111111 ) + // we init the result segments with all bit set, + // then we can calculate through the way of 'xor'. + + // ahead segments bin format : 1111 ... 1111 + for i := uint32(0); i < maxDstSeq; i++ { + segments[i] = fillSegment + } + + // last segment bin format : 1111..1100..0000 + var tailSeg = make([]byte, segByteSize, segByteSize) + var fillByte = fillBits[7] + var tailSegLen = db.bCapByteSize(uint32(0), maxDstOff) + for i := uint32(0); i < tailSegLen-1; i++ { + tailSeg[i] = fillByte + } + tailSeg[tailSegLen-1] = fillBits[maxDstOff-(tailSegLen-1)<<3] + segments[maxDstSeq] = tailSeg + + } else { + // ps : init segments by data corresponding to the 1st valid source key + it := db.bIterator(srckeys[srcIdx]) + for ; it.Valid(); it.Next() { + if _, seq, err = db.bDecodeBinKey(it.RawKey()); err != nil { + // to do ... + it.Close() + return + } + segments[seq] = it.Value() + } + it.Close() + srcIdx++ + } + + // operation with following keys + var res []byte + for i := srcIdx; i < keyNum; i++ { + if srckeys[i] == nil { + continue + } + + it := db.bIterator(srckeys[i]) + for idx, end := uint32(0), false; !end; it.Next() { + end = !it.Valid() + if !end { + if _, seq, err = db.bDecodeBinKey(it.RawKey()); err != nil { + // to do ... + it.Close() + return + } + } else { + seq = maxDstSeq + 1 + } + + // todo : + // operation 'and' can be optimize here : + // if seq > max_segments_idx, this loop can be break, + // which can avoid cost from Key() and bDecodeBinKey() + + for ; idx < seq; idx++ { + res = nil + exeOp(segments[idx], nil, &res) + segments[idx] = res + } + + if !end { + res = it.Value() + exeOp(segments[seq], res, &res) + segments[seq] = res + idx++ + } + } + it.Close() + } + + // clear the old data in case + db.bDelete(t, dstkey) + db.rmExpire(t, BitType, dstkey) + + // set data + db.bSetMeta(t, dstkey, maxDstSeq, maxDstOff) + + var bk []byte + for seq, segt := range segments { + if segt != nil { + bk = db.bEncodeBinKey(dstkey, uint32(seq)) + t.Put(bk, segt) + } + } + + err = t.Commit() + if err == nil { + // blen = int32(db.bCapByteSize(maxDstOff, maxDstOff)) + blen = int32(maxDstSeq< MaxKeySize || len(key) == 0 { + return errKeySize + } else if len(field) > MaxHashFieldSize || len(field) == 0 { + return errHashFieldSize + } + return nil +} + +func (db *DB) hEncodeSizeKey(key []byte) []byte { + buf := make([]byte, len(key)+2) + + buf[0] = db.index + buf[1] = HSizeType + + copy(buf[2:], key) + return buf +} + +func (db *DB) hDecodeSizeKey(ek []byte) ([]byte, error) { + if len(ek) < 2 || ek[0] != db.index || ek[1] != HSizeType { + return nil, errHSizeKey + } + + return ek[2:], nil +} + +func (db *DB) hEncodeHashKey(key []byte, field []byte) []byte { + buf := make([]byte, len(key)+len(field)+1+1+2+1) + + pos := 0 + buf[pos] = db.index + pos++ + buf[pos] = HashType + pos++ + + binary.BigEndian.PutUint16(buf[pos:], uint16(len(key))) + pos += 2 + + copy(buf[pos:], key) + pos += len(key) + + buf[pos] = hashStartSep + pos++ + copy(buf[pos:], field) + + return buf +} + +func (db *DB) hDecodeHashKey(ek []byte) ([]byte, []byte, error) { + if len(ek) < 5 || ek[0] != db.index || ek[1] != HashType { + return nil, nil, errHashKey + } + + pos := 2 + keyLen := int(binary.BigEndian.Uint16(ek[pos:])) + pos += 2 + + if keyLen+5 > len(ek) { + return nil, nil, errHashKey + } + + key := ek[pos : pos+keyLen] + pos += keyLen + + if ek[pos] != hashStartSep { + return nil, nil, errHashKey + } + + pos++ + field := ek[pos:] + return key, field, nil +} + +func (db *DB) hEncodeStartKey(key []byte) []byte { + return db.hEncodeHashKey(key, nil) +} + +func (db *DB) hEncodeStopKey(key []byte) []byte { + k := db.hEncodeHashKey(key, nil) + + k[len(k)-1] = hashStopSep + + return k +} + +func (db *DB) hSetItem(key []byte, field []byte, value []byte) (int64, error) { + t := db.hashBatch + + ek := db.hEncodeHashKey(key, field) + + var n int64 = 1 + if v, _ := db.bucket.Get(ek); v != nil { + n = 0 + } else { + if _, err := db.hIncrSize(key, 1); err != nil { + return 0, err + } + } + + t.Put(ek, value) + return n, nil +} + +// ps : here just focus on deleting the hash data, +// any other likes expire is ignore. +func (db *DB) hDelete(t *batch, key []byte) int64 { + sk := db.hEncodeSizeKey(key) + start := db.hEncodeStartKey(key) + stop := db.hEncodeStopKey(key) + + var num int64 = 0 + it := db.bucket.RangeLimitIterator(start, stop, store.RangeROpen, 0, -1) + for ; it.Valid(); it.Next() { + t.Delete(it.Key()) + num++ + } + it.Close() + + t.Delete(sk) + return num +} + +func (db *DB) hExpireAt(key []byte, when int64) (int64, error) { + t := db.hashBatch + t.Lock() + defer t.Unlock() + + if hlen, err := db.HLen(key); err != nil || hlen == 0 { + return 0, err + } else { + db.expireAt(t, HashType, key, when) + if err := t.Commit(); err != nil { + return 0, err + } + } + return 1, nil +} + +func (db *DB) HLen(key []byte) (int64, error) { + if err := checkKeySize(key); err != nil { + return 0, err + } + + return Int64(db.bucket.Get(db.hEncodeSizeKey(key))) +} + +func (db *DB) HSet(key []byte, field []byte, value []byte) (int64, error) { + if err := checkHashKFSize(key, field); err != nil { + return 0, err + } else if err := checkValueSize(value); err != nil { + return 0, err + } + + t := db.hashBatch + t.Lock() + defer t.Unlock() + + n, err := db.hSetItem(key, field, value) + if err != nil { + return 0, err + } + + //todo add binlog + + err = t.Commit() + return n, err +} + +func (db *DB) HGet(key []byte, field []byte) ([]byte, error) { + if err := checkHashKFSize(key, field); err != nil { + return nil, err + } + + return db.bucket.Get(db.hEncodeHashKey(key, field)) +} + +func (db *DB) HMset(key []byte, args ...FVPair) error { + t := db.hashBatch + t.Lock() + defer t.Unlock() + + var err error + var ek []byte + var num int64 = 0 + for i := 0; i < len(args); i++ { + if err := checkHashKFSize(key, args[i].Field); err != nil { + return err + } else if err := checkValueSize(args[i].Value); err != nil { + return err + } + + ek = db.hEncodeHashKey(key, args[i].Field) + + if v, err := db.bucket.Get(ek); err != nil { + return err + } else if v == nil { + num++ + } + + t.Put(ek, args[i].Value) + } + + if _, err = db.hIncrSize(key, num); err != nil { + return err + } + + //todo add binglog + err = t.Commit() + return err +} + +func (db *DB) HMget(key []byte, args ...[]byte) ([][]byte, error) { + var ek []byte + + it := db.bucket.NewIterator() + defer it.Close() + + r := make([][]byte, len(args)) + for i := 0; i < len(args); i++ { + if err := checkHashKFSize(key, args[i]); err != nil { + return nil, err + } + + ek = db.hEncodeHashKey(key, args[i]) + + r[i] = it.Find(ek) + } + + return r, nil +} + +func (db *DB) HDel(key []byte, args ...[]byte) (int64, error) { + t := db.hashBatch + + var ek []byte + var v []byte + var err error + + t.Lock() + defer t.Unlock() + + it := db.bucket.NewIterator() + defer it.Close() + + var num int64 = 0 + for i := 0; i < len(args); i++ { + if err := checkHashKFSize(key, args[i]); err != nil { + return 0, err + } + + ek = db.hEncodeHashKey(key, args[i]) + + v = it.RawFind(ek) + if v == nil { + continue + } else { + num++ + t.Delete(ek) + } + } + + if _, err = db.hIncrSize(key, -num); err != nil { + return 0, err + } + + err = t.Commit() + + return num, err +} + +func (db *DB) hIncrSize(key []byte, delta int64) (int64, error) { + t := db.hashBatch + sk := db.hEncodeSizeKey(key) + + var err error + var size int64 = 0 + if size, err = Int64(db.bucket.Get(sk)); err != nil { + return 0, err + } else { + size += delta + if size <= 0 { + size = 0 + t.Delete(sk) + db.rmExpire(t, HashType, key) + } else { + t.Put(sk, PutInt64(size)) + } + } + + return size, nil +} + +func (db *DB) HIncrBy(key []byte, field []byte, delta int64) (int64, error) { + if err := checkHashKFSize(key, field); err != nil { + return 0, err + } + + t := db.hashBatch + var ek []byte + var err error + + t.Lock() + defer t.Unlock() + + ek = db.hEncodeHashKey(key, field) + + var n int64 = 0 + if n, err = StrInt64(db.bucket.Get(ek)); err != nil { + return 0, err + } + + n += delta + + _, err = db.hSetItem(key, field, StrPutInt64(n)) + if err != nil { + return 0, err + } + + err = t.Commit() + + return n, err +} + +func (db *DB) HGetAll(key []byte) ([]FVPair, error) { + if err := checkKeySize(key); err != nil { + return nil, err + } + + start := db.hEncodeStartKey(key) + stop := db.hEncodeStopKey(key) + + v := make([]FVPair, 0, 16) + + it := db.bucket.RangeLimitIterator(start, stop, store.RangeROpen, 0, -1) + for ; it.Valid(); it.Next() { + _, f, err := db.hDecodeHashKey(it.Key()) + if err != nil { + return nil, err + } + + v = append(v, FVPair{Field: f, Value: it.Value()}) + } + + it.Close() + + return v, nil +} + +func (db *DB) HKeys(key []byte) ([][]byte, error) { + if err := checkKeySize(key); err != nil { + return nil, err + } + + start := db.hEncodeStartKey(key) + stop := db.hEncodeStopKey(key) + + v := make([][]byte, 0, 16) + + it := db.bucket.RangeLimitIterator(start, stop, store.RangeROpen, 0, -1) + for ; it.Valid(); it.Next() { + _, f, err := db.hDecodeHashKey(it.Key()) + if err != nil { + return nil, err + } + v = append(v, f) + } + + it.Close() + + return v, nil +} + +func (db *DB) HValues(key []byte) ([][]byte, error) { + if err := checkKeySize(key); err != nil { + return nil, err + } + + start := db.hEncodeStartKey(key) + stop := db.hEncodeStopKey(key) + + v := make([][]byte, 0, 16) + + it := db.bucket.RangeLimitIterator(start, stop, store.RangeROpen, 0, -1) + for ; it.Valid(); it.Next() { + _, _, err := db.hDecodeHashKey(it.Key()) + if err != nil { + return nil, err + } + + v = append(v, it.Value()) + } + + it.Close() + + return v, nil +} + +func (db *DB) HClear(key []byte) (int64, error) { + if err := checkKeySize(key); err != nil { + return 0, err + } + + t := db.hashBatch + t.Lock() + defer t.Unlock() + + num := db.hDelete(t, key) + db.rmExpire(t, HashType, key) + + err := t.Commit() + return num, err +} + +func (db *DB) HMclear(keys ...[]byte) (int64, error) { + t := db.hashBatch + t.Lock() + defer t.Unlock() + + for _, key := range keys { + if err := checkKeySize(key); err != nil { + return 0, err + } + + db.hDelete(t, key) + db.rmExpire(t, HashType, key) + } + + err := t.Commit() + return int64(len(keys)), err +} + +func (db *DB) hFlush() (drop int64, err error) { + t := db.hashBatch + + t.Lock() + defer t.Unlock() + + return db.flushType(t, HashType) +} + +func (db *DB) HScan(key []byte, count int, inclusive bool, match string) ([][]byte, error) { + return db.scan(HSizeType, key, count, inclusive, match) +} + +func (db *DB) HExpire(key []byte, duration int64) (int64, error) { + if duration <= 0 { + return 0, errExpireValue + } + + return db.hExpireAt(key, time.Now().Unix()+duration) +} + +func (db *DB) HExpireAt(key []byte, when int64) (int64, error) { + if when <= time.Now().Unix() { + return 0, errExpireValue + } + + return db.hExpireAt(key, when) +} + +func (db *DB) HTTL(key []byte) (int64, error) { + if err := checkKeySize(key); err != nil { + return -1, err + } + + return db.ttl(HashType, key) +} + +func (db *DB) HPersist(key []byte) (int64, error) { + if err := checkKeySize(key); err != nil { + return 0, err + } + + t := db.hashBatch + t.Lock() + defer t.Unlock() + + n, err := db.rmExpire(t, HashType, key) + if err != nil { + return 0, err + } + + err = t.Commit() + return n, err +} diff --git a/vendor/github.com/lunny/nodb/t_kv.go b/vendor/github.com/lunny/nodb/t_kv.go new file mode 100644 index 000000000000..82a12f7027a5 --- /dev/null +++ b/vendor/github.com/lunny/nodb/t_kv.go @@ -0,0 +1,387 @@ +package nodb + +import ( + "errors" + "time" +) + +type KVPair struct { + Key []byte + Value []byte +} + +var errKVKey = errors.New("invalid encode kv key") + +func checkKeySize(key []byte) error { + if len(key) > MaxKeySize || len(key) == 0 { + return errKeySize + } + return nil +} + +func checkValueSize(value []byte) error { + if len(value) > MaxValueSize { + return errValueSize + } + + return nil +} + +func (db *DB) encodeKVKey(key []byte) []byte { + ek := make([]byte, len(key)+2) + ek[0] = db.index + ek[1] = KVType + copy(ek[2:], key) + return ek +} + +func (db *DB) decodeKVKey(ek []byte) ([]byte, error) { + if len(ek) < 2 || ek[0] != db.index || ek[1] != KVType { + return nil, errKVKey + } + + return ek[2:], nil +} + +func (db *DB) encodeKVMinKey() []byte { + ek := db.encodeKVKey(nil) + return ek +} + +func (db *DB) encodeKVMaxKey() []byte { + ek := db.encodeKVKey(nil) + ek[len(ek)-1] = KVType + 1 + return ek +} + +func (db *DB) incr(key []byte, delta int64) (int64, error) { + if err := checkKeySize(key); err != nil { + return 0, err + } + + var err error + key = db.encodeKVKey(key) + + t := db.kvBatch + + t.Lock() + defer t.Unlock() + + var n int64 + n, err = StrInt64(db.bucket.Get(key)) + if err != nil { + return 0, err + } + + n += delta + + t.Put(key, StrPutInt64(n)) + + //todo binlog + + err = t.Commit() + return n, err +} + +// ps : here just focus on deleting the key-value data, +// any other likes expire is ignore. +func (db *DB) delete(t *batch, key []byte) int64 { + key = db.encodeKVKey(key) + t.Delete(key) + return 1 +} + +func (db *DB) setExpireAt(key []byte, when int64) (int64, error) { + t := db.kvBatch + t.Lock() + defer t.Unlock() + + if exist, err := db.Exists(key); err != nil || exist == 0 { + return 0, err + } else { + db.expireAt(t, KVType, key, when) + if err := t.Commit(); err != nil { + return 0, err + } + } + return 1, nil +} + +func (db *DB) Decr(key []byte) (int64, error) { + return db.incr(key, -1) +} + +func (db *DB) DecrBy(key []byte, decrement int64) (int64, error) { + return db.incr(key, -decrement) +} + +func (db *DB) Del(keys ...[]byte) (int64, error) { + if len(keys) == 0 { + return 0, nil + } + + codedKeys := make([][]byte, len(keys)) + for i, k := range keys { + codedKeys[i] = db.encodeKVKey(k) + } + + t := db.kvBatch + t.Lock() + defer t.Unlock() + + for i, k := range keys { + t.Delete(codedKeys[i]) + db.rmExpire(t, KVType, k) + } + + err := t.Commit() + return int64(len(keys)), err +} + +func (db *DB) Exists(key []byte) (int64, error) { + if err := checkKeySize(key); err != nil { + return 0, err + } + + var err error + key = db.encodeKVKey(key) + + var v []byte + v, err = db.bucket.Get(key) + if v != nil && err == nil { + return 1, nil + } + + return 0, err +} + +func (db *DB) Get(key []byte) ([]byte, error) { + if err := checkKeySize(key); err != nil { + return nil, err + } + + key = db.encodeKVKey(key) + + return db.bucket.Get(key) +} + +func (db *DB) GetSet(key []byte, value []byte) ([]byte, error) { + if err := checkKeySize(key); err != nil { + return nil, err + } else if err := checkValueSize(value); err != nil { + return nil, err + } + + key = db.encodeKVKey(key) + + t := db.kvBatch + + t.Lock() + defer t.Unlock() + + oldValue, err := db.bucket.Get(key) + if err != nil { + return nil, err + } + + t.Put(key, value) + //todo, binlog + + err = t.Commit() + + return oldValue, err +} + +func (db *DB) Incr(key []byte) (int64, error) { + return db.incr(key, 1) +} + +func (db *DB) IncrBy(key []byte, increment int64) (int64, error) { + return db.incr(key, increment) +} + +func (db *DB) MGet(keys ...[]byte) ([][]byte, error) { + values := make([][]byte, len(keys)) + + it := db.bucket.NewIterator() + defer it.Close() + + for i := range keys { + if err := checkKeySize(keys[i]); err != nil { + return nil, err + } + + values[i] = it.Find(db.encodeKVKey(keys[i])) + } + + return values, nil +} + +func (db *DB) MSet(args ...KVPair) error { + if len(args) == 0 { + return nil + } + + t := db.kvBatch + + var err error + var key []byte + var value []byte + + t.Lock() + defer t.Unlock() + + for i := 0; i < len(args); i++ { + if err := checkKeySize(args[i].Key); err != nil { + return err + } else if err := checkValueSize(args[i].Value); err != nil { + return err + } + + key = db.encodeKVKey(args[i].Key) + + value = args[i].Value + + t.Put(key, value) + + //todo binlog + } + + err = t.Commit() + return err +} + +func (db *DB) Set(key []byte, value []byte) error { + if err := checkKeySize(key); err != nil { + return err + } else if err := checkValueSize(value); err != nil { + return err + } + + var err error + key = db.encodeKVKey(key) + + t := db.kvBatch + + t.Lock() + defer t.Unlock() + + t.Put(key, value) + + err = t.Commit() + + return err +} + +func (db *DB) SetNX(key []byte, value []byte) (int64, error) { + if err := checkKeySize(key); err != nil { + return 0, err + } else if err := checkValueSize(value); err != nil { + return 0, err + } + + var err error + key = db.encodeKVKey(key) + + var n int64 = 1 + + t := db.kvBatch + + t.Lock() + defer t.Unlock() + + if v, err := db.bucket.Get(key); err != nil { + return 0, err + } else if v != nil { + n = 0 + } else { + t.Put(key, value) + + //todo binlog + + err = t.Commit() + } + + return n, err +} + +func (db *DB) flush() (drop int64, err error) { + t := db.kvBatch + t.Lock() + defer t.Unlock() + return db.flushType(t, KVType) +} + +//if inclusive is true, scan range [key, inf) else (key, inf) +func (db *DB) Scan(key []byte, count int, inclusive bool, match string) ([][]byte, error) { + return db.scan(KVType, key, count, inclusive, match) +} + +func (db *DB) Expire(key []byte, duration int64) (int64, error) { + if duration <= 0 { + return 0, errExpireValue + } + + return db.setExpireAt(key, time.Now().Unix()+duration) +} + +func (db *DB) ExpireAt(key []byte, when int64) (int64, error) { + if when <= time.Now().Unix() { + return 0, errExpireValue + } + + return db.setExpireAt(key, when) +} + +func (db *DB) TTL(key []byte) (int64, error) { + if err := checkKeySize(key); err != nil { + return -1, err + } + + return db.ttl(KVType, key) +} + +func (db *DB) Persist(key []byte) (int64, error) { + if err := checkKeySize(key); err != nil { + return 0, err + } + + t := db.kvBatch + t.Lock() + defer t.Unlock() + n, err := db.rmExpire(t, KVType, key) + if err != nil { + return 0, err + } + + err = t.Commit() + return n, err +} + +func (db *DB) Lock() { + t := db.kvBatch + t.Lock() +} + +func (db *DB) Remove(key []byte) bool { + if len(key) == 0 { + return false + } + t := db.kvBatch + t.Delete(db.encodeKVKey(key)) + _, err := db.rmExpire(t, KVType, key) + if err != nil { + return false + } + return true +} + +func (db *DB) Commit() error { + t := db.kvBatch + return t.Commit() +} + +func (db *DB) Unlock() { + t := db.kvBatch + t.Unlock() +} diff --git a/vendor/github.com/lunny/nodb/t_list.go b/vendor/github.com/lunny/nodb/t_list.go new file mode 100644 index 000000000000..5b9d9d9c2162 --- /dev/null +++ b/vendor/github.com/lunny/nodb/t_list.go @@ -0,0 +1,492 @@ +package nodb + +import ( + "encoding/binary" + "errors" + "time" + + "github.com/lunny/nodb/store" +) + +const ( + listHeadSeq int32 = 1 + listTailSeq int32 = 2 + + listMinSeq int32 = 1000 + listMaxSeq int32 = 1<<31 - 1000 + listInitialSeq int32 = listMinSeq + (listMaxSeq-listMinSeq)/2 +) + +var errLMetaKey = errors.New("invalid lmeta key") +var errListKey = errors.New("invalid list key") +var errListSeq = errors.New("invalid list sequence, overflow") + +func (db *DB) lEncodeMetaKey(key []byte) []byte { + buf := make([]byte, len(key)+2) + buf[0] = db.index + buf[1] = LMetaType + + copy(buf[2:], key) + return buf +} + +func (db *DB) lDecodeMetaKey(ek []byte) ([]byte, error) { + if len(ek) < 2 || ek[0] != db.index || ek[1] != LMetaType { + return nil, errLMetaKey + } + + return ek[2:], nil +} + +func (db *DB) lEncodeListKey(key []byte, seq int32) []byte { + buf := make([]byte, len(key)+8) + + pos := 0 + buf[pos] = db.index + pos++ + buf[pos] = ListType + pos++ + + binary.BigEndian.PutUint16(buf[pos:], uint16(len(key))) + pos += 2 + + copy(buf[pos:], key) + pos += len(key) + + binary.BigEndian.PutUint32(buf[pos:], uint32(seq)) + + return buf +} + +func (db *DB) lDecodeListKey(ek []byte) (key []byte, seq int32, err error) { + if len(ek) < 8 || ek[0] != db.index || ek[1] != ListType { + err = errListKey + return + } + + keyLen := int(binary.BigEndian.Uint16(ek[2:])) + if keyLen+8 != len(ek) { + err = errListKey + return + } + + key = ek[4 : 4+keyLen] + seq = int32(binary.BigEndian.Uint32(ek[4+keyLen:])) + return +} + +func (db *DB) lpush(key []byte, whereSeq int32, args ...[]byte) (int64, error) { + if err := checkKeySize(key); err != nil { + return 0, err + } + + var headSeq int32 + var tailSeq int32 + var size int32 + var err error + + t := db.listBatch + t.Lock() + defer t.Unlock() + + metaKey := db.lEncodeMetaKey(key) + headSeq, tailSeq, size, err = db.lGetMeta(nil, metaKey) + if err != nil { + return 0, err + } + + var pushCnt int = len(args) + if pushCnt == 0 { + return int64(size), nil + } + + var seq int32 = headSeq + var delta int32 = -1 + if whereSeq == listTailSeq { + seq = tailSeq + delta = 1 + } + + // append elements + if size > 0 { + seq += delta + } + + for i := 0; i < pushCnt; i++ { + ek := db.lEncodeListKey(key, seq+int32(i)*delta) + t.Put(ek, args[i]) + } + + seq += int32(pushCnt-1) * delta + if seq <= listMinSeq || seq >= listMaxSeq { + return 0, errListSeq + } + + // set meta info + if whereSeq == listHeadSeq { + headSeq = seq + } else { + tailSeq = seq + } + + db.lSetMeta(metaKey, headSeq, tailSeq) + + err = t.Commit() + return int64(size) + int64(pushCnt), err +} + +func (db *DB) lpop(key []byte, whereSeq int32) ([]byte, error) { + if err := checkKeySize(key); err != nil { + return nil, err + } + + t := db.listBatch + t.Lock() + defer t.Unlock() + + var headSeq int32 + var tailSeq int32 + var err error + + metaKey := db.lEncodeMetaKey(key) + headSeq, tailSeq, _, err = db.lGetMeta(nil, metaKey) + if err != nil { + return nil, err + } + + var value []byte + + var seq int32 = headSeq + if whereSeq == listTailSeq { + seq = tailSeq + } + + itemKey := db.lEncodeListKey(key, seq) + value, err = db.bucket.Get(itemKey) + if err != nil { + return nil, err + } + + if whereSeq == listHeadSeq { + headSeq += 1 + } else { + tailSeq -= 1 + } + + t.Delete(itemKey) + size := db.lSetMeta(metaKey, headSeq, tailSeq) + if size == 0 { + db.rmExpire(t, HashType, key) + } + + err = t.Commit() + return value, err +} + +// ps : here just focus on deleting the list data, +// any other likes expire is ignore. +func (db *DB) lDelete(t *batch, key []byte) int64 { + mk := db.lEncodeMetaKey(key) + + var headSeq int32 + var tailSeq int32 + var err error + + it := db.bucket.NewIterator() + defer it.Close() + + headSeq, tailSeq, _, err = db.lGetMeta(it, mk) + if err != nil { + return 0 + } + + var num int64 = 0 + startKey := db.lEncodeListKey(key, headSeq) + stopKey := db.lEncodeListKey(key, tailSeq) + + rit := store.NewRangeIterator(it, &store.Range{startKey, stopKey, store.RangeClose}) + for ; rit.Valid(); rit.Next() { + t.Delete(rit.RawKey()) + num++ + } + + t.Delete(mk) + + return num +} + +func (db *DB) lGetMeta(it *store.Iterator, ek []byte) (headSeq int32, tailSeq int32, size int32, err error) { + var v []byte + if it != nil { + v = it.Find(ek) + } else { + v, err = db.bucket.Get(ek) + } + if err != nil { + return + } else if v == nil { + headSeq = listInitialSeq + tailSeq = listInitialSeq + size = 0 + return + } else { + headSeq = int32(binary.LittleEndian.Uint32(v[0:4])) + tailSeq = int32(binary.LittleEndian.Uint32(v[4:8])) + size = tailSeq - headSeq + 1 + } + return +} + +func (db *DB) lSetMeta(ek []byte, headSeq int32, tailSeq int32) int32 { + t := db.listBatch + + var size int32 = tailSeq - headSeq + 1 + if size < 0 { + // todo : log error + panic + } else if size == 0 { + t.Delete(ek) + } else { + buf := make([]byte, 8) + + binary.LittleEndian.PutUint32(buf[0:4], uint32(headSeq)) + binary.LittleEndian.PutUint32(buf[4:8], uint32(tailSeq)) + + t.Put(ek, buf) + } + + return size +} + +func (db *DB) lExpireAt(key []byte, when int64) (int64, error) { + t := db.listBatch + t.Lock() + defer t.Unlock() + + if llen, err := db.LLen(key); err != nil || llen == 0 { + return 0, err + } else { + db.expireAt(t, ListType, key, when) + if err := t.Commit(); err != nil { + return 0, err + } + } + return 1, nil +} + +func (db *DB) LIndex(key []byte, index int32) ([]byte, error) { + if err := checkKeySize(key); err != nil { + return nil, err + } + + var seq int32 + var headSeq int32 + var tailSeq int32 + var err error + + metaKey := db.lEncodeMetaKey(key) + + it := db.bucket.NewIterator() + defer it.Close() + + headSeq, tailSeq, _, err = db.lGetMeta(it, metaKey) + if err != nil { + return nil, err + } + + if index >= 0 { + seq = headSeq + index + } else { + seq = tailSeq + index + 1 + } + + sk := db.lEncodeListKey(key, seq) + v := it.Find(sk) + + return v, nil +} + +func (db *DB) LLen(key []byte) (int64, error) { + if err := checkKeySize(key); err != nil { + return 0, err + } + + ek := db.lEncodeMetaKey(key) + _, _, size, err := db.lGetMeta(nil, ek) + return int64(size), err +} + +func (db *DB) LPop(key []byte) ([]byte, error) { + return db.lpop(key, listHeadSeq) +} + +func (db *DB) LPush(key []byte, arg1 []byte, args ...[]byte) (int64, error) { + var argss = [][]byte{arg1} + argss = append(argss, args...) + return db.lpush(key, listHeadSeq, argss...) +} + +func (db *DB) LRange(key []byte, start int32, stop int32) ([][]byte, error) { + if err := checkKeySize(key); err != nil { + return nil, err + } + + var headSeq int32 + var llen int32 + var err error + + metaKey := db.lEncodeMetaKey(key) + + it := db.bucket.NewIterator() + defer it.Close() + + if headSeq, _, llen, err = db.lGetMeta(it, metaKey); err != nil { + return nil, err + } + + if start < 0 { + start = llen + start + } + if stop < 0 { + stop = llen + stop + } + if start < 0 { + start = 0 + } + + if start > stop || start >= llen { + return [][]byte{}, nil + } + + if stop >= llen { + stop = llen - 1 + } + + limit := (stop - start) + 1 + headSeq += start + + v := make([][]byte, 0, limit) + + startKey := db.lEncodeListKey(key, headSeq) + rit := store.NewRangeLimitIterator(it, + &store.Range{ + Min: startKey, + Max: nil, + Type: store.RangeClose}, + &store.Limit{ + Offset: 0, + Count: int(limit)}) + + for ; rit.Valid(); rit.Next() { + v = append(v, rit.Value()) + } + + return v, nil +} + +func (db *DB) RPop(key []byte) ([]byte, error) { + return db.lpop(key, listTailSeq) +} + +func (db *DB) RPush(key []byte, arg1 []byte, args ...[]byte) (int64, error) { + var argss = [][]byte{arg1} + argss = append(argss, args...) + return db.lpush(key, listTailSeq, argss...) +} + +func (db *DB) LClear(key []byte) (int64, error) { + if err := checkKeySize(key); err != nil { + return 0, err + } + + t := db.listBatch + t.Lock() + defer t.Unlock() + + num := db.lDelete(t, key) + db.rmExpire(t, ListType, key) + + err := t.Commit() + return num, err +} + +func (db *DB) LMclear(keys ...[]byte) (int64, error) { + t := db.listBatch + t.Lock() + defer t.Unlock() + + for _, key := range keys { + if err := checkKeySize(key); err != nil { + return 0, err + } + + db.lDelete(t, key) + db.rmExpire(t, ListType, key) + + } + + err := t.Commit() + return int64(len(keys)), err +} + +func (db *DB) lFlush() (drop int64, err error) { + t := db.listBatch + t.Lock() + defer t.Unlock() + return db.flushType(t, ListType) +} + +func (db *DB) LExpire(key []byte, duration int64) (int64, error) { + if duration <= 0 { + return 0, errExpireValue + } + + return db.lExpireAt(key, time.Now().Unix()+duration) +} + +func (db *DB) LExpireAt(key []byte, when int64) (int64, error) { + if when <= time.Now().Unix() { + return 0, errExpireValue + } + + return db.lExpireAt(key, when) +} + +func (db *DB) LTTL(key []byte) (int64, error) { + if err := checkKeySize(key); err != nil { + return -1, err + } + + return db.ttl(ListType, key) +} + +func (db *DB) LPersist(key []byte) (int64, error) { + if err := checkKeySize(key); err != nil { + return 0, err + } + + t := db.listBatch + t.Lock() + defer t.Unlock() + + n, err := db.rmExpire(t, ListType, key) + if err != nil { + return 0, err + } + + err = t.Commit() + return n, err +} + +func (db *DB) LScan(key []byte, count int, inclusive bool, match string) ([][]byte, error) { + return db.scan(LMetaType, key, count, inclusive, match) +} + +func (db *DB) lEncodeMinKey() []byte { + return db.lEncodeMetaKey(nil) +} + +func (db *DB) lEncodeMaxKey() []byte { + ek := db.lEncodeMetaKey(nil) + ek[len(ek)-1] = LMetaType + 1 + return ek +} diff --git a/vendor/github.com/lunny/nodb/t_set.go b/vendor/github.com/lunny/nodb/t_set.go new file mode 100644 index 000000000000..41ce30e8ce82 --- /dev/null +++ b/vendor/github.com/lunny/nodb/t_set.go @@ -0,0 +1,601 @@ +package nodb + +import ( + "encoding/binary" + "errors" + "time" + + "github.com/lunny/nodb/store" +) + +var errSetKey = errors.New("invalid set key") +var errSSizeKey = errors.New("invalid ssize key") + +const ( + setStartSep byte = ':' + setStopSep byte = setStartSep + 1 + UnionType byte = 51 + DiffType byte = 52 + InterType byte = 53 +) + +func checkSetKMSize(key []byte, member []byte) error { + if len(key) > MaxKeySize || len(key) == 0 { + return errKeySize + } else if len(member) > MaxSetMemberSize || len(member) == 0 { + return errSetMemberSize + } + return nil +} + +func (db *DB) sEncodeSizeKey(key []byte) []byte { + buf := make([]byte, len(key)+2) + + buf[0] = db.index + buf[1] = SSizeType + + copy(buf[2:], key) + return buf +} + +func (db *DB) sDecodeSizeKey(ek []byte) ([]byte, error) { + if len(ek) < 2 || ek[0] != db.index || ek[1] != SSizeType { + return nil, errSSizeKey + } + + return ek[2:], nil +} + +func (db *DB) sEncodeSetKey(key []byte, member []byte) []byte { + buf := make([]byte, len(key)+len(member)+1+1+2+1) + + pos := 0 + buf[pos] = db.index + pos++ + buf[pos] = SetType + pos++ + + binary.BigEndian.PutUint16(buf[pos:], uint16(len(key))) + pos += 2 + + copy(buf[pos:], key) + pos += len(key) + + buf[pos] = setStartSep + pos++ + copy(buf[pos:], member) + + return buf +} + +func (db *DB) sDecodeSetKey(ek []byte) ([]byte, []byte, error) { + if len(ek) < 5 || ek[0] != db.index || ek[1] != SetType { + return nil, nil, errSetKey + } + + pos := 2 + keyLen := int(binary.BigEndian.Uint16(ek[pos:])) + pos += 2 + + if keyLen+5 > len(ek) { + return nil, nil, errSetKey + } + + key := ek[pos : pos+keyLen] + pos += keyLen + + if ek[pos] != hashStartSep { + return nil, nil, errSetKey + } + + pos++ + member := ek[pos:] + return key, member, nil +} + +func (db *DB) sEncodeStartKey(key []byte) []byte { + return db.sEncodeSetKey(key, nil) +} + +func (db *DB) sEncodeStopKey(key []byte) []byte { + k := db.sEncodeSetKey(key, nil) + + k[len(k)-1] = setStopSep + + return k +} + +func (db *DB) sFlush() (drop int64, err error) { + + t := db.setBatch + t.Lock() + defer t.Unlock() + + return db.flushType(t, SetType) +} + +func (db *DB) sDelete(t *batch, key []byte) int64 { + sk := db.sEncodeSizeKey(key) + start := db.sEncodeStartKey(key) + stop := db.sEncodeStopKey(key) + + var num int64 = 0 + it := db.bucket.RangeLimitIterator(start, stop, store.RangeROpen, 0, -1) + for ; it.Valid(); it.Next() { + t.Delete(it.RawKey()) + num++ + } + + it.Close() + + t.Delete(sk) + return num +} + +func (db *DB) sIncrSize(key []byte, delta int64) (int64, error) { + t := db.setBatch + sk := db.sEncodeSizeKey(key) + + var err error + var size int64 = 0 + if size, err = Int64(db.bucket.Get(sk)); err != nil { + return 0, err + } else { + size += delta + if size <= 0 { + size = 0 + t.Delete(sk) + db.rmExpire(t, SetType, key) + } else { + t.Put(sk, PutInt64(size)) + } + } + + return size, nil +} + +func (db *DB) sExpireAt(key []byte, when int64) (int64, error) { + t := db.setBatch + t.Lock() + defer t.Unlock() + + if scnt, err := db.SCard(key); err != nil || scnt == 0 { + return 0, err + } else { + db.expireAt(t, SetType, key, when) + if err := t.Commit(); err != nil { + return 0, err + } + + } + + return 1, nil +} + +func (db *DB) sSetItem(key []byte, member []byte) (int64, error) { + t := db.setBatch + ek := db.sEncodeSetKey(key, member) + + var n int64 = 1 + if v, _ := db.bucket.Get(ek); v != nil { + n = 0 + } else { + if _, err := db.sIncrSize(key, 1); err != nil { + return 0, err + } + } + + t.Put(ek, nil) + return n, nil +} + +func (db *DB) SAdd(key []byte, args ...[]byte) (int64, error) { + t := db.setBatch + t.Lock() + defer t.Unlock() + + var err error + var ek []byte + var num int64 = 0 + for i := 0; i < len(args); i++ { + if err := checkSetKMSize(key, args[i]); err != nil { + return 0, err + } + + ek = db.sEncodeSetKey(key, args[i]) + + if v, err := db.bucket.Get(ek); err != nil { + return 0, err + } else if v == nil { + num++ + } + + t.Put(ek, nil) + } + + if _, err = db.sIncrSize(key, num); err != nil { + return 0, err + } + + err = t.Commit() + return num, err + +} + +func (db *DB) SCard(key []byte) (int64, error) { + if err := checkKeySize(key); err != nil { + return 0, err + } + + sk := db.sEncodeSizeKey(key) + + return Int64(db.bucket.Get(sk)) +} + +func (db *DB) sDiffGeneric(keys ...[]byte) ([][]byte, error) { + destMap := make(map[string]bool) + + members, err := db.SMembers(keys[0]) + if err != nil { + return nil, err + } + + for _, m := range members { + destMap[String(m)] = true + } + + for _, k := range keys[1:] { + members, err := db.SMembers(k) + if err != nil { + return nil, err + } + + for _, m := range members { + if _, ok := destMap[String(m)]; !ok { + continue + } else if ok { + delete(destMap, String(m)) + } + } + // O - A = O, O is zero set. + if len(destMap) == 0 { + return nil, nil + } + } + + slice := make([][]byte, len(destMap)) + idx := 0 + for k, v := range destMap { + if !v { + continue + } + slice[idx] = []byte(k) + idx++ + } + + return slice, nil +} + +func (db *DB) SDiff(keys ...[]byte) ([][]byte, error) { + v, err := db.sDiffGeneric(keys...) + return v, err +} + +func (db *DB) SDiffStore(dstKey []byte, keys ...[]byte) (int64, error) { + n, err := db.sStoreGeneric(dstKey, DiffType, keys...) + return n, err +} + +func (db *DB) sInterGeneric(keys ...[]byte) ([][]byte, error) { + destMap := make(map[string]bool) + + members, err := db.SMembers(keys[0]) + if err != nil { + return nil, err + } + + for _, m := range members { + destMap[String(m)] = true + } + + for _, key := range keys[1:] { + if err := checkKeySize(key); err != nil { + return nil, err + } + + members, err := db.SMembers(key) + if err != nil { + return nil, err + } else if len(members) == 0 { + return nil, err + } + + tempMap := make(map[string]bool) + for _, member := range members { + if err := checkKeySize(member); err != nil { + return nil, err + } + if _, ok := destMap[String(member)]; ok { + tempMap[String(member)] = true //mark this item as selected + } + } + destMap = tempMap //reduce the size of the result set + if len(destMap) == 0 { + return nil, nil + } + } + + slice := make([][]byte, len(destMap)) + idx := 0 + for k, v := range destMap { + if !v { + continue + } + + slice[idx] = []byte(k) + idx++ + } + + return slice, nil + +} + +func (db *DB) SInter(keys ...[]byte) ([][]byte, error) { + v, err := db.sInterGeneric(keys...) + return v, err + +} + +func (db *DB) SInterStore(dstKey []byte, keys ...[]byte) (int64, error) { + n, err := db.sStoreGeneric(dstKey, InterType, keys...) + return n, err +} + +func (db *DB) SIsMember(key []byte, member []byte) (int64, error) { + ek := db.sEncodeSetKey(key, member) + + var n int64 = 1 + if v, err := db.bucket.Get(ek); err != nil { + return 0, err + } else if v == nil { + n = 0 + } + return n, nil +} + +func (db *DB) SMembers(key []byte) ([][]byte, error) { + if err := checkKeySize(key); err != nil { + return nil, err + } + + start := db.sEncodeStartKey(key) + stop := db.sEncodeStopKey(key) + + v := make([][]byte, 0, 16) + + it := db.bucket.RangeLimitIterator(start, stop, store.RangeROpen, 0, -1) + for ; it.Valid(); it.Next() { + _, m, err := db.sDecodeSetKey(it.Key()) + if err != nil { + return nil, err + } + + v = append(v, m) + } + + it.Close() + + return v, nil +} + +func (db *DB) SRem(key []byte, args ...[]byte) (int64, error) { + t := db.setBatch + t.Lock() + defer t.Unlock() + + var ek []byte + var v []byte + var err error + + it := db.bucket.NewIterator() + defer it.Close() + + var num int64 = 0 + for i := 0; i < len(args); i++ { + if err := checkSetKMSize(key, args[i]); err != nil { + return 0, err + } + + ek = db.sEncodeSetKey(key, args[i]) + + v = it.RawFind(ek) + if v == nil { + continue + } else { + num++ + t.Delete(ek) + } + } + + if _, err = db.sIncrSize(key, -num); err != nil { + return 0, err + } + + err = t.Commit() + return num, err + +} + +func (db *DB) sUnionGeneric(keys ...[]byte) ([][]byte, error) { + dstMap := make(map[string]bool) + + for _, key := range keys { + if err := checkKeySize(key); err != nil { + return nil, err + } + + members, err := db.SMembers(key) + if err != nil { + return nil, err + } + + for _, member := range members { + dstMap[String(member)] = true + } + } + + slice := make([][]byte, len(dstMap)) + idx := 0 + for k, v := range dstMap { + if !v { + continue + } + slice[idx] = []byte(k) + idx++ + } + + return slice, nil +} + +func (db *DB) SUnion(keys ...[]byte) ([][]byte, error) { + v, err := db.sUnionGeneric(keys...) + return v, err +} + +func (db *DB) SUnionStore(dstKey []byte, keys ...[]byte) (int64, error) { + n, err := db.sStoreGeneric(dstKey, UnionType, keys...) + return n, err +} + +func (db *DB) sStoreGeneric(dstKey []byte, optType byte, keys ...[]byte) (int64, error) { + if err := checkKeySize(dstKey); err != nil { + return 0, err + } + + t := db.setBatch + t.Lock() + defer t.Unlock() + + db.sDelete(t, dstKey) + + var err error + var ek []byte + var v [][]byte + + switch optType { + case UnionType: + v, err = db.sUnionGeneric(keys...) + case DiffType: + v, err = db.sDiffGeneric(keys...) + case InterType: + v, err = db.sInterGeneric(keys...) + } + + if err != nil { + return 0, err + } + + for _, m := range v { + if err := checkSetKMSize(dstKey, m); err != nil { + return 0, err + } + + ek = db.sEncodeSetKey(dstKey, m) + + if _, err := db.bucket.Get(ek); err != nil { + return 0, err + } + + t.Put(ek, nil) + } + + var num = int64(len(v)) + sk := db.sEncodeSizeKey(dstKey) + t.Put(sk, PutInt64(num)) + + if err = t.Commit(); err != nil { + return 0, err + } + return num, nil +} + +func (db *DB) SClear(key []byte) (int64, error) { + if err := checkKeySize(key); err != nil { + return 0, err + } + + t := db.setBatch + t.Lock() + defer t.Unlock() + + num := db.sDelete(t, key) + db.rmExpire(t, SetType, key) + + err := t.Commit() + return num, err +} + +func (db *DB) SMclear(keys ...[]byte) (int64, error) { + t := db.setBatch + t.Lock() + defer t.Unlock() + + for _, key := range keys { + if err := checkKeySize(key); err != nil { + return 0, err + } + + db.sDelete(t, key) + db.rmExpire(t, SetType, key) + } + + err := t.Commit() + return int64(len(keys)), err +} + +func (db *DB) SExpire(key []byte, duration int64) (int64, error) { + if duration <= 0 { + return 0, errExpireValue + } + + return db.sExpireAt(key, time.Now().Unix()+duration) + +} + +func (db *DB) SExpireAt(key []byte, when int64) (int64, error) { + if when <= time.Now().Unix() { + return 0, errExpireValue + } + + return db.sExpireAt(key, when) + +} + +func (db *DB) STTL(key []byte) (int64, error) { + if err := checkKeySize(key); err != nil { + return -1, err + } + + return db.ttl(SetType, key) +} + +func (db *DB) SPersist(key []byte) (int64, error) { + if err := checkKeySize(key); err != nil { + return 0, err + } + + t := db.setBatch + t.Lock() + defer t.Unlock() + + n, err := db.rmExpire(t, SetType, key) + if err != nil { + return 0, err + } + err = t.Commit() + return n, err +} + +func (db *DB) SScan(key []byte, count int, inclusive bool, match string) ([][]byte, error) { + return db.scan(SSizeType, key, count, inclusive, match) +} diff --git a/vendor/github.com/lunny/nodb/t_ttl.go b/vendor/github.com/lunny/nodb/t_ttl.go new file mode 100644 index 000000000000..5c3638891c00 --- /dev/null +++ b/vendor/github.com/lunny/nodb/t_ttl.go @@ -0,0 +1,195 @@ +package nodb + +import ( + "encoding/binary" + "errors" + "time" + + "github.com/lunny/nodb/store" +) + +var ( + errExpMetaKey = errors.New("invalid expire meta key") + errExpTimeKey = errors.New("invalid expire time key") +) + +type retireCallback func(*batch, []byte) int64 + +type elimination struct { + db *DB + exp2Tx []*batch + exp2Retire []retireCallback +} + +var errExpType = errors.New("invalid expire type") + +func (db *DB) expEncodeTimeKey(dataType byte, key []byte, when int64) []byte { + buf := make([]byte, len(key)+11) + + buf[0] = db.index + buf[1] = ExpTimeType + buf[2] = dataType + pos := 3 + + binary.BigEndian.PutUint64(buf[pos:], uint64(when)) + pos += 8 + + copy(buf[pos:], key) + + return buf +} + +func (db *DB) expEncodeMetaKey(dataType byte, key []byte) []byte { + buf := make([]byte, len(key)+3) + + buf[0] = db.index + buf[1] = ExpMetaType + buf[2] = dataType + pos := 3 + + copy(buf[pos:], key) + + return buf +} + +func (db *DB) expDecodeMetaKey(mk []byte) (byte, []byte, error) { + if len(mk) <= 3 || mk[0] != db.index || mk[1] != ExpMetaType { + return 0, nil, errExpMetaKey + } + + return mk[2], mk[3:], nil +} + +func (db *DB) expDecodeTimeKey(tk []byte) (byte, []byte, int64, error) { + if len(tk) < 11 || tk[0] != db.index || tk[1] != ExpTimeType { + return 0, nil, 0, errExpTimeKey + } + + return tk[2], tk[11:], int64(binary.BigEndian.Uint64(tk[3:])), nil +} + +func (db *DB) expire(t *batch, dataType byte, key []byte, duration int64) { + db.expireAt(t, dataType, key, time.Now().Unix()+duration) +} + +func (db *DB) expireAt(t *batch, dataType byte, key []byte, when int64) { + mk := db.expEncodeMetaKey(dataType, key) + tk := db.expEncodeTimeKey(dataType, key, when) + + t.Put(tk, mk) + t.Put(mk, PutInt64(when)) +} + +func (db *DB) ttl(dataType byte, key []byte) (t int64, err error) { + mk := db.expEncodeMetaKey(dataType, key) + + if t, err = Int64(db.bucket.Get(mk)); err != nil || t == 0 { + t = -1 + } else { + t -= time.Now().Unix() + if t <= 0 { + t = -1 + } + // if t == -1 : to remove ???? + } + + return t, err +} + +func (db *DB) rmExpire(t *batch, dataType byte, key []byte) (int64, error) { + mk := db.expEncodeMetaKey(dataType, key) + if v, err := db.bucket.Get(mk); err != nil { + return 0, err + } else if v == nil { + return 0, nil + } else if when, err2 := Int64(v, nil); err2 != nil { + return 0, err2 + } else { + tk := db.expEncodeTimeKey(dataType, key, when) + t.Delete(mk) + t.Delete(tk) + return 1, nil + } +} + +func (db *DB) expFlush(t *batch, dataType byte) (err error) { + minKey := make([]byte, 3) + minKey[0] = db.index + minKey[1] = ExpTimeType + minKey[2] = dataType + + maxKey := make([]byte, 3) + maxKey[0] = db.index + maxKey[1] = ExpMetaType + maxKey[2] = dataType + 1 + + _, err = db.flushRegion(t, minKey, maxKey) + err = t.Commit() + return +} + +////////////////////////////////////////////////////////// +// +////////////////////////////////////////////////////////// + +func newEliminator(db *DB) *elimination { + eli := new(elimination) + eli.db = db + eli.exp2Tx = make([]*batch, maxDataType) + eli.exp2Retire = make([]retireCallback, maxDataType) + return eli +} + +func (eli *elimination) regRetireContext(dataType byte, t *batch, onRetire retireCallback) { + + // todo .. need to ensure exist - mapExpMetaType[expType] + + eli.exp2Tx[dataType] = t + eli.exp2Retire[dataType] = onRetire +} + +// call by outside ... (from *db to another *db) +func (eli *elimination) active() { + now := time.Now().Unix() + db := eli.db + dbGet := db.bucket.Get + + minKey := db.expEncodeTimeKey(NoneType, nil, 0) + maxKey := db.expEncodeTimeKey(maxDataType, nil, now) + + it := db.bucket.RangeLimitIterator(minKey, maxKey, store.RangeROpen, 0, -1) + for ; it.Valid(); it.Next() { + tk := it.RawKey() + mk := it.RawValue() + + dt, k, _, err := db.expDecodeTimeKey(tk) + if err != nil { + continue + } + + t := eli.exp2Tx[dt] + onRetire := eli.exp2Retire[dt] + if tk == nil || onRetire == nil { + continue + } + + t.Lock() + + if exp, err := Int64(dbGet(mk)); err == nil { + // check expire again + if exp <= now { + onRetire(t, k) + t.Delete(tk) + t.Delete(mk) + + t.Commit() + } + + } + + t.Unlock() + } + it.Close() + + return +} diff --git a/vendor/github.com/lunny/nodb/t_zset.go b/vendor/github.com/lunny/nodb/t_zset.go new file mode 100644 index 000000000000..d0ffb7ccf324 --- /dev/null +++ b/vendor/github.com/lunny/nodb/t_zset.go @@ -0,0 +1,943 @@ +package nodb + +import ( + "bytes" + "encoding/binary" + "errors" + "time" + + "github.com/lunny/nodb/store" +) + +const ( + MinScore int64 = -1<<63 + 1 + MaxScore int64 = 1<<63 - 1 + InvalidScore int64 = -1 << 63 + + AggregateSum byte = 0 + AggregateMin byte = 1 + AggregateMax byte = 2 +) + +type ScorePair struct { + Score int64 + Member []byte +} + +var errZSizeKey = errors.New("invalid zsize key") +var errZSetKey = errors.New("invalid zset key") +var errZScoreKey = errors.New("invalid zscore key") +var errScoreOverflow = errors.New("zset score overflow") +var errInvalidAggregate = errors.New("invalid aggregate") +var errInvalidWeightNum = errors.New("invalid weight number") +var errInvalidSrcKeyNum = errors.New("invalid src key number") + +const ( + zsetNScoreSep byte = '<' + zsetPScoreSep byte = zsetNScoreSep + 1 + zsetStopScoreSep byte = zsetPScoreSep + 1 + + zsetStartMemSep byte = ':' + zsetStopMemSep byte = zsetStartMemSep + 1 +) + +func checkZSetKMSize(key []byte, member []byte) error { + if len(key) > MaxKeySize || len(key) == 0 { + return errKeySize + } else if len(member) > MaxZSetMemberSize || len(member) == 0 { + return errZSetMemberSize + } + return nil +} + +func (db *DB) zEncodeSizeKey(key []byte) []byte { + buf := make([]byte, len(key)+2) + buf[0] = db.index + buf[1] = ZSizeType + + copy(buf[2:], key) + return buf +} + +func (db *DB) zDecodeSizeKey(ek []byte) ([]byte, error) { + if len(ek) < 2 || ek[0] != db.index || ek[1] != ZSizeType { + return nil, errZSizeKey + } + + return ek[2:], nil +} + +func (db *DB) zEncodeSetKey(key []byte, member []byte) []byte { + buf := make([]byte, len(key)+len(member)+5) + + pos := 0 + buf[pos] = db.index + pos++ + + buf[pos] = ZSetType + pos++ + + binary.BigEndian.PutUint16(buf[pos:], uint16(len(key))) + pos += 2 + + copy(buf[pos:], key) + pos += len(key) + + buf[pos] = zsetStartMemSep + pos++ + + copy(buf[pos:], member) + + return buf +} + +func (db *DB) zDecodeSetKey(ek []byte) ([]byte, []byte, error) { + if len(ek) < 5 || ek[0] != db.index || ek[1] != ZSetType { + return nil, nil, errZSetKey + } + + keyLen := int(binary.BigEndian.Uint16(ek[2:])) + if keyLen+5 > len(ek) { + return nil, nil, errZSetKey + } + + key := ek[4 : 4+keyLen] + + if ek[4+keyLen] != zsetStartMemSep { + return nil, nil, errZSetKey + } + + member := ek[5+keyLen:] + return key, member, nil +} + +func (db *DB) zEncodeStartSetKey(key []byte) []byte { + k := db.zEncodeSetKey(key, nil) + return k +} + +func (db *DB) zEncodeStopSetKey(key []byte) []byte { + k := db.zEncodeSetKey(key, nil) + k[len(k)-1] = zsetStartMemSep + 1 + return k +} + +func (db *DB) zEncodeScoreKey(key []byte, member []byte, score int64) []byte { + buf := make([]byte, len(key)+len(member)+14) + + pos := 0 + buf[pos] = db.index + pos++ + + buf[pos] = ZScoreType + pos++ + + binary.BigEndian.PutUint16(buf[pos:], uint16(len(key))) + pos += 2 + + copy(buf[pos:], key) + pos += len(key) + + if score < 0 { + buf[pos] = zsetNScoreSep + } else { + buf[pos] = zsetPScoreSep + } + + pos++ + binary.BigEndian.PutUint64(buf[pos:], uint64(score)) + pos += 8 + + buf[pos] = zsetStartMemSep + pos++ + + copy(buf[pos:], member) + return buf +} + +func (db *DB) zEncodeStartScoreKey(key []byte, score int64) []byte { + return db.zEncodeScoreKey(key, nil, score) +} + +func (db *DB) zEncodeStopScoreKey(key []byte, score int64) []byte { + k := db.zEncodeScoreKey(key, nil, score) + k[len(k)-1] = zsetStopMemSep + return k +} + +func (db *DB) zDecodeScoreKey(ek []byte) (key []byte, member []byte, score int64, err error) { + if len(ek) < 14 || ek[0] != db.index || ek[1] != ZScoreType { + err = errZScoreKey + return + } + + keyLen := int(binary.BigEndian.Uint16(ek[2:])) + if keyLen+14 > len(ek) { + err = errZScoreKey + return + } + + key = ek[4 : 4+keyLen] + pos := 4 + keyLen + + if (ek[pos] != zsetNScoreSep) && (ek[pos] != zsetPScoreSep) { + err = errZScoreKey + return + } + pos++ + + score = int64(binary.BigEndian.Uint64(ek[pos:])) + pos += 8 + + if ek[pos] != zsetStartMemSep { + err = errZScoreKey + return + } + + pos++ + + member = ek[pos:] + return +} + +func (db *DB) zSetItem(t *batch, key []byte, score int64, member []byte) (int64, error) { + if score <= MinScore || score >= MaxScore { + return 0, errScoreOverflow + } + + var exists int64 = 0 + ek := db.zEncodeSetKey(key, member) + + if v, err := db.bucket.Get(ek); err != nil { + return 0, err + } else if v != nil { + exists = 1 + + if s, err := Int64(v, err); err != nil { + return 0, err + } else { + sk := db.zEncodeScoreKey(key, member, s) + t.Delete(sk) + } + } + + t.Put(ek, PutInt64(score)) + + sk := db.zEncodeScoreKey(key, member, score) + t.Put(sk, []byte{}) + + return exists, nil +} + +func (db *DB) zDelItem(t *batch, key []byte, member []byte, skipDelScore bool) (int64, error) { + ek := db.zEncodeSetKey(key, member) + if v, err := db.bucket.Get(ek); err != nil { + return 0, err + } else if v == nil { + //not exists + return 0, nil + } else { + //exists + if !skipDelScore { + //we must del score + if s, err := Int64(v, err); err != nil { + return 0, err + } else { + sk := db.zEncodeScoreKey(key, member, s) + t.Delete(sk) + } + } + } + + t.Delete(ek) + + return 1, nil +} + +func (db *DB) zDelete(t *batch, key []byte) int64 { + delMembCnt, _ := db.zRemRange(t, key, MinScore, MaxScore, 0, -1) + // todo : log err + return delMembCnt +} + +func (db *DB) zExpireAt(key []byte, when int64) (int64, error) { + t := db.zsetBatch + t.Lock() + defer t.Unlock() + + if zcnt, err := db.ZCard(key); err != nil || zcnt == 0 { + return 0, err + } else { + db.expireAt(t, ZSetType, key, when) + if err := t.Commit(); err != nil { + return 0, err + } + } + return 1, nil +} + +func (db *DB) ZAdd(key []byte, args ...ScorePair) (int64, error) { + if len(args) == 0 { + return 0, nil + } + + t := db.zsetBatch + t.Lock() + defer t.Unlock() + + var num int64 = 0 + for i := 0; i < len(args); i++ { + score := args[i].Score + member := args[i].Member + + if err := checkZSetKMSize(key, member); err != nil { + return 0, err + } + + if n, err := db.zSetItem(t, key, score, member); err != nil { + return 0, err + } else if n == 0 { + //add new + num++ + } + } + + if _, err := db.zIncrSize(t, key, num); err != nil { + return 0, err + } + + //todo add binlog + err := t.Commit() + return num, err +} + +func (db *DB) zIncrSize(t *batch, key []byte, delta int64) (int64, error) { + sk := db.zEncodeSizeKey(key) + + size, err := Int64(db.bucket.Get(sk)) + if err != nil { + return 0, err + } else { + size += delta + if size <= 0 { + size = 0 + t.Delete(sk) + db.rmExpire(t, ZSetType, key) + } else { + t.Put(sk, PutInt64(size)) + } + } + + return size, nil +} + +func (db *DB) ZCard(key []byte) (int64, error) { + if err := checkKeySize(key); err != nil { + return 0, err + } + + sk := db.zEncodeSizeKey(key) + return Int64(db.bucket.Get(sk)) +} + +func (db *DB) ZScore(key []byte, member []byte) (int64, error) { + if err := checkZSetKMSize(key, member); err != nil { + return InvalidScore, err + } + + var score int64 = InvalidScore + + k := db.zEncodeSetKey(key, member) + if v, err := db.bucket.Get(k); err != nil { + return InvalidScore, err + } else if v == nil { + return InvalidScore, ErrScoreMiss + } else { + if score, err = Int64(v, nil); err != nil { + return InvalidScore, err + } + } + + return score, nil +} + +func (db *DB) ZRem(key []byte, members ...[]byte) (int64, error) { + if len(members) == 0 { + return 0, nil + } + + t := db.zsetBatch + t.Lock() + defer t.Unlock() + + var num int64 = 0 + for i := 0; i < len(members); i++ { + if err := checkZSetKMSize(key, members[i]); err != nil { + return 0, err + } + + if n, err := db.zDelItem(t, key, members[i], false); err != nil { + return 0, err + } else if n == 1 { + num++ + } + } + + if _, err := db.zIncrSize(t, key, -num); err != nil { + return 0, err + } + + err := t.Commit() + return num, err +} + +func (db *DB) ZIncrBy(key []byte, delta int64, member []byte) (int64, error) { + if err := checkZSetKMSize(key, member); err != nil { + return InvalidScore, err + } + + t := db.zsetBatch + t.Lock() + defer t.Unlock() + + ek := db.zEncodeSetKey(key, member) + + var oldScore int64 = 0 + v, err := db.bucket.Get(ek) + if err != nil { + return InvalidScore, err + } else if v == nil { + db.zIncrSize(t, key, 1) + } else { + if oldScore, err = Int64(v, err); err != nil { + return InvalidScore, err + } + } + + newScore := oldScore + delta + if newScore >= MaxScore || newScore <= MinScore { + return InvalidScore, errScoreOverflow + } + + sk := db.zEncodeScoreKey(key, member, newScore) + t.Put(sk, []byte{}) + t.Put(ek, PutInt64(newScore)) + + if v != nil { + // so as to update score, we must delete the old one + oldSk := db.zEncodeScoreKey(key, member, oldScore) + t.Delete(oldSk) + } + + err = t.Commit() + return newScore, err +} + +func (db *DB) ZCount(key []byte, min int64, max int64) (int64, error) { + if err := checkKeySize(key); err != nil { + return 0, err + } + minKey := db.zEncodeStartScoreKey(key, min) + maxKey := db.zEncodeStopScoreKey(key, max) + + rangeType := store.RangeROpen + + it := db.bucket.RangeLimitIterator(minKey, maxKey, rangeType, 0, -1) + var n int64 = 0 + for ; it.Valid(); it.Next() { + n++ + } + it.Close() + + return n, nil +} + +func (db *DB) zrank(key []byte, member []byte, reverse bool) (int64, error) { + if err := checkZSetKMSize(key, member); err != nil { + return 0, err + } + + k := db.zEncodeSetKey(key, member) + + it := db.bucket.NewIterator() + defer it.Close() + + if v := it.Find(k); v == nil { + return -1, nil + } else { + if s, err := Int64(v, nil); err != nil { + return 0, err + } else { + var rit *store.RangeLimitIterator + + sk := db.zEncodeScoreKey(key, member, s) + + if !reverse { + minKey := db.zEncodeStartScoreKey(key, MinScore) + + rit = store.NewRangeIterator(it, &store.Range{minKey, sk, store.RangeClose}) + } else { + maxKey := db.zEncodeStopScoreKey(key, MaxScore) + rit = store.NewRevRangeIterator(it, &store.Range{sk, maxKey, store.RangeClose}) + } + + var lastKey []byte = nil + var n int64 = 0 + + for ; rit.Valid(); rit.Next() { + n++ + + lastKey = rit.BufKey(lastKey) + } + + if _, m, _, err := db.zDecodeScoreKey(lastKey); err == nil && bytes.Equal(m, member) { + n-- + return n, nil + } + } + } + + return -1, nil +} + +func (db *DB) zIterator(key []byte, min int64, max int64, offset int, count int, reverse bool) *store.RangeLimitIterator { + minKey := db.zEncodeStartScoreKey(key, min) + maxKey := db.zEncodeStopScoreKey(key, max) + + if !reverse { + return db.bucket.RangeLimitIterator(minKey, maxKey, store.RangeClose, offset, count) + } else { + return db.bucket.RevRangeLimitIterator(minKey, maxKey, store.RangeClose, offset, count) + } +} + +func (db *DB) zRemRange(t *batch, key []byte, min int64, max int64, offset int, count int) (int64, error) { + if len(key) > MaxKeySize { + return 0, errKeySize + } + + it := db.zIterator(key, min, max, offset, count, false) + var num int64 = 0 + for ; it.Valid(); it.Next() { + sk := it.RawKey() + _, m, _, err := db.zDecodeScoreKey(sk) + if err != nil { + continue + } + + if n, err := db.zDelItem(t, key, m, true); err != nil { + return 0, err + } else if n == 1 { + num++ + } + + t.Delete(sk) + } + it.Close() + + if _, err := db.zIncrSize(t, key, -num); err != nil { + return 0, err + } + + return num, nil +} + +func (db *DB) zRange(key []byte, min int64, max int64, offset int, count int, reverse bool) ([]ScorePair, error) { + if len(key) > MaxKeySize { + return nil, errKeySize + } + + if offset < 0 { + return []ScorePair{}, nil + } + + nv := 64 + if count > 0 { + nv = count + } + + v := make([]ScorePair, 0, nv) + + var it *store.RangeLimitIterator + + //if reverse and offset is 0, count < 0, we may use forward iterator then reverse + //because store iterator prev is slower than next + if !reverse || (offset == 0 && count < 0) { + it = db.zIterator(key, min, max, offset, count, false) + } else { + it = db.zIterator(key, min, max, offset, count, true) + } + + for ; it.Valid(); it.Next() { + _, m, s, err := db.zDecodeScoreKey(it.Key()) + //may be we will check key equal? + if err != nil { + continue + } + + v = append(v, ScorePair{Member: m, Score: s}) + } + it.Close() + + if reverse && (offset == 0 && count < 0) { + for i, j := 0, len(v)-1; i < j; i, j = i+1, j-1 { + v[i], v[j] = v[j], v[i] + } + } + + return v, nil +} + +func (db *DB) zParseLimit(key []byte, start int, stop int) (offset int, count int, err error) { + if start < 0 || stop < 0 { + //refer redis implementation + var size int64 + size, err = db.ZCard(key) + if err != nil { + return + } + + llen := int(size) + + if start < 0 { + start = llen + start + } + if stop < 0 { + stop = llen + stop + } + + if start < 0 { + start = 0 + } + + if start >= llen { + offset = -1 + return + } + } + + if start > stop { + offset = -1 + return + } + + offset = start + count = (stop - start) + 1 + return +} + +func (db *DB) ZClear(key []byte) (int64, error) { + t := db.zsetBatch + t.Lock() + defer t.Unlock() + + rmCnt, err := db.zRemRange(t, key, MinScore, MaxScore, 0, -1) + if err == nil { + err = t.Commit() + } + + return rmCnt, err +} + +func (db *DB) ZMclear(keys ...[]byte) (int64, error) { + t := db.zsetBatch + t.Lock() + defer t.Unlock() + + for _, key := range keys { + if _, err := db.zRemRange(t, key, MinScore, MaxScore, 0, -1); err != nil { + return 0, err + } + } + + err := t.Commit() + + return int64(len(keys)), err +} + +func (db *DB) ZRange(key []byte, start int, stop int) ([]ScorePair, error) { + return db.ZRangeGeneric(key, start, stop, false) +} + +//min and max must be inclusive +//if no limit, set offset = 0 and count = -1 +func (db *DB) ZRangeByScore(key []byte, min int64, max int64, + offset int, count int) ([]ScorePair, error) { + return db.ZRangeByScoreGeneric(key, min, max, offset, count, false) +} + +func (db *DB) ZRank(key []byte, member []byte) (int64, error) { + return db.zrank(key, member, false) +} + +func (db *DB) ZRemRangeByRank(key []byte, start int, stop int) (int64, error) { + offset, count, err := db.zParseLimit(key, start, stop) + if err != nil { + return 0, err + } + + var rmCnt int64 + + t := db.zsetBatch + t.Lock() + defer t.Unlock() + + rmCnt, err = db.zRemRange(t, key, MinScore, MaxScore, offset, count) + if err == nil { + err = t.Commit() + } + + return rmCnt, err +} + +//min and max must be inclusive +func (db *DB) ZRemRangeByScore(key []byte, min int64, max int64) (int64, error) { + t := db.zsetBatch + t.Lock() + defer t.Unlock() + + rmCnt, err := db.zRemRange(t, key, min, max, 0, -1) + if err == nil { + err = t.Commit() + } + + return rmCnt, err +} + +func (db *DB) ZRevRange(key []byte, start int, stop int) ([]ScorePair, error) { + return db.ZRangeGeneric(key, start, stop, true) +} + +func (db *DB) ZRevRank(key []byte, member []byte) (int64, error) { + return db.zrank(key, member, true) +} + +//min and max must be inclusive +//if no limit, set offset = 0 and count = -1 +func (db *DB) ZRevRangeByScore(key []byte, min int64, max int64, offset int, count int) ([]ScorePair, error) { + return db.ZRangeByScoreGeneric(key, min, max, offset, count, true) +} + +func (db *DB) ZRangeGeneric(key []byte, start int, stop int, reverse bool) ([]ScorePair, error) { + offset, count, err := db.zParseLimit(key, start, stop) + if err != nil { + return nil, err + } + + return db.zRange(key, MinScore, MaxScore, offset, count, reverse) +} + +//min and max must be inclusive +//if no limit, set offset = 0 and count = -1 +func (db *DB) ZRangeByScoreGeneric(key []byte, min int64, max int64, + offset int, count int, reverse bool) ([]ScorePair, error) { + + return db.zRange(key, min, max, offset, count, reverse) +} + +func (db *DB) zFlush() (drop int64, err error) { + t := db.zsetBatch + t.Lock() + defer t.Unlock() + return db.flushType(t, ZSetType) +} + +func (db *DB) ZExpire(key []byte, duration int64) (int64, error) { + if duration <= 0 { + return 0, errExpireValue + } + + return db.zExpireAt(key, time.Now().Unix()+duration) +} + +func (db *DB) ZExpireAt(key []byte, when int64) (int64, error) { + if when <= time.Now().Unix() { + return 0, errExpireValue + } + + return db.zExpireAt(key, when) +} + +func (db *DB) ZTTL(key []byte) (int64, error) { + if err := checkKeySize(key); err != nil { + return -1, err + } + + return db.ttl(ZSetType, key) +} + +func (db *DB) ZPersist(key []byte) (int64, error) { + if err := checkKeySize(key); err != nil { + return 0, err + } + + t := db.zsetBatch + t.Lock() + defer t.Unlock() + + n, err := db.rmExpire(t, ZSetType, key) + if err != nil { + return 0, err + } + + err = t.Commit() + return n, err +} + +func getAggregateFunc(aggregate byte) func(int64, int64) int64 { + switch aggregate { + case AggregateSum: + return func(a int64, b int64) int64 { + return a + b + } + case AggregateMax: + return func(a int64, b int64) int64 { + if a > b { + return a + } + return b + } + case AggregateMin: + return func(a int64, b int64) int64 { + if a > b { + return b + } + return a + } + } + return nil +} + +func (db *DB) ZUnionStore(destKey []byte, srcKeys [][]byte, weights []int64, aggregate byte) (int64, error) { + + var destMap = map[string]int64{} + aggregateFunc := getAggregateFunc(aggregate) + if aggregateFunc == nil { + return 0, errInvalidAggregate + } + if len(srcKeys) < 1 { + return 0, errInvalidSrcKeyNum + } + if weights != nil { + if len(srcKeys) != len(weights) { + return 0, errInvalidWeightNum + } + } else { + weights = make([]int64, len(srcKeys)) + for i := 0; i < len(weights); i++ { + weights[i] = 1 + } + } + + for i, key := range srcKeys { + scorePairs, err := db.ZRange(key, 0, -1) + if err != nil { + return 0, err + } + for _, pair := range scorePairs { + if score, ok := destMap[String(pair.Member)]; !ok { + destMap[String(pair.Member)] = pair.Score * weights[i] + } else { + destMap[String(pair.Member)] = aggregateFunc(score, pair.Score*weights[i]) + } + } + } + + t := db.zsetBatch + t.Lock() + defer t.Unlock() + + db.zDelete(t, destKey) + + for member, score := range destMap { + if err := checkZSetKMSize(destKey, []byte(member)); err != nil { + return 0, err + } + + if _, err := db.zSetItem(t, destKey, score, []byte(member)); err != nil { + return 0, err + } + } + + var num = int64(len(destMap)) + sk := db.zEncodeSizeKey(destKey) + t.Put(sk, PutInt64(num)) + + //todo add binlog + if err := t.Commit(); err != nil { + return 0, err + } + return num, nil +} + +func (db *DB) ZInterStore(destKey []byte, srcKeys [][]byte, weights []int64, aggregate byte) (int64, error) { + + aggregateFunc := getAggregateFunc(aggregate) + if aggregateFunc == nil { + return 0, errInvalidAggregate + } + if len(srcKeys) < 1 { + return 0, errInvalidSrcKeyNum + } + if weights != nil { + if len(srcKeys) != len(weights) { + return 0, errInvalidWeightNum + } + } else { + weights = make([]int64, len(srcKeys)) + for i := 0; i < len(weights); i++ { + weights[i] = 1 + } + } + + var destMap = map[string]int64{} + scorePairs, err := db.ZRange(srcKeys[0], 0, -1) + if err != nil { + return 0, err + } + for _, pair := range scorePairs { + destMap[String(pair.Member)] = pair.Score * weights[0] + } + + for i, key := range srcKeys[1:] { + scorePairs, err := db.ZRange(key, 0, -1) + if err != nil { + return 0, err + } + tmpMap := map[string]int64{} + for _, pair := range scorePairs { + if score, ok := destMap[String(pair.Member)]; ok { + tmpMap[String(pair.Member)] = aggregateFunc(score, pair.Score*weights[i+1]) + } + } + destMap = tmpMap + } + + t := db.zsetBatch + t.Lock() + defer t.Unlock() + + db.zDelete(t, destKey) + + for member, score := range destMap { + if err := checkZSetKMSize(destKey, []byte(member)); err != nil { + return 0, err + } + if _, err := db.zSetItem(t, destKey, score, []byte(member)); err != nil { + return 0, err + } + } + + var num int64 = int64(len(destMap)) + sk := db.zEncodeSizeKey(destKey) + t.Put(sk, PutInt64(num)) + //todo add binlog + if err := t.Commit(); err != nil { + return 0, err + } + return num, nil +} + +func (db *DB) ZScan(key []byte, count int, inclusive bool, match string) ([][]byte, error) { + return db.scan(ZSizeType, key, count, inclusive, match) +} diff --git a/vendor/github.com/lunny/nodb/tx.go b/vendor/github.com/lunny/nodb/tx.go new file mode 100644 index 000000000000..5ce99db57ab7 --- /dev/null +++ b/vendor/github.com/lunny/nodb/tx.go @@ -0,0 +1,113 @@ +package nodb + +import ( + "errors" + "fmt" + + "github.com/lunny/nodb/store" +) + +var ( + ErrNestTx = errors.New("nest transaction not supported") + ErrTxDone = errors.New("Transaction has already been committed or rolled back") +) + +type Tx struct { + *DB + + tx *store.Tx + + logs [][]byte +} + +func (db *DB) IsTransaction() bool { + return db.status == DBInTransaction +} + +// Begin a transaction, it will block all other write operations before calling Commit or Rollback. +// You must be very careful to prevent long-time transaction. +func (db *DB) Begin() (*Tx, error) { + if db.IsTransaction() { + return nil, ErrNestTx + } + + tx := new(Tx) + + tx.DB = new(DB) + tx.DB.l = db.l + + tx.l.wLock.Lock() + + tx.DB.sdb = db.sdb + + var err error + tx.tx, err = db.sdb.Begin() + if err != nil { + tx.l.wLock.Unlock() + return nil, err + } + + tx.DB.bucket = tx.tx + + tx.DB.status = DBInTransaction + + tx.DB.index = db.index + + tx.DB.kvBatch = tx.newBatch() + tx.DB.listBatch = tx.newBatch() + tx.DB.hashBatch = tx.newBatch() + tx.DB.zsetBatch = tx.newBatch() + tx.DB.binBatch = tx.newBatch() + tx.DB.setBatch = tx.newBatch() + + return tx, nil +} + +func (tx *Tx) Commit() error { + if tx.tx == nil { + return ErrTxDone + } + + tx.l.commitLock.Lock() + err := tx.tx.Commit() + tx.tx = nil + + if len(tx.logs) > 0 { + tx.l.binlog.Log(tx.logs...) + } + + tx.l.commitLock.Unlock() + + tx.l.wLock.Unlock() + + tx.DB.bucket = nil + + return err +} + +func (tx *Tx) Rollback() error { + if tx.tx == nil { + return ErrTxDone + } + + err := tx.tx.Rollback() + tx.tx = nil + + tx.l.wLock.Unlock() + tx.DB.bucket = nil + + return err +} + +func (tx *Tx) newBatch() *batch { + return tx.l.newBatch(tx.tx.NewWriteBatch(), &txBatchLocker{}, tx) +} + +func (tx *Tx) Select(index int) error { + if index < 0 || index >= int(MaxDBNumber) { + return fmt.Errorf("invalid db index %d", index) + } + + tx.DB.index = uint8(index) + return nil +} diff --git a/vendor/github.com/lunny/nodb/util.go b/vendor/github.com/lunny/nodb/util.go new file mode 100644 index 000000000000..d5949a96e6dd --- /dev/null +++ b/vendor/github.com/lunny/nodb/util.go @@ -0,0 +1,113 @@ +package nodb + +import ( + "encoding/binary" + "errors" + "reflect" + "strconv" + "unsafe" +) + +var errIntNumber = errors.New("invalid integer") + +// no copy to change slice to string +// use your own risk +func String(b []byte) (s string) { + pbytes := (*reflect.SliceHeader)(unsafe.Pointer(&b)) + pstring := (*reflect.StringHeader)(unsafe.Pointer(&s)) + pstring.Data = pbytes.Data + pstring.Len = pbytes.Len + return +} + +// no copy to change string to slice +// use your own risk +func Slice(s string) (b []byte) { + pbytes := (*reflect.SliceHeader)(unsafe.Pointer(&b)) + pstring := (*reflect.StringHeader)(unsafe.Pointer(&s)) + pbytes.Data = pstring.Data + pbytes.Len = pstring.Len + pbytes.Cap = pstring.Len + return +} + +func Int64(v []byte, err error) (int64, error) { + if err != nil { + return 0, err + } else if v == nil || len(v) == 0 { + return 0, nil + } else if len(v) != 8 { + return 0, errIntNumber + } + + return int64(binary.LittleEndian.Uint64(v)), nil +} + +func PutInt64(v int64) []byte { + var b []byte + pbytes := (*reflect.SliceHeader)(unsafe.Pointer(&b)) + pbytes.Data = uintptr(unsafe.Pointer(&v)) + pbytes.Len = 8 + pbytes.Cap = 8 + return b +} + +func StrInt64(v []byte, err error) (int64, error) { + if err != nil { + return 0, err + } else if v == nil { + return 0, nil + } else { + return strconv.ParseInt(String(v), 10, 64) + } +} + +func StrInt32(v []byte, err error) (int32, error) { + if err != nil { + return 0, err + } else if v == nil { + return 0, nil + } else { + res, err := strconv.ParseInt(String(v), 10, 32) + return int32(res), err + } +} + +func StrInt8(v []byte, err error) (int8, error) { + if err != nil { + return 0, err + } else if v == nil { + return 0, nil + } else { + res, err := strconv.ParseInt(String(v), 10, 8) + return int8(res), err + } +} + +func StrPutInt64(v int64) []byte { + return strconv.AppendInt(nil, v, 10) +} + +func MinUInt32(a uint32, b uint32) uint32 { + if a > b { + return b + } else { + return a + } +} + +func MaxUInt32(a uint32, b uint32) uint32 { + if a > b { + return a + } else { + return b + } +} + +func MaxInt32(a int32, b int32) int32 { + if a > b { + return a + } else { + return b + } +} diff --git a/vendor/github.com/pkg/errors/LICENSE b/vendor/github.com/pkg/errors/LICENSE new file mode 100644 index 000000000000..835ba3e755ce --- /dev/null +++ b/vendor/github.com/pkg/errors/LICENSE @@ -0,0 +1,23 @@ +Copyright (c) 2015, Dave Cheney +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/github.com/pkg/errors/errors.go b/vendor/github.com/pkg/errors/errors.go new file mode 100644 index 000000000000..7421f326ffe8 --- /dev/null +++ b/vendor/github.com/pkg/errors/errors.go @@ -0,0 +1,282 @@ +// Package errors provides simple error handling primitives. +// +// The traditional error handling idiom in Go is roughly akin to +// +// if err != nil { +// return err +// } +// +// which when applied recursively up the call stack results in error reports +// without context or debugging information. The errors package allows +// programmers to add context to the failure path in their code in a way +// that does not destroy the original value of the error. +// +// Adding context to an error +// +// The errors.Wrap function returns a new error that adds context to the +// original error by recording a stack trace at the point Wrap is called, +// together with the supplied message. For example +// +// _, err := ioutil.ReadAll(r) +// if err != nil { +// return errors.Wrap(err, "read failed") +// } +// +// If additional control is required, the errors.WithStack and +// errors.WithMessage functions destructure errors.Wrap into its component +// operations: annotating an error with a stack trace and with a message, +// respectively. +// +// Retrieving the cause of an error +// +// Using errors.Wrap constructs a stack of errors, adding context to the +// preceding error. Depending on the nature of the error it may be necessary +// to reverse the operation of errors.Wrap to retrieve the original error +// for inspection. Any error value which implements this interface +// +// type causer interface { +// Cause() error +// } +// +// can be inspected by errors.Cause. errors.Cause will recursively retrieve +// the topmost error that does not implement causer, which is assumed to be +// the original cause. For example: +// +// switch err := errors.Cause(err).(type) { +// case *MyError: +// // handle specifically +// default: +// // unknown error +// } +// +// Although the causer interface is not exported by this package, it is +// considered a part of its stable public interface. +// +// Formatted printing of errors +// +// All error values returned from this package implement fmt.Formatter and can +// be formatted by the fmt package. The following verbs are supported: +// +// %s print the error. If the error has a Cause it will be +// printed recursively. +// %v see %s +// %+v extended format. Each Frame of the error's StackTrace will +// be printed in detail. +// +// Retrieving the stack trace of an error or wrapper +// +// New, Errorf, Wrap, and Wrapf record a stack trace at the point they are +// invoked. This information can be retrieved with the following interface: +// +// type stackTracer interface { +// StackTrace() errors.StackTrace +// } +// +// The returned errors.StackTrace type is defined as +// +// type StackTrace []Frame +// +// The Frame type represents a call site in the stack trace. Frame supports +// the fmt.Formatter interface that can be used for printing information about +// the stack trace of this error. For example: +// +// if err, ok := err.(stackTracer); ok { +// for _, f := range err.StackTrace() { +// fmt.Printf("%+s:%d", f) +// } +// } +// +// Although the stackTracer interface is not exported by this package, it is +// considered a part of its stable public interface. +// +// See the documentation for Frame.Format for more details. +package errors + +import ( + "fmt" + "io" +) + +// New returns an error with the supplied message. +// New also records the stack trace at the point it was called. +func New(message string) error { + return &fundamental{ + msg: message, + stack: callers(), + } +} + +// Errorf formats according to a format specifier and returns the string +// as a value that satisfies error. +// Errorf also records the stack trace at the point it was called. +func Errorf(format string, args ...interface{}) error { + return &fundamental{ + msg: fmt.Sprintf(format, args...), + stack: callers(), + } +} + +// fundamental is an error that has a message and a stack, but no caller. +type fundamental struct { + msg string + *stack +} + +func (f *fundamental) Error() string { return f.msg } + +func (f *fundamental) Format(s fmt.State, verb rune) { + switch verb { + case 'v': + if s.Flag('+') { + io.WriteString(s, f.msg) + f.stack.Format(s, verb) + return + } + fallthrough + case 's': + io.WriteString(s, f.msg) + case 'q': + fmt.Fprintf(s, "%q", f.msg) + } +} + +// WithStack annotates err with a stack trace at the point WithStack was called. +// If err is nil, WithStack returns nil. +func WithStack(err error) error { + if err == nil { + return nil + } + return &withStack{ + err, + callers(), + } +} + +type withStack struct { + error + *stack +} + +func (w *withStack) Cause() error { return w.error } + +func (w *withStack) Format(s fmt.State, verb rune) { + switch verb { + case 'v': + if s.Flag('+') { + fmt.Fprintf(s, "%+v", w.Cause()) + w.stack.Format(s, verb) + return + } + fallthrough + case 's': + io.WriteString(s, w.Error()) + case 'q': + fmt.Fprintf(s, "%q", w.Error()) + } +} + +// Wrap returns an error annotating err with a stack trace +// at the point Wrap is called, and the supplied message. +// If err is nil, Wrap returns nil. +func Wrap(err error, message string) error { + if err == nil { + return nil + } + err = &withMessage{ + cause: err, + msg: message, + } + return &withStack{ + err, + callers(), + } +} + +// Wrapf returns an error annotating err with a stack trace +// at the point Wrapf is called, and the format specifier. +// If err is nil, Wrapf returns nil. +func Wrapf(err error, format string, args ...interface{}) error { + if err == nil { + return nil + } + err = &withMessage{ + cause: err, + msg: fmt.Sprintf(format, args...), + } + return &withStack{ + err, + callers(), + } +} + +// WithMessage annotates err with a new message. +// If err is nil, WithMessage returns nil. +func WithMessage(err error, message string) error { + if err == nil { + return nil + } + return &withMessage{ + cause: err, + msg: message, + } +} + +// WithMessagef annotates err with the format specifier. +// If err is nil, WithMessagef returns nil. +func WithMessagef(err error, format string, args ...interface{}) error { + if err == nil { + return nil + } + return &withMessage{ + cause: err, + msg: fmt.Sprintf(format, args...), + } +} + +type withMessage struct { + cause error + msg string +} + +func (w *withMessage) Error() string { return w.msg + ": " + w.cause.Error() } +func (w *withMessage) Cause() error { return w.cause } + +func (w *withMessage) Format(s fmt.State, verb rune) { + switch verb { + case 'v': + if s.Flag('+') { + fmt.Fprintf(s, "%+v\n", w.Cause()) + io.WriteString(s, w.msg) + return + } + fallthrough + case 's', 'q': + io.WriteString(s, w.Error()) + } +} + +// Cause returns the underlying cause of the error, if possible. +// An error value has a cause if it implements the following +// interface: +// +// type causer interface { +// Cause() error +// } +// +// If the error does not implement Cause, the original error will +// be returned. If the error is nil, nil will be returned without further +// investigation. +func Cause(err error) error { + type causer interface { + Cause() error + } + + for err != nil { + cause, ok := err.(causer) + if !ok { + break + } + err = cause.Cause() + } + return err +} diff --git a/vendor/github.com/pkg/errors/stack.go b/vendor/github.com/pkg/errors/stack.go new file mode 100644 index 000000000000..2874a048cf3e --- /dev/null +++ b/vendor/github.com/pkg/errors/stack.go @@ -0,0 +1,147 @@ +package errors + +import ( + "fmt" + "io" + "path" + "runtime" + "strings" +) + +// Frame represents a program counter inside a stack frame. +type Frame uintptr + +// pc returns the program counter for this frame; +// multiple frames may have the same PC value. +func (f Frame) pc() uintptr { return uintptr(f) - 1 } + +// file returns the full path to the file that contains the +// function for this Frame's pc. +func (f Frame) file() string { + fn := runtime.FuncForPC(f.pc()) + if fn == nil { + return "unknown" + } + file, _ := fn.FileLine(f.pc()) + return file +} + +// line returns the line number of source code of the +// function for this Frame's pc. +func (f Frame) line() int { + fn := runtime.FuncForPC(f.pc()) + if fn == nil { + return 0 + } + _, line := fn.FileLine(f.pc()) + return line +} + +// Format formats the frame according to the fmt.Formatter interface. +// +// %s source file +// %d source line +// %n function name +// %v equivalent to %s:%d +// +// Format accepts flags that alter the printing of some verbs, as follows: +// +// %+s function name and path of source file relative to the compile time +// GOPATH separated by \n\t (\n\t) +// %+v equivalent to %+s:%d +func (f Frame) Format(s fmt.State, verb rune) { + switch verb { + case 's': + switch { + case s.Flag('+'): + pc := f.pc() + fn := runtime.FuncForPC(pc) + if fn == nil { + io.WriteString(s, "unknown") + } else { + file, _ := fn.FileLine(pc) + fmt.Fprintf(s, "%s\n\t%s", fn.Name(), file) + } + default: + io.WriteString(s, path.Base(f.file())) + } + case 'd': + fmt.Fprintf(s, "%d", f.line()) + case 'n': + name := runtime.FuncForPC(f.pc()).Name() + io.WriteString(s, funcname(name)) + case 'v': + f.Format(s, 's') + io.WriteString(s, ":") + f.Format(s, 'd') + } +} + +// StackTrace is stack of Frames from innermost (newest) to outermost (oldest). +type StackTrace []Frame + +// Format formats the stack of Frames according to the fmt.Formatter interface. +// +// %s lists source files for each Frame in the stack +// %v lists the source file and line number for each Frame in the stack +// +// Format accepts flags that alter the printing of some verbs, as follows: +// +// %+v Prints filename, function, and line number for each Frame in the stack. +func (st StackTrace) Format(s fmt.State, verb rune) { + switch verb { + case 'v': + switch { + case s.Flag('+'): + for _, f := range st { + fmt.Fprintf(s, "\n%+v", f) + } + case s.Flag('#'): + fmt.Fprintf(s, "%#v", []Frame(st)) + default: + fmt.Fprintf(s, "%v", []Frame(st)) + } + case 's': + fmt.Fprintf(s, "%s", []Frame(st)) + } +} + +// stack represents a stack of program counters. +type stack []uintptr + +func (s *stack) Format(st fmt.State, verb rune) { + switch verb { + case 'v': + switch { + case st.Flag('+'): + for _, pc := range *s { + f := Frame(pc) + fmt.Fprintf(st, "\n%+v", f) + } + } + } +} + +func (s *stack) StackTrace() StackTrace { + f := make([]Frame, len(*s)) + for i := 0; i < len(f); i++ { + f[i] = Frame((*s)[i]) + } + return f +} + +func callers() *stack { + const depth = 32 + var pcs [depth]uintptr + n := runtime.Callers(3, pcs[:]) + var st stack = pcs[0:n] + return &st +} + +// funcname removes the path prefix component of a function's name reported by func.Name(). +func funcname(name string) string { + i := strings.LastIndex(name, "/") + name = name[i+1:] + i = strings.Index(name, ".") + return name[i+1:] +} diff --git a/vendor/github.com/siddontang/go-snappy/AUTHORS b/vendor/github.com/siddontang/go-snappy/AUTHORS new file mode 100644 index 000000000000..8ddb5b7a2b9c --- /dev/null +++ b/vendor/github.com/siddontang/go-snappy/AUTHORS @@ -0,0 +1,12 @@ +# This is the official list of Snappy-Go authors for copyright purposes. +# This file is distinct from the CONTRIBUTORS files. +# See the latter for an explanation. + +# Names should be added to this file as +# Name or Organization +# The email address is not required for organizations. + +# Please keep the list sorted. + +Google Inc. +Jan Mercl <0xjnml@gmail.com> diff --git a/vendor/github.com/siddontang/go-snappy/CONTRIBUTORS b/vendor/github.com/siddontang/go-snappy/CONTRIBUTORS new file mode 100644 index 000000000000..50b69c80ea9b --- /dev/null +++ b/vendor/github.com/siddontang/go-snappy/CONTRIBUTORS @@ -0,0 +1,34 @@ +# This is the official list of people who can contribute +# (and typically have contributed) code to the Snappy-Go repository. +# The AUTHORS file lists the copyright holders; this file +# lists people. For example, Google employees are listed here +# but not in AUTHORS, because Google holds the copyright. +# +# The submission process automatically checks to make sure +# that people submitting code are listed in this file (by email address). +# +# Names should be added to this file only after verifying that +# the individual or the individual's organization has agreed to +# the appropriate Contributor License Agreement, found here: +# +# http://code.google.com/legal/individual-cla-v1.0.html +# http://code.google.com/legal/corporate-cla-v1.0.html +# +# The agreement for individuals can be filled out on the web. +# +# When adding J Random Contributor's name to this file, +# either J's name or J's organization's name should be +# added to the AUTHORS file, depending on whether the +# individual or corporate CLA was used. + +# Names should be added to this file like so: +# Name + +# Please keep the list sorted. + +Jan Mercl <0xjnml@gmail.com> +Kai Backman +Marc-Antoine Ruel +Nigel Tao +Rob Pike +Russ Cox diff --git a/vendor/github.com/siddontang/go-snappy/LICENSE b/vendor/github.com/siddontang/go-snappy/LICENSE new file mode 100644 index 000000000000..6050c10f4c8b --- /dev/null +++ b/vendor/github.com/siddontang/go-snappy/LICENSE @@ -0,0 +1,27 @@ +Copyright (c) 2011 The Snappy-Go Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/github.com/siddontang/go-snappy/snappy/decode.go b/vendor/github.com/siddontang/go-snappy/snappy/decode.go new file mode 100644 index 000000000000..d93c1b9dbfd7 --- /dev/null +++ b/vendor/github.com/siddontang/go-snappy/snappy/decode.go @@ -0,0 +1,124 @@ +// Copyright 2011 The Snappy-Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package snappy + +import ( + "encoding/binary" + "errors" +) + +// ErrCorrupt reports that the input is invalid. +var ErrCorrupt = errors.New("snappy: corrupt input") + +// DecodedLen returns the length of the decoded block. +func DecodedLen(src []byte) (int, error) { + v, _, err := decodedLen(src) + return v, err +} + +// decodedLen returns the length of the decoded block and the number of bytes +// that the length header occupied. +func decodedLen(src []byte) (blockLen, headerLen int, err error) { + v, n := binary.Uvarint(src) + if n == 0 { + return 0, 0, ErrCorrupt + } + if uint64(int(v)) != v { + return 0, 0, errors.New("snappy: decoded block is too large") + } + return int(v), n, nil +} + +// Decode returns the decoded form of src. The returned slice may be a sub- +// slice of dst if dst was large enough to hold the entire decoded block. +// Otherwise, a newly allocated slice will be returned. +// It is valid to pass a nil dst. +func Decode(dst, src []byte) ([]byte, error) { + dLen, s, err := decodedLen(src) + if err != nil { + return nil, err + } + if len(dst) < dLen { + dst = make([]byte, dLen) + } + + var d, offset, length int + for s < len(src) { + switch src[s] & 0x03 { + case tagLiteral: + x := uint(src[s] >> 2) + switch { + case x < 60: + s += 1 + case x == 60: + s += 2 + if s > len(src) { + return nil, ErrCorrupt + } + x = uint(src[s-1]) + case x == 61: + s += 3 + if s > len(src) { + return nil, ErrCorrupt + } + x = uint(src[s-2]) | uint(src[s-1])<<8 + case x == 62: + s += 4 + if s > len(src) { + return nil, ErrCorrupt + } + x = uint(src[s-3]) | uint(src[s-2])<<8 | uint(src[s-1])<<16 + case x == 63: + s += 5 + if s > len(src) { + return nil, ErrCorrupt + } + x = uint(src[s-4]) | uint(src[s-3])<<8 | uint(src[s-2])<<16 | uint(src[s-1])<<24 + } + length = int(x + 1) + if length <= 0 { + return nil, errors.New("snappy: unsupported literal length") + } + if length > len(dst)-d || length > len(src)-s { + return nil, ErrCorrupt + } + copy(dst[d:], src[s:s+length]) + d += length + s += length + continue + + case tagCopy1: + s += 2 + if s > len(src) { + return nil, ErrCorrupt + } + length = 4 + int(src[s-2])>>2&0x7 + offset = int(src[s-2])&0xe0<<3 | int(src[s-1]) + + case tagCopy2: + s += 3 + if s > len(src) { + return nil, ErrCorrupt + } + length = 1 + int(src[s-3])>>2 + offset = int(src[s-2]) | int(src[s-1])<<8 + + case tagCopy4: + return nil, errors.New("snappy: unsupported COPY_4 tag") + } + + end := d + length + if offset > d || end > len(dst) { + return nil, ErrCorrupt + } + for ; d < end; d++ { + dst[d] = dst[d-offset] + } + } + if d != dLen { + return nil, ErrCorrupt + } + return dst[:d], nil +} diff --git a/vendor/github.com/siddontang/go-snappy/snappy/encode.go b/vendor/github.com/siddontang/go-snappy/snappy/encode.go new file mode 100644 index 000000000000..b2371db11c8f --- /dev/null +++ b/vendor/github.com/siddontang/go-snappy/snappy/encode.go @@ -0,0 +1,174 @@ +// Copyright 2011 The Snappy-Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package snappy + +import ( + "encoding/binary" +) + +// We limit how far copy back-references can go, the same as the C++ code. +const maxOffset = 1 << 15 + +// emitLiteral writes a literal chunk and returns the number of bytes written. +func emitLiteral(dst, lit []byte) int { + i, n := 0, uint(len(lit)-1) + switch { + case n < 60: + dst[0] = uint8(n)<<2 | tagLiteral + i = 1 + case n < 1<<8: + dst[0] = 60<<2 | tagLiteral + dst[1] = uint8(n) + i = 2 + case n < 1<<16: + dst[0] = 61<<2 | tagLiteral + dst[1] = uint8(n) + dst[2] = uint8(n >> 8) + i = 3 + case n < 1<<24: + dst[0] = 62<<2 | tagLiteral + dst[1] = uint8(n) + dst[2] = uint8(n >> 8) + dst[3] = uint8(n >> 16) + i = 4 + case int64(n) < 1<<32: + dst[0] = 63<<2 | tagLiteral + dst[1] = uint8(n) + dst[2] = uint8(n >> 8) + dst[3] = uint8(n >> 16) + dst[4] = uint8(n >> 24) + i = 5 + default: + panic("snappy: source buffer is too long") + } + if copy(dst[i:], lit) != len(lit) { + panic("snappy: destination buffer is too short") + } + return i + len(lit) +} + +// emitCopy writes a copy chunk and returns the number of bytes written. +func emitCopy(dst []byte, offset, length int) int { + i := 0 + for length > 0 { + x := length - 4 + if 0 <= x && x < 1<<3 && offset < 1<<11 { + dst[i+0] = uint8(offset>>8)&0x07<<5 | uint8(x)<<2 | tagCopy1 + dst[i+1] = uint8(offset) + i += 2 + break + } + + x = length + if x > 1<<6 { + x = 1 << 6 + } + dst[i+0] = uint8(x-1)<<2 | tagCopy2 + dst[i+1] = uint8(offset) + dst[i+2] = uint8(offset >> 8) + i += 3 + length -= x + } + return i +} + +// Encode returns the encoded form of src. The returned slice may be a sub- +// slice of dst if dst was large enough to hold the entire encoded block. +// Otherwise, a newly allocated slice will be returned. +// It is valid to pass a nil dst. +func Encode(dst, src []byte) ([]byte, error) { + if n := MaxEncodedLen(len(src)); len(dst) < n { + dst = make([]byte, n) + } + + // The block starts with the varint-encoded length of the decompressed bytes. + d := binary.PutUvarint(dst, uint64(len(src))) + + // Return early if src is short. + if len(src) <= 4 { + if len(src) != 0 { + d += emitLiteral(dst[d:], src) + } + return dst[:d], nil + } + + // Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive. + const maxTableSize = 1 << 14 + shift, tableSize := uint(32-8), 1<<8 + for tableSize < maxTableSize && tableSize < len(src) { + shift-- + tableSize *= 2 + } + var table [maxTableSize]int + + // Iterate over the source bytes. + var ( + s int // The iterator position. + t int // The last position with the same hash as s. + lit int // The start position of any pending literal bytes. + ) + for s+3 < len(src) { + // Update the hash table. + b0, b1, b2, b3 := src[s], src[s+1], src[s+2], src[s+3] + h := uint32(b0) | uint32(b1)<<8 | uint32(b2)<<16 | uint32(b3)<<24 + p := &table[(h*0x1e35a7bd)>>shift] + // We need to to store values in [-1, inf) in table. To save + // some initialization time, (re)use the table's zero value + // and shift the values against this zero: add 1 on writes, + // subtract 1 on reads. + t, *p = *p-1, s+1 + // If t is invalid or src[s:s+4] differs from src[t:t+4], accumulate a literal byte. + if t < 0 || s-t >= maxOffset || b0 != src[t] || b1 != src[t+1] || b2 != src[t+2] || b3 != src[t+3] { + s++ + continue + } + // Otherwise, we have a match. First, emit any pending literal bytes. + if lit != s { + d += emitLiteral(dst[d:], src[lit:s]) + } + // Extend the match to be as long as possible. + s0 := s + s, t = s+4, t+4 + for s < len(src) && src[s] == src[t] { + s++ + t++ + } + // Emit the copied bytes. + d += emitCopy(dst[d:], s-t, s-s0) + lit = s + } + + // Emit any final pending literal bytes and return. + if lit != len(src) { + d += emitLiteral(dst[d:], src[lit:]) + } + return dst[:d], nil +} + +// MaxEncodedLen returns the maximum length of a snappy block, given its +// uncompressed length. +func MaxEncodedLen(srcLen int) int { + // Compressed data can be defined as: + // compressed := item* literal* + // item := literal* copy + // + // The trailing literal sequence has a space blowup of at most 62/60 + // since a literal of length 60 needs one tag byte + one extra byte + // for length information. + // + // Item blowup is trickier to measure. Suppose the "copy" op copies + // 4 bytes of data. Because of a special check in the encoding code, + // we produce a 4-byte copy only if the offset is < 65536. Therefore + // the copy op takes 3 bytes to encode, and this type of item leads + // to at most the 62/60 blowup for representing literals. + // + // Suppose the "copy" op copies 5 bytes of data. If the offset is big + // enough, it will take 5 bytes to encode the copy op. Therefore the + // worst case here is a one-byte literal followed by a five-byte copy. + // That is, 6 bytes of input turn into 7 bytes of "compressed" data. + // + // This last factor dominates the blowup, so the final estimate is: + return 32 + srcLen + srcLen/6 +} diff --git a/vendor/github.com/siddontang/go-snappy/snappy/snappy.go b/vendor/github.com/siddontang/go-snappy/snappy/snappy.go new file mode 100644 index 000000000000..2f1b790d0b71 --- /dev/null +++ b/vendor/github.com/siddontang/go-snappy/snappy/snappy.go @@ -0,0 +1,38 @@ +// Copyright 2011 The Snappy-Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package snappy implements the snappy block-based compression format. +// It aims for very high speeds and reasonable compression. +// +// The C++ snappy implementation is at http://code.google.com/p/snappy/ +package snappy + +/* +Each encoded block begins with the varint-encoded length of the decoded data, +followed by a sequence of chunks. Chunks begin and end on byte boundaries. The +first byte of each chunk is broken into its 2 least and 6 most significant bits +called l and m: l ranges in [0, 4) and m ranges in [0, 64). l is the chunk tag. +Zero means a literal tag. All other values mean a copy tag. + +For literal tags: + - If m < 60, the next 1 + m bytes are literal bytes. + - Otherwise, let n be the little-endian unsigned integer denoted by the next + m - 59 bytes. The next 1 + n bytes after that are literal bytes. + +For copy tags, length bytes are copied from offset bytes ago, in the style of +Lempel-Ziv compression algorithms. In particular: + - For l == 1, the offset ranges in [0, 1<<11) and the length in [4, 12). + The length is 4 + the low 3 bits of m. The high 3 bits of m form bits 8-10 + of the offset. The next byte is bits 0-7 of the offset. + - For l == 2, the offset ranges in [0, 1<<16) and the length in [1, 65). + The length is 1 + m. The offset is the little-endian unsigned integer + denoted by the next 2 bytes. + - For l == 3, this tag is a legacy format that is no longer supported. +*/ +const ( + tagLiteral = 0x00 + tagCopy1 = 0x01 + tagCopy2 = 0x02 + tagCopy4 = 0x03 +) diff --git a/vendor/github.com/siddontang/go/LICENSE b/vendor/github.com/siddontang/go/LICENSE new file mode 100644 index 000000000000..80511a0a784d --- /dev/null +++ b/vendor/github.com/siddontang/go/LICENSE @@ -0,0 +1,20 @@ +The MIT License (MIT) + +Copyright (c) 2014 siddontang + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/vendor/github.com/siddontang/go/bson/LICENSE b/vendor/github.com/siddontang/go/bson/LICENSE new file mode 100644 index 000000000000..890326017b85 --- /dev/null +++ b/vendor/github.com/siddontang/go/bson/LICENSE @@ -0,0 +1,25 @@ +BSON library for Go + +Copyright (c) 2010-2012 - Gustavo Niemeyer + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/github.com/siddontang/go/filelock/LICENSE b/vendor/github.com/siddontang/go/filelock/LICENSE new file mode 100644 index 000000000000..fec05ce12959 --- /dev/null +++ b/vendor/github.com/siddontang/go/filelock/LICENSE @@ -0,0 +1,27 @@ +Copyright (c) 2011 The LevelDB-Go Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/github.com/siddontang/go/filelock/file_lock_generic.go b/vendor/github.com/siddontang/go/filelock/file_lock_generic.go new file mode 100644 index 000000000000..53c292acbdff --- /dev/null +++ b/vendor/github.com/siddontang/go/filelock/file_lock_generic.go @@ -0,0 +1,17 @@ +// Copyright 2012 The LevelDB-Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !darwin,!dragonfly,!freebsd,!linux,!netbsd,!openbsd,!solaris,!windows + +package filelock + +import ( + "fmt" + "io" + "runtime" +) + +func Lock(name string) (io.Closer, error) { + return nil, fmt.Errorf("leveldb/db: file locking is not implemented on %s/%s", runtime.GOOS, runtime.GOARCH) +} diff --git a/vendor/github.com/siddontang/go/filelock/file_lock_solaris.go b/vendor/github.com/siddontang/go/filelock/file_lock_solaris.go new file mode 100644 index 000000000000..56ff3e2ceef2 --- /dev/null +++ b/vendor/github.com/siddontang/go/filelock/file_lock_solaris.go @@ -0,0 +1,43 @@ +// Copyright 2014 The LevelDB-Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build solaris + +package filelock + +import ( + "io" + "os" + "syscall" +) + +// lockCloser hides all of an os.File's methods, except for Close. +type lockCloser struct { + f *os.File +} + +func (l lockCloser) Close() error { + return l.f.Close() +} + +func Lock(name string) (io.Closer, error) { + f, err := os.Create(name) + if err != nil { + return nil, err + } + + spec := syscall.Flock_t{ + Type: syscall.F_WRLCK, + Whence: int16(os.SEEK_SET), + Start: 0, + Len: 0, // 0 means to lock the entire file. + Pid: int32(os.Getpid()), + } + if err := syscall.FcntlFlock(f.Fd(), syscall.F_SETLK, &spec); err != nil { + f.Close() + return nil, err + } + + return lockCloser{f}, nil +} diff --git a/vendor/github.com/siddontang/go/filelock/file_lock_unix.go b/vendor/github.com/siddontang/go/filelock/file_lock_unix.go new file mode 100644 index 000000000000..f70ae6192c59 --- /dev/null +++ b/vendor/github.com/siddontang/go/filelock/file_lock_unix.go @@ -0,0 +1,51 @@ +// Copyright 2014 The LevelDB-Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build darwin dragonfly freebsd linux netbsd openbsd + +package filelock + +import ( + "io" + "os" + "syscall" +) + +// lockCloser hides all of an os.File's methods, except for Close. +type lockCloser struct { + f *os.File +} + +func (l lockCloser) Close() error { + return l.f.Close() +} + +func Lock(name string) (io.Closer, error) { + f, err := os.Create(name) + if err != nil { + return nil, err + } + + /* + Some people tell me FcntlFlock does not exist, so use flock here + */ + if err := syscall.Flock(int(f.Fd()), syscall.LOCK_EX|syscall.LOCK_NB); err != nil { + f.Close() + return nil, err + } + + // spec := syscall.Flock_t{ + // Type: syscall.F_WRLCK, + // Whence: int16(os.SEEK_SET), + // Start: 0, + // Len: 0, // 0 means to lock the entire file. + // Pid: int32(os.Getpid()), + // } + // if err := syscall.FcntlFlock(f.Fd(), syscall.F_SETLK, &spec); err != nil { + // f.Close() + // return nil, err + // } + + return lockCloser{f}, nil +} diff --git a/vendor/github.com/siddontang/go/filelock/file_lock_windows.go b/vendor/github.com/siddontang/go/filelock/file_lock_windows.go new file mode 100644 index 000000000000..5d3e4ba2029a --- /dev/null +++ b/vendor/github.com/siddontang/go/filelock/file_lock_windows.go @@ -0,0 +1,36 @@ +// Copyright 2013 The LevelDB-Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package filelock + +import ( + "io" + "syscall" +) + +// lockCloser hides all of an syscall.Handle's methods, except for Close. +type lockCloser struct { + fd syscall.Handle +} + +func (l lockCloser) Close() error { + return syscall.Close(l.fd) +} + +func Lock(name string) (io.Closer, error) { + p, err := syscall.UTF16PtrFromString(name) + if err != nil { + return nil, err + } + fd, err := syscall.CreateFile(p, + syscall.GENERIC_READ|syscall.GENERIC_WRITE, + 0, nil, syscall.CREATE_ALWAYS, + syscall.FILE_ATTRIBUTE_NORMAL, + 0, + ) + if err != nil { + return nil, err + } + return lockCloser{fd: fd}, nil +} diff --git a/vendor/github.com/siddontang/go/hack/hack.go b/vendor/github.com/siddontang/go/hack/hack.go new file mode 100644 index 000000000000..74ee83cbf5d5 --- /dev/null +++ b/vendor/github.com/siddontang/go/hack/hack.go @@ -0,0 +1,27 @@ +package hack + +import ( + "reflect" + "unsafe" +) + +// no copy to change slice to string +// use your own risk +func String(b []byte) (s string) { + pbytes := (*reflect.SliceHeader)(unsafe.Pointer(&b)) + pstring := (*reflect.StringHeader)(unsafe.Pointer(&s)) + pstring.Data = pbytes.Data + pstring.Len = pbytes.Len + return +} + +// no copy to change string to slice +// use your own risk +func Slice(s string) (b []byte) { + pbytes := (*reflect.SliceHeader)(unsafe.Pointer(&b)) + pstring := (*reflect.StringHeader)(unsafe.Pointer(&s)) + pbytes.Data = pstring.Data + pbytes.Len = pstring.Len + pbytes.Cap = pstring.Len + return +} diff --git a/vendor/github.com/siddontang/go/ioutil2/ioutil.go b/vendor/github.com/siddontang/go/ioutil2/ioutil.go new file mode 100644 index 000000000000..35c0ad3cad51 --- /dev/null +++ b/vendor/github.com/siddontang/go/ioutil2/ioutil.go @@ -0,0 +1,39 @@ +// Copyright 2012, Google Inc. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package ioutil2 + +import ( + "io" + "io/ioutil" + "os" + "path" +) + +// Write file to temp and atomically move when everything else succeeds. +func WriteFileAtomic(filename string, data []byte, perm os.FileMode) error { + dir, name := path.Dir(filename), path.Base(filename) + f, err := ioutil.TempFile(dir, name) + if err != nil { + return err + } + n, err := f.Write(data) + f.Close() + if err == nil && n < len(data) { + err = io.ErrShortWrite + } else { + err = os.Chmod(f.Name(), perm) + } + if err != nil { + os.Remove(f.Name()) + return err + } + return os.Rename(f.Name(), filename) +} + +// Check file exists or not +func FileExists(name string) bool { + _, err := os.Stat(name) + return !os.IsNotExist(err) +} diff --git a/vendor/github.com/siddontang/go/ioutil2/sectionwriter.go b/vendor/github.com/siddontang/go/ioutil2/sectionwriter.go new file mode 100644 index 000000000000..c02ab0d5fd1f --- /dev/null +++ b/vendor/github.com/siddontang/go/ioutil2/sectionwriter.go @@ -0,0 +1,69 @@ +package ioutil2 + +import ( + "errors" + "io" +) + +var ErrExceedLimit = errors.New("write exceed limit") + +func NewSectionWriter(w io.WriterAt, off int64, n int64) *SectionWriter { + return &SectionWriter{w, off, off, off + n} +} + +type SectionWriter struct { + w io.WriterAt + base int64 + off int64 + limit int64 +} + +func (s *SectionWriter) Write(p []byte) (n int, err error) { + if s.off >= s.limit { + return 0, ErrExceedLimit + } + + if max := s.limit - s.off; int64(len(p)) > max { + return 0, ErrExceedLimit + } + + n, err = s.w.WriteAt(p, s.off) + s.off += int64(n) + return +} + +var errWhence = errors.New("Seek: invalid whence") +var errOffset = errors.New("Seek: invalid offset") + +func (s *SectionWriter) Seek(offset int64, whence int) (int64, error) { + switch whence { + default: + return 0, errWhence + case 0: + offset += s.base + case 1: + offset += s.off + case 2: + offset += s.limit + } + if offset < s.base { + return 0, errOffset + } + s.off = offset + return offset - s.base, nil +} + +func (s *SectionWriter) WriteAt(p []byte, off int64) (n int, err error) { + if off < 0 || off >= s.limit-s.base { + return 0, errOffset + } + off += s.base + if max := s.limit - off; int64(len(p)) > max { + return 0, ErrExceedLimit + } + + return s.w.WriteAt(p, off) +} + +// Size returns the size of the section in bytes. +func (s *SectionWriter) Size() int64 { return s.limit - s.base } diff --git a/vendor/github.com/siddontang/go/log/doc.go b/vendor/github.com/siddontang/go/log/doc.go new file mode 100644 index 000000000000..81a60ee853bc --- /dev/null +++ b/vendor/github.com/siddontang/go/log/doc.go @@ -0,0 +1,21 @@ +// log package supplies more advanced features than go orign log package. +// +// It supports log different level: trace, debug, info, warn, error, fatal. +// +// It also supports different log handlers which you can log to stdout, file, socket, etc... +// +// Use +// +// import "github.com/siddontang/go/log" +// +// //log with different level +// log.Info("hello world") +// log.Error("hello world") +// +// //create a logger with specified handler +// h := NewStreamHandler(os.Stdout) +// l := log.NewDefault(h) +// l.Info("hello world") +// l.Infof("%s %d", "hello", 123) +// +package log diff --git a/vendor/github.com/siddontang/go/log/filehandler.go b/vendor/github.com/siddontang/go/log/filehandler.go new file mode 100644 index 000000000000..2c158e2cf569 --- /dev/null +++ b/vendor/github.com/siddontang/go/log/filehandler.go @@ -0,0 +1,221 @@ +package log + +import ( + "fmt" + "os" + "path" + "time" +) + +//FileHandler writes log to a file. +type FileHandler struct { + fd *os.File +} + +func NewFileHandler(fileName string, flag int) (*FileHandler, error) { + dir := path.Dir(fileName) + os.Mkdir(dir, 0777) + + f, err := os.OpenFile(fileName, flag, 0) + if err != nil { + return nil, err + } + + h := new(FileHandler) + + h.fd = f + + return h, nil +} + +func (h *FileHandler) Write(b []byte) (n int, err error) { + return h.fd.Write(b) +} + +func (h *FileHandler) Close() error { + return h.fd.Close() +} + +//RotatingFileHandler writes log a file, if file size exceeds maxBytes, +//it will backup current file and open a new one. +// +//max backup file number is set by backupCount, it will delete oldest if backups too many. +type RotatingFileHandler struct { + fd *os.File + + fileName string + maxBytes int + curBytes int + backupCount int +} + +func NewRotatingFileHandler(fileName string, maxBytes int, backupCount int) (*RotatingFileHandler, error) { + dir := path.Dir(fileName) + os.MkdirAll(dir, 0777) + + h := new(RotatingFileHandler) + + if maxBytes <= 0 { + return nil, fmt.Errorf("invalid max bytes") + } + + h.fileName = fileName + h.maxBytes = maxBytes + h.backupCount = backupCount + + var err error + h.fd, err = os.OpenFile(fileName, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666) + if err != nil { + return nil, err + } + + f, err := h.fd.Stat() + if err != nil { + return nil, err + } + h.curBytes = int(f.Size()) + + return h, nil +} + +func (h *RotatingFileHandler) Write(p []byte) (n int, err error) { + h.doRollover() + n, err = h.fd.Write(p) + h.curBytes += n + return +} + +func (h *RotatingFileHandler) Close() error { + if h.fd != nil { + return h.fd.Close() + } + return nil +} + +func (h *RotatingFileHandler) doRollover() { + + if h.curBytes < h.maxBytes { + return + } + + f, err := h.fd.Stat() + if err != nil { + return + } + + if h.maxBytes <= 0 { + return + } else if f.Size() < int64(h.maxBytes) { + h.curBytes = int(f.Size()) + return + } + + if h.backupCount > 0 { + h.fd.Close() + + for i := h.backupCount - 1; i > 0; i-- { + sfn := fmt.Sprintf("%s.%d", h.fileName, i) + dfn := fmt.Sprintf("%s.%d", h.fileName, i+1) + + os.Rename(sfn, dfn) + } + + dfn := fmt.Sprintf("%s.1", h.fileName) + os.Rename(h.fileName, dfn) + + h.fd, _ = os.OpenFile(h.fileName, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666) + h.curBytes = 0 + f, err := h.fd.Stat() + if err != nil { + return + } + h.curBytes = int(f.Size()) + } +} + +//TimeRotatingFileHandler writes log to a file, +//it will backup current and open a new one, with a period time you sepecified. +// +//refer: http://docs.python.org/2/library/logging.handlers.html. +//same like python TimedRotatingFileHandler. +type TimeRotatingFileHandler struct { + fd *os.File + + baseName string + interval int64 + suffix string + rolloverAt int64 +} + +const ( + WhenSecond = iota + WhenMinute + WhenHour + WhenDay +) + +func NewTimeRotatingFileHandler(baseName string, when int8, interval int) (*TimeRotatingFileHandler, error) { + dir := path.Dir(baseName) + os.Mkdir(dir, 0777) + + h := new(TimeRotatingFileHandler) + + h.baseName = baseName + + switch when { + case WhenSecond: + h.interval = 1 + h.suffix = "2006-01-02_15-04-05" + case WhenMinute: + h.interval = 60 + h.suffix = "2006-01-02_15-04" + case WhenHour: + h.interval = 3600 + h.suffix = "2006-01-02_15" + case WhenDay: + h.interval = 3600 * 24 + h.suffix = "2006-01-02" + default: + return nil, fmt.Errorf("invalid when_rotate: %d", when) + } + + h.interval = h.interval * int64(interval) + + var err error + h.fd, err = os.OpenFile(h.baseName, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666) + if err != nil { + return nil, err + } + + fInfo, _ := h.fd.Stat() + h.rolloverAt = fInfo.ModTime().Unix() + h.interval + + return h, nil +} + +func (h *TimeRotatingFileHandler) doRollover() { + //refer http://hg.python.org/cpython/file/2.7/Lib/logging/handlers.py + now := time.Now() + + if h.rolloverAt <= now.Unix() { + fName := h.baseName + now.Format(h.suffix) + h.fd.Close() + e := os.Rename(h.baseName, fName) + if e != nil { + panic(e) + } + + h.fd, _ = os.OpenFile(h.baseName, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666) + + h.rolloverAt = time.Now().Unix() + h.interval + } +} + +func (h *TimeRotatingFileHandler) Write(b []byte) (n int, err error) { + h.doRollover() + return h.fd.Write(b) +} + +func (h *TimeRotatingFileHandler) Close() error { + return h.fd.Close() +} diff --git a/vendor/github.com/siddontang/go/log/handler.go b/vendor/github.com/siddontang/go/log/handler.go new file mode 100644 index 000000000000..4dc086f45c0a --- /dev/null +++ b/vendor/github.com/siddontang/go/log/handler.go @@ -0,0 +1,48 @@ +package log + +import ( + "io" +) + +//Handler writes logs to somewhere +type Handler interface { + Write(p []byte) (n int, err error) + Close() error +} + +//StreamHandler writes logs to a specified io Writer, maybe stdout, stderr, etc... +type StreamHandler struct { + w io.Writer +} + +func NewStreamHandler(w io.Writer) (*StreamHandler, error) { + h := new(StreamHandler) + + h.w = w + + return h, nil +} + +func (h *StreamHandler) Write(b []byte) (n int, err error) { + return h.w.Write(b) +} + +func (h *StreamHandler) Close() error { + return nil +} + +//NullHandler does nothing, it discards anything. +type NullHandler struct { +} + +func NewNullHandler() (*NullHandler, error) { + return new(NullHandler), nil +} + +func (h *NullHandler) Write(b []byte) (n int, err error) { + return len(b), nil +} + +func (h *NullHandler) Close() { + +} diff --git a/vendor/github.com/siddontang/go/log/log.go b/vendor/github.com/siddontang/go/log/log.go new file mode 100644 index 000000000000..371f6016871e --- /dev/null +++ b/vendor/github.com/siddontang/go/log/log.go @@ -0,0 +1,343 @@ +package log + +import ( + "fmt" + "os" + "runtime" + "strconv" + "strings" + "sync" + "sync/atomic" + "time" +) + +//log level, from low to high, more high means more serious +const ( + LevelTrace = iota + LevelDebug + LevelInfo + LevelWarn + LevelError + LevelFatal +) + +const ( + Ltime = 1 << iota //time format "2006/01/02 15:04:05" + Lfile //file.go:123 + Llevel //[Trace|Debug|Info...] +) + +var LevelName [6]string = [6]string{"Trace", "Debug", "Info", "Warn", "Error", "Fatal"} + +const TimeFormat = "2006/01/02 15:04:05" + +const maxBufPoolSize = 16 + +type atomicInt32 int32 + +func (i *atomicInt32) Set(n int) { + atomic.StoreInt32((*int32)(i), int32(n)) +} + +func (i *atomicInt32) Get() int { + return int(atomic.LoadInt32((*int32)(i))) +} + +type Logger struct { + level atomicInt32 + flag int + + hMutex sync.Mutex + handler Handler + + bufMutex sync.Mutex + bufs [][]byte + + closed atomicInt32 +} + +//new a logger with specified handler and flag +func New(handler Handler, flag int) *Logger { + var l = new(Logger) + + l.level.Set(LevelInfo) + l.handler = handler + + l.flag = flag + + l.closed.Set(0) + + l.bufs = make([][]byte, 0, 16) + + return l +} + +//new a default logger with specified handler and flag: Ltime|Lfile|Llevel +func NewDefault(handler Handler) *Logger { + return New(handler, Ltime|Lfile|Llevel) +} + +func newStdHandler() *StreamHandler { + h, _ := NewStreamHandler(os.Stdout) + return h +} + +var std = NewDefault(newStdHandler()) + +func (l *Logger) popBuf() []byte { + l.bufMutex.Lock() + var buf []byte + if len(l.bufs) == 0 { + buf = make([]byte, 0, 1024) + } else { + buf = l.bufs[len(l.bufs)-1] + l.bufs = l.bufs[0 : len(l.bufs)-1] + } + l.bufMutex.Unlock() + + return buf +} + +func (l *Logger) putBuf(buf []byte) { + l.bufMutex.Lock() + if len(l.bufs) < maxBufPoolSize { + buf = buf[0:0] + l.bufs = append(l.bufs, buf) + } + l.bufMutex.Unlock() +} + +func (l *Logger) Close() { + if l.closed.Get() == 1 { + return + } + l.closed.Set(1) + + l.handler.Close() +} + +//set log level, any log level less than it will not log +func (l *Logger) SetLevel(level int) { + l.level.Set(level) +} + +// name can be in ["trace", "debug", "info", "warn", "error", "fatal"] +func (l *Logger) SetLevelByName(name string) { + name = strings.ToLower(name) + switch name { + case "trace": + l.SetLevel(LevelTrace) + case "debug": + l.SetLevel(LevelDebug) + case "info": + l.SetLevel(LevelInfo) + case "warn": + l.SetLevel(LevelWarn) + case "error": + l.SetLevel(LevelError) + case "fatal": + l.SetLevel(LevelFatal) + } +} + +func (l *Logger) SetHandler(h Handler) { + if l.closed.Get() == 1 { + return + } + + l.hMutex.Lock() + if l.handler != nil { + l.handler.Close() + } + l.handler = h + l.hMutex.Unlock() +} + +func (l *Logger) Output(callDepth int, level int, format string, v ...interface{}) { + if l.closed.Get() == 1 { + // closed + return + } + + if l.level.Get() > level { + // higher level can be logged + return + } + + var s string + if format == "" { + s = fmt.Sprint(v...) + } else { + s = fmt.Sprintf(format, v...) + } + + buf := l.popBuf() + + if l.flag&Ltime > 0 { + now := time.Now().Format(TimeFormat) + buf = append(buf, '[') + buf = append(buf, now...) + buf = append(buf, "] "...) + } + + if l.flag&Lfile > 0 { + _, file, line, ok := runtime.Caller(callDepth) + if !ok { + file = "???" + line = 0 + } else { + for i := len(file) - 1; i > 0; i-- { + if file[i] == '/' { + file = file[i+1:] + break + } + } + } + + buf = append(buf, file...) + buf = append(buf, ':') + + buf = strconv.AppendInt(buf, int64(line), 10) + buf = append(buf, ' ') + } + + if l.flag&Llevel > 0 { + buf = append(buf, '[') + buf = append(buf, LevelName[level]...) + buf = append(buf, "] "...) + } + + buf = append(buf, s...) + + if s[len(s)-1] != '\n' { + buf = append(buf, '\n') + } + + // l.msg <- buf + + l.hMutex.Lock() + l.handler.Write(buf) + l.hMutex.Unlock() + l.putBuf(buf) +} + +//log with Trace level +func (l *Logger) Trace(v ...interface{}) { + l.Output(2, LevelTrace, "", v...) +} + +//log with Debug level +func (l *Logger) Debug(v ...interface{}) { + l.Output(2, LevelDebug, "", v...) +} + +//log with info level +func (l *Logger) Info(v ...interface{}) { + l.Output(2, LevelInfo, "", v...) +} + +//log with warn level +func (l *Logger) Warn(v ...interface{}) { + l.Output(2, LevelWarn, "", v...) +} + +//log with error level +func (l *Logger) Error(v ...interface{}) { + l.Output(2, LevelError, "", v...) +} + +//log with fatal level +func (l *Logger) Fatal(v ...interface{}) { + l.Output(2, LevelFatal, "", v...) +} + +//log with Trace level +func (l *Logger) Tracef(format string, v ...interface{}) { + l.Output(2, LevelTrace, format, v...) +} + +//log with Debug level +func (l *Logger) Debugf(format string, v ...interface{}) { + l.Output(2, LevelDebug, format, v...) +} + +//log with info level +func (l *Logger) Infof(format string, v ...interface{}) { + l.Output(2, LevelInfo, format, v...) +} + +//log with warn level +func (l *Logger) Warnf(format string, v ...interface{}) { + l.Output(2, LevelWarn, format, v...) +} + +//log with error level +func (l *Logger) Errorf(format string, v ...interface{}) { + l.Output(2, LevelError, format, v...) +} + +//log with fatal level +func (l *Logger) Fatalf(format string, v ...interface{}) { + l.Output(2, LevelFatal, format, v...) +} + +func SetLevel(level int) { + std.SetLevel(level) +} + +// name can be in ["trace", "debug", "info", "warn", "error", "fatal"] +func SetLevelByName(name string) { + std.SetLevelByName(name) +} + +func SetHandler(h Handler) { + std.SetHandler(h) +} + +func Trace(v ...interface{}) { + std.Output(2, LevelTrace, "", v...) +} + +func Debug(v ...interface{}) { + std.Output(2, LevelDebug, "", v...) +} + +func Info(v ...interface{}) { + std.Output(2, LevelInfo, "", v...) +} + +func Warn(v ...interface{}) { + std.Output(2, LevelWarn, "", v...) +} + +func Error(v ...interface{}) { + std.Output(2, LevelError, "", v...) +} + +func Fatal(v ...interface{}) { + std.Output(2, LevelFatal, "", v...) +} + +func Tracef(format string, v ...interface{}) { + std.Output(2, LevelTrace, format, v...) +} + +func Debugf(format string, v ...interface{}) { + std.Output(2, LevelDebug, format, v...) +} + +func Infof(format string, v ...interface{}) { + std.Output(2, LevelInfo, format, v...) +} + +func Warnf(format string, v ...interface{}) { + std.Output(2, LevelWarn, format, v...) +} + +func Errorf(format string, v ...interface{}) { + std.Output(2, LevelError, format, v...) +} + +func Fatalf(format string, v ...interface{}) { + std.Output(2, LevelFatal, format, v...) +} diff --git a/vendor/github.com/siddontang/go/log/sockethandler.go b/vendor/github.com/siddontang/go/log/sockethandler.go new file mode 100644 index 000000000000..3e7494d9501b --- /dev/null +++ b/vendor/github.com/siddontang/go/log/sockethandler.go @@ -0,0 +1,65 @@ +package log + +import ( + "encoding/binary" + "net" + "time" +) + +//SocketHandler writes log to a connectionl. +//Network protocol is simple: log length + log | log length + log. log length is uint32, bigendian. +//you must implement your own log server, maybe you can use logd instead simply. +type SocketHandler struct { + c net.Conn + protocol string + addr string +} + +func NewSocketHandler(protocol string, addr string) (*SocketHandler, error) { + s := new(SocketHandler) + + s.protocol = protocol + s.addr = addr + + return s, nil +} + +func (h *SocketHandler) Write(p []byte) (n int, err error) { + if err = h.connect(); err != nil { + return + } + + buf := make([]byte, len(p)+4) + + binary.BigEndian.PutUint32(buf, uint32(len(p))) + + copy(buf[4:], p) + + n, err = h.c.Write(buf) + if err != nil { + h.c.Close() + h.c = nil + } + return +} + +func (h *SocketHandler) Close() error { + if h.c != nil { + h.c.Close() + } + return nil +} + +func (h *SocketHandler) connect() error { + if h.c != nil { + return nil + } + + var err error + h.c, err = net.DialTimeout(h.protocol, h.addr, 20*time.Second) + if err != nil { + return err + } + + return nil +} diff --git a/vendor/github.com/siddontang/go/num/bytes.go b/vendor/github.com/siddontang/go/num/bytes.go new file mode 100644 index 000000000000..1f3def74ac73 --- /dev/null +++ b/vendor/github.com/siddontang/go/num/bytes.go @@ -0,0 +1,67 @@ +package num + +import ( + "encoding/binary" +) + +//all are bigendian format + +func BytesToUint16(b []byte) uint16 { + return binary.BigEndian.Uint16(b) +} + +func Uint16ToBytes(u uint16) []byte { + buf := make([]byte, 2) + binary.BigEndian.PutUint16(buf, u) + return buf +} + +func BytesToUint32(b []byte) uint32 { + return binary.BigEndian.Uint32(b) +} + +func Uint32ToBytes(u uint32) []byte { + buf := make([]byte, 4) + binary.BigEndian.PutUint32(buf, u) + return buf +} + +func BytesToUint64(b []byte) uint64 { + return binary.BigEndian.Uint64(b) +} + +func Uint64ToBytes(u uint64) []byte { + buf := make([]byte, 8) + binary.BigEndian.PutUint64(buf, u) + return buf +} + +func BytesToInt16(b []byte) int16 { + return int16(binary.BigEndian.Uint16(b)) +} + +func Int16ToBytes(u int16) []byte { + buf := make([]byte, 2) + binary.BigEndian.PutUint16(buf, uint16(u)) + return buf +} + +func BytesToInt32(b []byte) int32 { + return int32(binary.BigEndian.Uint32(b)) +} + +func Int32ToBytes(u int32) []byte { + buf := make([]byte, 4) + binary.BigEndian.PutUint32(buf, uint32(u)) + return buf +} + +func BytesToInt64(b []byte) int64 { + return int64(binary.BigEndian.Uint64(b)) +} + +func Int64ToBytes(u int64) []byte { + buf := make([]byte, 8) + binary.BigEndian.PutUint64(buf, uint64(u)) + return buf +} diff --git a/vendor/github.com/siddontang/go/num/cmp.go b/vendor/github.com/siddontang/go/num/cmp.go new file mode 100644 index 000000000000..78f8d4f14af3 --- /dev/null +++ b/vendor/github.com/siddontang/go/num/cmp.go @@ -0,0 +1,161 @@ +package num + +func MinUint(a uint, b uint) uint { + if a > b { + return b + } else { + return a + } +} + +func MaxUint(a uint, b uint) uint { + if a > b { + return a + } else { + return b + } +} + +func MinInt(a int, b int) int { + if a > b { + return b + } else { + return a + } +} + +func MaxInt(a int, b int) int { + if a > b { + return a + } else { + return b + } +} + +func MinUint8(a uint8, b uint8) uint8 { + if a > b { + return b + } else { + return a + } +} + +func MaxUint8(a uint8, b uint8) uint8 { + if a > b { + return a + } else { + return b + } +} + +func MinInt8(a int8, b int8) int8 { + if a > b { + return b + } else { + return a + } +} + +func MaxInt8(a int8, b int8) int8 { + if a > b { + return a + } else { + return b + } +} + +func MinUint16(a uint16, b uint16) uint16 { + if a > b { + return b + } else { + return a + } +} + +func MaxUint16(a uint16, b uint16) uint16 { + if a > b { + return a + } else { + return b + } +} + +func MinInt16(a int16, b int16) int16 { + if a > b { + return b + } else { + return a + } +} + +func MaxInt16(a int16, b int16) int16 { + if a > b { + return a + } else { + return b + } +} + +func MinUint32(a uint32, b uint32) uint32 { + if a > b { + return b + } else { + return a + } +} + +func MaxUint32(a uint32, b uint32) uint32 { + if a > b { + return a + } else { + return b + } +} + +func MinInt32(a int32, b int32) int32 { + if a > b { + return b + } else { + return a + } +} + +func MaxInt32(a int32, b int32) int32 { + if a > b { + return a + } else { + return b + } +} + +func MinUint64(a uint64, b uint64) uint64 { + if a > b { + return b + } else { + return a + } +} + +func MaxUint64(a uint64, b uint64) uint64 { + if a > b { + return a + } else { + return b + } +} + +func MinInt64(a int64, b int64) int64 { + if a > b { + return b + } else { + return a + } +} + +func MaxInt64(a int64, b int64) int64 { + if a > b { + return a + } else { + return b + } +} diff --git a/vendor/github.com/siddontang/go/num/str.go b/vendor/github.com/siddontang/go/num/str.go new file mode 100644 index 000000000000..4b304817b86d --- /dev/null +++ b/vendor/github.com/siddontang/go/num/str.go @@ -0,0 +1,157 @@ +package num + +import ( + "strconv" +) + +func ParseUint(s string) (uint, error) { + if v, err := strconv.ParseUint(s, 10, 0); err != nil { + return 0, err + } else { + return uint(v), nil + } +} + +func ParseUint8(s string) (uint8, error) { + if v, err := strconv.ParseUint(s, 10, 8); err != nil { + return 0, err + } else { + return uint8(v), nil + } +} + +func ParseUint16(s string) (uint16, error) { + if v, err := strconv.ParseUint(s, 10, 16); err != nil { + return 0, err + } else { + return uint16(v), nil + } +} + +func ParseUint32(s string) (uint32, error) { + if v, err := strconv.ParseUint(s, 10, 32); err != nil { + return 0, err + } else { + return uint32(v), nil + } +} + +func ParseUint64(s string) (uint64, error) { + return strconv.ParseUint(s, 10, 64) +} + +func ParseInt(s string) (int, error) { + if v, err := strconv.ParseInt(s, 10, 0); err != nil { + return 0, err + } else { + return int(v), nil + } +} + +func ParseInt8(s string) (int8, error) { + if v, err := strconv.ParseInt(s, 10, 8); err != nil { + return 0, err + } else { + return int8(v), nil + } +} + +func ParseInt16(s string) (int16, error) { + if v, err := strconv.ParseInt(s, 10, 16); err != nil { + return 0, err + } else { + return int16(v), nil + } +} + +func ParseInt32(s string) (int32, error) { + if v, err := strconv.ParseInt(s, 10, 32); err != nil { + return 0, err + } else { + return int32(v), nil + } +} + +func ParseInt64(s string) (int64, error) { + return strconv.ParseInt(s, 10, 64) +} + +func FormatInt(v int) string { + return strconv.FormatInt(int64(v), 10) +} + +func FormatInt8(v int8) string { + return strconv.FormatInt(int64(v), 10) +} + +func FormatInt16(v int16) string { + return strconv.FormatInt(int64(v), 10) +} + +func FormatInt32(v int32) string { + return strconv.FormatInt(int64(v), 10) +} + +func FormatInt64(v int64) string { + return strconv.FormatInt(int64(v), 10) +} + +func FormatUint(v uint) string { + return strconv.FormatUint(uint64(v), 10) +} + +func FormatUint8(v uint8) string { + return strconv.FormatUint(uint64(v), 10) +} + +func FormatUint16(v uint16) string { + return strconv.FormatUint(uint64(v), 10) +} + +func FormatUint32(v uint32) string { + return strconv.FormatUint(uint64(v), 10) +} + +func FormatUint64(v uint64) string { + return strconv.FormatUint(uint64(v), 10) +} + +func FormatIntToSlice(v int) []byte { + return strconv.AppendInt(nil, int64(v), 10) +} + +func FormatInt8ToSlice(v int8) []byte { + return strconv.AppendInt(nil, int64(v), 10) +} + +func FormatInt16ToSlice(v int16) []byte { + return strconv.AppendInt(nil, int64(v), 10) +} + +func FormatInt32ToSlice(v int32) []byte { + return strconv.AppendInt(nil, int64(v), 10) +} + +func FormatInt64ToSlice(v int64) []byte { + return strconv.AppendInt(nil, int64(v), 10) +} + +func FormatUintToSlice(v uint) []byte { + return strconv.AppendUint(nil, uint64(v), 10) +} + +func FormatUint8ToSlice(v uint8) []byte { + return strconv.AppendUint(nil, uint64(v), 10) +} + +func FormatUint16ToSlice(v uint16) []byte { + return strconv.AppendUint(nil, uint64(v), 10) +} + +func FormatUint32ToSlice(v uint32) []byte { + return strconv.AppendUint(nil, uint64(v), 10) +} + +func FormatUint64ToSlice(v uint64) []byte { + return strconv.AppendUint(nil, uint64(v), 10) +} diff --git a/vendor/github.com/siddontang/go/snappy/LICENSE b/vendor/github.com/siddontang/go/snappy/LICENSE new file mode 100644 index 000000000000..6050c10f4c8b --- /dev/null +++ b/vendor/github.com/siddontang/go/snappy/LICENSE @@ -0,0 +1,27 @@ +Copyright (c) 2011 The Snappy-Go Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/github.com/siddontang/go/snappy/decode.go b/vendor/github.com/siddontang/go/snappy/decode.go new file mode 100644 index 000000000000..d93c1b9dbfd7 --- /dev/null +++ b/vendor/github.com/siddontang/go/snappy/decode.go @@ -0,0 +1,124 @@ +// Copyright 2011 The Snappy-Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package snappy + +import ( + "encoding/binary" + "errors" +) + +// ErrCorrupt reports that the input is invalid. +var ErrCorrupt = errors.New("snappy: corrupt input") + +// DecodedLen returns the length of the decoded block. +func DecodedLen(src []byte) (int, error) { + v, _, err := decodedLen(src) + return v, err +} + +// decodedLen returns the length of the decoded block and the number of bytes +// that the length header occupied. +func decodedLen(src []byte) (blockLen, headerLen int, err error) { + v, n := binary.Uvarint(src) + if n == 0 { + return 0, 0, ErrCorrupt + } + if uint64(int(v)) != v { + return 0, 0, errors.New("snappy: decoded block is too large") + } + return int(v), n, nil +} + +// Decode returns the decoded form of src. The returned slice may be a sub- +// slice of dst if dst was large enough to hold the entire decoded block. +// Otherwise, a newly allocated slice will be returned. +// It is valid to pass a nil dst. +func Decode(dst, src []byte) ([]byte, error) { + dLen, s, err := decodedLen(src) + if err != nil { + return nil, err + } + if len(dst) < dLen { + dst = make([]byte, dLen) + } + + var d, offset, length int + for s < len(src) { + switch src[s] & 0x03 { + case tagLiteral: + x := uint(src[s] >> 2) + switch { + case x < 60: + s += 1 + case x == 60: + s += 2 + if s > len(src) { + return nil, ErrCorrupt + } + x = uint(src[s-1]) + case x == 61: + s += 3 + if s > len(src) { + return nil, ErrCorrupt + } + x = uint(src[s-2]) | uint(src[s-1])<<8 + case x == 62: + s += 4 + if s > len(src) { + return nil, ErrCorrupt + } + x = uint(src[s-3]) | uint(src[s-2])<<8 | uint(src[s-1])<<16 + case x == 63: + s += 5 + if s > len(src) { + return nil, ErrCorrupt + } + x = uint(src[s-4]) | uint(src[s-3])<<8 | uint(src[s-2])<<16 | uint(src[s-1])<<24 + } + length = int(x + 1) + if length <= 0 { + return nil, errors.New("snappy: unsupported literal length") + } + if length > len(dst)-d || length > len(src)-s { + return nil, ErrCorrupt + } + copy(dst[d:], src[s:s+length]) + d += length + s += length + continue + + case tagCopy1: + s += 2 + if s > len(src) { + return nil, ErrCorrupt + } + length = 4 + int(src[s-2])>>2&0x7 + offset = int(src[s-2])&0xe0<<3 | int(src[s-1]) + + case tagCopy2: + s += 3 + if s > len(src) { + return nil, ErrCorrupt + } + length = 1 + int(src[s-3])>>2 + offset = int(src[s-2]) | int(src[s-1])<<8 + + case tagCopy4: + return nil, errors.New("snappy: unsupported COPY_4 tag") + } + + end := d + length + if offset > d || end > len(dst) { + return nil, ErrCorrupt + } + for ; d < end; d++ { + dst[d] = dst[d-offset] + } + } + if d != dLen { + return nil, ErrCorrupt + } + return dst[:d], nil +} diff --git a/vendor/github.com/siddontang/go/snappy/encode.go b/vendor/github.com/siddontang/go/snappy/encode.go new file mode 100644 index 000000000000..b2371db11c8f --- /dev/null +++ b/vendor/github.com/siddontang/go/snappy/encode.go @@ -0,0 +1,174 @@ +// Copyright 2011 The Snappy-Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package snappy + +import ( + "encoding/binary" +) + +// We limit how far copy back-references can go, the same as the C++ code. +const maxOffset = 1 << 15 + +// emitLiteral writes a literal chunk and returns the number of bytes written. +func emitLiteral(dst, lit []byte) int { + i, n := 0, uint(len(lit)-1) + switch { + case n < 60: + dst[0] = uint8(n)<<2 | tagLiteral + i = 1 + case n < 1<<8: + dst[0] = 60<<2 | tagLiteral + dst[1] = uint8(n) + i = 2 + case n < 1<<16: + dst[0] = 61<<2 | tagLiteral + dst[1] = uint8(n) + dst[2] = uint8(n >> 8) + i = 3 + case n < 1<<24: + dst[0] = 62<<2 | tagLiteral + dst[1] = uint8(n) + dst[2] = uint8(n >> 8) + dst[3] = uint8(n >> 16) + i = 4 + case int64(n) < 1<<32: + dst[0] = 63<<2 | tagLiteral + dst[1] = uint8(n) + dst[2] = uint8(n >> 8) + dst[3] = uint8(n >> 16) + dst[4] = uint8(n >> 24) + i = 5 + default: + panic("snappy: source buffer is too long") + } + if copy(dst[i:], lit) != len(lit) { + panic("snappy: destination buffer is too short") + } + return i + len(lit) +} + +// emitCopy writes a copy chunk and returns the number of bytes written. +func emitCopy(dst []byte, offset, length int) int { + i := 0 + for length > 0 { + x := length - 4 + if 0 <= x && x < 1<<3 && offset < 1<<11 { + dst[i+0] = uint8(offset>>8)&0x07<<5 | uint8(x)<<2 | tagCopy1 + dst[i+1] = uint8(offset) + i += 2 + break + } + + x = length + if x > 1<<6 { + x = 1 << 6 + } + dst[i+0] = uint8(x-1)<<2 | tagCopy2 + dst[i+1] = uint8(offset) + dst[i+2] = uint8(offset >> 8) + i += 3 + length -= x + } + return i +} + +// Encode returns the encoded form of src. The returned slice may be a sub- +// slice of dst if dst was large enough to hold the entire encoded block. +// Otherwise, a newly allocated slice will be returned. +// It is valid to pass a nil dst. +func Encode(dst, src []byte) ([]byte, error) { + if n := MaxEncodedLen(len(src)); len(dst) < n { + dst = make([]byte, n) + } + + // The block starts with the varint-encoded length of the decompressed bytes. + d := binary.PutUvarint(dst, uint64(len(src))) + + // Return early if src is short. + if len(src) <= 4 { + if len(src) != 0 { + d += emitLiteral(dst[d:], src) + } + return dst[:d], nil + } + + // Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive. + const maxTableSize = 1 << 14 + shift, tableSize := uint(32-8), 1<<8 + for tableSize < maxTableSize && tableSize < len(src) { + shift-- + tableSize *= 2 + } + var table [maxTableSize]int + + // Iterate over the source bytes. + var ( + s int // The iterator position. + t int // The last position with the same hash as s. + lit int // The start position of any pending literal bytes. + ) + for s+3 < len(src) { + // Update the hash table. + b0, b1, b2, b3 := src[s], src[s+1], src[s+2], src[s+3] + h := uint32(b0) | uint32(b1)<<8 | uint32(b2)<<16 | uint32(b3)<<24 + p := &table[(h*0x1e35a7bd)>>shift] + // We need to to store values in [-1, inf) in table. To save + // some initialization time, (re)use the table's zero value + // and shift the values against this zero: add 1 on writes, + // subtract 1 on reads. + t, *p = *p-1, s+1 + // If t is invalid or src[s:s+4] differs from src[t:t+4], accumulate a literal byte. + if t < 0 || s-t >= maxOffset || b0 != src[t] || b1 != src[t+1] || b2 != src[t+2] || b3 != src[t+3] { + s++ + continue + } + // Otherwise, we have a match. First, emit any pending literal bytes. + if lit != s { + d += emitLiteral(dst[d:], src[lit:s]) + } + // Extend the match to be as long as possible. + s0 := s + s, t = s+4, t+4 + for s < len(src) && src[s] == src[t] { + s++ + t++ + } + // Emit the copied bytes. + d += emitCopy(dst[d:], s-t, s-s0) + lit = s + } + + // Emit any final pending literal bytes and return. + if lit != len(src) { + d += emitLiteral(dst[d:], src[lit:]) + } + return dst[:d], nil +} + +// MaxEncodedLen returns the maximum length of a snappy block, given its +// uncompressed length. +func MaxEncodedLen(srcLen int) int { + // Compressed data can be defined as: + // compressed := item* literal* + // item := literal* copy + // + // The trailing literal sequence has a space blowup of at most 62/60 + // since a literal of length 60 needs one tag byte + one extra byte + // for length information. + // + // Item blowup is trickier to measure. Suppose the "copy" op copies + // 4 bytes of data. Because of a special check in the encoding code, + // we produce a 4-byte copy only if the offset is < 65536. Therefore + // the copy op takes 3 bytes to encode, and this type of item leads + // to at most the 62/60 blowup for representing literals. + // + // Suppose the "copy" op copies 5 bytes of data. If the offset is big + // enough, it will take 5 bytes to encode the copy op. Therefore the + // worst case here is a one-byte literal followed by a five-byte copy. + // That is, 6 bytes of input turn into 7 bytes of "compressed" data. + // + // This last factor dominates the blowup, so the final estimate is: + return 32 + srcLen + srcLen/6 +} diff --git a/vendor/github.com/siddontang/go/snappy/snappy.go b/vendor/github.com/siddontang/go/snappy/snappy.go new file mode 100644 index 000000000000..2f1b790d0b71 --- /dev/null +++ b/vendor/github.com/siddontang/go/snappy/snappy.go @@ -0,0 +1,38 @@ +// Copyright 2011 The Snappy-Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package snappy implements the snappy block-based compression format. +// It aims for very high speeds and reasonable compression. +// +// The C++ snappy implementation is at http://code.google.com/p/snappy/ +package snappy + +/* +Each encoded block begins with the varint-encoded length of the decoded data, +followed by a sequence of chunks. Chunks begin and end on byte boundaries. The +first byte of each chunk is broken into its 2 least and 6 most significant bits +called l and m: l ranges in [0, 4) and m ranges in [0, 64). l is the chunk tag. +Zero means a literal tag. All other values mean a copy tag. + +For literal tags: + - If m < 60, the next 1 + m bytes are literal bytes. + - Otherwise, let n be the little-endian unsigned integer denoted by the next + m - 59 bytes. The next 1 + n bytes after that are literal bytes. + +For copy tags, length bytes are copied from offset bytes ago, in the style of +Lempel-Ziv compression algorithms. In particular: + - For l == 1, the offset ranges in [0, 1<<11) and the length in [4, 12). + The length is 4 + the low 3 bits of m. The high 3 bits of m form bits 8-10 + of the offset. The next byte is bits 0-7 of the offset. + - For l == 2, the offset ranges in [0, 1<<16) and the length in [1, 65). + The length is 1 + m. The offset is the little-endian unsigned integer + denoted by the next 2 bytes. + - For l == 3, this tag is a legacy format that is no longer supported. +*/ +const ( + tagLiteral = 0x00 + tagCopy1 = 0x01 + tagCopy2 = 0x02 + tagCopy4 = 0x03 +) diff --git a/vendor/github.com/siddontang/go/sync2/atomic.go b/vendor/github.com/siddontang/go/sync2/atomic.go new file mode 100644 index 000000000000..382fc20dfec7 --- /dev/null +++ b/vendor/github.com/siddontang/go/sync2/atomic.go @@ -0,0 +1,146 @@ +// Copyright 2013, Google Inc. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package sync2 + +import ( + "sync" + "sync/atomic" + "time" +) + +type AtomicInt32 int32 + +func (i *AtomicInt32) Add(n int32) int32 { + return atomic.AddInt32((*int32)(i), n) +} + +func (i *AtomicInt32) Set(n int32) { + atomic.StoreInt32((*int32)(i), n) +} + +func (i *AtomicInt32) Get() int32 { + return atomic.LoadInt32((*int32)(i)) +} + +func (i *AtomicInt32) CompareAndSwap(oldval, newval int32) (swapped bool) { + return atomic.CompareAndSwapInt32((*int32)(i), oldval, newval) +} + +type AtomicUint32 uint32 + +func (i *AtomicUint32) Add(n uint32) uint32 { + return atomic.AddUint32((*uint32)(i), n) +} + +func (i *AtomicUint32) Set(n uint32) { + atomic.StoreUint32((*uint32)(i), n) +} + +func (i *AtomicUint32) Get() uint32 { + return atomic.LoadUint32((*uint32)(i)) +} + +func (i *AtomicUint32) CompareAndSwap(oldval, newval uint32) (swapped bool) { + return atomic.CompareAndSwapUint32((*uint32)(i), oldval, newval) +} + +type AtomicInt64 int64 + +func (i *AtomicInt64) Add(n int64) int64 { + return atomic.AddInt64((*int64)(i), n) +} + +func (i *AtomicInt64) Set(n int64) { + atomic.StoreInt64((*int64)(i), n) +} + +func (i *AtomicInt64) Get() int64 { + return atomic.LoadInt64((*int64)(i)) +} + +func (i *AtomicInt64) CompareAndSwap(oldval, newval int64) (swapped bool) { + return atomic.CompareAndSwapInt64((*int64)(i), oldval, newval) +} + +type AtomicUint64 uint64 + +func (i *AtomicUint64) Add(n uint64) uint64 { + return atomic.AddUint64((*uint64)(i), n) +} + +func (i *AtomicUint64) Set(n uint64) { + atomic.StoreUint64((*uint64)(i), n) +} + +func (i *AtomicUint64) Get() uint64 { + return atomic.LoadUint64((*uint64)(i)) +} + +func (i *AtomicUint64) CompareAndSwap(oldval, newval uint64) (swapped bool) { + return atomic.CompareAndSwapUint64((*uint64)(i), oldval, newval) +} + +type AtomicDuration int64 + +func (d *AtomicDuration) Add(duration time.Duration) time.Duration { + return time.Duration(atomic.AddInt64((*int64)(d), int64(duration))) +} + +func (d *AtomicDuration) Set(duration time.Duration) { + atomic.StoreInt64((*int64)(d), int64(duration)) +} + +func (d *AtomicDuration) Get() time.Duration { + return time.Duration(atomic.LoadInt64((*int64)(d))) +} + +func (d *AtomicDuration) CompareAndSwap(oldval, newval time.Duration) (swapped bool) { + return atomic.CompareAndSwapInt64((*int64)(d), int64(oldval), int64(newval)) +} + +// AtomicString gives you atomic-style APIs for string, but +// it's only a convenience wrapper that uses a mutex. So, it's +// not as efficient as the rest of the atomic types. +type AtomicString struct { + mu sync.Mutex + str string +} + +func (s *AtomicString) Set(str string) { + s.mu.Lock() + s.str = str + s.mu.Unlock() +} + +func (s *AtomicString) Get() string { + s.mu.Lock() + str := s.str + s.mu.Unlock() + return str +} + +func (s *AtomicString) CompareAndSwap(oldval, newval string) (swapped bool) { + s.mu.Lock() + defer s.mu.Unlock() + if s.str == oldval { + s.str = newval + return true + } + return false +} + +type AtomicBool int32 + +func (b *AtomicBool) Set(v bool) { + if v { + atomic.StoreInt32((*int32)(b), 1) + } else { + atomic.StoreInt32((*int32)(b), 0) + } +} + +func (b *AtomicBool) Get() bool { + return atomic.LoadInt32((*int32)(b)) == 1 +} diff --git a/vendor/github.com/siddontang/go/sync2/semaphore.go b/vendor/github.com/siddontang/go/sync2/semaphore.go new file mode 100644 index 000000000000..d310da7294c7 --- /dev/null +++ b/vendor/github.com/siddontang/go/sync2/semaphore.go @@ -0,0 +1,65 @@ +package sync2 + +import ( + "sync" + "sync/atomic" + "time" +) + +func NewSemaphore(initialCount int) *Semaphore { + res := &Semaphore{ + counter: int64(initialCount), + } + res.cond.L = &res.lock + return res +} + +type Semaphore struct { + lock sync.Mutex + cond sync.Cond + counter int64 +} + +func (s *Semaphore) Release() { + s.lock.Lock() + s.counter += 1 + if s.counter >= 0 { + s.cond.Signal() + } + s.lock.Unlock() +} + +func (s *Semaphore) Acquire() { + s.lock.Lock() + for s.counter < 1 { + s.cond.Wait() + } + s.counter -= 1 + s.lock.Unlock() +} + +func (s *Semaphore) AcquireTimeout(timeout time.Duration) bool { + done := make(chan bool, 1) + // Gate used to communicate between the threads and decide what the result + // is. If the main thread decides, we have timed out, otherwise we succeed. + decided := new(int32) + go func() { + s.Acquire() + if atomic.SwapInt32(decided, 1) == 0 { + done <- true + } else { + // If we already decided the result, and this thread did not win + s.Release() + } + }() + select { + case <-done: + return true + case <-time.NewTimer(timeout).C: + if atomic.SwapInt32(decided, 1) == 1 { + // The other thread already decided the result + return true + } + return false + } +} diff --git a/vendor/github.com/siddontang/ledisdb/LICENSE b/vendor/github.com/siddontang/ledisdb/LICENSE new file mode 100644 index 000000000000..7ece9fdf5a64 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2014 siddontang + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/vendor/github.com/siddontang/ledisdb/client/go/LICENSE b/vendor/github.com/siddontang/ledisdb/client/go/LICENSE new file mode 100644 index 000000000000..7ece9fdf5a64 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/client/go/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2014 siddontang + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/vendor/github.com/siddontang/ledisdb/config/config.go b/vendor/github.com/siddontang/ledisdb/config/config.go new file mode 100644 index 000000000000..92ea190957ea --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/config/config.go @@ -0,0 +1,279 @@ +package config + +import ( + "bytes" + "errors" + "github.com/BurntSushi/toml" + "github.com/siddontang/go/ioutil2" + "io" + "io/ioutil" + "sync" +) + +var ( + ErrNoConfigFile = errors.New("Running without a config file") +) + +const ( + DefaultAddr string = "127.0.0.1:6380" + + DefaultDBName string = "goleveldb" + + DefaultDataDir string = "./var" + + KB int = 1024 + MB int = KB * 1024 + GB int = MB * 1024 +) + +type LevelDBConfig struct { + Compression bool `toml:"compression"` + BlockSize int `toml:"block_size"` + WriteBufferSize int `toml:"write_buffer_size"` + CacheSize int `toml:"cache_size"` + MaxOpenFiles int `toml:"max_open_files"` +} + +type RocksDBConfig struct { + Compression int `toml:"compression"` + BlockSize int `toml:"block_size"` + WriteBufferSize int `toml:"write_buffer_size"` + CacheSize int `toml:"cache_size"` + MaxOpenFiles int `toml:"max_open_files"` + MaxWriteBufferNum int `toml:"max_write_buffer_num"` + MinWriteBufferNumberToMerge int `toml:"min_write_buffer_number_to_merge"` + NumLevels int `toml:"num_levels"` + Level0FileNumCompactionTrigger int `toml:"level0_file_num_compaction_trigger"` + Level0SlowdownWritesTrigger int `toml:"level0_slowdown_writes_trigger"` + Level0StopWritesTrigger int `toml:"level0_stop_writes_trigger"` + TargetFileSizeBase int `toml:"target_file_size_base"` + TargetFileSizeMultiplier int `toml:"target_file_size_multiplier"` + MaxBytesForLevelBase int `toml:"max_bytes_for_level_base"` + MaxBytesForLevelMultiplier int `toml:"max_bytes_for_level_multiplier"` + DisableAutoCompactions bool `toml:"disable_auto_compactions"` + DisableDataSync bool `toml:"disable_data_sync"` + UseFsync bool `toml:"use_fsync"` + MaxBackgroundCompactions int `toml:"max_background_compactions"` + MaxBackgroundFlushes int `toml:"max_background_flushes"` + AllowOsBuffer bool `toml:"allow_os_buffer"` + EnableStatistics bool `toml:"enable_statistics"` + StatsDumpPeriodSec int `toml:"stats_dump_period_sec"` + BackgroundThreads int `toml:"background_theads"` + HighPriorityBackgroundThreads int `toml:"high_priority_background_threads"` + DisableWAL bool `toml:"disable_wal"` +} + +type LMDBConfig struct { + MapSize int `toml:"map_size"` + NoSync bool `toml:"nosync"` +} + +type ReplicationConfig struct { + Path string `toml:"path"` + Sync bool `toml:"sync"` + WaitSyncTime int `toml:"wait_sync_time"` + WaitMaxSlaveAcks int `toml:"wait_max_slave_acks"` + ExpiredLogDays int `toml:"expired_log_days"` + StoreName string `toml:"store_name"` + MaxLogFileSize int64 `toml:"max_log_file_size"` + MaxLogFileNum int `toml:"max_log_file_num"` + SyncLog int `toml:"sync_log"` + Compression bool `toml:"compression"` + UseMmap bool `toml:"use_mmap"` +} + +type SnapshotConfig struct { + Path string `toml:"path"` + MaxNum int `toml:"max_num"` +} + +type Config struct { + m sync.RWMutex `toml:"-"` + + FileName string `toml:"-"` + + Addr string `toml:"addr"` + + HttpAddr string `toml:"http_addr"` + + SlaveOf string `toml:"slaveof"` + + Readonly bool `toml:readonly` + + DataDir string `toml:"data_dir"` + + DBName string `toml:"db_name"` + DBPath string `toml:"db_path"` + DBSyncCommit int `toml:"db_sync_commit"` + + LevelDB LevelDBConfig `toml:"leveldb"` + RocksDB RocksDBConfig `toml:"rocksdb"` + + LMDB LMDBConfig `toml:"lmdb"` + + AccessLog string `toml:"access_log"` + + UseReplication bool `toml:"use_replication"` + Replication ReplicationConfig `toml:"replication"` + + Snapshot SnapshotConfig `toml:"snapshot"` + + ConnReadBufferSize int `toml:"conn_read_buffer_size"` + ConnWriteBufferSize int `toml:"conn_write_buffer_size"` + ConnKeepaliveInterval int `toml:"conn_keepalive_interval"` + + TTLCheckInterval int `toml:"ttl_check_interval"` +} + +func NewConfigWithFile(fileName string) (*Config, error) { + data, err := ioutil.ReadFile(fileName) + if err != nil { + return nil, err + } + + if cfg, err := NewConfigWithData(data); err != nil { + return nil, err + } else { + cfg.FileName = fileName + return cfg, nil + } +} + +func NewConfigWithData(data []byte) (*Config, error) { + cfg := NewConfigDefault() + + _, err := toml.Decode(string(data), cfg) + if err != nil { + return nil, err + } + + cfg.adjust() + + return cfg, nil +} + +func NewConfigDefault() *Config { + cfg := new(Config) + + cfg.Addr = DefaultAddr + cfg.HttpAddr = "" + + cfg.DataDir = DefaultDataDir + + cfg.DBName = DefaultDBName + + cfg.SlaveOf = "" + cfg.Readonly = false + + // disable access log + cfg.AccessLog = "" + + cfg.LMDB.MapSize = 20 * MB + cfg.LMDB.NoSync = true + + cfg.UseReplication = false + cfg.Replication.WaitSyncTime = 500 + cfg.Replication.Compression = true + cfg.Replication.WaitMaxSlaveAcks = 2 + cfg.Replication.SyncLog = 0 + cfg.Replication.UseMmap = true + cfg.Snapshot.MaxNum = 1 + + cfg.RocksDB.AllowOsBuffer = true + cfg.RocksDB.EnableStatistics = false + cfg.RocksDB.UseFsync = false + cfg.RocksDB.DisableAutoCompactions = false + cfg.RocksDB.AllowOsBuffer = true + cfg.RocksDB.DisableWAL = false + + cfg.adjust() + + return cfg +} + +func getDefault(d int, s int) int { + if s <= 0 { + return d + } else { + return s + } +} + +func (cfg *Config) adjust() { + cfg.LevelDB.adjust() + + cfg.RocksDB.adjust() + + cfg.Replication.ExpiredLogDays = getDefault(7, cfg.Replication.ExpiredLogDays) + cfg.Replication.MaxLogFileNum = getDefault(50, cfg.Replication.MaxLogFileNum) + cfg.ConnReadBufferSize = getDefault(4*KB, cfg.ConnReadBufferSize) + cfg.ConnWriteBufferSize = getDefault(4*KB, cfg.ConnWriteBufferSize) + cfg.TTLCheckInterval = getDefault(1, cfg.TTLCheckInterval) + +} + +func (cfg *LevelDBConfig) adjust() { + cfg.CacheSize = getDefault(4*MB, cfg.CacheSize) + cfg.BlockSize = getDefault(4*KB, cfg.BlockSize) + cfg.WriteBufferSize = getDefault(4*MB, cfg.WriteBufferSize) + cfg.MaxOpenFiles = getDefault(1024, cfg.MaxOpenFiles) +} + +func (cfg *RocksDBConfig) adjust() { + cfg.CacheSize = getDefault(4*MB, cfg.CacheSize) + cfg.BlockSize = getDefault(4*KB, cfg.BlockSize) + cfg.WriteBufferSize = getDefault(4*MB, cfg.WriteBufferSize) + cfg.MaxOpenFiles = getDefault(1024, cfg.MaxOpenFiles) + cfg.MaxWriteBufferNum = getDefault(2, cfg.MaxWriteBufferNum) + cfg.MinWriteBufferNumberToMerge = getDefault(1, cfg.MinWriteBufferNumberToMerge) + cfg.NumLevels = getDefault(7, cfg.NumLevels) + cfg.Level0FileNumCompactionTrigger = getDefault(4, cfg.Level0FileNumCompactionTrigger) + cfg.Level0SlowdownWritesTrigger = getDefault(16, cfg.Level0SlowdownWritesTrigger) + cfg.Level0StopWritesTrigger = getDefault(64, cfg.Level0StopWritesTrigger) + cfg.TargetFileSizeBase = getDefault(32*MB, cfg.TargetFileSizeBase) + cfg.TargetFileSizeMultiplier = getDefault(1, cfg.TargetFileSizeMultiplier) + cfg.MaxBytesForLevelBase = getDefault(32*MB, cfg.MaxBytesForLevelBase) + cfg.MaxBytesForLevelMultiplier = getDefault(1, cfg.MaxBytesForLevelMultiplier) + cfg.MaxBackgroundCompactions = getDefault(1, cfg.MaxBackgroundCompactions) + cfg.MaxBackgroundFlushes = getDefault(1, cfg.MaxBackgroundFlushes) + cfg.StatsDumpPeriodSec = getDefault(3600, cfg.StatsDumpPeriodSec) + cfg.BackgroundThreads = getDefault(2, cfg.BackgroundThreads) + cfg.HighPriorityBackgroundThreads = getDefault(1, cfg.HighPriorityBackgroundThreads) +} + +func (cfg *Config) Dump(w io.Writer) error { + e := toml.NewEncoder(w) + e.Indent = "" + return e.Encode(cfg) +} + +func (cfg *Config) DumpFile(fileName string) error { + var b bytes.Buffer + + if err := cfg.Dump(&b); err != nil { + return err + } + + return ioutil2.WriteFileAtomic(fileName, b.Bytes(), 0644) +} + +func (cfg *Config) Rewrite() error { + if len(cfg.FileName) == 0 { + return ErrNoConfigFile + } + + return cfg.DumpFile(cfg.FileName) +} + +func (cfg *Config) GetReadonly() bool { + cfg.m.RLock() + b := cfg.Readonly + cfg.m.RUnlock() + return b +} + +func (cfg *Config) SetReadonly(b bool) { + cfg.m.Lock() + cfg.Readonly = b + cfg.m.Unlock() +} diff --git a/vendor/github.com/siddontang/ledisdb/ledis/batch.go b/vendor/github.com/siddontang/ledisdb/ledis/batch.go new file mode 100644 index 000000000000..c9064df4927a --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/ledis/batch.go @@ -0,0 +1,138 @@ +package ledis + +import ( + "github.com/siddontang/go/log" + "github.com/siddontang/ledisdb/rpl" + "github.com/siddontang/ledisdb/store" + "sync" +) + +type batch struct { + l *Ledis + + *store.WriteBatch + + sync.Locker + + tx *Tx +} + +func (b *batch) Commit() error { + if b.l.cfg.GetReadonly() { + return ErrWriteInROnly + } + + if b.tx == nil { + return b.l.handleCommit(b.WriteBatch, b.WriteBatch) + } else { + if b.l.r != nil { + if err := b.tx.data.Append(b.WriteBatch.BatchData()); err != nil { + return err + } + } + return b.WriteBatch.Commit() + } +} + +func (b *batch) Lock() { + b.Locker.Lock() +} + +func (b *batch) Unlock() { + b.WriteBatch.Rollback() + b.Locker.Unlock() +} + +func (b *batch) Put(key []byte, value []byte) { + b.WriteBatch.Put(key, value) +} + +func (b *batch) Delete(key []byte) { + b.WriteBatch.Delete(key) +} + +type dbBatchLocker struct { + l *sync.Mutex + wrLock *sync.RWMutex +} + +func (l *dbBatchLocker) Lock() { + l.wrLock.RLock() + l.l.Lock() +} + +func (l *dbBatchLocker) Unlock() { + l.l.Unlock() + l.wrLock.RUnlock() +} + +type txBatchLocker struct { +} + +func (l *txBatchLocker) Lock() {} +func (l *txBatchLocker) Unlock() {} + +type multiBatchLocker struct { +} + +func (l *multiBatchLocker) Lock() {} +func (l *multiBatchLocker) Unlock() {} + +func (l *Ledis) newBatch(wb *store.WriteBatch, locker sync.Locker, tx *Tx) *batch { + b := new(batch) + b.l = l + b.WriteBatch = wb + + b.Locker = locker + + b.tx = tx + + return b +} + +type commiter interface { + Commit() error +} + +type commitDataGetter interface { + Data() []byte +} + +func (l *Ledis) handleCommit(g commitDataGetter, c commiter) error { + l.commitLock.Lock() + + var err error + if l.r != nil { + var rl *rpl.Log + if rl, err = l.r.Log(g.Data()); err != nil { + l.commitLock.Unlock() + + log.Fatal("write wal error %s", err.Error()) + return err + } + + l.propagate(rl) + + if err = c.Commit(); err != nil { + l.commitLock.Unlock() + + log.Fatal("commit error %s", err.Error()) + l.noticeReplication() + return err + } + + if err = l.r.UpdateCommitID(rl.ID); err != nil { + l.commitLock.Unlock() + + log.Fatal("update commit id error %s", err.Error()) + l.noticeReplication() + return err + } + } else { + err = c.Commit() + } + + l.commitLock.Unlock() + + return err +} diff --git a/vendor/github.com/siddontang/ledisdb/ledis/const.go b/vendor/github.com/siddontang/ledisdb/ledis/const.go new file mode 100644 index 000000000000..a61cb901bb27 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/ledis/const.go @@ -0,0 +1,100 @@ +package ledis + +import ( + "errors" +) + +const Version = "0.4" + +const ( + NoneType byte = 0 + KVType byte = 1 + HashType byte = 2 + HSizeType byte = 3 + ListType byte = 4 + LMetaType byte = 5 + ZSetType byte = 6 + ZSizeType byte = 7 + ZScoreType byte = 8 + BitType byte = 9 + BitMetaType byte = 10 + SetType byte = 11 + SSizeType byte = 12 + + maxDataType byte = 100 + + /* + I make a big mistake about TTL time key format and have to use a new one (change 101 to 103). + You must run the ledis-upgrade-ttl to upgrade db. + */ + ObsoleteExpTimeType byte = 101 + ExpMetaType byte = 102 + ExpTimeType byte = 103 + + MetaType byte = 201 +) + +var ( + TypeName = map[byte]string{ + KVType: "kv", + HashType: "hash", + HSizeType: "hsize", + ListType: "list", + LMetaType: "lmeta", + ZSetType: "zset", + ZSizeType: "zsize", + ZScoreType: "zscore", + BitType: "bit", + BitMetaType: "bitmeta", + SetType: "set", + SSizeType: "ssize", + ExpTimeType: "exptime", + ExpMetaType: "expmeta", + } +) + +const ( + defaultScanCount int = 10 +) + +var ( + errKeySize = errors.New("invalid key size") + errValueSize = errors.New("invalid value size") + errHashFieldSize = errors.New("invalid hash field size") + errSetMemberSize = errors.New("invalid set member size") + errZSetMemberSize = errors.New("invalid zset member size") + errExpireValue = errors.New("invalid expire value") +) + +const ( + //we don't support too many databases + MaxDBNumber uint8 = 16 + + //max key size + MaxKeySize int = 1024 + + //max hash field size + MaxHashFieldSize int = 1024 + + //max zset member size + MaxZSetMemberSize int = 1024 + + //max set member size + MaxSetMemberSize int = 1024 + + //max value size + MaxValueSize int = 1024 * 1024 * 1024 +) + +var ( + ErrScoreMiss = errors.New("zset score miss") + ErrWriteInROnly = errors.New("write not support in readonly mode") + ErrRplInRDWR = errors.New("replication not support in read write mode") + ErrRplNotSupport = errors.New("replication not support") +) + +const ( + DBAutoCommit uint8 = 0x0 + DBInTransaction uint8 = 0x1 + DBInMulti uint8 = 0x2 +) diff --git a/vendor/github.com/siddontang/ledisdb/ledis/doc.go b/vendor/github.com/siddontang/ledisdb/ledis/doc.go new file mode 100644 index 000000000000..c6bfe7807bef --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/ledis/doc.go @@ -0,0 +1,58 @@ +// Package ledis is a high performance embedded NoSQL. +// +// Ledis supports various data structure like kv, list, hash and zset like redis. +// +// Other features include replication, data with a limited time-to-live. +// +// Usage +// +// First create a ledis instance before use: +// +// l := ledis.Open(cfg) +// +// cfg is a Config instance which contains configuration for ledis use, +// like DataDir (root directory for ledis working to store data). +// +// After you create a ledis instance, you can select a DB to store you data: +// +// db, _ := l.Select(0) +// +// DB must be selected by a index, ledis supports only 16 databases, so the index range is [0-15]. +// +// KV +// +// KV is the most basic ledis type like any other key-value database. +// +// err := db.Set(key, value) +// value, err := db.Get(key) +// +// List +// +// List is simply lists of values, sorted by insertion order. +// You can push or pop value on the list head (left) or tail (right). +// +// err := db.LPush(key, value1) +// err := db.RPush(key, value2) +// value1, err := db.LPop(key) +// value2, err := db.RPop(key) +// +// Hash +// +// Hash is a map between fields and values. +// +// n, err := db.HSet(key, field1, value1) +// n, err := db.HSet(key, field2, value2) +// value1, err := db.HGet(key, field1) +// value2, err := db.HGet(key, field2) +// +// ZSet +// +// ZSet is a sorted collections of values. +// Every member of zset is associated with score, a int64 value which used to sort, from smallest to greatest score. +// Members are unique, but score may be same. +// +// n, err := db.ZAdd(key, ScorePair{score1, member1}, ScorePair{score2, member2}) +// ay, err := db.ZRangeByScore(key, minScore, maxScore, 0, -1) +// +// +package ledis diff --git a/vendor/github.com/siddontang/ledisdb/ledis/dump.go b/vendor/github.com/siddontang/ledisdb/ledis/dump.go new file mode 100644 index 000000000000..863255c1220d --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/ledis/dump.go @@ -0,0 +1,220 @@ +package ledis + +import ( + "bufio" + "bytes" + "encoding/binary" + "github.com/siddontang/go/snappy" + "github.com/siddontang/ledisdb/store" + "io" + "os" +) + +type DumpHead struct { + CommitID uint64 +} + +func (h *DumpHead) Read(r io.Reader) error { + if err := binary.Read(r, binary.BigEndian, &h.CommitID); err != nil { + return err + } + + return nil +} + +func (h *DumpHead) Write(w io.Writer) error { + if err := binary.Write(w, binary.BigEndian, h.CommitID); err != nil { + return err + } + + return nil +} + +func (l *Ledis) DumpFile(path string) error { + f, err := os.Create(path) + if err != nil { + return err + } + defer f.Close() + + return l.Dump(f) +} + +func (l *Ledis) Dump(w io.Writer) error { + var err error + + var commitID uint64 + var snap *store.Snapshot + + l.wLock.Lock() + + if l.r != nil { + if commitID, err = l.r.LastCommitID(); err != nil { + l.wLock.Unlock() + return err + } + } + + if snap, err = l.ldb.NewSnapshot(); err != nil { + l.wLock.Unlock() + return err + } + + l.wLock.Unlock() + + wb := bufio.NewWriterSize(w, 4096) + + h := &DumpHead{commitID} + + if err = h.Write(wb); err != nil { + return err + } + + it := snap.NewIterator() + it.SeekToFirst() + + compressBuf := make([]byte, 4096) + + var key []byte + var value []byte + for ; it.Valid(); it.Next() { + key = it.RawKey() + value = it.RawValue() + + if key, err = snappy.Encode(compressBuf, key); err != nil { + return err + } + + if err = binary.Write(wb, binary.BigEndian, uint16(len(key))); err != nil { + return err + } + + if _, err = wb.Write(key); err != nil { + return err + } + + if value, err = snappy.Encode(compressBuf, value); err != nil { + return err + } + + if err = binary.Write(wb, binary.BigEndian, uint32(len(value))); err != nil { + return err + } + + if _, err = wb.Write(value); err != nil { + return err + } + } + + if err = wb.Flush(); err != nil { + return err + } + + compressBuf = nil + + return nil +} + +// clear all data and load dump file to db +func (l *Ledis) LoadDumpFile(path string) (*DumpHead, error) { + f, err := os.Open(path) + if err != nil { + return nil, err + } + defer f.Close() + + return l.LoadDump(f) +} + +// clear all data and load dump file to db +func (l *Ledis) LoadDump(r io.Reader) (*DumpHead, error) { + l.wLock.Lock() + defer l.wLock.Unlock() + + var err error + if err = l.flushAll(); err != nil { + return nil, err + } + + rb := bufio.NewReaderSize(r, 4096) + + h := new(DumpHead) + + if err = h.Read(rb); err != nil { + return nil, err + } + + var keyLen uint16 + var valueLen uint32 + + var keyBuf bytes.Buffer + var valueBuf bytes.Buffer + + deKeyBuf := make([]byte, 4096) + deValueBuf := make([]byte, 4096) + + var key, value []byte + + wb := l.ldb.NewWriteBatch() + defer wb.Close() + + n := 0 + + for { + if err = binary.Read(rb, binary.BigEndian, &keyLen); err != nil && err != io.EOF { + return nil, err + } else if err == io.EOF { + break + } + + if _, err = io.CopyN(&keyBuf, rb, int64(keyLen)); err != nil { + return nil, err + } + + if key, err = snappy.Decode(deKeyBuf, keyBuf.Bytes()); err != nil { + return nil, err + } + + if err = binary.Read(rb, binary.BigEndian, &valueLen); err != nil { + return nil, err + } + + if _, err = io.CopyN(&valueBuf, rb, int64(valueLen)); err != nil { + return nil, err + } + + if value, err = snappy.Decode(deValueBuf, valueBuf.Bytes()); err != nil { + return nil, err + } + + wb.Put(key, value) + n++ + if n%1024 == 0 { + if err = wb.Commit(); err != nil { + return nil, err + } + } + + // if err = l.ldb.Put(key, value); err != nil { + // return nil, err + // } + + keyBuf.Reset() + valueBuf.Reset() + } + + if err = wb.Commit(); err != nil { + return nil, err + } + + deKeyBuf = nil + deValueBuf = nil + + if l.r != nil { + if err := l.r.UpdateCommitID(h.CommitID); err != nil { + return nil, err + } + } + + return h, nil +} diff --git a/vendor/github.com/siddontang/ledisdb/ledis/event.go b/vendor/github.com/siddontang/ledisdb/ledis/event.go new file mode 100644 index 000000000000..2a3b54a07e2a --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/ledis/event.go @@ -0,0 +1,135 @@ +package ledis + +import ( + "errors" + "fmt" + "github.com/siddontang/go/hack" + "strconv" +) + +var errInvalidEvent = errors.New("invalid event") + +func formatEventKey(buf []byte, k []byte) ([]byte, error) { + if len(k) < 2 { + return nil, errInvalidEvent + } + + buf = append(buf, fmt.Sprintf("DB:%2d ", k[0])...) + buf = append(buf, fmt.Sprintf("%s ", TypeName[k[1]])...) + + db := new(DB) + db.index = k[0] + + //to do format at respective place + + switch k[1] { + case KVType: + if key, err := db.decodeKVKey(k); err != nil { + return nil, err + } else { + buf = strconv.AppendQuote(buf, hack.String(key)) + } + case HashType: + if key, field, err := db.hDecodeHashKey(k); err != nil { + return nil, err + } else { + buf = strconv.AppendQuote(buf, hack.String(key)) + buf = append(buf, ' ') + buf = strconv.AppendQuote(buf, hack.String(field)) + } + case HSizeType: + if key, err := db.hDecodeSizeKey(k); err != nil { + return nil, err + } else { + buf = strconv.AppendQuote(buf, hack.String(key)) + } + case ListType: + if key, seq, err := db.lDecodeListKey(k); err != nil { + return nil, err + } else { + buf = strconv.AppendQuote(buf, hack.String(key)) + buf = append(buf, ' ') + buf = strconv.AppendInt(buf, int64(seq), 10) + } + case LMetaType: + if key, err := db.lDecodeMetaKey(k); err != nil { + return nil, err + } else { + buf = strconv.AppendQuote(buf, hack.String(key)) + } + case ZSetType: + if key, m, err := db.zDecodeSetKey(k); err != nil { + return nil, err + } else { + buf = strconv.AppendQuote(buf, hack.String(key)) + buf = append(buf, ' ') + buf = strconv.AppendQuote(buf, hack.String(m)) + } + case ZSizeType: + if key, err := db.zDecodeSizeKey(k); err != nil { + return nil, err + } else { + buf = strconv.AppendQuote(buf, hack.String(key)) + } + case ZScoreType: + if key, m, score, err := db.zDecodeScoreKey(k); err != nil { + return nil, err + } else { + buf = strconv.AppendQuote(buf, hack.String(key)) + buf = append(buf, ' ') + buf = strconv.AppendQuote(buf, hack.String(m)) + buf = append(buf, ' ') + buf = strconv.AppendInt(buf, score, 10) + } + case BitType: + if key, seq, err := db.bDecodeBinKey(k); err != nil { + return nil, err + } else { + buf = strconv.AppendQuote(buf, hack.String(key)) + buf = append(buf, ' ') + buf = strconv.AppendUint(buf, uint64(seq), 10) + } + case BitMetaType: + if key, err := db.bDecodeMetaKey(k); err != nil { + return nil, err + } else { + buf = strconv.AppendQuote(buf, hack.String(key)) + } + case SetType: + if key, member, err := db.sDecodeSetKey(k); err != nil { + return nil, err + } else { + buf = strconv.AppendQuote(buf, hack.String(key)) + buf = append(buf, ' ') + buf = strconv.AppendQuote(buf, hack.String(member)) + } + case SSizeType: + if key, err := db.sDecodeSizeKey(k); err != nil { + return nil, err + } else { + buf = strconv.AppendQuote(buf, hack.String(key)) + } + case ExpTimeType: + if tp, key, t, err := db.expDecodeTimeKey(k); err != nil { + return nil, err + } else { + buf = append(buf, TypeName[tp]...) + buf = append(buf, ' ') + buf = strconv.AppendQuote(buf, hack.String(key)) + buf = append(buf, ' ') + buf = strconv.AppendInt(buf, t, 10) + } + case ExpMetaType: + if tp, key, err := db.expDecodeMetaKey(k); err != nil { + return nil, err + } else { + buf = append(buf, TypeName[tp]...) + buf = append(buf, ' ') + buf = strconv.AppendQuote(buf, hack.String(key)) + } + default: + return nil, errInvalidEvent + } + + return buf, nil +} diff --git a/vendor/github.com/siddontang/ledisdb/ledis/info.go b/vendor/github.com/siddontang/ledisdb/ledis/info.go new file mode 100644 index 000000000000..df3ee72ce280 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/ledis/info.go @@ -0,0 +1,26 @@ +package ledis + +import () + +// todo, add info + +// type Keyspace struct { +// Kvs int `json:"kvs"` +// KvExpires int `json:"kv_expires"` + +// Lists int `json:"lists"` +// ListExpires int `json:"list_expires"` + +// Bitmaps int `json:"bitmaps"` +// BitmapExpires int `json:"bitmap_expires"` + +// ZSets int `json:"zsets"` +// ZSetExpires int `json:"zset_expires"` + +// Hashes int `json:"hashes"` +// HashExpires int `json:"hahsh_expires"` +// } + +// type Info struct { +// KeySpaces [MaxDBNumber]Keyspace +// } diff --git a/vendor/github.com/siddontang/ledisdb/ledis/ledis.go b/vendor/github.com/siddontang/ledisdb/ledis/ledis.go new file mode 100644 index 000000000000..6e390c5404ea --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/ledis/ledis.go @@ -0,0 +1,215 @@ +package ledis + +import ( + "fmt" + "github.com/siddontang/go/filelock" + "github.com/siddontang/go/log" + "github.com/siddontang/ledisdb/config" + "github.com/siddontang/ledisdb/rpl" + "github.com/siddontang/ledisdb/store" + "io" + "os" + "path" + "sync" + "time" +) + +type Ledis struct { + cfg *config.Config + + ldb *store.DB + dbs [MaxDBNumber]*DB + + quit chan struct{} + wg sync.WaitGroup + + //for replication + r *rpl.Replication + rc chan struct{} + rbatch *store.WriteBatch + rwg sync.WaitGroup + rhs []NewLogEventHandler + + wLock sync.RWMutex //allow one write at same time + commitLock sync.Mutex //allow one write commit at same time + + lock io.Closer + + tcs [MaxDBNumber]*ttlChecker +} + +func Open(cfg *config.Config) (*Ledis, error) { + if len(cfg.DataDir) == 0 { + cfg.DataDir = config.DefaultDataDir + } + + os.MkdirAll(cfg.DataDir, 0755) + + var err error + + l := new(Ledis) + l.cfg = cfg + + if l.lock, err = filelock.Lock(path.Join(cfg.DataDir, "LOCK")); err != nil { + return nil, err + } + + l.quit = make(chan struct{}) + + if l.ldb, err = store.Open(cfg); err != nil { + return nil, err + } + + if cfg.UseReplication { + if l.r, err = rpl.NewReplication(cfg); err != nil { + return nil, err + } + + l.rc = make(chan struct{}, 1) + l.rbatch = l.ldb.NewWriteBatch() + + l.wg.Add(1) + go l.onReplication() + + //first we must try wait all replication ok + //maybe some logs are not committed + l.WaitReplication() + } else { + l.r = nil + } + + for i := uint8(0); i < MaxDBNumber; i++ { + l.dbs[i] = l.newDB(i) + } + + l.checkTTL() + + return l, nil +} + +func (l *Ledis) Close() { + close(l.quit) + l.wg.Wait() + + l.ldb.Close() + + if l.r != nil { + l.r.Close() + //l.r = nil + } + + if l.lock != nil { + l.lock.Close() + //l.lock = nil + } +} + +func (l *Ledis) Select(index int) (*DB, error) { + if index < 0 || index >= int(MaxDBNumber) { + return nil, fmt.Errorf("invalid db index %d", index) + } + + return l.dbs[index], nil +} + +// Flush All will clear all data and replication logs +func (l *Ledis) FlushAll() error { + l.wLock.Lock() + defer l.wLock.Unlock() + + return l.flushAll() +} + +func (l *Ledis) flushAll() error { + it := l.ldb.NewIterator() + defer it.Close() + + w := l.ldb.NewWriteBatch() + defer w.Rollback() + + n := 0 + for ; it.Valid(); it.Next() { + n++ + if n == 10000 { + if err := w.Commit(); err != nil { + log.Fatal("flush all commit error: %s", err.Error()) + return err + } + n = 0 + } + w.Delete(it.RawKey()) + } + + if err := w.Commit(); err != nil { + log.Fatal("flush all commit error: %s", err.Error()) + return err + } + + if l.r != nil { + if err := l.r.Clear(); err != nil { + log.Fatal("flush all replication clear error: %s", err.Error()) + return err + } + } + + return nil +} + +func (l *Ledis) IsReadOnly() bool { + if l.cfg.GetReadonly() { + return true + } else if l.r != nil { + if b, _ := l.r.CommitIDBehind(); b { + return true + } + } + return false +} + +func (l *Ledis) checkTTL() { + for i, db := range l.dbs { + c := newTTLChecker(db) + + c.register(KVType, db.kvBatch, db.delete) + c.register(ListType, db.listBatch, db.lDelete) + c.register(HashType, db.hashBatch, db.hDelete) + c.register(ZSetType, db.zsetBatch, db.zDelete) + c.register(BitType, db.binBatch, db.bDelete) + c.register(SetType, db.setBatch, db.sDelete) + + l.tcs[i] = c + } + + if l.cfg.TTLCheckInterval == 0 { + l.cfg.TTLCheckInterval = 1 + } + + l.wg.Add(1) + go func() { + defer l.wg.Done() + + tick := time.NewTicker(time.Duration(l.cfg.TTLCheckInterval) * time.Second) + defer tick.Stop() + + for { + select { + case <-tick.C: + if l.IsReadOnly() { + break + } + + for _, c := range l.tcs { + c.check() + } + case <-l.quit: + return + } + } + + }() + +} + +func (l *Ledis) StoreStat() *store.Stat { + return l.ldb.Stat() +} diff --git a/vendor/github.com/siddontang/ledisdb/ledis/ledis_db.go b/vendor/github.com/siddontang/ledisdb/ledis/ledis_db.go new file mode 100644 index 000000000000..ebde98e87bd9 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/ledis/ledis_db.go @@ -0,0 +1,147 @@ +package ledis + +import ( + "fmt" + "github.com/siddontang/ledisdb/store" + "sync" +) + +type ibucket interface { + Get(key []byte) ([]byte, error) + GetSlice(key []byte) (store.Slice, error) + + Put(key []byte, value []byte) error + Delete(key []byte) error + + NewIterator() *store.Iterator + + NewWriteBatch() *store.WriteBatch + + RangeIterator(min []byte, max []byte, rangeType uint8) *store.RangeLimitIterator + RevRangeIterator(min []byte, max []byte, rangeType uint8) *store.RangeLimitIterator + RangeLimitIterator(min []byte, max []byte, rangeType uint8, offset int, count int) *store.RangeLimitIterator + RevRangeLimitIterator(min []byte, max []byte, rangeType uint8, offset int, count int) *store.RangeLimitIterator +} + +type DB struct { + l *Ledis + + sdb *store.DB + + bucket ibucket + + index uint8 + + kvBatch *batch + listBatch *batch + hashBatch *batch + zsetBatch *batch + binBatch *batch + setBatch *batch + + status uint8 + + lbkeys *lBlockKeys +} + +func (l *Ledis) newDB(index uint8) *DB { + d := new(DB) + + d.l = l + + d.sdb = l.ldb + + d.bucket = d.sdb + + d.status = DBAutoCommit + d.index = index + + d.kvBatch = d.newBatch() + d.listBatch = d.newBatch() + d.hashBatch = d.newBatch() + d.zsetBatch = d.newBatch() + d.binBatch = d.newBatch() + d.setBatch = d.newBatch() + + d.lbkeys = newLBlockKeys() + + return d +} + +func (db *DB) newBatch() *batch { + return db.l.newBatch(db.bucket.NewWriteBatch(), &dbBatchLocker{l: &sync.Mutex{}, wrLock: &db.l.wLock}, nil) +} + +func (db *DB) Index() int { + return int(db.index) +} + +func (db *DB) IsAutoCommit() bool { + return db.status == DBAutoCommit +} + +func (db *DB) FlushAll() (drop int64, err error) { + all := [...](func() (int64, error)){ + db.flush, + db.lFlush, + db.hFlush, + db.zFlush, + db.bFlush, + db.sFlush} + + for _, flush := range all { + if n, e := flush(); e != nil { + err = e + return + } else { + drop += n + } + } + + return +} + +func (db *DB) flushType(t *batch, dataType byte) (drop int64, err error) { + var deleteFunc func(t *batch, key []byte) int64 + var metaDataType byte + switch dataType { + case KVType: + deleteFunc = db.delete + metaDataType = KVType + case ListType: + deleteFunc = db.lDelete + metaDataType = LMetaType + case HashType: + deleteFunc = db.hDelete + metaDataType = HSizeType + case ZSetType: + deleteFunc = db.zDelete + metaDataType = ZSizeType + case BitType: + deleteFunc = db.bDelete + metaDataType = BitMetaType + case SetType: + deleteFunc = db.sDelete + metaDataType = SSizeType + default: + return 0, fmt.Errorf("invalid data type: %s", TypeName[dataType]) + } + + var keys [][]byte + keys, err = db.scan(metaDataType, nil, 1024, false, "") + for len(keys) != 0 || err != nil { + for _, key := range keys { + deleteFunc(t, key) + db.rmExpire(t, dataType, key) + + } + + if err = t.Commit(); err != nil { + return + } else { + drop += int64(len(keys)) + } + keys, err = db.scan(metaDataType, nil, 1024, false, "") + } + return +} diff --git a/vendor/github.com/siddontang/ledisdb/ledis/multi.go b/vendor/github.com/siddontang/ledisdb/ledis/multi.go new file mode 100644 index 000000000000..29abe34aa3c6 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/ledis/multi.go @@ -0,0 +1,75 @@ +package ledis + +import ( + "errors" + "fmt" +) + +var ( + ErrNestMulti = errors.New("nest multi not supported") + ErrMultiDone = errors.New("multi has been closed") +) + +type Multi struct { + *DB +} + +func (db *DB) IsInMulti() bool { + return db.status == DBInMulti +} + +// begin a mutli to execute commands, +// it will block any other write operations before you close the multi, unlike transaction, mutli can not rollback +func (db *DB) Multi() (*Multi, error) { + if db.IsInMulti() { + return nil, ErrNestMulti + } + + m := new(Multi) + + m.DB = new(DB) + m.DB.status = DBInMulti + + m.DB.l = db.l + + m.l.wLock.Lock() + + m.DB.sdb = db.sdb + + m.DB.bucket = db.sdb + + m.DB.index = db.index + + m.DB.kvBatch = m.newBatch() + m.DB.listBatch = m.newBatch() + m.DB.hashBatch = m.newBatch() + m.DB.zsetBatch = m.newBatch() + m.DB.binBatch = m.newBatch() + m.DB.setBatch = m.newBatch() + + m.DB.lbkeys = db.lbkeys + + return m, nil +} + +func (m *Multi) newBatch() *batch { + return m.l.newBatch(m.bucket.NewWriteBatch(), &multiBatchLocker{}, nil) +} + +func (m *Multi) Close() error { + if m.bucket == nil { + return ErrMultiDone + } + m.l.wLock.Unlock() + m.bucket = nil + return nil +} + +func (m *Multi) Select(index int) error { + if index < 0 || index >= int(MaxDBNumber) { + return fmt.Errorf("invalid db index %d", index) + } + + m.DB.index = uint8(index) + return nil +} diff --git a/vendor/github.com/siddontang/ledisdb/ledis/replication.go b/vendor/github.com/siddontang/ledisdb/ledis/replication.go new file mode 100644 index 000000000000..4f02259898fd --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/ledis/replication.go @@ -0,0 +1,246 @@ +package ledis + +import ( + "bytes" + "errors" + "github.com/siddontang/go/log" + "github.com/siddontang/go/snappy" + "github.com/siddontang/ledisdb/rpl" + "github.com/siddontang/ledisdb/store" + "io" + "time" +) + +const ( + maxReplLogSize = 1 * 1024 * 1024 +) + +var ( + ErrLogMissed = errors.New("log is pured in server") +) + +func (l *Ledis) ReplicationUsed() bool { + return l.r != nil +} + +func (l *Ledis) handleReplication() error { + l.wLock.Lock() + defer l.wLock.Unlock() + + l.rwg.Add(1) + rl := &rpl.Log{} + var err error + for { + if err = l.r.NextNeedCommitLog(rl); err != nil { + if err != rpl.ErrNoBehindLog { + log.Error("get next commit log err, %s", err.Error) + return err + } else { + l.rwg.Done() + return nil + } + } else { + l.rbatch.Rollback() + + if rl.Compression == 1 { + //todo optimize + if rl.Data, err = snappy.Decode(nil, rl.Data); err != nil { + log.Error("decode log error %s", err.Error()) + return err + } + } + + if bd, err := store.NewBatchData(rl.Data); err != nil { + log.Error("decode batch log error %s", err.Error()) + return err + } else if err = bd.Replay(l.rbatch); err != nil { + log.Error("replay batch log error %s", err.Error()) + } + + l.commitLock.Lock() + if err = l.rbatch.Commit(); err != nil { + log.Error("commit log error %s", err.Error()) + } else if err = l.r.UpdateCommitID(rl.ID); err != nil { + log.Error("update commit id error %s", err.Error()) + } + + l.commitLock.Unlock() + if err != nil { + return err + } + } + + } +} + +func (l *Ledis) onReplication() { + defer l.wg.Done() + + l.noticeReplication() + + for { + select { + case <-l.rc: + l.handleReplication() + case <-l.quit: + return + } + } +} + +func (l *Ledis) WaitReplication() error { + if !l.ReplicationUsed() { + return ErrRplNotSupport + + } + + l.noticeReplication() + l.rwg.Wait() + + for i := 0; i < 100; i++ { + b, err := l.r.CommitIDBehind() + if err != nil { + return err + } else if b { + l.noticeReplication() + l.rwg.Wait() + time.Sleep(100 * time.Millisecond) + } else { + return nil + } + } + + return errors.New("wait replication too many times") +} + +func (l *Ledis) StoreLogsFromReader(rb io.Reader) error { + if !l.ReplicationUsed() { + return ErrRplNotSupport + } else if !l.cfg.Readonly { + return ErrRplInRDWR + } + + log := &rpl.Log{} + + for { + if err := log.Decode(rb); err != nil { + if err == io.EOF { + break + } else { + return err + } + } + + if err := l.r.StoreLog(log); err != nil { + return err + } + + } + + l.noticeReplication() + + return nil +} + +func (l *Ledis) noticeReplication() { + AsyncNotify(l.rc) +} + +func (l *Ledis) StoreLogsFromData(data []byte) error { + rb := bytes.NewReader(data) + + return l.StoreLogsFromReader(rb) +} + +func (l *Ledis) ReadLogsTo(startLogID uint64, w io.Writer) (n int, nextLogID uint64, err error) { + if !l.ReplicationUsed() { + // no replication log + nextLogID = 0 + err = ErrRplNotSupport + return + } + + var firtID, lastID uint64 + + firtID, err = l.r.FirstLogID() + if err != nil { + return + } + + if startLogID < firtID { + err = ErrLogMissed + return + } + + lastID, err = l.r.LastLogID() + if err != nil { + return + } + + nextLogID = startLogID + + log := &rpl.Log{} + for i := startLogID; i <= lastID; i++ { + if err = l.r.GetLog(i, log); err != nil { + return + } + + if err = log.Encode(w); err != nil { + return + } + + nextLogID = i + 1 + + n += log.Size() + + if n > maxReplLogSize { + break + } + } + + return +} + +// try to read events, if no events read, try to wait the new event singal until timeout seconds +func (l *Ledis) ReadLogsToTimeout(startLogID uint64, w io.Writer, timeout int, quitCh chan struct{}) (n int, nextLogID uint64, err error) { + n, nextLogID, err = l.ReadLogsTo(startLogID, w) + if err != nil { + return + } else if n != 0 { + return + } + //no events read + select { + case <-l.r.WaitLog(): + case <-time.After(time.Duration(timeout) * time.Second): + case <-quitCh: + return + } + return l.ReadLogsTo(startLogID, w) +} + +func (l *Ledis) propagate(rl *rpl.Log) { + for _, h := range l.rhs { + h(rl) + } +} + +type NewLogEventHandler func(rl *rpl.Log) + +func (l *Ledis) AddNewLogEventHandler(h NewLogEventHandler) error { + if !l.ReplicationUsed() { + return ErrRplNotSupport + } + + l.rhs = append(l.rhs, h) + + return nil +} + +func (l *Ledis) ReplicationStat() (*rpl.Stat, error) { + if !l.ReplicationUsed() { + return nil, ErrRplNotSupport + } + + return l.r.Stat() +} diff --git a/vendor/github.com/siddontang/ledisdb/ledis/scan.go b/vendor/github.com/siddontang/ledisdb/ledis/scan.go new file mode 100644 index 000000000000..9e8e235a16a5 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/ledis/scan.go @@ -0,0 +1,136 @@ +package ledis + +import ( + "errors" + "github.com/siddontang/ledisdb/store" + "regexp" +) + +var errDataType = errors.New("error data type") +var errMetaKey = errors.New("error meta key") + +func (db *DB) scan(dataType byte, key []byte, count int, inclusive bool, match string) ([][]byte, error) { + return db.scanGeneric(dataType, key, count, inclusive, match, false) +} + +func (db *DB) revscan(dataType byte, key []byte, count int, inclusive bool, match string) ([][]byte, error) { + return db.scanGeneric(dataType, key, count, inclusive, match, true) +} + +func (db *DB) scanGeneric(dataType byte, key []byte, count int, + inclusive bool, match string, reverse bool) ([][]byte, error) { + var minKey, maxKey []byte + var err error + var r *regexp.Regexp + + if len(match) > 0 { + if r, err = regexp.Compile(match); err != nil { + return nil, err + } + } + + tp := store.RangeOpen + + if !reverse { + if minKey, err = db.encodeScanMinKey(dataType, key); err != nil { + return nil, err + } + if maxKey, err = db.encodeScanMaxKey(dataType, nil); err != nil { + return nil, err + } + + if inclusive { + tp = store.RangeROpen + } + } else { + if minKey, err = db.encodeScanMinKey(dataType, nil); err != nil { + return nil, err + } + if maxKey, err = db.encodeScanMaxKey(dataType, key); err != nil { + return nil, err + } + + if inclusive { + tp = store.RangeLOpen + } + } + + if count <= 0 { + count = defaultScanCount + } + + var it *store.RangeLimitIterator + if !reverse { + it = db.bucket.RangeIterator(minKey, maxKey, tp) + } else { + it = db.bucket.RevRangeIterator(minKey, maxKey, tp) + } + + v := make([][]byte, 0, count) + + for i := 0; it.Valid() && i < count; it.Next() { + if k, err := db.decodeScanKey(dataType, it.Key()); err != nil { + continue + } else if r != nil && !r.Match(k) { + continue + } else { + v = append(v, k) + i++ + } + } + it.Close() + return v, nil +} + +func (db *DB) encodeScanMinKey(dataType byte, key []byte) ([]byte, error) { + if len(key) == 0 { + return db.encodeScanKey(dataType, nil) + } else { + if err := checkKeySize(key); err != nil { + return nil, err + } + return db.encodeScanKey(dataType, key) + } +} + +func (db *DB) encodeScanMaxKey(dataType byte, key []byte) ([]byte, error) { + if len(key) > 0 { + if err := checkKeySize(key); err != nil { + return nil, err + } + + return db.encodeScanKey(dataType, key) + } + + k, err := db.encodeScanKey(dataType, nil) + if err != nil { + return nil, err + } + k[len(k)-1] = dataType + 1 + return k, nil +} + +func (db *DB) encodeScanKey(dataType byte, key []byte) ([]byte, error) { + switch dataType { + case KVType: + return db.encodeKVKey(key), nil + case LMetaType: + return db.lEncodeMetaKey(key), nil + case HSizeType: + return db.hEncodeSizeKey(key), nil + case ZSizeType: + return db.zEncodeSizeKey(key), nil + case BitMetaType: + return db.bEncodeMetaKey(key), nil + case SSizeType: + return db.sEncodeSizeKey(key), nil + default: + return nil, errDataType + } +} +func (db *DB) decodeScanKey(dataType byte, ek []byte) ([]byte, error) { + if len(ek) < 2 || ek[0] != db.index || ek[1] != dataType { + return nil, errMetaKey + } + return ek[2:], nil +} diff --git a/vendor/github.com/siddontang/ledisdb/ledis/t_bit.go b/vendor/github.com/siddontang/ledisdb/ledis/t_bit.go new file mode 100644 index 000000000000..771ef9016e71 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/ledis/t_bit.go @@ -0,0 +1,926 @@ +package ledis + +import ( + "encoding/binary" + "errors" + "github.com/siddontang/go/num" + "github.com/siddontang/ledisdb/store" + "sort" + "time" +) + +const ( + OPand uint8 = iota + 1 + OPor + OPxor + OPnot +) + +type BitPair struct { + Pos int32 + Val uint8 +} + +type segBitInfo struct { + Seq uint32 + Off uint32 + Val uint8 +} + +type segBitInfoArray []segBitInfo + +const ( + // byte + segByteWidth uint32 = 9 + segByteSize uint32 = 1 << segByteWidth + + // bit + segBitWidth uint32 = segByteWidth + 3 + segBitSize uint32 = segByteSize << 3 + + maxByteSize uint32 = 8 << 20 + maxSegCount uint32 = maxByteSize / segByteSize + + minSeq uint32 = 0 + maxSeq uint32 = uint32((maxByteSize << 3) - 1) +) + +var bitsInByte = [256]int32{0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, + 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, + 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, + 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, + 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, + 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, + 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, + 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, + 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, + 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, + 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8} + +var fillBits = [...]uint8{1, 3, 7, 15, 31, 63, 127, 255} + +var emptySegment []byte = make([]byte, segByteSize, segByteSize) + +var fillSegment []byte = func() []byte { + data := make([]byte, segByteSize, segByteSize) + for i := uint32(0); i < segByteSize; i++ { + data[i] = 0xff + } + return data +}() + +var errBinKey = errors.New("invalid bin key") +var errOffset = errors.New("invalid offset") +var errDuplicatePos = errors.New("duplicate bit pos") + +func getBit(sz []byte, offset uint32) uint8 { + index := offset >> 3 + if index >= uint32(len(sz)) { + return 0 // error("overflow") + } + + offset -= index << 3 + return sz[index] >> offset & 1 +} + +func setBit(sz []byte, offset uint32, val uint8) bool { + if val != 1 && val != 0 { + return false // error("invalid val") + } + + index := offset >> 3 + if index >= uint32(len(sz)) { + return false // error("overflow") + } + + offset -= index << 3 + if sz[index]>>offset&1 != val { + sz[index] ^= (1 << offset) + } + return true +} + +func (datas segBitInfoArray) Len() int { + return len(datas) +} + +func (datas segBitInfoArray) Less(i, j int) bool { + res := (datas)[i].Seq < (datas)[j].Seq + if !res && (datas)[i].Seq == (datas)[j].Seq { + res = (datas)[i].Off < (datas)[j].Off + } + return res +} + +func (datas segBitInfoArray) Swap(i, j int) { + datas[i], datas[j] = datas[j], datas[i] +} + +func (db *DB) bEncodeMetaKey(key []byte) []byte { + mk := make([]byte, len(key)+2) + mk[0] = db.index + mk[1] = BitMetaType + + copy(mk[2:], key) + return mk +} + +func (db *DB) bDecodeMetaKey(bkey []byte) ([]byte, error) { + if len(bkey) < 2 || bkey[0] != db.index || bkey[1] != BitMetaType { + return nil, errBinKey + } + + return bkey[2:], nil +} + +func (db *DB) bEncodeBinKey(key []byte, seq uint32) []byte { + bk := make([]byte, len(key)+8) + + pos := 0 + bk[pos] = db.index + pos++ + bk[pos] = BitType + pos++ + + binary.BigEndian.PutUint16(bk[pos:], uint16(len(key))) + pos += 2 + + copy(bk[pos:], key) + pos += len(key) + + binary.BigEndian.PutUint32(bk[pos:], seq) + + return bk +} + +func (db *DB) bDecodeBinKey(bkey []byte) (key []byte, seq uint32, err error) { + if len(bkey) < 8 || bkey[0] != db.index { + err = errBinKey + return + } + + keyLen := binary.BigEndian.Uint16(bkey[2:4]) + if int(keyLen+8) != len(bkey) { + err = errBinKey + return + } + + key = bkey[4 : 4+keyLen] + seq = uint32(binary.BigEndian.Uint32(bkey[4+keyLen:])) + return +} + +func (db *DB) bCapByteSize(seq uint32, off uint32) uint32 { + var offByteSize uint32 = (off >> 3) + 1 + if offByteSize > segByteSize { + offByteSize = segByteSize + } + + return seq<= 0 { + offset += int32((uint32(tailSeq)<> segBitWidth + off &= (segBitSize - 1) + return +} + +func (db *DB) bGetMeta(key []byte) (tailSeq int32, tailOff int32, err error) { + var v []byte + + mk := db.bEncodeMetaKey(key) + v, err = db.bucket.Get(mk) + if err != nil { + return + } + + if v != nil { + tailSeq = int32(binary.LittleEndian.Uint32(v[0:4])) + tailOff = int32(binary.LittleEndian.Uint32(v[4:8])) + } else { + tailSeq = -1 + tailOff = -1 + } + return +} + +func (db *DB) bSetMeta(t *batch, key []byte, tailSeq uint32, tailOff uint32) { + ek := db.bEncodeMetaKey(key) + + buf := make([]byte, 8) + binary.LittleEndian.PutUint32(buf[0:4], tailSeq) + binary.LittleEndian.PutUint32(buf[4:8], tailOff) + + t.Put(ek, buf) + return +} + +func (db *DB) bUpdateMeta(t *batch, key []byte, seq uint32, off uint32) (tailSeq uint32, tailOff uint32, err error) { + var tseq, toff int32 + var update bool = false + + if tseq, toff, err = db.bGetMeta(key); err != nil { + return + } else if tseq < 0 { + update = true + } else { + tailSeq = uint32(num.MaxInt32(tseq, 0)) + tailOff = uint32(num.MaxInt32(toff, 0)) + update = (seq > tailSeq || (seq == tailSeq && off > tailOff)) + } + + if update { + db.bSetMeta(t, key, seq, off) + tailSeq = seq + tailOff = off + } + return +} + +func (db *DB) bDelete(t *batch, key []byte) (drop int64) { + mk := db.bEncodeMetaKey(key) + t.Delete(mk) + + minKey := db.bEncodeBinKey(key, minSeq) + maxKey := db.bEncodeBinKey(key, maxSeq) + it := db.bucket.RangeIterator(minKey, maxKey, store.RangeClose) + for ; it.Valid(); it.Next() { + t.Delete(it.RawKey()) + drop++ + } + it.Close() + + return drop +} + +func (db *DB) bGetSegment(key []byte, seq uint32) ([]byte, []byte, error) { + bk := db.bEncodeBinKey(key, seq) + segment, err := db.bucket.Get(bk) + if err != nil { + return bk, nil, err + } + return bk, segment, nil +} + +func (db *DB) bAllocateSegment(key []byte, seq uint32) ([]byte, []byte, error) { + bk, segment, err := db.bGetSegment(key, seq) + if err == nil && segment == nil { + segment = make([]byte, segByteSize, segByteSize) + } + return bk, segment, err +} + +func (db *DB) bIterator(key []byte) *store.RangeLimitIterator { + sk := db.bEncodeBinKey(key, minSeq) + ek := db.bEncodeBinKey(key, maxSeq) + return db.bucket.RangeIterator(sk, ek, store.RangeClose) +} + +func (db *DB) bSegAnd(a []byte, b []byte, res *[]byte) { + if a == nil || b == nil { + *res = nil + return + } + + data := *res + if data == nil { + data = make([]byte, segByteSize, segByteSize) + *res = data + } + + for i := uint32(0); i < segByteSize; i++ { + data[i] = a[i] & b[i] + } + return +} + +func (db *DB) bSegOr(a []byte, b []byte, res *[]byte) { + if a == nil || b == nil { + if a == nil && b == nil { + *res = nil + } else if a == nil { + *res = b + } else { + *res = a + } + return + } + + data := *res + if data == nil { + data = make([]byte, segByteSize, segByteSize) + *res = data + } + + for i := uint32(0); i < segByteSize; i++ { + data[i] = a[i] | b[i] + } + return +} + +func (db *DB) bSegXor(a []byte, b []byte, res *[]byte) { + if a == nil && b == nil { + *res = fillSegment + return + } + + if a == nil { + a = emptySegment + } + + if b == nil { + b = emptySegment + } + + data := *res + if data == nil { + data = make([]byte, segByteSize, segByteSize) + *res = data + } + + for i := uint32(0); i < segByteSize; i++ { + data[i] = a[i] ^ b[i] + } + + return +} + +func (db *DB) bExpireAt(key []byte, when int64) (int64, error) { + t := db.binBatch + t.Lock() + defer t.Unlock() + + if seq, _, err := db.bGetMeta(key); err != nil || seq < 0 { + return 0, err + } else { + db.expireAt(t, BitType, key, when) + if err := t.Commit(); err != nil { + return 0, err + } + } + return 1, nil +} + +func (db *DB) bCountByte(val byte, soff uint32, eoff uint32) int32 { + if soff > eoff { + soff, eoff = eoff, soff + } + + mask := uint8(0) + if soff > 0 { + mask |= fillBits[soff-1] + } + if eoff < 7 { + mask |= (fillBits[7] ^ fillBits[eoff]) + } + mask = fillBits[7] ^ mask + + return bitsInByte[val&mask] +} + +func (db *DB) bCountSeg(key []byte, seq uint32, soff uint32, eoff uint32) (cnt int32, err error) { + if soff >= segBitSize || soff < 0 || + eoff >= segBitSize || eoff < 0 { + return + } + + var segment []byte + if _, segment, err = db.bGetSegment(key, seq); err != nil { + return + } + + if segment == nil { + return + } + + if soff > eoff { + soff, eoff = eoff, soff + } + + headIdx := int(soff >> 3) + endIdx := int(eoff >> 3) + sByteOff := soff - ((soff >> 3) << 3) + eByteOff := eoff - ((eoff >> 3) << 3) + + if headIdx == endIdx { + cnt = db.bCountByte(segment[headIdx], sByteOff, eByteOff) + } else { + cnt = db.bCountByte(segment[headIdx], sByteOff, 7) + + db.bCountByte(segment[endIdx], 0, eByteOff) + } + + // sum up following bytes + for idx, end := headIdx+1, endIdx-1; idx <= end; idx += 1 { + cnt += bitsInByte[segment[idx]] + if idx == end { + break + } + } + + return +} + +func (db *DB) BGet(key []byte) (data []byte, err error) { + if err = checkKeySize(key); err != nil { + return + } + + var ts, to int32 + if ts, to, err = db.bGetMeta(key); err != nil || ts < 0 { + return + } + + var tailSeq, tailOff = uint32(ts), uint32(to) + var capByteSize uint32 = db.bCapByteSize(tailSeq, tailOff) + data = make([]byte, capByteSize, capByteSize) + + minKey := db.bEncodeBinKey(key, minSeq) + maxKey := db.bEncodeBinKey(key, tailSeq) + it := db.bucket.RangeIterator(minKey, maxKey, store.RangeClose) + + var seq, s, e uint32 + for ; it.Valid(); it.Next() { + if _, seq, err = db.bDecodeBinKey(it.RawKey()); err != nil { + data = nil + break + } + + s = seq << segByteWidth + e = num.MinUint32(s+segByteSize, capByteSize) + copy(data[s:e], it.RawValue()) + } + it.Close() + + return +} + +func (db *DB) BDelete(key []byte) (drop int64, err error) { + if err = checkKeySize(key); err != nil { + return + } + + t := db.binBatch + t.Lock() + defer t.Unlock() + + drop = db.bDelete(t, key) + db.rmExpire(t, BitType, key) + + err = t.Commit() + return +} + +func (db *DB) BSetBit(key []byte, offset int32, val uint8) (ori uint8, err error) { + if err = checkKeySize(key); err != nil { + return + } + + // todo : check offset + var seq, off uint32 + if seq, off, err = db.bParseOffset(key, offset); err != nil { + return 0, err + } + + var bk, segment []byte + if bk, segment, err = db.bAllocateSegment(key, seq); err != nil { + return 0, err + } + + if segment != nil { + ori = getBit(segment, off) + if setBit(segment, off, val) { + t := db.binBatch + t.Lock() + defer t.Unlock() + + t.Put(bk, segment) + if _, _, e := db.bUpdateMeta(t, key, seq, off); e != nil { + err = e + return + } + + err = t.Commit() + } + } + + return +} + +func (db *DB) BMSetBit(key []byte, args ...BitPair) (place int64, err error) { + if err = checkKeySize(key); err != nil { + return + } + + // (ps : so as to aviod wasting memory copy while calling db.Get() and batch.Put(), + // here we sequence the params by pos, so that we can merge the execution of + // diff pos setting which targets on the same segment respectively. ) + + // #1 : sequence request data + var argCnt = len(args) + var bitInfos segBitInfoArray = make(segBitInfoArray, argCnt) + var seq, off uint32 + + for i, info := range args { + if seq, off, err = db.bParseOffset(key, info.Pos); err != nil { + return + } + + bitInfos[i].Seq = seq + bitInfos[i].Off = off + bitInfos[i].Val = info.Val + } + + sort.Sort(bitInfos) + + for i := 1; i < argCnt; i++ { + if bitInfos[i].Seq == bitInfos[i-1].Seq && bitInfos[i].Off == bitInfos[i-1].Off { + return 0, errDuplicatePos + } + } + + // #2 : execute bit set in order + t := db.binBatch + t.Lock() + defer t.Unlock() + + var curBinKey, curSeg []byte + var curSeq, maxSeq, maxOff uint32 + + for _, info := range bitInfos { + if curSeg != nil && info.Seq != curSeq { + t.Put(curBinKey, curSeg) + curSeg = nil + } + + if curSeg == nil { + curSeq = info.Seq + if curBinKey, curSeg, err = db.bAllocateSegment(key, info.Seq); err != nil { + return + } + + if curSeg == nil { + continue + } + } + + if setBit(curSeg, info.Off, info.Val) { + maxSeq = info.Seq + maxOff = info.Off + place++ + } + } + + if curSeg != nil { + t.Put(curBinKey, curSeg) + } + + // finally, update meta + if place > 0 { + if _, _, err = db.bUpdateMeta(t, key, maxSeq, maxOff); err != nil { + return + } + + err = t.Commit() + } + + return +} + +func (db *DB) BGetBit(key []byte, offset int32) (uint8, error) { + if seq, off, err := db.bParseOffset(key, offset); err != nil { + return 0, err + } else { + _, segment, err := db.bGetSegment(key, seq) + if err != nil { + return 0, err + } + + if segment == nil { + return 0, nil + } else { + return getBit(segment, off), nil + } + } +} + +// func (db *DB) BGetRange(key []byte, start int32, end int32) ([]byte, error) { +// section := make([]byte) + +// return +// } + +func (db *DB) BCount(key []byte, start int32, end int32) (cnt int32, err error) { + var sseq, soff uint32 + if sseq, soff, err = db.bParseOffset(key, start); err != nil { + return + } + + var eseq, eoff uint32 + if eseq, eoff, err = db.bParseOffset(key, end); err != nil { + return + } + + if sseq > eseq || (sseq == eseq && soff > eoff) { + sseq, eseq = eseq, sseq + soff, eoff = eoff, soff + } + + var segCnt int32 + if eseq == sseq { + if segCnt, err = db.bCountSeg(key, sseq, soff, eoff); err != nil { + return 0, err + } + + cnt = segCnt + + } else { + if segCnt, err = db.bCountSeg(key, sseq, soff, segBitSize-1); err != nil { + return 0, err + } else { + cnt += segCnt + } + + if segCnt, err = db.bCountSeg(key, eseq, 0, eoff); err != nil { + return 0, err + } else { + cnt += segCnt + } + } + + // middle segs + var segment []byte + skey := db.bEncodeBinKey(key, sseq) + ekey := db.bEncodeBinKey(key, eseq) + + it := db.bucket.RangeIterator(skey, ekey, store.RangeOpen) + for ; it.Valid(); it.Next() { + segment = it.RawValue() + for _, bt := range segment { + cnt += bitsInByte[bt] + } + } + it.Close() + + return +} + +func (db *DB) BTail(key []byte) (int32, error) { + // effective length of data, the highest bit-pos set in history + tailSeq, tailOff, err := db.bGetMeta(key) + if err != nil { + return 0, err + } + + tail := int32(-1) + if tailSeq >= 0 { + tail = int32(uint32(tailSeq)< maxDstSeq || (seq == maxDstSeq && off > maxDstOff) { + maxDstSeq = seq + maxDstOff = off + } + } + + if (op == OPnot && validKeyNum != 1) || + (op != OPnot && validKeyNum < 2) { + return // with not enough existing source key + } + + var srcIdx int + for srcIdx = 0; srcIdx < keyNum; srcIdx++ { + if srckeys[srcIdx] != nil { + break + } + } + + // init - data + var segments = make([][]byte, maxDstSeq+1) + + if op == OPnot { + // ps : + // ( ~num == num ^ 0x11111111 ) + // we init the result segments with all bit set, + // then we can calculate through the way of 'xor'. + + // ahead segments bin format : 1111 ... 1111 + for i := uint32(0); i < maxDstSeq; i++ { + segments[i] = fillSegment + } + + // last segment bin format : 1111..1100..0000 + var tailSeg = make([]byte, segByteSize, segByteSize) + var fillByte = fillBits[7] + var tailSegLen = db.bCapByteSize(uint32(0), maxDstOff) + for i := uint32(0); i < tailSegLen-1; i++ { + tailSeg[i] = fillByte + } + tailSeg[tailSegLen-1] = fillBits[maxDstOff-(tailSegLen-1)<<3] + segments[maxDstSeq] = tailSeg + + } else { + // ps : init segments by data corresponding to the 1st valid source key + it := db.bIterator(srckeys[srcIdx]) + for ; it.Valid(); it.Next() { + if _, seq, err = db.bDecodeBinKey(it.RawKey()); err != nil { + // to do ... + it.Close() + return + } + segments[seq] = it.Value() + } + it.Close() + srcIdx++ + } + + // operation with following keys + var res []byte + for i := srcIdx; i < keyNum; i++ { + if srckeys[i] == nil { + continue + } + + it := db.bIterator(srckeys[i]) + for idx, end := uint32(0), false; !end; it.Next() { + end = !it.Valid() + if !end { + if _, seq, err = db.bDecodeBinKey(it.RawKey()); err != nil { + // to do ... + it.Close() + return + } + } else { + seq = maxDstSeq + 1 + } + + // todo : + // operation 'and' can be optimize here : + // if seq > max_segments_idx, this loop can be break, + // which can avoid cost from Key() and bDecodeBinKey() + + for ; idx < seq; idx++ { + res = nil + exeOp(segments[idx], nil, &res) + segments[idx] = res + } + + if !end { + res = it.Value() + exeOp(segments[seq], res, &res) + segments[seq] = res + idx++ + } + } + it.Close() + } + + // clear the old data in case + db.bDelete(t, dstkey) + db.rmExpire(t, BitType, dstkey) + + // set data + db.bSetMeta(t, dstkey, maxDstSeq, maxDstOff) + + var bk []byte + for seq, segt := range segments { + if segt != nil { + bk = db.bEncodeBinKey(dstkey, uint32(seq)) + t.Put(bk, segt) + } + } + + err = t.Commit() + if err == nil { + // blen = int32(db.bCapByteSize(maxDstOff, maxDstOff)) + blen = int32(maxDstSeq< MaxKeySize || len(key) == 0 { + return errKeySize + } else if len(field) > MaxHashFieldSize || len(field) == 0 { + return errHashFieldSize + } + return nil +} + +func (db *DB) hEncodeSizeKey(key []byte) []byte { + buf := make([]byte, len(key)+2) + + buf[0] = db.index + buf[1] = HSizeType + + copy(buf[2:], key) + return buf +} + +func (db *DB) hDecodeSizeKey(ek []byte) ([]byte, error) { + if len(ek) < 2 || ek[0] != db.index || ek[1] != HSizeType { + return nil, errHSizeKey + } + + return ek[2:], nil +} + +func (db *DB) hEncodeHashKey(key []byte, field []byte) []byte { + buf := make([]byte, len(key)+len(field)+1+1+2+1) + + pos := 0 + buf[pos] = db.index + pos++ + buf[pos] = HashType + pos++ + + binary.BigEndian.PutUint16(buf[pos:], uint16(len(key))) + pos += 2 + + copy(buf[pos:], key) + pos += len(key) + + buf[pos] = hashStartSep + pos++ + copy(buf[pos:], field) + + return buf +} + +func (db *DB) hDecodeHashKey(ek []byte) ([]byte, []byte, error) { + if len(ek) < 5 || ek[0] != db.index || ek[1] != HashType { + return nil, nil, errHashKey + } + + pos := 2 + keyLen := int(binary.BigEndian.Uint16(ek[pos:])) + pos += 2 + + if keyLen+5 > len(ek) { + return nil, nil, errHashKey + } + + key := ek[pos : pos+keyLen] + pos += keyLen + + if ek[pos] != hashStartSep { + return nil, nil, errHashKey + } + + pos++ + field := ek[pos:] + return key, field, nil +} + +func (db *DB) hEncodeStartKey(key []byte) []byte { + return db.hEncodeHashKey(key, nil) +} + +func (db *DB) hEncodeStopKey(key []byte) []byte { + k := db.hEncodeHashKey(key, nil) + + k[len(k)-1] = hashStopSep + + return k +} + +func (db *DB) hSetItem(key []byte, field []byte, value []byte) (int64, error) { + t := db.hashBatch + + ek := db.hEncodeHashKey(key, field) + + var n int64 = 1 + if v, _ := db.bucket.Get(ek); v != nil { + n = 0 + } else { + if _, err := db.hIncrSize(key, 1); err != nil { + return 0, err + } + } + + t.Put(ek, value) + return n, nil +} + +// ps : here just focus on deleting the hash data, +// any other likes expire is ignore. +func (db *DB) hDelete(t *batch, key []byte) int64 { + sk := db.hEncodeSizeKey(key) + start := db.hEncodeStartKey(key) + stop := db.hEncodeStopKey(key) + + var num int64 = 0 + it := db.bucket.RangeLimitIterator(start, stop, store.RangeROpen, 0, -1) + for ; it.Valid(); it.Next() { + t.Delete(it.Key()) + num++ + } + it.Close() + + t.Delete(sk) + return num +} + +func (db *DB) hExpireAt(key []byte, when int64) (int64, error) { + t := db.hashBatch + t.Lock() + defer t.Unlock() + + if hlen, err := db.HLen(key); err != nil || hlen == 0 { + return 0, err + } else { + db.expireAt(t, HashType, key, when) + if err := t.Commit(); err != nil { + return 0, err + } + } + return 1, nil +} + +func (db *DB) HLen(key []byte) (int64, error) { + if err := checkKeySize(key); err != nil { + return 0, err + } + + return Int64(db.bucket.Get(db.hEncodeSizeKey(key))) +} + +func (db *DB) HSet(key []byte, field []byte, value []byte) (int64, error) { + if err := checkHashKFSize(key, field); err != nil { + return 0, err + } else if err := checkValueSize(value); err != nil { + return 0, err + } + + t := db.hashBatch + t.Lock() + defer t.Unlock() + + n, err := db.hSetItem(key, field, value) + if err != nil { + return 0, err + } + + err = t.Commit() + return n, err +} + +func (db *DB) HGet(key []byte, field []byte) ([]byte, error) { + if err := checkHashKFSize(key, field); err != nil { + return nil, err + } + + return db.bucket.Get(db.hEncodeHashKey(key, field)) +} + +func (db *DB) HMset(key []byte, args ...FVPair) error { + t := db.hashBatch + t.Lock() + defer t.Unlock() + + var err error + var ek []byte + var num int64 = 0 + for i := 0; i < len(args); i++ { + if err := checkHashKFSize(key, args[i].Field); err != nil { + return err + } else if err := checkValueSize(args[i].Value); err != nil { + return err + } + + ek = db.hEncodeHashKey(key, args[i].Field) + + if v, err := db.bucket.Get(ek); err != nil { + return err + } else if v == nil { + num++ + } + + t.Put(ek, args[i].Value) + } + + if _, err = db.hIncrSize(key, num); err != nil { + return err + } + + //todo add binglog + err = t.Commit() + return err +} + +func (db *DB) HMget(key []byte, args ...[]byte) ([][]byte, error) { + var ek []byte + + it := db.bucket.NewIterator() + defer it.Close() + + r := make([][]byte, len(args)) + for i := 0; i < len(args); i++ { + if err := checkHashKFSize(key, args[i]); err != nil { + return nil, err + } + + ek = db.hEncodeHashKey(key, args[i]) + + r[i] = it.Find(ek) + } + + return r, nil +} + +func (db *DB) HDel(key []byte, args ...[]byte) (int64, error) { + t := db.hashBatch + + var ek []byte + var v []byte + var err error + + t.Lock() + defer t.Unlock() + + it := db.bucket.NewIterator() + defer it.Close() + + var num int64 = 0 + for i := 0; i < len(args); i++ { + if err := checkHashKFSize(key, args[i]); err != nil { + return 0, err + } + + ek = db.hEncodeHashKey(key, args[i]) + + v = it.RawFind(ek) + if v == nil { + continue + } else { + num++ + t.Delete(ek) + } + } + + if _, err = db.hIncrSize(key, -num); err != nil { + return 0, err + } + + err = t.Commit() + + return num, err +} + +func (db *DB) hIncrSize(key []byte, delta int64) (int64, error) { + t := db.hashBatch + sk := db.hEncodeSizeKey(key) + + var err error + var size int64 = 0 + if size, err = Int64(db.bucket.Get(sk)); err != nil { + return 0, err + } else { + size += delta + if size <= 0 { + size = 0 + t.Delete(sk) + db.rmExpire(t, HashType, key) + } else { + t.Put(sk, PutInt64(size)) + } + } + + return size, nil +} + +func (db *DB) HIncrBy(key []byte, field []byte, delta int64) (int64, error) { + if err := checkHashKFSize(key, field); err != nil { + return 0, err + } + + t := db.hashBatch + var ek []byte + var err error + + t.Lock() + defer t.Unlock() + + ek = db.hEncodeHashKey(key, field) + + var n int64 = 0 + if n, err = StrInt64(db.bucket.Get(ek)); err != nil { + return 0, err + } + + n += delta + + _, err = db.hSetItem(key, field, num.FormatInt64ToSlice(n)) + if err != nil { + return 0, err + } + + err = t.Commit() + + return n, err +} + +func (db *DB) HGetAll(key []byte) ([]FVPair, error) { + if err := checkKeySize(key); err != nil { + return nil, err + } + + start := db.hEncodeStartKey(key) + stop := db.hEncodeStopKey(key) + + v := make([]FVPair, 0, 16) + + it := db.bucket.RangeLimitIterator(start, stop, store.RangeROpen, 0, -1) + for ; it.Valid(); it.Next() { + _, f, err := db.hDecodeHashKey(it.Key()) + if err != nil { + return nil, err + } + + v = append(v, FVPair{Field: f, Value: it.Value()}) + } + + it.Close() + + return v, nil +} + +func (db *DB) HKeys(key []byte) ([][]byte, error) { + if err := checkKeySize(key); err != nil { + return nil, err + } + + start := db.hEncodeStartKey(key) + stop := db.hEncodeStopKey(key) + + v := make([][]byte, 0, 16) + + it := db.bucket.RangeLimitIterator(start, stop, store.RangeROpen, 0, -1) + for ; it.Valid(); it.Next() { + _, f, err := db.hDecodeHashKey(it.Key()) + if err != nil { + return nil, err + } + v = append(v, f) + } + + it.Close() + + return v, nil +} + +func (db *DB) HValues(key []byte) ([][]byte, error) { + if err := checkKeySize(key); err != nil { + return nil, err + } + + start := db.hEncodeStartKey(key) + stop := db.hEncodeStopKey(key) + + v := make([][]byte, 0, 16) + + it := db.bucket.RangeLimitIterator(start, stop, store.RangeROpen, 0, -1) + for ; it.Valid(); it.Next() { + _, _, err := db.hDecodeHashKey(it.Key()) + if err != nil { + return nil, err + } + + v = append(v, it.Value()) + } + + it.Close() + + return v, nil +} + +func (db *DB) HClear(key []byte) (int64, error) { + if err := checkKeySize(key); err != nil { + return 0, err + } + + t := db.hashBatch + t.Lock() + defer t.Unlock() + + num := db.hDelete(t, key) + db.rmExpire(t, HashType, key) + + err := t.Commit() + return num, err +} + +func (db *DB) HMclear(keys ...[]byte) (int64, error) { + t := db.hashBatch + t.Lock() + defer t.Unlock() + + for _, key := range keys { + if err := checkKeySize(key); err != nil { + return 0, err + } + + db.hDelete(t, key) + db.rmExpire(t, HashType, key) + } + + err := t.Commit() + return int64(len(keys)), err +} + +func (db *DB) hFlush() (drop int64, err error) { + t := db.hashBatch + + t.Lock() + defer t.Unlock() + + return db.flushType(t, HashType) +} + +func (db *DB) HScan(key []byte, count int, inclusive bool, match string) ([][]byte, error) { + return db.scan(HSizeType, key, count, inclusive, match) +} + +func (db *DB) HRevScan(key []byte, count int, inclusive bool, match string) ([][]byte, error) { + return db.revscan(HSizeType, key, count, inclusive, match) +} + +func (db *DB) HExpire(key []byte, duration int64) (int64, error) { + if duration <= 0 { + return 0, errExpireValue + } + + return db.hExpireAt(key, time.Now().Unix()+duration) +} + +func (db *DB) HExpireAt(key []byte, when int64) (int64, error) { + if when <= time.Now().Unix() { + return 0, errExpireValue + } + + return db.hExpireAt(key, when) +} + +func (db *DB) HTTL(key []byte) (int64, error) { + if err := checkKeySize(key); err != nil { + return -1, err + } + + return db.ttl(HashType, key) +} + +func (db *DB) HPersist(key []byte) (int64, error) { + if err := checkKeySize(key); err != nil { + return 0, err + } + + t := db.hashBatch + t.Lock() + defer t.Unlock() + + n, err := db.rmExpire(t, HashType, key) + if err != nil { + return 0, err + } + + err = t.Commit() + return n, err +} diff --git a/vendor/github.com/siddontang/ledisdb/ledis/t_kv.go b/vendor/github.com/siddontang/ledisdb/ledis/t_kv.go new file mode 100644 index 000000000000..37315c18e452 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/ledis/t_kv.go @@ -0,0 +1,396 @@ +package ledis + +import ( + "errors" + "github.com/siddontang/go/num" + "github.com/siddontang/ledisdb/store" + "time" +) + +type KVPair struct { + Key []byte + Value []byte +} + +var errKVKey = errors.New("invalid encode kv key") + +func checkKeySize(key []byte) error { + if len(key) > MaxKeySize || len(key) == 0 { + return errKeySize + } + return nil +} + +func checkValueSize(value []byte) error { + if len(value) > MaxValueSize { + return errValueSize + } + + return nil +} + +func (db *DB) encodeKVKey(key []byte) []byte { + ek := make([]byte, len(key)+2) + ek[0] = db.index + ek[1] = KVType + copy(ek[2:], key) + return ek +} + +func (db *DB) decodeKVKey(ek []byte) ([]byte, error) { + if len(ek) < 2 || ek[0] != db.index || ek[1] != KVType { + return nil, errKVKey + } + + return ek[2:], nil +} + +func (db *DB) encodeKVMinKey() []byte { + ek := db.encodeKVKey(nil) + return ek +} + +func (db *DB) encodeKVMaxKey() []byte { + ek := db.encodeKVKey(nil) + ek[len(ek)-1] = KVType + 1 + return ek +} + +func (db *DB) incr(key []byte, delta int64) (int64, error) { + if err := checkKeySize(key); err != nil { + return 0, err + } + + var err error + key = db.encodeKVKey(key) + + t := db.kvBatch + + t.Lock() + defer t.Unlock() + + var n int64 + n, err = StrInt64(db.bucket.Get(key)) + if err != nil { + return 0, err + } + + n += delta + + t.Put(key, num.FormatInt64ToSlice(n)) + + err = t.Commit() + return n, err +} + +// ps : here just focus on deleting the key-value data, +// any other likes expire is ignore. +func (db *DB) delete(t *batch, key []byte) int64 { + key = db.encodeKVKey(key) + t.Delete(key) + return 1 +} + +func (db *DB) setExpireAt(key []byte, when int64) (int64, error) { + t := db.kvBatch + t.Lock() + defer t.Unlock() + + if exist, err := db.Exists(key); err != nil || exist == 0 { + return 0, err + } else { + db.expireAt(t, KVType, key, when) + if err := t.Commit(); err != nil { + return 0, err + } + } + return 1, nil +} + +func (db *DB) Decr(key []byte) (int64, error) { + return db.incr(key, -1) +} + +func (db *DB) DecrBy(key []byte, decrement int64) (int64, error) { + return db.incr(key, -decrement) +} + +func (db *DB) Del(keys ...[]byte) (int64, error) { + if len(keys) == 0 { + return 0, nil + } + + codedKeys := make([][]byte, len(keys)) + for i, k := range keys { + codedKeys[i] = db.encodeKVKey(k) + } + + t := db.kvBatch + t.Lock() + defer t.Unlock() + + for i, k := range keys { + t.Delete(codedKeys[i]) + db.rmExpire(t, KVType, k) + } + + err := t.Commit() + return int64(len(keys)), err +} + +func (db *DB) Exists(key []byte) (int64, error) { + if err := checkKeySize(key); err != nil { + return 0, err + } + + var err error + key = db.encodeKVKey(key) + + var v []byte + v, err = db.bucket.Get(key) + if v != nil && err == nil { + return 1, nil + } + + return 0, err +} + +func (db *DB) Get(key []byte) ([]byte, error) { + if err := checkKeySize(key); err != nil { + return nil, err + } + + key = db.encodeKVKey(key) + + return db.bucket.Get(key) +} + +func (db *DB) GetSlice(key []byte) (store.Slice, error) { + if err := checkKeySize(key); err != nil { + return nil, err + } + + key = db.encodeKVKey(key) + + return db.bucket.GetSlice(key) +} + +func (db *DB) GetSet(key []byte, value []byte) ([]byte, error) { + if err := checkKeySize(key); err != nil { + return nil, err + } else if err := checkValueSize(value); err != nil { + return nil, err + } + + key = db.encodeKVKey(key) + + t := db.kvBatch + + t.Lock() + defer t.Unlock() + + oldValue, err := db.bucket.Get(key) + if err != nil { + return nil, err + } + + t.Put(key, value) + + err = t.Commit() + + return oldValue, err +} + +func (db *DB) Incr(key []byte) (int64, error) { + return db.incr(key, 1) +} + +func (db *DB) IncrBy(key []byte, increment int64) (int64, error) { + return db.incr(key, increment) +} + +func (db *DB) MGet(keys ...[]byte) ([][]byte, error) { + values := make([][]byte, len(keys)) + + it := db.bucket.NewIterator() + defer it.Close() + + for i := range keys { + if err := checkKeySize(keys[i]); err != nil { + return nil, err + } + + values[i] = it.Find(db.encodeKVKey(keys[i])) + } + + return values, nil +} + +func (db *DB) MSet(args ...KVPair) error { + if len(args) == 0 { + return nil + } + + t := db.kvBatch + + var err error + var key []byte + var value []byte + + t.Lock() + defer t.Unlock() + + for i := 0; i < len(args); i++ { + if err := checkKeySize(args[i].Key); err != nil { + return err + } else if err := checkValueSize(args[i].Value); err != nil { + return err + } + + key = db.encodeKVKey(args[i].Key) + + value = args[i].Value + + t.Put(key, value) + + } + + err = t.Commit() + return err +} + +func (db *DB) Set(key []byte, value []byte) error { + if err := checkKeySize(key); err != nil { + return err + } else if err := checkValueSize(value); err != nil { + return err + } + + var err error + key = db.encodeKVKey(key) + + t := db.kvBatch + + t.Lock() + defer t.Unlock() + + t.Put(key, value) + + err = t.Commit() + + return err +} + +func (db *DB) SetNX(key []byte, value []byte) (int64, error) { + if err := checkKeySize(key); err != nil { + return 0, err + } else if err := checkValueSize(value); err != nil { + return 0, err + } + + var err error + key = db.encodeKVKey(key) + + var n int64 = 1 + + t := db.kvBatch + + t.Lock() + defer t.Unlock() + + if v, err := db.bucket.Get(key); err != nil { + return 0, err + } else if v != nil { + n = 0 + } else { + t.Put(key, value) + + err = t.Commit() + } + + return n, err +} + +func (db *DB) SetEX(key []byte, duration int64, value []byte) error { + if err := checkKeySize(key); err != nil { + return err + } else if err := checkValueSize(value); err != nil { + return err + } else if duration <= 0 { + return errExpireValue + } + + ek := db.encodeKVKey(key) + + t := db.kvBatch + + t.Lock() + defer t.Unlock() + + t.Put(ek, value) + db.expireAt(t, KVType, key, time.Now().Unix()+duration) + + if err := t.Commit(); err != nil { + return err + } + + return nil +} + +func (db *DB) flush() (drop int64, err error) { + t := db.kvBatch + t.Lock() + defer t.Unlock() + return db.flushType(t, KVType) +} + +//if inclusive is true, scan range [key, inf) else (key, inf) +func (db *DB) Scan(key []byte, count int, inclusive bool, match string) ([][]byte, error) { + return db.scan(KVType, key, count, inclusive, match) +} + +//if inclusive is true, revscan range (-inf, key] else (inf, key) +func (db *DB) RevScan(key []byte, count int, inclusive bool, match string) ([][]byte, error) { + return db.revscan(KVType, key, count, inclusive, match) +} + +func (db *DB) Expire(key []byte, duration int64) (int64, error) { + if duration <= 0 { + return 0, errExpireValue + } + + return db.setExpireAt(key, time.Now().Unix()+duration) +} + +func (db *DB) ExpireAt(key []byte, when int64) (int64, error) { + if when <= time.Now().Unix() { + return 0, errExpireValue + } + + return db.setExpireAt(key, when) +} + +func (db *DB) TTL(key []byte) (int64, error) { + if err := checkKeySize(key); err != nil { + return -1, err + } + + return db.ttl(KVType, key) +} + +func (db *DB) Persist(key []byte) (int64, error) { + if err := checkKeySize(key); err != nil { + return 0, err + } + + t := db.kvBatch + t.Lock() + defer t.Unlock() + n, err := db.rmExpire(t, KVType, key) + if err != nil { + return 0, err + } + + err = t.Commit() + return n, err +} diff --git a/vendor/github.com/siddontang/ledisdb/ledis/t_list.go b/vendor/github.com/siddontang/ledisdb/ledis/t_list.go new file mode 100644 index 000000000000..7f66bed75664 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/ledis/t_list.go @@ -0,0 +1,665 @@ +package ledis + +import ( + "container/list" + "encoding/binary" + "errors" + "github.com/siddontang/go/hack" + "github.com/siddontang/ledisdb/store" + "sync" + "time" +) + +const ( + listHeadSeq int32 = 1 + listTailSeq int32 = 2 + + listMinSeq int32 = 1000 + listMaxSeq int32 = 1<<31 - 1000 + listInitialSeq int32 = listMinSeq + (listMaxSeq-listMinSeq)/2 +) + +var errLMetaKey = errors.New("invalid lmeta key") +var errListKey = errors.New("invalid list key") +var errListSeq = errors.New("invalid list sequence, overflow") + +func (db *DB) lEncodeMetaKey(key []byte) []byte { + buf := make([]byte, len(key)+2) + buf[0] = db.index + buf[1] = LMetaType + + copy(buf[2:], key) + return buf +} + +func (db *DB) lDecodeMetaKey(ek []byte) ([]byte, error) { + if len(ek) < 2 || ek[0] != db.index || ek[1] != LMetaType { + return nil, errLMetaKey + } + + return ek[2:], nil +} + +func (db *DB) lEncodeListKey(key []byte, seq int32) []byte { + buf := make([]byte, len(key)+8) + + pos := 0 + buf[pos] = db.index + pos++ + buf[pos] = ListType + pos++ + + binary.BigEndian.PutUint16(buf[pos:], uint16(len(key))) + pos += 2 + + copy(buf[pos:], key) + pos += len(key) + + binary.BigEndian.PutUint32(buf[pos:], uint32(seq)) + + return buf +} + +func (db *DB) lDecodeListKey(ek []byte) (key []byte, seq int32, err error) { + if len(ek) < 8 || ek[0] != db.index || ek[1] != ListType { + err = errListKey + return + } + + keyLen := int(binary.BigEndian.Uint16(ek[2:])) + if keyLen+8 != len(ek) { + err = errListKey + return + } + + key = ek[4 : 4+keyLen] + seq = int32(binary.BigEndian.Uint32(ek[4+keyLen:])) + return +} + +func (db *DB) lpush(key []byte, whereSeq int32, args ...[]byte) (int64, error) { + if err := checkKeySize(key); err != nil { + return 0, err + } + + var headSeq int32 + var tailSeq int32 + var size int32 + var err error + + t := db.listBatch + t.Lock() + defer t.Unlock() + + metaKey := db.lEncodeMetaKey(key) + headSeq, tailSeq, size, err = db.lGetMeta(nil, metaKey) + if err != nil { + return 0, err + } + + var pushCnt int = len(args) + if pushCnt == 0 { + return int64(size), nil + } + + var seq int32 = headSeq + var delta int32 = -1 + if whereSeq == listTailSeq { + seq = tailSeq + delta = 1 + } + + // append elements + if size > 0 { + seq += delta + } + + for i := 0; i < pushCnt; i++ { + ek := db.lEncodeListKey(key, seq+int32(i)*delta) + t.Put(ek, args[i]) + } + + seq += int32(pushCnt-1) * delta + if seq <= listMinSeq || seq >= listMaxSeq { + return 0, errListSeq + } + + // set meta info + if whereSeq == listHeadSeq { + headSeq = seq + } else { + tailSeq = seq + } + + db.lSetMeta(metaKey, headSeq, tailSeq) + + err = t.Commit() + + if err == nil { + db.lSignalAsReady(key, pushCnt) + } + + return int64(size) + int64(pushCnt), err +} + +func (db *DB) lpop(key []byte, whereSeq int32) ([]byte, error) { + if err := checkKeySize(key); err != nil { + return nil, err + } + + t := db.listBatch + t.Lock() + defer t.Unlock() + + var headSeq int32 + var tailSeq int32 + var err error + + metaKey := db.lEncodeMetaKey(key) + headSeq, tailSeq, _, err = db.lGetMeta(nil, metaKey) + if err != nil { + return nil, err + } + + var value []byte + + var seq int32 = headSeq + if whereSeq == listTailSeq { + seq = tailSeq + } + + itemKey := db.lEncodeListKey(key, seq) + value, err = db.bucket.Get(itemKey) + if err != nil { + return nil, err + } + + if whereSeq == listHeadSeq { + headSeq += 1 + } else { + tailSeq -= 1 + } + + t.Delete(itemKey) + size := db.lSetMeta(metaKey, headSeq, tailSeq) + if size == 0 { + db.rmExpire(t, HashType, key) + } + + err = t.Commit() + return value, err +} + +// ps : here just focus on deleting the list data, +// any other likes expire is ignore. +func (db *DB) lDelete(t *batch, key []byte) int64 { + mk := db.lEncodeMetaKey(key) + + var headSeq int32 + var tailSeq int32 + var err error + + it := db.bucket.NewIterator() + defer it.Close() + + headSeq, tailSeq, _, err = db.lGetMeta(it, mk) + if err != nil { + return 0 + } + + var num int64 = 0 + startKey := db.lEncodeListKey(key, headSeq) + stopKey := db.lEncodeListKey(key, tailSeq) + + rit := store.NewRangeIterator(it, &store.Range{startKey, stopKey, store.RangeClose}) + for ; rit.Valid(); rit.Next() { + t.Delete(rit.RawKey()) + num++ + } + + t.Delete(mk) + + return num +} + +func (db *DB) lGetMeta(it *store.Iterator, ek []byte) (headSeq int32, tailSeq int32, size int32, err error) { + var v []byte + if it != nil { + v = it.Find(ek) + } else { + v, err = db.bucket.Get(ek) + } + if err != nil { + return + } else if v == nil { + headSeq = listInitialSeq + tailSeq = listInitialSeq + size = 0 + return + } else { + headSeq = int32(binary.LittleEndian.Uint32(v[0:4])) + tailSeq = int32(binary.LittleEndian.Uint32(v[4:8])) + size = tailSeq - headSeq + 1 + } + return +} + +func (db *DB) lSetMeta(ek []byte, headSeq int32, tailSeq int32) int32 { + t := db.listBatch + + var size int32 = tailSeq - headSeq + 1 + if size < 0 { + // todo : log error + panic + } else if size == 0 { + t.Delete(ek) + } else { + buf := make([]byte, 8) + + binary.LittleEndian.PutUint32(buf[0:4], uint32(headSeq)) + binary.LittleEndian.PutUint32(buf[4:8], uint32(tailSeq)) + + t.Put(ek, buf) + } + + return size +} + +func (db *DB) lExpireAt(key []byte, when int64) (int64, error) { + t := db.listBatch + t.Lock() + defer t.Unlock() + + if llen, err := db.LLen(key); err != nil || llen == 0 { + return 0, err + } else { + db.expireAt(t, ListType, key, when) + if err := t.Commit(); err != nil { + return 0, err + } + } + return 1, nil +} + +func (db *DB) LIndex(key []byte, index int32) ([]byte, error) { + if err := checkKeySize(key); err != nil { + return nil, err + } + + var seq int32 + var headSeq int32 + var tailSeq int32 + var err error + + metaKey := db.lEncodeMetaKey(key) + + it := db.bucket.NewIterator() + defer it.Close() + + headSeq, tailSeq, _, err = db.lGetMeta(it, metaKey) + if err != nil { + return nil, err + } + + if index >= 0 { + seq = headSeq + index + } else { + seq = tailSeq + index + 1 + } + + sk := db.lEncodeListKey(key, seq) + v := it.Find(sk) + + return v, nil +} + +func (db *DB) LLen(key []byte) (int64, error) { + if err := checkKeySize(key); err != nil { + return 0, err + } + + ek := db.lEncodeMetaKey(key) + _, _, size, err := db.lGetMeta(nil, ek) + return int64(size), err +} + +func (db *DB) LPop(key []byte) ([]byte, error) { + return db.lpop(key, listHeadSeq) +} + +func (db *DB) LPush(key []byte, args ...[]byte) (int64, error) { + return db.lpush(key, listHeadSeq, args...) +} + +func (db *DB) LRange(key []byte, start int32, stop int32) ([][]byte, error) { + if err := checkKeySize(key); err != nil { + return nil, err + } + + var headSeq int32 + var llen int32 + var err error + + metaKey := db.lEncodeMetaKey(key) + + it := db.bucket.NewIterator() + defer it.Close() + + if headSeq, _, llen, err = db.lGetMeta(it, metaKey); err != nil { + return nil, err + } + + if start < 0 { + start = llen + start + } + if stop < 0 { + stop = llen + stop + } + if start < 0 { + start = 0 + } + + if start > stop || start >= llen { + return [][]byte{}, nil + } + + if stop >= llen { + stop = llen - 1 + } + + limit := (stop - start) + 1 + headSeq += start + + v := make([][]byte, 0, limit) + + startKey := db.lEncodeListKey(key, headSeq) + rit := store.NewRangeLimitIterator(it, + &store.Range{ + Min: startKey, + Max: nil, + Type: store.RangeClose}, + &store.Limit{ + Offset: 0, + Count: int(limit)}) + + for ; rit.Valid(); rit.Next() { + v = append(v, rit.Value()) + } + + return v, nil +} + +func (db *DB) RPop(key []byte) ([]byte, error) { + return db.lpop(key, listTailSeq) +} + +func (db *DB) RPush(key []byte, args ...[]byte) (int64, error) { + return db.lpush(key, listTailSeq, args...) +} + +func (db *DB) LClear(key []byte) (int64, error) { + if err := checkKeySize(key); err != nil { + return 0, err + } + + t := db.listBatch + t.Lock() + defer t.Unlock() + + num := db.lDelete(t, key) + db.rmExpire(t, ListType, key) + + err := t.Commit() + return num, err +} + +func (db *DB) LMclear(keys ...[]byte) (int64, error) { + t := db.listBatch + t.Lock() + defer t.Unlock() + + for _, key := range keys { + if err := checkKeySize(key); err != nil { + return 0, err + } + + db.lDelete(t, key) + db.rmExpire(t, ListType, key) + + } + + err := t.Commit() + return int64(len(keys)), err +} + +func (db *DB) lFlush() (drop int64, err error) { + t := db.listBatch + t.Lock() + defer t.Unlock() + return db.flushType(t, ListType) +} + +func (db *DB) LExpire(key []byte, duration int64) (int64, error) { + if duration <= 0 { + return 0, errExpireValue + } + + return db.lExpireAt(key, time.Now().Unix()+duration) +} + +func (db *DB) LExpireAt(key []byte, when int64) (int64, error) { + if when <= time.Now().Unix() { + return 0, errExpireValue + } + + return db.lExpireAt(key, when) +} + +func (db *DB) LTTL(key []byte) (int64, error) { + if err := checkKeySize(key); err != nil { + return -1, err + } + + return db.ttl(ListType, key) +} + +func (db *DB) LPersist(key []byte) (int64, error) { + if err := checkKeySize(key); err != nil { + return 0, err + } + + t := db.listBatch + t.Lock() + defer t.Unlock() + + n, err := db.rmExpire(t, ListType, key) + if err != nil { + return 0, err + } + + err = t.Commit() + return n, err +} + +func (db *DB) LScan(key []byte, count int, inclusive bool, match string) ([][]byte, error) { + return db.scan(LMetaType, key, count, inclusive, match) +} + +func (db *DB) LRevScan(key []byte, count int, inclusive bool, match string) ([][]byte, error) { + return db.revscan(LMetaType, key, count, inclusive, match) +} + +func (db *DB) lEncodeMinKey() []byte { + return db.lEncodeMetaKey(nil) +} + +func (db *DB) lEncodeMaxKey() []byte { + ek := db.lEncodeMetaKey(nil) + ek[len(ek)-1] = LMetaType + 1 + return ek +} + +func (db *DB) BLPop(keys [][]byte, timeout time.Duration) ([]interface{}, error) { + return db.lblockPop(keys, listHeadSeq, timeout) +} + +func (db *DB) BRPop(keys [][]byte, timeout time.Duration) ([]interface{}, error) { + return db.lblockPop(keys, listTailSeq, timeout) +} + +func (db *DB) lblockPop(keys [][]byte, whereSeq int32, timeout time.Duration) ([]interface{}, error) { + ch := make(chan []byte) + + bkeys := [][]byte{} + for _, key := range keys { + v, err := db.lpop(key, whereSeq) + if err != nil { + return nil, err + } else if v != nil { + return []interface{}{key, v}, nil + } else { + if db.IsAutoCommit() { + //block wait can not be supported in transaction and multi + db.lbkeys.wait(key, ch) + bkeys = append(bkeys, key) + } + } + } + if len(bkeys) == 0 { + return nil, nil + } + + defer func() { + for _, key := range bkeys { + db.lbkeys.unwait(key, ch) + } + }() + + deadT := time.Now().Add(timeout) + + for { + if timeout == 0 { + key := <-ch + if v, err := db.lpop(key, whereSeq); err != nil { + return nil, err + } else if v == nil { + continue + } else { + return []interface{}{key, v}, nil + } + } else { + d := deadT.Sub(time.Now()) + if d < 0 { + return nil, nil + } + + select { + case key := <-ch: + if v, err := db.lpop(key, whereSeq); err != nil { + return nil, err + } else if v == nil { + db.lbkeys.wait(key, ch) + continue + } else { + return []interface{}{key, v}, nil + } + case <-time.After(d): + return nil, nil + } + } + + } +} + +func (db *DB) lSignalAsReady(key []byte, num int) { + if db.status == DBInTransaction { + //for transaction, only data can be pushed after tx commit and it is hard to signal + //so we don't handle it now + return + } + + db.lbkeys.signal(key, num) +} + +type lbKeyCh chan<- []byte + +type lBlockKeys struct { + sync.Mutex + + keys map[string]*list.List +} + +func newLBlockKeys() *lBlockKeys { + l := new(lBlockKeys) + + l.keys = make(map[string]*list.List) + return l +} + +func (l *lBlockKeys) signal(key []byte, num int) { + l.Lock() + defer l.Unlock() + + s := hack.String(key) + chs, ok := l.keys[s] + if !ok { + return + } + + var n *list.Element + + i := 0 + for e := chs.Front(); e != nil && i < num; e = n { + ch := e.Value.(lbKeyCh) + n = e.Next() + select { + case ch <- key: + chs.Remove(e) + i++ + default: + //waiter unwait + chs.Remove(e) + } + } + + if chs.Len() == 0 { + delete(l.keys, s) + } +} + +func (l *lBlockKeys) wait(key []byte, ch lbKeyCh) { + l.Lock() + defer l.Unlock() + + s := hack.String(key) + chs, ok := l.keys[s] + if !ok { + chs = list.New() + l.keys[s] = chs + } + + chs.PushBack(ch) +} + +func (l *lBlockKeys) unwait(key []byte, ch lbKeyCh) { + l.Lock() + defer l.Unlock() + + s := hack.String(key) + chs, ok := l.keys[s] + if !ok { + return + } else { + var n *list.Element + for e := chs.Front(); e != nil; e = n { + c := e.Value.(lbKeyCh) + n = e.Next() + if c == ch { + chs.Remove(e) + } + } + + if chs.Len() == 0 { + delete(l.keys, s) + } + } +} diff --git a/vendor/github.com/siddontang/ledisdb/ledis/t_set.go b/vendor/github.com/siddontang/ledisdb/ledis/t_set.go new file mode 100644 index 000000000000..d1647524c1bf --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/ledis/t_set.go @@ -0,0 +1,605 @@ +package ledis + +import ( + "encoding/binary" + "errors" + "github.com/siddontang/go/hack" + "github.com/siddontang/ledisdb/store" + "time" +) + +var errSetKey = errors.New("invalid set key") +var errSSizeKey = errors.New("invalid ssize key") + +const ( + setStartSep byte = ':' + setStopSep byte = setStartSep + 1 + UnionType byte = 51 + DiffType byte = 52 + InterType byte = 53 +) + +func checkSetKMSize(key []byte, member []byte) error { + if len(key) > MaxKeySize || len(key) == 0 { + return errKeySize + } else if len(member) > MaxSetMemberSize || len(member) == 0 { + return errSetMemberSize + } + return nil +} + +func (db *DB) sEncodeSizeKey(key []byte) []byte { + buf := make([]byte, len(key)+2) + + buf[0] = db.index + buf[1] = SSizeType + + copy(buf[2:], key) + return buf +} + +func (db *DB) sDecodeSizeKey(ek []byte) ([]byte, error) { + if len(ek) < 2 || ek[0] != db.index || ek[1] != SSizeType { + return nil, errSSizeKey + } + + return ek[2:], nil +} + +func (db *DB) sEncodeSetKey(key []byte, member []byte) []byte { + buf := make([]byte, len(key)+len(member)+1+1+2+1) + + pos := 0 + buf[pos] = db.index + pos++ + buf[pos] = SetType + pos++ + + binary.BigEndian.PutUint16(buf[pos:], uint16(len(key))) + pos += 2 + + copy(buf[pos:], key) + pos += len(key) + + buf[pos] = setStartSep + pos++ + copy(buf[pos:], member) + + return buf +} + +func (db *DB) sDecodeSetKey(ek []byte) ([]byte, []byte, error) { + if len(ek) < 5 || ek[0] != db.index || ek[1] != SetType { + return nil, nil, errSetKey + } + + pos := 2 + keyLen := int(binary.BigEndian.Uint16(ek[pos:])) + pos += 2 + + if keyLen+5 > len(ek) { + return nil, nil, errSetKey + } + + key := ek[pos : pos+keyLen] + pos += keyLen + + if ek[pos] != hashStartSep { + return nil, nil, errSetKey + } + + pos++ + member := ek[pos:] + return key, member, nil +} + +func (db *DB) sEncodeStartKey(key []byte) []byte { + return db.sEncodeSetKey(key, nil) +} + +func (db *DB) sEncodeStopKey(key []byte) []byte { + k := db.sEncodeSetKey(key, nil) + + k[len(k)-1] = setStopSep + + return k +} + +func (db *DB) sFlush() (drop int64, err error) { + + t := db.setBatch + t.Lock() + defer t.Unlock() + + return db.flushType(t, SetType) +} + +func (db *DB) sDelete(t *batch, key []byte) int64 { + sk := db.sEncodeSizeKey(key) + start := db.sEncodeStartKey(key) + stop := db.sEncodeStopKey(key) + + var num int64 = 0 + it := db.bucket.RangeLimitIterator(start, stop, store.RangeROpen, 0, -1) + for ; it.Valid(); it.Next() { + t.Delete(it.RawKey()) + num++ + } + + it.Close() + + t.Delete(sk) + return num +} + +func (db *DB) sIncrSize(key []byte, delta int64) (int64, error) { + t := db.setBatch + sk := db.sEncodeSizeKey(key) + + var err error + var size int64 = 0 + if size, err = Int64(db.bucket.Get(sk)); err != nil { + return 0, err + } else { + size += delta + if size <= 0 { + size = 0 + t.Delete(sk) + db.rmExpire(t, SetType, key) + } else { + t.Put(sk, PutInt64(size)) + } + } + + return size, nil +} + +func (db *DB) sExpireAt(key []byte, when int64) (int64, error) { + t := db.setBatch + t.Lock() + defer t.Unlock() + + if scnt, err := db.SCard(key); err != nil || scnt == 0 { + return 0, err + } else { + db.expireAt(t, SetType, key, when) + if err := t.Commit(); err != nil { + return 0, err + } + + } + + return 1, nil +} + +func (db *DB) sSetItem(key []byte, member []byte) (int64, error) { + t := db.setBatch + ek := db.sEncodeSetKey(key, member) + + var n int64 = 1 + if v, _ := db.bucket.Get(ek); v != nil { + n = 0 + } else { + if _, err := db.sIncrSize(key, 1); err != nil { + return 0, err + } + } + + t.Put(ek, nil) + return n, nil +} + +func (db *DB) SAdd(key []byte, args ...[]byte) (int64, error) { + t := db.setBatch + t.Lock() + defer t.Unlock() + + var err error + var ek []byte + var num int64 = 0 + for i := 0; i < len(args); i++ { + if err := checkSetKMSize(key, args[i]); err != nil { + return 0, err + } + + ek = db.sEncodeSetKey(key, args[i]) + + if v, err := db.bucket.Get(ek); err != nil { + return 0, err + } else if v == nil { + num++ + } + + t.Put(ek, nil) + } + + if _, err = db.sIncrSize(key, num); err != nil { + return 0, err + } + + err = t.Commit() + return num, err + +} + +func (db *DB) SCard(key []byte) (int64, error) { + if err := checkKeySize(key); err != nil { + return 0, err + } + + sk := db.sEncodeSizeKey(key) + + return Int64(db.bucket.Get(sk)) +} + +func (db *DB) sDiffGeneric(keys ...[]byte) ([][]byte, error) { + destMap := make(map[string]bool) + + members, err := db.SMembers(keys[0]) + if err != nil { + return nil, err + } + + for _, m := range members { + destMap[hack.String(m)] = true + } + + for _, k := range keys[1:] { + members, err := db.SMembers(k) + if err != nil { + return nil, err + } + + for _, m := range members { + if _, ok := destMap[hack.String(m)]; !ok { + continue + } else if ok { + delete(destMap, hack.String(m)) + } + } + // O - A = O, O is zero set. + if len(destMap) == 0 { + return nil, nil + } + } + + slice := make([][]byte, len(destMap)) + idx := 0 + for k, v := range destMap { + if !v { + continue + } + slice[idx] = []byte(k) + idx++ + } + + return slice, nil +} + +func (db *DB) SDiff(keys ...[]byte) ([][]byte, error) { + v, err := db.sDiffGeneric(keys...) + return v, err +} + +func (db *DB) SDiffStore(dstKey []byte, keys ...[]byte) (int64, error) { + n, err := db.sStoreGeneric(dstKey, DiffType, keys...) + return n, err +} + +func (db *DB) sInterGeneric(keys ...[]byte) ([][]byte, error) { + destMap := make(map[string]bool) + + members, err := db.SMembers(keys[0]) + if err != nil { + return nil, err + } + + for _, m := range members { + destMap[hack.String(m)] = true + } + + for _, key := range keys[1:] { + if err := checkKeySize(key); err != nil { + return nil, err + } + + members, err := db.SMembers(key) + if err != nil { + return nil, err + } else if len(members) == 0 { + return nil, err + } + + tempMap := make(map[string]bool) + for _, member := range members { + if err := checkKeySize(member); err != nil { + return nil, err + } + if _, ok := destMap[hack.String(member)]; ok { + tempMap[hack.String(member)] = true //mark this item as selected + } + } + destMap = tempMap //reduce the size of the result set + if len(destMap) == 0 { + return nil, nil + } + } + + slice := make([][]byte, len(destMap)) + idx := 0 + for k, v := range destMap { + if !v { + continue + } + + slice[idx] = []byte(k) + idx++ + } + + return slice, nil + +} + +func (db *DB) SInter(keys ...[]byte) ([][]byte, error) { + v, err := db.sInterGeneric(keys...) + return v, err + +} + +func (db *DB) SInterStore(dstKey []byte, keys ...[]byte) (int64, error) { + n, err := db.sStoreGeneric(dstKey, InterType, keys...) + return n, err +} + +func (db *DB) SIsMember(key []byte, member []byte) (int64, error) { + ek := db.sEncodeSetKey(key, member) + + var n int64 = 1 + if v, err := db.bucket.Get(ek); err != nil { + return 0, err + } else if v == nil { + n = 0 + } + return n, nil +} + +func (db *DB) SMembers(key []byte) ([][]byte, error) { + if err := checkKeySize(key); err != nil { + return nil, err + } + + start := db.sEncodeStartKey(key) + stop := db.sEncodeStopKey(key) + + v := make([][]byte, 0, 16) + + it := db.bucket.RangeLimitIterator(start, stop, store.RangeROpen, 0, -1) + for ; it.Valid(); it.Next() { + _, m, err := db.sDecodeSetKey(it.Key()) + if err != nil { + return nil, err + } + + v = append(v, m) + } + + it.Close() + + return v, nil +} + +func (db *DB) SRem(key []byte, args ...[]byte) (int64, error) { + t := db.setBatch + t.Lock() + defer t.Unlock() + + var ek []byte + var v []byte + var err error + + it := db.bucket.NewIterator() + defer it.Close() + + var num int64 = 0 + for i := 0; i < len(args); i++ { + if err := checkSetKMSize(key, args[i]); err != nil { + return 0, err + } + + ek = db.sEncodeSetKey(key, args[i]) + + v = it.RawFind(ek) + if v == nil { + continue + } else { + num++ + t.Delete(ek) + } + } + + if _, err = db.sIncrSize(key, -num); err != nil { + return 0, err + } + + err = t.Commit() + return num, err + +} + +func (db *DB) sUnionGeneric(keys ...[]byte) ([][]byte, error) { + dstMap := make(map[string]bool) + + for _, key := range keys { + if err := checkKeySize(key); err != nil { + return nil, err + } + + members, err := db.SMembers(key) + if err != nil { + return nil, err + } + + for _, member := range members { + dstMap[hack.String(member)] = true + } + } + + slice := make([][]byte, len(dstMap)) + idx := 0 + for k, v := range dstMap { + if !v { + continue + } + slice[idx] = []byte(k) + idx++ + } + + return slice, nil +} + +func (db *DB) SUnion(keys ...[]byte) ([][]byte, error) { + v, err := db.sUnionGeneric(keys...) + return v, err +} + +func (db *DB) SUnionStore(dstKey []byte, keys ...[]byte) (int64, error) { + n, err := db.sStoreGeneric(dstKey, UnionType, keys...) + return n, err +} + +func (db *DB) sStoreGeneric(dstKey []byte, optType byte, keys ...[]byte) (int64, error) { + if err := checkKeySize(dstKey); err != nil { + return 0, err + } + + t := db.setBatch + t.Lock() + defer t.Unlock() + + db.sDelete(t, dstKey) + + var err error + var ek []byte + var v [][]byte + + switch optType { + case UnionType: + v, err = db.sUnionGeneric(keys...) + case DiffType: + v, err = db.sDiffGeneric(keys...) + case InterType: + v, err = db.sInterGeneric(keys...) + } + + if err != nil { + return 0, err + } + + for _, m := range v { + if err := checkSetKMSize(dstKey, m); err != nil { + return 0, err + } + + ek = db.sEncodeSetKey(dstKey, m) + + if _, err := db.bucket.Get(ek); err != nil { + return 0, err + } + + t.Put(ek, nil) + } + + var n = int64(len(v)) + sk := db.sEncodeSizeKey(dstKey) + t.Put(sk, PutInt64(n)) + + if err = t.Commit(); err != nil { + return 0, err + } + return n, nil +} + +func (db *DB) SClear(key []byte) (int64, error) { + if err := checkKeySize(key); err != nil { + return 0, err + } + + t := db.setBatch + t.Lock() + defer t.Unlock() + + num := db.sDelete(t, key) + db.rmExpire(t, SetType, key) + + err := t.Commit() + return num, err +} + +func (db *DB) SMclear(keys ...[]byte) (int64, error) { + t := db.setBatch + t.Lock() + defer t.Unlock() + + for _, key := range keys { + if err := checkKeySize(key); err != nil { + return 0, err + } + + db.sDelete(t, key) + db.rmExpire(t, SetType, key) + } + + err := t.Commit() + return int64(len(keys)), err +} + +func (db *DB) SExpire(key []byte, duration int64) (int64, error) { + if duration <= 0 { + return 0, errExpireValue + } + + return db.sExpireAt(key, time.Now().Unix()+duration) + +} + +func (db *DB) SExpireAt(key []byte, when int64) (int64, error) { + if when <= time.Now().Unix() { + return 0, errExpireValue + } + + return db.sExpireAt(key, when) + +} + +func (db *DB) STTL(key []byte) (int64, error) { + if err := checkKeySize(key); err != nil { + return -1, err + } + + return db.ttl(SetType, key) +} + +func (db *DB) SPersist(key []byte) (int64, error) { + if err := checkKeySize(key); err != nil { + return 0, err + } + + t := db.setBatch + t.Lock() + defer t.Unlock() + + n, err := db.rmExpire(t, SetType, key) + if err != nil { + return 0, err + } + err = t.Commit() + return n, err +} + +func (db *DB) SScan(key []byte, count int, inclusive bool, match string) ([][]byte, error) { + return db.scan(SSizeType, key, count, inclusive, match) +} + +func (db *DB) SRevScan(key []byte, count int, inclusive bool, match string) ([][]byte, error) { + return db.revscan(SSizeType, key, count, inclusive, match) +} diff --git a/vendor/github.com/siddontang/ledisdb/ledis/t_ttl.go b/vendor/github.com/siddontang/ledisdb/ledis/t_ttl.go new file mode 100644 index 000000000000..f16a735d026a --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/ledis/t_ttl.go @@ -0,0 +1,210 @@ +package ledis + +import ( + "encoding/binary" + "errors" + "github.com/siddontang/ledisdb/store" + "sync" + "time" +) + +var ( + errExpMetaKey = errors.New("invalid expire meta key") + errExpTimeKey = errors.New("invalid expire time key") +) + +type onExpired func(*batch, []byte) int64 + +type ttlChecker struct { + sync.Mutex + db *DB + txs []*batch + cbs []onExpired + + //next check time + nc int64 +} + +var errExpType = errors.New("invalid expire type") + +func (db *DB) expEncodeTimeKey(dataType byte, key []byte, when int64) []byte { + buf := make([]byte, len(key)+11) + + buf[0] = db.index + buf[1] = ExpTimeType + pos := 2 + + binary.BigEndian.PutUint64(buf[pos:], uint64(when)) + pos += 8 + + buf[pos] = dataType + pos++ + + copy(buf[pos:], key) + + return buf +} + +func (db *DB) expEncodeMetaKey(dataType byte, key []byte) []byte { + buf := make([]byte, len(key)+3) + + buf[0] = db.index + buf[1] = ExpMetaType + buf[2] = dataType + pos := 3 + + copy(buf[pos:], key) + + return buf +} + +func (db *DB) expDecodeMetaKey(mk []byte) (byte, []byte, error) { + if len(mk) <= 3 || mk[0] != db.index || mk[1] != ExpMetaType { + return 0, nil, errExpMetaKey + } + + return mk[2], mk[3:], nil +} + +func (db *DB) expDecodeTimeKey(tk []byte) (byte, []byte, int64, error) { + if len(tk) < 11 || tk[0] != db.index || tk[1] != ExpTimeType { + return 0, nil, 0, errExpTimeKey + } + + return tk[10], tk[11:], int64(binary.BigEndian.Uint64(tk[2:])), nil +} + +func (db *DB) expire(t *batch, dataType byte, key []byte, duration int64) { + db.expireAt(t, dataType, key, time.Now().Unix()+duration) +} + +func (db *DB) expireAt(t *batch, dataType byte, key []byte, when int64) { + mk := db.expEncodeMetaKey(dataType, key) + tk := db.expEncodeTimeKey(dataType, key, when) + + t.Put(tk, mk) + t.Put(mk, PutInt64(when)) + + tc := db.l.tcs[db.index] + tc.setNextCheckTime(when, false) +} + +func (db *DB) ttl(dataType byte, key []byte) (t int64, err error) { + mk := db.expEncodeMetaKey(dataType, key) + + if t, err = Int64(db.bucket.Get(mk)); err != nil || t == 0 { + t = -1 + } else { + t -= time.Now().Unix() + if t <= 0 { + t = -1 + } + // if t == -1 : to remove ???? + } + + return t, err +} + +func (db *DB) rmExpire(t *batch, dataType byte, key []byte) (int64, error) { + mk := db.expEncodeMetaKey(dataType, key) + if v, err := db.bucket.Get(mk); err != nil { + return 0, err + } else if v == nil { + return 0, nil + } else if when, err2 := Int64(v, nil); err2 != nil { + return 0, err2 + } else { + tk := db.expEncodeTimeKey(dataType, key, when) + t.Delete(mk) + t.Delete(tk) + return 1, nil + } +} + +func newTTLChecker(db *DB) *ttlChecker { + c := new(ttlChecker) + c.db = db + c.txs = make([]*batch, maxDataType) + c.cbs = make([]onExpired, maxDataType) + c.nc = 0 + return c +} + +func (c *ttlChecker) register(dataType byte, t *batch, f onExpired) { + c.txs[dataType] = t + c.cbs[dataType] = f +} + +func (c *ttlChecker) setNextCheckTime(when int64, force bool) { + c.Lock() + if force { + c.nc = when + } else if !force && c.nc > when { + c.nc = when + } + c.Unlock() +} + +func (c *ttlChecker) check() { + now := time.Now().Unix() + + c.Lock() + nc := c.nc + c.Unlock() + + if now < nc { + return + } + + nc = now + 3600 + + db := c.db + dbGet := db.bucket.Get + + minKey := db.expEncodeTimeKey(NoneType, nil, 0) + maxKey := db.expEncodeTimeKey(maxDataType, nil, nc) + + it := db.bucket.RangeLimitIterator(minKey, maxKey, store.RangeROpen, 0, -1) + for ; it.Valid(); it.Next() { + tk := it.RawKey() + mk := it.RawValue() + + dt, k, nt, err := db.expDecodeTimeKey(tk) + if err != nil { + continue + } + + if nt > now { + //the next ttl check time is nt! + nc = nt + break + } + + t := c.txs[dt] + cb := c.cbs[dt] + if tk == nil || cb == nil { + continue + } + + t.Lock() + + if exp, err := Int64(dbGet(mk)); err == nil { + // check expire again + if exp <= now { + cb(t, k) + t.Delete(tk) + t.Delete(mk) + + t.Commit() + } + + } + + t.Unlock() + } + it.Close() + + c.setNextCheckTime(nc, true) + + return +} diff --git a/vendor/github.com/siddontang/ledisdb/ledis/t_zset.go b/vendor/github.com/siddontang/ledisdb/ledis/t_zset.go new file mode 100644 index 000000000000..47191710555e --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/ledis/t_zset.go @@ -0,0 +1,1025 @@ +package ledis + +import ( + "bytes" + "encoding/binary" + "errors" + "github.com/siddontang/go/hack" + "github.com/siddontang/ledisdb/store" + "time" +) + +const ( + MinScore int64 = -1<<63 + 1 + MaxScore int64 = 1<<63 - 1 + InvalidScore int64 = -1 << 63 + + AggregateSum byte = 0 + AggregateMin byte = 1 + AggregateMax byte = 2 +) + +type ScorePair struct { + Score int64 + Member []byte +} + +var errZSizeKey = errors.New("invalid zsize key") +var errZSetKey = errors.New("invalid zset key") +var errZScoreKey = errors.New("invalid zscore key") +var errScoreOverflow = errors.New("zset score overflow") +var errInvalidAggregate = errors.New("invalid aggregate") +var errInvalidWeightNum = errors.New("invalid weight number") +var errInvalidSrcKeyNum = errors.New("invalid src key number") + +const ( + zsetNScoreSep byte = '<' + zsetPScoreSep byte = zsetNScoreSep + 1 + zsetStopScoreSep byte = zsetPScoreSep + 1 + + zsetStartMemSep byte = ':' + zsetStopMemSep byte = zsetStartMemSep + 1 +) + +func checkZSetKMSize(key []byte, member []byte) error { + if len(key) > MaxKeySize || len(key) == 0 { + return errKeySize + } else if len(member) > MaxZSetMemberSize || len(member) == 0 { + return errZSetMemberSize + } + return nil +} + +func (db *DB) zEncodeSizeKey(key []byte) []byte { + buf := make([]byte, len(key)+2) + buf[0] = db.index + buf[1] = ZSizeType + + copy(buf[2:], key) + return buf +} + +func (db *DB) zDecodeSizeKey(ek []byte) ([]byte, error) { + if len(ek) < 2 || ek[0] != db.index || ek[1] != ZSizeType { + return nil, errZSizeKey + } + + return ek[2:], nil +} + +func (db *DB) zEncodeSetKey(key []byte, member []byte) []byte { + buf := make([]byte, len(key)+len(member)+5) + + pos := 0 + buf[pos] = db.index + pos++ + + buf[pos] = ZSetType + pos++ + + binary.BigEndian.PutUint16(buf[pos:], uint16(len(key))) + pos += 2 + + copy(buf[pos:], key) + pos += len(key) + + buf[pos] = zsetStartMemSep + pos++ + + copy(buf[pos:], member) + + return buf +} + +func (db *DB) zDecodeSetKey(ek []byte) ([]byte, []byte, error) { + if len(ek) < 5 || ek[0] != db.index || ek[1] != ZSetType { + return nil, nil, errZSetKey + } + + keyLen := int(binary.BigEndian.Uint16(ek[2:])) + if keyLen+5 > len(ek) { + return nil, nil, errZSetKey + } + + key := ek[4 : 4+keyLen] + + if ek[4+keyLen] != zsetStartMemSep { + return nil, nil, errZSetKey + } + + member := ek[5+keyLen:] + return key, member, nil +} + +func (db *DB) zEncodeStartSetKey(key []byte) []byte { + k := db.zEncodeSetKey(key, nil) + return k +} + +func (db *DB) zEncodeStopSetKey(key []byte) []byte { + k := db.zEncodeSetKey(key, nil) + k[len(k)-1] = zsetStartMemSep + 1 + return k +} + +func (db *DB) zEncodeScoreKey(key []byte, member []byte, score int64) []byte { + buf := make([]byte, len(key)+len(member)+14) + + pos := 0 + buf[pos] = db.index + pos++ + + buf[pos] = ZScoreType + pos++ + + binary.BigEndian.PutUint16(buf[pos:], uint16(len(key))) + pos += 2 + + copy(buf[pos:], key) + pos += len(key) + + if score < 0 { + buf[pos] = zsetNScoreSep + } else { + buf[pos] = zsetPScoreSep + } + + pos++ + binary.BigEndian.PutUint64(buf[pos:], uint64(score)) + pos += 8 + + buf[pos] = zsetStartMemSep + pos++ + + copy(buf[pos:], member) + return buf +} + +func (db *DB) zEncodeStartScoreKey(key []byte, score int64) []byte { + return db.zEncodeScoreKey(key, nil, score) +} + +func (db *DB) zEncodeStopScoreKey(key []byte, score int64) []byte { + k := db.zEncodeScoreKey(key, nil, score) + k[len(k)-1] = zsetStopMemSep + return k +} + +func (db *DB) zDecodeScoreKey(ek []byte) (key []byte, member []byte, score int64, err error) { + if len(ek) < 14 || ek[0] != db.index || ek[1] != ZScoreType { + err = errZScoreKey + return + } + + keyLen := int(binary.BigEndian.Uint16(ek[2:])) + if keyLen+14 > len(ek) { + err = errZScoreKey + return + } + + key = ek[4 : 4+keyLen] + pos := 4 + keyLen + + if (ek[pos] != zsetNScoreSep) && (ek[pos] != zsetPScoreSep) { + err = errZScoreKey + return + } + pos++ + + score = int64(binary.BigEndian.Uint64(ek[pos:])) + pos += 8 + + if ek[pos] != zsetStartMemSep { + err = errZScoreKey + return + } + + pos++ + + member = ek[pos:] + return +} + +func (db *DB) zSetItem(t *batch, key []byte, score int64, member []byte) (int64, error) { + if score <= MinScore || score >= MaxScore { + return 0, errScoreOverflow + } + + var exists int64 = 0 + ek := db.zEncodeSetKey(key, member) + + if v, err := db.bucket.Get(ek); err != nil { + return 0, err + } else if v != nil { + exists = 1 + + if s, err := Int64(v, err); err != nil { + return 0, err + } else { + sk := db.zEncodeScoreKey(key, member, s) + t.Delete(sk) + } + } + + t.Put(ek, PutInt64(score)) + + sk := db.zEncodeScoreKey(key, member, score) + t.Put(sk, []byte{}) + + return exists, nil +} + +func (db *DB) zDelItem(t *batch, key []byte, member []byte, skipDelScore bool) (int64, error) { + ek := db.zEncodeSetKey(key, member) + if v, err := db.bucket.Get(ek); err != nil { + return 0, err + } else if v == nil { + //not exists + return 0, nil + } else { + //exists + if !skipDelScore { + //we must del score + if s, err := Int64(v, err); err != nil { + return 0, err + } else { + sk := db.zEncodeScoreKey(key, member, s) + t.Delete(sk) + } + } + } + + t.Delete(ek) + + return 1, nil +} + +func (db *DB) zDelete(t *batch, key []byte) int64 { + delMembCnt, _ := db.zRemRange(t, key, MinScore, MaxScore, 0, -1) + // todo : log err + return delMembCnt +} + +func (db *DB) zExpireAt(key []byte, when int64) (int64, error) { + t := db.zsetBatch + t.Lock() + defer t.Unlock() + + if zcnt, err := db.ZCard(key); err != nil || zcnt == 0 { + return 0, err + } else { + db.expireAt(t, ZSetType, key, when) + if err := t.Commit(); err != nil { + return 0, err + } + } + return 1, nil +} + +func (db *DB) ZAdd(key []byte, args ...ScorePair) (int64, error) { + if len(args) == 0 { + return 0, nil + } + + t := db.zsetBatch + t.Lock() + defer t.Unlock() + + var num int64 = 0 + for i := 0; i < len(args); i++ { + score := args[i].Score + member := args[i].Member + + if err := checkZSetKMSize(key, member); err != nil { + return 0, err + } + + if n, err := db.zSetItem(t, key, score, member); err != nil { + return 0, err + } else if n == 0 { + //add new + num++ + } + } + + if _, err := db.zIncrSize(t, key, num); err != nil { + return 0, err + } + + err := t.Commit() + return num, err +} + +func (db *DB) zIncrSize(t *batch, key []byte, delta int64) (int64, error) { + sk := db.zEncodeSizeKey(key) + + size, err := Int64(db.bucket.Get(sk)) + if err != nil { + return 0, err + } else { + size += delta + if size <= 0 { + size = 0 + t.Delete(sk) + db.rmExpire(t, ZSetType, key) + } else { + t.Put(sk, PutInt64(size)) + } + } + + return size, nil +} + +func (db *DB) ZCard(key []byte) (int64, error) { + if err := checkKeySize(key); err != nil { + return 0, err + } + + sk := db.zEncodeSizeKey(key) + return Int64(db.bucket.Get(sk)) +} + +func (db *DB) ZScore(key []byte, member []byte) (int64, error) { + if err := checkZSetKMSize(key, member); err != nil { + return InvalidScore, err + } + + var score int64 = InvalidScore + + k := db.zEncodeSetKey(key, member) + if v, err := db.bucket.Get(k); err != nil { + return InvalidScore, err + } else if v == nil { + return InvalidScore, ErrScoreMiss + } else { + if score, err = Int64(v, nil); err != nil { + return InvalidScore, err + } + } + + return score, nil +} + +func (db *DB) ZRem(key []byte, members ...[]byte) (int64, error) { + if len(members) == 0 { + return 0, nil + } + + t := db.zsetBatch + t.Lock() + defer t.Unlock() + + var num int64 = 0 + for i := 0; i < len(members); i++ { + if err := checkZSetKMSize(key, members[i]); err != nil { + return 0, err + } + + if n, err := db.zDelItem(t, key, members[i], false); err != nil { + return 0, err + } else if n == 1 { + num++ + } + } + + if _, err := db.zIncrSize(t, key, -num); err != nil { + return 0, err + } + + err := t.Commit() + return num, err +} + +func (db *DB) ZIncrBy(key []byte, delta int64, member []byte) (int64, error) { + if err := checkZSetKMSize(key, member); err != nil { + return InvalidScore, err + } + + t := db.zsetBatch + t.Lock() + defer t.Unlock() + + ek := db.zEncodeSetKey(key, member) + + var oldScore int64 = 0 + v, err := db.bucket.Get(ek) + if err != nil { + return InvalidScore, err + } else if v == nil { + db.zIncrSize(t, key, 1) + } else { + if oldScore, err = Int64(v, err); err != nil { + return InvalidScore, err + } + } + + newScore := oldScore + delta + if newScore >= MaxScore || newScore <= MinScore { + return InvalidScore, errScoreOverflow + } + + sk := db.zEncodeScoreKey(key, member, newScore) + t.Put(sk, []byte{}) + t.Put(ek, PutInt64(newScore)) + + if v != nil { + // so as to update score, we must delete the old one + oldSk := db.zEncodeScoreKey(key, member, oldScore) + t.Delete(oldSk) + } + + err = t.Commit() + return newScore, err +} + +func (db *DB) ZCount(key []byte, min int64, max int64) (int64, error) { + if err := checkKeySize(key); err != nil { + return 0, err + } + minKey := db.zEncodeStartScoreKey(key, min) + maxKey := db.zEncodeStopScoreKey(key, max) + + rangeType := store.RangeROpen + + it := db.bucket.RangeLimitIterator(minKey, maxKey, rangeType, 0, -1) + var n int64 = 0 + for ; it.Valid(); it.Next() { + n++ + } + it.Close() + + return n, nil +} + +func (db *DB) zrank(key []byte, member []byte, reverse bool) (int64, error) { + if err := checkZSetKMSize(key, member); err != nil { + return 0, err + } + + k := db.zEncodeSetKey(key, member) + + it := db.bucket.NewIterator() + defer it.Close() + + if v := it.Find(k); v == nil { + return -1, nil + } else { + if s, err := Int64(v, nil); err != nil { + return 0, err + } else { + var rit *store.RangeLimitIterator + + sk := db.zEncodeScoreKey(key, member, s) + + if !reverse { + minKey := db.zEncodeStartScoreKey(key, MinScore) + + rit = store.NewRangeIterator(it, &store.Range{minKey, sk, store.RangeClose}) + } else { + maxKey := db.zEncodeStopScoreKey(key, MaxScore) + rit = store.NewRevRangeIterator(it, &store.Range{sk, maxKey, store.RangeClose}) + } + + var lastKey []byte = nil + var n int64 = 0 + + for ; rit.Valid(); rit.Next() { + n++ + + lastKey = rit.BufKey(lastKey) + } + + if _, m, _, err := db.zDecodeScoreKey(lastKey); err == nil && bytes.Equal(m, member) { + n-- + return n, nil + } + } + } + + return -1, nil +} + +func (db *DB) zIterator(key []byte, min int64, max int64, offset int, count int, reverse bool) *store.RangeLimitIterator { + minKey := db.zEncodeStartScoreKey(key, min) + maxKey := db.zEncodeStopScoreKey(key, max) + + if !reverse { + return db.bucket.RangeLimitIterator(minKey, maxKey, store.RangeClose, offset, count) + } else { + return db.bucket.RevRangeLimitIterator(minKey, maxKey, store.RangeClose, offset, count) + } +} + +func (db *DB) zRemRange(t *batch, key []byte, min int64, max int64, offset int, count int) (int64, error) { + if len(key) > MaxKeySize { + return 0, errKeySize + } + + it := db.zIterator(key, min, max, offset, count, false) + var num int64 = 0 + for ; it.Valid(); it.Next() { + sk := it.RawKey() + _, m, _, err := db.zDecodeScoreKey(sk) + if err != nil { + continue + } + + if n, err := db.zDelItem(t, key, m, true); err != nil { + return 0, err + } else if n == 1 { + num++ + } + + t.Delete(sk) + } + it.Close() + + if _, err := db.zIncrSize(t, key, -num); err != nil { + return 0, err + } + + return num, nil +} + +func (db *DB) zRange(key []byte, min int64, max int64, offset int, count int, reverse bool) ([]ScorePair, error) { + if len(key) > MaxKeySize { + return nil, errKeySize + } + + if offset < 0 { + return []ScorePair{}, nil + } + + nv := 64 + if count > 0 { + nv = count + } + + v := make([]ScorePair, 0, nv) + + var it *store.RangeLimitIterator + + //if reverse and offset is 0, count < 0, we may use forward iterator then reverse + //because store iterator prev is slower than next + if !reverse || (offset == 0 && count < 0) { + it = db.zIterator(key, min, max, offset, count, false) + } else { + it = db.zIterator(key, min, max, offset, count, true) + } + + for ; it.Valid(); it.Next() { + _, m, s, err := db.zDecodeScoreKey(it.Key()) + //may be we will check key equal? + if err != nil { + continue + } + + v = append(v, ScorePair{Member: m, Score: s}) + } + it.Close() + + if reverse && (offset == 0 && count < 0) { + for i, j := 0, len(v)-1; i < j; i, j = i+1, j-1 { + v[i], v[j] = v[j], v[i] + } + } + + return v, nil +} + +func (db *DB) zParseLimit(key []byte, start int, stop int) (offset int, count int, err error) { + if start < 0 || stop < 0 { + //refer redis implementation + var size int64 + size, err = db.ZCard(key) + if err != nil { + return + } + + llen := int(size) + + if start < 0 { + start = llen + start + } + if stop < 0 { + stop = llen + stop + } + + if start < 0 { + start = 0 + } + + if start >= llen { + offset = -1 + return + } + } + + if start > stop { + offset = -1 + return + } + + offset = start + count = (stop - start) + 1 + return +} + +func (db *DB) ZClear(key []byte) (int64, error) { + t := db.zsetBatch + t.Lock() + defer t.Unlock() + + rmCnt, err := db.zRemRange(t, key, MinScore, MaxScore, 0, -1) + if err == nil { + err = t.Commit() + } + + return rmCnt, err +} + +func (db *DB) ZMclear(keys ...[]byte) (int64, error) { + t := db.zsetBatch + t.Lock() + defer t.Unlock() + + for _, key := range keys { + if _, err := db.zRemRange(t, key, MinScore, MaxScore, 0, -1); err != nil { + return 0, err + } + } + + err := t.Commit() + + return int64(len(keys)), err +} + +func (db *DB) ZRange(key []byte, start int, stop int) ([]ScorePair, error) { + return db.ZRangeGeneric(key, start, stop, false) +} + +//min and max must be inclusive +//if no limit, set offset = 0 and count = -1 +func (db *DB) ZRangeByScore(key []byte, min int64, max int64, + offset int, count int) ([]ScorePair, error) { + return db.ZRangeByScoreGeneric(key, min, max, offset, count, false) +} + +func (db *DB) ZRank(key []byte, member []byte) (int64, error) { + return db.zrank(key, member, false) +} + +func (db *DB) ZRemRangeByRank(key []byte, start int, stop int) (int64, error) { + offset, count, err := db.zParseLimit(key, start, stop) + if err != nil { + return 0, err + } + + var rmCnt int64 + + t := db.zsetBatch + t.Lock() + defer t.Unlock() + + rmCnt, err = db.zRemRange(t, key, MinScore, MaxScore, offset, count) + if err == nil { + err = t.Commit() + } + + return rmCnt, err +} + +//min and max must be inclusive +func (db *DB) ZRemRangeByScore(key []byte, min int64, max int64) (int64, error) { + t := db.zsetBatch + t.Lock() + defer t.Unlock() + + rmCnt, err := db.zRemRange(t, key, min, max, 0, -1) + if err == nil { + err = t.Commit() + } + + return rmCnt, err +} + +func (db *DB) ZRevRange(key []byte, start int, stop int) ([]ScorePair, error) { + return db.ZRangeGeneric(key, start, stop, true) +} + +func (db *DB) ZRevRank(key []byte, member []byte) (int64, error) { + return db.zrank(key, member, true) +} + +//min and max must be inclusive +//if no limit, set offset = 0 and count = -1 +func (db *DB) ZRevRangeByScore(key []byte, min int64, max int64, offset int, count int) ([]ScorePair, error) { + return db.ZRangeByScoreGeneric(key, min, max, offset, count, true) +} + +func (db *DB) ZRangeGeneric(key []byte, start int, stop int, reverse bool) ([]ScorePair, error) { + offset, count, err := db.zParseLimit(key, start, stop) + if err != nil { + return nil, err + } + + return db.zRange(key, MinScore, MaxScore, offset, count, reverse) +} + +//min and max must be inclusive +//if no limit, set offset = 0 and count = -1 +func (db *DB) ZRangeByScoreGeneric(key []byte, min int64, max int64, + offset int, count int, reverse bool) ([]ScorePair, error) { + + return db.zRange(key, min, max, offset, count, reverse) +} + +func (db *DB) zFlush() (drop int64, err error) { + t := db.zsetBatch + t.Lock() + defer t.Unlock() + return db.flushType(t, ZSetType) +} + +func (db *DB) ZExpire(key []byte, duration int64) (int64, error) { + if duration <= 0 { + return 0, errExpireValue + } + + return db.zExpireAt(key, time.Now().Unix()+duration) +} + +func (db *DB) ZExpireAt(key []byte, when int64) (int64, error) { + if when <= time.Now().Unix() { + return 0, errExpireValue + } + + return db.zExpireAt(key, when) +} + +func (db *DB) ZTTL(key []byte) (int64, error) { + if err := checkKeySize(key); err != nil { + return -1, err + } + + return db.ttl(ZSetType, key) +} + +func (db *DB) ZPersist(key []byte) (int64, error) { + if err := checkKeySize(key); err != nil { + return 0, err + } + + t := db.zsetBatch + t.Lock() + defer t.Unlock() + + n, err := db.rmExpire(t, ZSetType, key) + if err != nil { + return 0, err + } + + err = t.Commit() + return n, err +} + +func getAggregateFunc(aggregate byte) func(int64, int64) int64 { + switch aggregate { + case AggregateSum: + return func(a int64, b int64) int64 { + return a + b + } + case AggregateMax: + return func(a int64, b int64) int64 { + if a > b { + return a + } + return b + } + case AggregateMin: + return func(a int64, b int64) int64 { + if a > b { + return b + } + return a + } + } + return nil +} + +func (db *DB) ZUnionStore(destKey []byte, srcKeys [][]byte, weights []int64, aggregate byte) (int64, error) { + + var destMap = map[string]int64{} + aggregateFunc := getAggregateFunc(aggregate) + if aggregateFunc == nil { + return 0, errInvalidAggregate + } + if len(srcKeys) < 1 { + return 0, errInvalidSrcKeyNum + } + if weights != nil { + if len(srcKeys) != len(weights) { + return 0, errInvalidWeightNum + } + } else { + weights = make([]int64, len(srcKeys)) + for i := 0; i < len(weights); i++ { + weights[i] = 1 + } + } + + for i, key := range srcKeys { + scorePairs, err := db.ZRange(key, 0, -1) + if err != nil { + return 0, err + } + for _, pair := range scorePairs { + if score, ok := destMap[hack.String(pair.Member)]; !ok { + destMap[hack.String(pair.Member)] = pair.Score * weights[i] + } else { + destMap[hack.String(pair.Member)] = aggregateFunc(score, pair.Score*weights[i]) + } + } + } + + t := db.zsetBatch + t.Lock() + defer t.Unlock() + + db.zDelete(t, destKey) + + for member, score := range destMap { + if err := checkZSetKMSize(destKey, []byte(member)); err != nil { + return 0, err + } + + if _, err := db.zSetItem(t, destKey, score, []byte(member)); err != nil { + return 0, err + } + } + + var n = int64(len(destMap)) + sk := db.zEncodeSizeKey(destKey) + t.Put(sk, PutInt64(n)) + + if err := t.Commit(); err != nil { + return 0, err + } + return n, nil +} + +func (db *DB) ZInterStore(destKey []byte, srcKeys [][]byte, weights []int64, aggregate byte) (int64, error) { + + aggregateFunc := getAggregateFunc(aggregate) + if aggregateFunc == nil { + return 0, errInvalidAggregate + } + if len(srcKeys) < 1 { + return 0, errInvalidSrcKeyNum + } + if weights != nil { + if len(srcKeys) != len(weights) { + return 0, errInvalidWeightNum + } + } else { + weights = make([]int64, len(srcKeys)) + for i := 0; i < len(weights); i++ { + weights[i] = 1 + } + } + + var destMap = map[string]int64{} + scorePairs, err := db.ZRange(srcKeys[0], 0, -1) + if err != nil { + return 0, err + } + for _, pair := range scorePairs { + destMap[hack.String(pair.Member)] = pair.Score * weights[0] + } + + for i, key := range srcKeys[1:] { + scorePairs, err := db.ZRange(key, 0, -1) + if err != nil { + return 0, err + } + tmpMap := map[string]int64{} + for _, pair := range scorePairs { + if score, ok := destMap[hack.String(pair.Member)]; ok { + tmpMap[hack.String(pair.Member)] = aggregateFunc(score, pair.Score*weights[i+1]) + } + } + destMap = tmpMap + } + + t := db.zsetBatch + t.Lock() + defer t.Unlock() + + db.zDelete(t, destKey) + + for member, score := range destMap { + if err := checkZSetKMSize(destKey, []byte(member)); err != nil { + return 0, err + } + if _, err := db.zSetItem(t, destKey, score, []byte(member)); err != nil { + return 0, err + } + } + + var n int64 = int64(len(destMap)) + sk := db.zEncodeSizeKey(destKey) + t.Put(sk, PutInt64(n)) + + if err := t.Commit(); err != nil { + return 0, err + } + return n, nil +} + +func (db *DB) ZScan(key []byte, count int, inclusive bool, match string) ([][]byte, error) { + return db.scan(ZSizeType, key, count, inclusive, match) +} + +func (db *DB) ZRevScan(key []byte, count int, inclusive bool, match string) ([][]byte, error) { + return db.revscan(ZSizeType, key, count, inclusive, match) +} + +func (db *DB) ZRangeByLex(key []byte, min []byte, max []byte, rangeType uint8, offset int, count int) ([][]byte, error) { + if min == nil { + min = db.zEncodeStartSetKey(key) + } else { + min = db.zEncodeSetKey(key, min) + } + if max == nil { + max = db.zEncodeStopSetKey(key) + } else { + max = db.zEncodeSetKey(key, max) + } + + it := db.bucket.RangeLimitIterator(min, max, rangeType, offset, count) + defer it.Close() + + ay := make([][]byte, 0, 16) + for ; it.Valid(); it.Next() { + if _, m, err := db.zDecodeSetKey(it.Key()); err == nil { + ay = append(ay, m) + } + } + + return ay, nil +} + +func (db *DB) ZRemRangeByLex(key []byte, min []byte, max []byte, rangeType uint8) (int64, error) { + if min == nil { + min = db.zEncodeStartSetKey(key) + } else { + min = db.zEncodeSetKey(key, min) + } + if max == nil { + max = db.zEncodeStopSetKey(key) + } else { + max = db.zEncodeSetKey(key, max) + } + + t := db.zsetBatch + t.Lock() + defer t.Unlock() + + it := db.bucket.RangeIterator(min, max, rangeType) + defer it.Close() + + var n int64 = 0 + for ; it.Valid(); it.Next() { + t.Delete(it.RawKey()) + n++ + } + + if err := t.Commit(); err != nil { + return 0, err + } + + return n, nil +} + +func (db *DB) ZLexCount(key []byte, min []byte, max []byte, rangeType uint8) (int64, error) { + if min == nil { + min = db.zEncodeStartSetKey(key) + } else { + min = db.zEncodeSetKey(key, min) + } + if max == nil { + max = db.zEncodeStopSetKey(key) + } else { + max = db.zEncodeSetKey(key, max) + } + + it := db.bucket.RangeIterator(min, max, rangeType) + defer it.Close() + + var n int64 = 0 + for ; it.Valid(); it.Next() { + n++ + } + + return n, nil +} diff --git a/vendor/github.com/siddontang/ledisdb/ledis/tx.go b/vendor/github.com/siddontang/ledisdb/ledis/tx.go new file mode 100644 index 000000000000..03d6c5b7ba34 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/ledis/tx.go @@ -0,0 +1,112 @@ +package ledis + +import ( + "errors" + "fmt" + "github.com/siddontang/ledisdb/store" +) + +var ( + ErrNestTx = errors.New("nest transaction not supported") + ErrTxDone = errors.New("Transaction has already been committed or rolled back") +) + +type Tx struct { + *DB + + tx *store.Tx + + data *store.BatchData +} + +func (db *DB) IsTransaction() bool { + return db.status == DBInTransaction +} + +// Begin a transaction, it will block all other write operations before calling Commit or Rollback. +// You must be very careful to prevent long-time transaction. +func (db *DB) Begin() (*Tx, error) { + if db.IsTransaction() { + return nil, ErrNestTx + } + + tx := new(Tx) + + tx.data = &store.BatchData{} + + tx.DB = new(DB) + tx.DB.l = db.l + + tx.l.wLock.Lock() + + tx.DB.sdb = db.sdb + + var err error + tx.tx, err = db.sdb.Begin() + if err != nil { + tx.l.wLock.Unlock() + return nil, err + } + + tx.DB.bucket = tx.tx + + tx.DB.status = DBInTransaction + + tx.DB.index = db.index + + tx.DB.kvBatch = tx.newBatch() + tx.DB.listBatch = tx.newBatch() + tx.DB.hashBatch = tx.newBatch() + tx.DB.zsetBatch = tx.newBatch() + tx.DB.binBatch = tx.newBatch() + tx.DB.setBatch = tx.newBatch() + + tx.DB.lbkeys = db.lbkeys + + return tx, nil +} + +func (tx *Tx) Commit() error { + if tx.tx == nil { + return ErrTxDone + } + + err := tx.l.handleCommit(tx.data, tx.tx) + tx.data.Reset() + + tx.tx = nil + + tx.l.wLock.Unlock() + + tx.DB.bucket = nil + + return err +} + +func (tx *Tx) Rollback() error { + if tx.tx == nil { + return ErrTxDone + } + + err := tx.tx.Rollback() + tx.data.Reset() + tx.tx = nil + + tx.l.wLock.Unlock() + tx.DB.bucket = nil + + return err +} + +func (tx *Tx) newBatch() *batch { + return tx.l.newBatch(tx.tx.NewWriteBatch(), &txBatchLocker{}, tx) +} + +func (tx *Tx) Select(index int) error { + if index < 0 || index >= int(MaxDBNumber) { + return fmt.Errorf("invalid db index %d", index) + } + + tx.DB.index = uint8(index) + return nil +} diff --git a/vendor/github.com/siddontang/ledisdb/ledis/util.go b/vendor/github.com/siddontang/ledisdb/ledis/util.go new file mode 100644 index 000000000000..a0abdd02228f --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/ledis/util.go @@ -0,0 +1,94 @@ +package ledis + +import ( + "encoding/binary" + "errors" + "github.com/siddontang/go/hack" + "strconv" +) + +var errIntNumber = errors.New("invalid integer") + +/* + Below I forget why I use little endian to store int. + Maybe I was foolish at that time. +*/ + +func Int64(v []byte, err error) (int64, error) { + if err != nil { + return 0, err + } else if v == nil || len(v) == 0 { + return 0, nil + } else if len(v) != 8 { + return 0, errIntNumber + } + + return int64(binary.LittleEndian.Uint64(v)), nil +} + +func Uint64(v []byte, err error) (uint64, error) { + if err != nil { + return 0, err + } else if v == nil || len(v) == 0 { + return 0, nil + } else if len(v) != 8 { + return 0, errIntNumber + } + + return binary.LittleEndian.Uint64(v), nil +} + +func PutInt64(v int64) []byte { + b := make([]byte, 8) + binary.LittleEndian.PutUint64(b, uint64(v)) + return b +} + +func StrInt64(v []byte, err error) (int64, error) { + if err != nil { + return 0, err + } else if v == nil { + return 0, nil + } else { + return strconv.ParseInt(hack.String(v), 10, 64) + } +} + +func StrUint64(v []byte, err error) (uint64, error) { + if err != nil { + return 0, err + } else if v == nil { + return 0, nil + } else { + return strconv.ParseUint(hack.String(v), 10, 64) + } +} + +func StrInt32(v []byte, err error) (int32, error) { + if err != nil { + return 0, err + } else if v == nil { + return 0, nil + } else { + res, err := strconv.ParseInt(hack.String(v), 10, 32) + return int32(res), err + } +} + +func StrInt8(v []byte, err error) (int8, error) { + if err != nil { + return 0, err + } else if v == nil { + return 0, nil + } else { + res, err := strconv.ParseInt(hack.String(v), 10, 8) + return int8(res), err + } +} + +func AsyncNotify(ch chan struct{}) { + select { + case ch <- struct{}{}: + default: + } +} diff --git a/vendor/github.com/siddontang/ledisdb/rpl/file_io.go b/vendor/github.com/siddontang/ledisdb/rpl/file_io.go new file mode 100644 index 000000000000..08e9d2eb3e19 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/rpl/file_io.go @@ -0,0 +1,362 @@ +package rpl + +import ( + "fmt" + "github.com/edsrzf/mmap-go" + "github.com/siddontang/go/log" + "io" + "os" +) + +//like leveldb or rocksdb file interface, haha! + +type writeFile interface { + Sync() error + Write(b []byte) (n int, err error) + Close() error + ReadAt(buf []byte, offset int64) (int, error) + Truncate(size int64) error + SetOffset(o int64) + Name() string + Size() int + Offset() int64 +} + +type readFile interface { + ReadAt(buf []byte, offset int64) (int, error) + Close() error + Size() int + Name() string +} + +type rawWriteFile struct { + writeFile + f *os.File + offset int64 + name string +} + +func newRawWriteFile(name string, size int64) (writeFile, error) { + m := new(rawWriteFile) + var err error + + m.name = name + + m.f, err = os.OpenFile(name, os.O_CREATE|os.O_RDWR, 0644) + if err != nil { + return nil, err + } + + return m, nil +} + +func (m *rawWriteFile) Close() error { + if err := m.f.Truncate(m.offset); err != nil { + return fmt.Errorf("close truncate %s error %s", m.name, err.Error()) + } + + if err := m.f.Close(); err != nil { + return fmt.Errorf("close %s error %s", m.name, err.Error()) + } + + return nil +} + +func (m *rawWriteFile) Sync() error { + return m.f.Sync() +} + +func (m *rawWriteFile) Write(b []byte) (n int, err error) { + n, err = m.f.WriteAt(b, m.offset) + if err != nil { + return + } else if n != len(b) { + err = io.ErrShortWrite + return + } + + m.offset += int64(n) + return +} + +func (m *rawWriteFile) ReadAt(buf []byte, offset int64) (int, error) { + return m.f.ReadAt(buf, offset) +} + +func (m *rawWriteFile) Truncate(size int64) error { + var err error + if err = m.f.Truncate(size); err != nil { + return err + } + + if m.offset > size { + m.offset = size + } + return nil +} + +func (m *rawWriteFile) SetOffset(o int64) { + m.offset = o +} + +func (m *rawWriteFile) Offset() int64 { + return m.offset +} + +func (m *rawWriteFile) Name() string { + return m.name +} + +func (m *rawWriteFile) Size() int { + st, _ := m.f.Stat() + return int(st.Size()) +} + +type rawReadFile struct { + readFile + + f *os.File + name string +} + +func newRawReadFile(name string) (readFile, error) { + m := new(rawReadFile) + + var err error + m.f, err = os.Open(name) + m.name = name + + if err != nil { + return nil, err + } + + return m, err +} + +func (m *rawReadFile) Close() error { + return m.f.Close() +} + +func (m *rawReadFile) Size() int { + st, _ := m.f.Stat() + return int(st.Size()) +} + +func (m *rawReadFile) ReadAt(b []byte, offset int64) (int, error) { + return m.f.ReadAt(b, offset) +} + +func (m *rawReadFile) Name() string { + return m.name +} + +///////////////////////////////////////////////// + +type mmapWriteFile struct { + writeFile + + f *os.File + m mmap.MMap + name string + size int64 + offset int64 +} + +func newMmapWriteFile(name string, size int64) (writeFile, error) { + m := new(mmapWriteFile) + + m.name = name + + var err error + + m.f, err = os.OpenFile(name, os.O_CREATE|os.O_RDWR, 0644) + if err != nil { + return nil, err + } + + if size == 0 { + st, _ := m.f.Stat() + size = st.Size() + } + + if err = m.f.Truncate(size); err != nil { + return nil, err + } + + if m.m, err = mmap.Map(m.f, mmap.RDWR, 0); err != nil { + return nil, err + } + + m.size = size + m.offset = 0 + return m, nil +} + +func (m *mmapWriteFile) Size() int { + return int(m.size) +} + +func (m *mmapWriteFile) Sync() error { + return m.m.Flush() +} + +func (m *mmapWriteFile) Close() error { + if err := m.m.Unmap(); err != nil { + return fmt.Errorf("unmap %s error %s", m.name, err.Error()) + } + + if err := m.f.Truncate(m.offset); err != nil { + return fmt.Errorf("close truncate %s error %s", m.name, err.Error()) + } + + if err := m.f.Close(); err != nil { + return fmt.Errorf("close %s error %s", m.name, err.Error()) + } + + return nil +} + +func (m *mmapWriteFile) Write(b []byte) (n int, err error) { + extra := int64(len(b)) - (m.size - m.offset) + if extra > 0 { + newSize := m.size + extra + m.size/10 + if err = m.Truncate(newSize); err != nil { + return + } + m.size = newSize + } + + n = copy(m.m[m.offset:], b) + if n != len(b) { + return 0, io.ErrShortWrite + } + + m.offset += int64(len(b)) + return len(b), nil +} + +func (m *mmapWriteFile) ReadAt(buf []byte, offset int64) (int, error) { + if offset > m.offset { + return 0, fmt.Errorf("invalid offset %d", offset) + } + + n := copy(buf, m.m[offset:m.offset]) + if n != len(buf) { + return n, io.ErrUnexpectedEOF + } + + return n, nil +} + +func (m *mmapWriteFile) Truncate(size int64) error { + var err error + if err = m.m.Unmap(); err != nil { + return err + } + + if err = m.f.Truncate(size); err != nil { + return err + } + + if m.m, err = mmap.Map(m.f, mmap.RDWR, 0); err != nil { + return err + } + + m.size = size + if m.offset > m.size { + m.offset = m.size + } + return nil +} + +func (m *mmapWriteFile) SetOffset(o int64) { + m.offset = o +} + +func (m *mmapWriteFile) Offset() int64 { + return m.offset +} + +func (m *mmapWriteFile) Name() string { + return m.name +} + +type mmapReadFile struct { + readFile + + f *os.File + m mmap.MMap + name string +} + +func newMmapReadFile(name string) (readFile, error) { + m := new(mmapReadFile) + + m.name = name + + var err error + m.f, err = os.Open(name) + if err != nil { + return nil, err + } + + m.m, err = mmap.Map(m.f, mmap.RDONLY, 0) + return m, err +} + +func (m *mmapReadFile) ReadAt(buf []byte, offset int64) (int, error) { + if int64(offset) > int64(len(m.m)) { + return 0, fmt.Errorf("invalid offset %d", offset) + } + + n := copy(buf, m.m[offset:]) + if n != len(buf) { + return n, io.ErrUnexpectedEOF + } + + return n, nil +} + +func (m *mmapReadFile) Close() error { + if m.m != nil { + if err := m.m.Unmap(); err != nil { + log.Error("unmap %s error %s", m.name, err.Error()) + } + m.m = nil + } + + if m.f != nil { + if err := m.f.Close(); err != nil { + log.Error("close %s error %s", m.name, err.Error()) + } + m.f = nil + } + + return nil +} + +func (m *mmapReadFile) Size() int { + return len(m.m) +} + +func (m *mmapReadFile) Name() string { + return m.name +} + +///////////////////////////////////// + +func newWriteFile(useMmap bool, name string, size int64) (writeFile, error) { + if useMmap { + return newMmapWriteFile(name, size) + } else { + return newRawWriteFile(name, size) + } +} + +func newReadFile(useMmap bool, name string) (readFile, error) { + if useMmap { + return newMmapReadFile(name) + } else { + return newRawReadFile(name) + } +} diff --git a/vendor/github.com/siddontang/ledisdb/rpl/file_store.go b/vendor/github.com/siddontang/ledisdb/rpl/file_store.go new file mode 100644 index 000000000000..161ab8d923d8 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/rpl/file_store.go @@ -0,0 +1,414 @@ +package rpl + +import ( + "fmt" + "github.com/siddontang/go/log" + "github.com/siddontang/go/num" + "github.com/siddontang/ledisdb/config" + "io/ioutil" + "os" + "sort" + "sync" + "time" +) + +const ( + defaultMaxLogFileSize = int64(256 * 1024 * 1024) + + maxLogFileSize = int64(1024 * 1024 * 1024) + + defaultLogNumInFile = int64(1024 * 1024) +) + +/* + File Store: + 00000001.data + 00000001.meta + 00000002.data + 00000002.meta + + data: log1 data | log2 data | magic data + + if data has no magic data, it means that we don't close replication gracefully. + so we must repair the log data + log data: id (bigendian uint64), create time (bigendian uint32), compression (byte), data len(bigendian uint32), data + split data = log0 data + [padding 0] -> file % pagesize() == 0 + + meta: log1 offset | log2 offset + log offset: bigendian uint32 | bigendian uint32 + + //sha1 of github.com/siddontang/ledisdb 20 bytes + magic data = "\x1c\x1d\xb8\x88\xff\x9e\x45\x55\x40\xf0\x4c\xda\xe0\xce\x47\xde\x65\x48\x71\x17" + + we must guarantee that the log id is monotonic increment strictly. + if log1's id is 1, log2 must be 2 +*/ + +type FileStore struct { + LogStore + + cfg *config.Config + + base string + + rm sync.RWMutex + wm sync.Mutex + + rs tableReaders + w *tableWriter + + quit chan struct{} +} + +func NewFileStore(base string, cfg *config.Config) (*FileStore, error) { + s := new(FileStore) + + s.quit = make(chan struct{}) + + var err error + + if err = os.MkdirAll(base, 0755); err != nil { + return nil, err + } + + s.base = base + + if cfg.Replication.MaxLogFileSize == 0 { + cfg.Replication.MaxLogFileSize = defaultMaxLogFileSize + } + + cfg.Replication.MaxLogFileSize = num.MinInt64(cfg.Replication.MaxLogFileSize, maxLogFileSize) + + s.cfg = cfg + + if err = s.load(); err != nil { + return nil, err + } + + index := int64(1) + if len(s.rs) != 0 { + index = s.rs[len(s.rs)-1].index + 1 + } + + s.w = newTableWriter(s.base, index, cfg.Replication.MaxLogFileSize, cfg.Replication.UseMmap) + s.w.SetSyncType(cfg.Replication.SyncLog) + + go s.checkTableReaders() + + return s, nil +} + +func (s *FileStore) GetLog(id uint64, l *Log) error { + //first search in table writer + if err := s.w.GetLog(id, l); err == nil { + return nil + } else if err != ErrLogNotFound { + return err + } + + s.rm.RLock() + t := s.rs.Search(id) + + if t == nil { + s.rm.RUnlock() + + return ErrLogNotFound + } + + err := t.GetLog(id, l) + s.rm.RUnlock() + + return err +} + +func (s *FileStore) FirstID() (uint64, error) { + id := uint64(0) + + s.rm.RLock() + if len(s.rs) > 0 { + id = s.rs[0].first + } else { + id = 0 + } + s.rm.RUnlock() + + if id > 0 { + return id, nil + } + + //if id = 0, + + return s.w.First(), nil +} + +func (s *FileStore) LastID() (uint64, error) { + id := s.w.Last() + if id > 0 { + return id, nil + } + + //if table writer has no last id, we may find in the last table reader + + s.rm.RLock() + if len(s.rs) > 0 { + id = s.rs[len(s.rs)-1].last + } + s.rm.RUnlock() + + return id, nil +} + +func (s *FileStore) StoreLog(l *Log) error { + s.wm.Lock() + err := s.storeLog(l) + s.wm.Unlock() + return err +} + +func (s *FileStore) storeLog(l *Log) error { + err := s.w.StoreLog(l) + if err == nil { + return nil + } else if err != errTableNeedFlush { + return err + } + + var r *tableReader + r, err = s.w.Flush() + + if err != nil { + log.Fatal("write table flush error %s, can not store!!!", err.Error()) + + s.w.Close() + + return err + } + + s.rm.Lock() + s.rs = append(s.rs, r) + s.rm.Unlock() + + err = s.w.StoreLog(l) + + return err +} + +func (s *FileStore) PurgeExpired(n int64) error { + s.rm.Lock() + + purges := []*tableReader{} + + t := uint32(time.Now().Unix() - int64(n)) + + for i, r := range s.rs { + if r.lastTime > t { + purges = s.rs[0:i] + s.rs = s.rs[i:] + break + } + } + + s.rm.Unlock() + + s.purgeTableReaders(purges) + + return nil +} + +func (s *FileStore) Sync() error { + return s.w.Sync() +} + +func (s *FileStore) Clear() error { + s.wm.Lock() + s.rm.Lock() + + defer func() { + s.rm.Unlock() + s.wm.Unlock() + }() + + s.w.Close() + + for i := range s.rs { + s.rs[i].Close() + } + + s.rs = tableReaders{} + + if err := os.RemoveAll(s.base); err != nil { + return err + } + + if err := os.MkdirAll(s.base, 0755); err != nil { + return err + } + + s.w = newTableWriter(s.base, 1, s.cfg.Replication.MaxLogFileSize, s.cfg.Replication.UseMmap) + + return nil +} + +func (s *FileStore) Close() error { + close(s.quit) + + s.wm.Lock() + s.rm.Lock() + + if r, err := s.w.Flush(); err != nil { + if err != errNilHandler { + log.Error("close err: %s", err.Error()) + } + } else { + r.Close() + s.w.Close() + } + + for i := range s.rs { + s.rs[i].Close() + } + + s.rs = tableReaders{} + + s.rm.Unlock() + s.wm.Unlock() + + return nil +} + +func (s *FileStore) checkTableReaders() { + t := time.NewTicker(60 * time.Second) + defer t.Stop() + for { + select { + case <-t.C: + s.rm.Lock() + + for _, r := range s.rs { + if !r.Keepalived() { + r.Close() + } + } + + purges := []*tableReader{} + maxNum := s.cfg.Replication.MaxLogFileNum + num := len(s.rs) + if num > maxNum { + purges = s.rs[:num-maxNum] + s.rs = s.rs[num-maxNum:] + } + + s.rm.Unlock() + + s.purgeTableReaders(purges) + + case <-s.quit: + return + } + } +} + +func (s *FileStore) purgeTableReaders(purges []*tableReader) { + for _, r := range purges { + dataName := fmtTableDataName(r.base, r.index) + metaName := fmtTableMetaName(r.base, r.index) + r.Close() + if err := os.Remove(dataName); err != nil { + log.Error("purge table data %s err: %s", dataName, err.Error()) + } + if err := os.Remove(metaName); err != nil { + log.Error("purge table meta %s err: %s", metaName, err.Error()) + } + + } +} + +func (s *FileStore) load() error { + fs, err := ioutil.ReadDir(s.base) + if err != nil { + return err + } + + s.rs = make(tableReaders, 0, len(fs)) + + var r *tableReader + var index int64 + for _, f := range fs { + if _, err := fmt.Sscanf(f.Name(), "%08d.data", &index); err == nil { + if r, err = newTableReader(s.base, index, s.cfg.Replication.UseMmap); err != nil { + log.Error("load table %s err: %s", f.Name(), err.Error()) + } else { + s.rs = append(s.rs, r) + } + } + } + + if err := s.rs.check(); err != nil { + return err + } + + return nil +} + +type tableReaders []*tableReader + +func (ts tableReaders) Len() int { + return len(ts) +} + +func (ts tableReaders) Swap(i, j int) { + ts[i], ts[j] = ts[j], ts[i] +} + +func (ts tableReaders) Less(i, j int) bool { + return ts[i].first < ts[j].first +} + +func (ts tableReaders) Search(id uint64) *tableReader { + i, j := 0, len(ts)-1 + + for i <= j { + h := i + (j-i)/2 + + if ts[h].first <= id && id <= ts[h].last { + return ts[h] + } else if ts[h].last < id { + i = h + 1 + } else { + j = h - 1 + } + } + + return nil +} + +func (ts tableReaders) check() error { + if len(ts) == 0 { + return nil + } + + sort.Sort(ts) + + first := ts[0].first + last := ts[0].last + index := ts[0].index + + if first == 0 || first > last { + return fmt.Errorf("invalid log in table %s", ts[0]) + } + + for i := 1; i < len(ts); i++ { + if ts[i].first <= last { + return fmt.Errorf("invalid first log id %d in table %s", ts[i].first, ts[i]) + } + + if ts[i].index <= index { + return fmt.Errorf("invalid index %d in table %s", ts[i].index, ts[i]) + } + + first = ts[i].first + last = ts[i].last + index = ts[i].index + } + return nil +} diff --git a/vendor/github.com/siddontang/ledisdb/rpl/file_table.go b/vendor/github.com/siddontang/ledisdb/rpl/file_table.go new file mode 100644 index 000000000000..b4dcbfd9536c --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/rpl/file_table.go @@ -0,0 +1,570 @@ +package rpl + +import ( + "bytes" + "encoding/binary" + "errors" + "fmt" + "github.com/siddontang/go/log" + "github.com/siddontang/go/sync2" + "io" + "path" + "sync" + "time" +) + +var ( + magic = []byte("\x1c\x1d\xb8\x88\xff\x9e\x45\x55\x40\xf0\x4c\xda\xe0\xce\x47\xde\x65\x48\x71\x17") + errTableNeedFlush = errors.New("write table need flush") + errNilHandler = errors.New("nil write handler") +) + +const tableReaderKeepaliveInterval int64 = 30 + +func fmtTableDataName(base string, index int64) string { + return path.Join(base, fmt.Sprintf("%08d.data", index)) +} + +func fmtTableMetaName(base string, index int64) string { + return path.Join(base, fmt.Sprintf("%08d.meta", index)) +} + +type tableReader struct { + sync.Mutex + + base string + index int64 + + data readFile + meta readFile + + first uint64 + last uint64 + + lastTime uint32 + + lastReadTime sync2.AtomicInt64 + + useMmap bool +} + +func newTableReader(base string, index int64, useMmap bool) (*tableReader, error) { + if index <= 0 { + return nil, fmt.Errorf("invalid index %d", index) + } + t := new(tableReader) + t.base = base + t.index = index + + t.useMmap = useMmap + + var err error + + if err = t.check(); err != nil { + log.Error("check %d error: %s, try to repair", t.index, err.Error()) + + if err = t.repair(); err != nil { + log.Error("repair %d error: %s", t.index, err.Error()) + return nil, err + } + } + + t.close() + + return t, nil +} + +func (t *tableReader) String() string { + return fmt.Sprintf("%d", t.index) +} + +func (t *tableReader) Close() { + t.Lock() + + t.close() + + t.Unlock() +} + +func (t *tableReader) close() { + if t.data != nil { + t.data.Close() + t.data = nil + } + + if t.meta != nil { + t.meta.Close() + t.meta = nil + } +} + +func (t *tableReader) Keepalived() bool { + l := t.lastReadTime.Get() + if l > 0 && time.Now().Unix()-l > tableReaderKeepaliveInterval { + return false + } + + return true +} + +func (t *tableReader) getLogPos(index int) (uint32, error) { + var buf [4]byte + if _, err := t.meta.ReadAt(buf[0:4], int64(index)*4); err != nil { + return 0, err + } + + return binary.BigEndian.Uint32(buf[0:4]), nil +} + +func (t *tableReader) checkData() error { + var err error + //check will use raw file mode + if t.data, err = newReadFile(false, fmtTableDataName(t.base, t.index)); err != nil { + return err + } + + if t.data.Size() < len(magic) { + return fmt.Errorf("data file %s size %d too short", t.data.Name(), t.data.Size()) + } + + buf := make([]byte, len(magic)) + if _, err := t.data.ReadAt(buf, int64(t.data.Size()-len(magic))); err != nil { + return err + } + + if !bytes.Equal(magic, buf) { + return fmt.Errorf("data file %s invalid magic data %q", t.data.Name(), buf) + } + + return nil +} + +func (t *tableReader) checkMeta() error { + var err error + //check will use raw file mode + if t.meta, err = newReadFile(false, fmtTableMetaName(t.base, t.index)); err != nil { + return err + } + + if t.meta.Size()%4 != 0 || t.meta.Size() == 0 { + return fmt.Errorf("meta file %s invalid offset len %d, must 4 multiple and not 0", t.meta.Name(), t.meta.Size()) + } + + return nil +} + +func (t *tableReader) check() error { + var err error + + if err := t.checkMeta(); err != nil { + return err + } + + if err := t.checkData(); err != nil { + return err + } + + firstLogPos, _ := t.getLogPos(0) + lastLogPos, _ := t.getLogPos(t.meta.Size()/4 - 1) + + if firstLogPos != 0 { + return fmt.Errorf("invalid first log pos %d, must 0", firstLogPos) + } + + var l Log + if _, err = t.decodeLogHead(&l, t.data, int64(firstLogPos)); err != nil { + return fmt.Errorf("decode first log err %s", err.Error()) + } + + t.first = l.ID + var n int64 + if n, err = t.decodeLogHead(&l, t.data, int64(lastLogPos)); err != nil { + return fmt.Errorf("decode last log err %s", err.Error()) + } else if n+int64(len(magic)) != int64(t.data.Size()) { + return fmt.Errorf("extra log data at offset %d", n) + } + + t.last = l.ID + t.lastTime = l.CreateTime + + if t.first > t.last { + return fmt.Errorf("invalid log table first %d > last %d", t.first, t.last) + } else if (t.last - t.first + 1) != uint64(t.meta.Size()/4) { + return fmt.Errorf("invalid log table, first %d, last %d, and log num %d", t.first, t.last, t.meta.Size()/4) + } + + return nil +} + +func (t *tableReader) repair() error { + t.close() + + var err error + var data writeFile + var meta writeFile + + //repair will use raw file mode + data, err = newWriteFile(false, fmtTableDataName(t.base, t.index), 0) + data.SetOffset(int64(data.Size())) + + meta, err = newWriteFile(false, fmtTableMetaName(t.base, t.index), int64(defaultLogNumInFile*4)) + + var l Log + var pos int64 = 0 + var nextPos int64 = 0 + b := make([]byte, 4) + + t.first = 0 + t.last = 0 + + for { + nextPos, err = t.decodeLogHead(&l, data, pos) + if err != nil { + //if error, we may lost all logs from pos + log.Error("%s may lost logs from %d", data.Name(), pos) + break + } + + if l.ID == 0 { + log.Error("%s may lost logs from %d, invalid log 0", data.Name(), pos) + break + } + + if t.first == 0 { + t.first = l.ID + } + + if t.last == 0 { + t.last = l.ID + } else if l.ID <= t.last { + log.Error("%s may lost logs from %d, invalid logid %d", t.data.Name(), pos, l.ID) + break + } + + t.last = l.ID + t.lastTime = l.CreateTime + + binary.BigEndian.PutUint32(b, uint32(pos)) + meta.Write(b) + + pos = nextPos + + t.lastTime = l.CreateTime + } + + var e error + if err := meta.Close(); err != nil { + e = err + } + + data.SetOffset(pos) + + if _, err = data.Write(magic); err != nil { + log.Error("write magic error %s", err.Error()) + } + + if err = data.Close(); err != nil { + return err + } + + return e +} + +func (t *tableReader) decodeLogHead(l *Log, r io.ReaderAt, pos int64) (int64, error) { + dataLen, err := l.DecodeHeadAt(r, pos) + if err != nil { + return 0, err + } + + return pos + int64(l.HeadSize()) + int64(dataLen), nil +} + +func (t *tableReader) GetLog(id uint64, l *Log) error { + if id < t.first || id > t.last { + return ErrLogNotFound + } + + t.lastReadTime.Set(time.Now().Unix()) + + t.Lock() + + if err := t.openTable(); err != nil { + t.close() + t.Unlock() + return err + } + t.Unlock() + + pos, err := t.getLogPos(int(id - t.first)) + if err != nil { + return err + } + + if err := l.DecodeAt(t.data, int64(pos)); err != nil { + return err + } else if l.ID != id { + return fmt.Errorf("invalid log id %d != %d", l.ID, id) + } + + return nil +} + +func (t *tableReader) openTable() error { + var err error + if t.data == nil { + if t.data, err = newReadFile(t.useMmap, fmtTableDataName(t.base, t.index)); err != nil { + return err + } + } + + if t.meta == nil { + if t.meta, err = newReadFile(t.useMmap, fmtTableMetaName(t.base, t.index)); err != nil { + return err + } + + } + + return nil +} + +type tableWriter struct { + sync.RWMutex + + data writeFile + meta writeFile + + base string + index int64 + + first uint64 + last uint64 + lastTime uint32 + + maxLogSize int64 + + closed bool + + syncType int + + posBuf []byte + + useMmap bool +} + +func newTableWriter(base string, index int64, maxLogSize int64, useMmap bool) *tableWriter { + if index <= 0 { + panic(fmt.Errorf("invalid index %d", index)) + } + + t := new(tableWriter) + + t.base = base + t.index = index + + t.maxLogSize = maxLogSize + + t.closed = false + + t.posBuf = make([]byte, 4) + + t.useMmap = useMmap + + return t +} + +func (t *tableWriter) String() string { + return fmt.Sprintf("%d", t.index) +} + +func (t *tableWriter) SetMaxLogSize(s int64) { + t.maxLogSize = s +} + +func (t *tableWriter) SetSyncType(tp int) { + t.syncType = tp +} + +func (t *tableWriter) close() { + if t.meta != nil { + if err := t.meta.Close(); err != nil { + log.Fatal("close log meta error %s", err.Error()) + } + t.meta = nil + } + + if t.data != nil { + if _, err := t.data.Write(magic); err != nil { + log.Fatal("write magic error %s", err.Error()) + } + + if err := t.data.Close(); err != nil { + log.Fatal("close log data error %s", err.Error()) + } + t.data = nil + } +} + +func (t *tableWriter) Close() { + t.Lock() + t.closed = true + + t.close() + t.Unlock() +} + +func (t *tableWriter) First() uint64 { + t.Lock() + id := t.first + t.Unlock() + return id +} + +func (t *tableWriter) Last() uint64 { + t.Lock() + id := t.last + t.Unlock() + return id +} + +func (t *tableWriter) Flush() (*tableReader, error) { + t.Lock() + + if t.data == nil || t.meta == nil { + t.Unlock() + return nil, errNilHandler + } + + tr := new(tableReader) + tr.base = t.base + tr.index = t.index + + tr.first = t.first + tr.last = t.last + tr.lastTime = t.lastTime + tr.useMmap = t.useMmap + + t.close() + + t.first = 0 + t.last = 0 + t.index = t.index + 1 + + t.Unlock() + + return tr, nil +} + +func (t *tableWriter) StoreLog(l *Log) error { + t.Lock() + err := t.storeLog(l) + t.Unlock() + + return err +} + +func (t *tableWriter) openFile() error { + var err error + if t.data == nil { + if t.data, err = newWriteFile(t.useMmap, fmtTableDataName(t.base, t.index), t.maxLogSize+t.maxLogSize/10+int64(len(magic))); err != nil { + return err + } + } + + if t.meta == nil { + if t.meta, err = newWriteFile(t.useMmap, fmtTableMetaName(t.base, t.index), int64(defaultLogNumInFile*4)); err != nil { + return err + } + } + return err +} + +func (t *tableWriter) storeLog(l *Log) error { + if l.ID == 0 { + return ErrStoreLogID + } + + if t.closed { + return fmt.Errorf("table writer is closed") + } + + if t.last > 0 && l.ID != t.last+1 { + return ErrStoreLogID + } + + if t.data != nil && t.data.Offset() > t.maxLogSize { + return errTableNeedFlush + } + + var err error + if err = t.openFile(); err != nil { + return err + } + + offsetPos := t.data.Offset() + if err = l.Encode(t.data); err != nil { + return err + } + + binary.BigEndian.PutUint32(t.posBuf, uint32(offsetPos)) + if _, err = t.meta.Write(t.posBuf); err != nil { + return err + } + + if t.first == 0 { + t.first = l.ID + } + + t.last = l.ID + t.lastTime = l.CreateTime + + if t.syncType == 2 { + if err := t.data.Sync(); err != nil { + log.Error("sync table error %s", err.Error()) + } + } + + return nil +} + +func (t *tableWriter) GetLog(id uint64, l *Log) error { + t.RLock() + defer t.RUnlock() + + if id < t.first || id > t.last { + return ErrLogNotFound + } + + var buf [4]byte + if _, err := t.meta.ReadAt(buf[0:4], int64((id-t.first)*4)); err != nil { + return err + } + + offset := binary.BigEndian.Uint32(buf[0:4]) + + if err := l.DecodeAt(t.data, int64(offset)); err != nil { + return err + } else if l.ID != id { + return fmt.Errorf("invalid log id %d != %d", id, l.ID) + } + + return nil +} + +func (t *tableWriter) Sync() error { + t.Lock() + + var err error + if t.data != nil { + err = t.data.Sync() + t.Unlock() + return err + } + + if t.meta != nil { + err = t.meta.Sync() + } + + t.Unlock() + + return err +} diff --git a/vendor/github.com/siddontang/ledisdb/rpl/goleveldb_store.go b/vendor/github.com/siddontang/ledisdb/rpl/goleveldb_store.go new file mode 100644 index 000000000000..5ece8d511b15 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/rpl/goleveldb_store.go @@ -0,0 +1,224 @@ +package rpl + +import ( + "bytes" + "fmt" + "github.com/siddontang/go/num" + "github.com/siddontang/ledisdb/config" + "github.com/siddontang/ledisdb/store" + "os" + "sync" + "time" +) + +type GoLevelDBStore struct { + LogStore + + m sync.Mutex + db *store.DB + + cfg *config.Config + + first uint64 + last uint64 + + buf bytes.Buffer +} + +func (s *GoLevelDBStore) FirstID() (uint64, error) { + s.m.Lock() + id, err := s.firstID() + s.m.Unlock() + + return id, err +} + +func (s *GoLevelDBStore) LastID() (uint64, error) { + s.m.Lock() + id, err := s.lastID() + s.m.Unlock() + + return id, err +} + +func (s *GoLevelDBStore) firstID() (uint64, error) { + if s.first != InvalidLogID { + return s.first, nil + } + + it := s.db.NewIterator() + defer it.Close() + + it.SeekToFirst() + + if it.Valid() { + s.first = num.BytesToUint64(it.RawKey()) + } + + return s.first, nil +} + +func (s *GoLevelDBStore) lastID() (uint64, error) { + if s.last != InvalidLogID { + return s.last, nil + } + + it := s.db.NewIterator() + defer it.Close() + + it.SeekToLast() + + if it.Valid() { + s.last = num.BytesToUint64(it.RawKey()) + } + + return s.last, nil +} + +func (s *GoLevelDBStore) GetLog(id uint64, log *Log) error { + v, err := s.db.Get(num.Uint64ToBytes(id)) + if err != nil { + return err + } else if v == nil { + return ErrLogNotFound + } else { + return log.Decode(bytes.NewBuffer(v)) + } +} + +func (s *GoLevelDBStore) StoreLog(log *Log) error { + s.m.Lock() + defer s.m.Unlock() + + last, err := s.lastID() + if err != nil { + return err + } + + s.last = InvalidLogID + + s.buf.Reset() + + if log.ID != last+1 { + return ErrStoreLogID + } + + last = log.ID + key := num.Uint64ToBytes(log.ID) + + if err := log.Encode(&s.buf); err != nil { + return err + } + + if err = s.db.Put(key, s.buf.Bytes()); err != nil { + return err + } + + s.last = last + return nil +} + +func (s *GoLevelDBStore) PurgeExpired(n int64) error { + if n <= 0 { + return fmt.Errorf("invalid expired time %d", n) + } + + t := uint32(time.Now().Unix() - int64(n)) + + s.m.Lock() + defer s.m.Unlock() + + s.reset() + + it := s.db.NewIterator() + it.SeekToFirst() + + w := s.db.NewWriteBatch() + defer w.Rollback() + + l := new(Log) + for ; it.Valid(); it.Next() { + v := it.RawValue() + + if err := l.Unmarshal(v); err != nil { + return err + } else if l.CreateTime > t { + break + } else { + w.Delete(it.RawKey()) + } + } + + if err := w.Commit(); err != nil { + return err + } + + return nil +} + +func (s *GoLevelDBStore) Sync() error { + //no other way for sync, so ignore here + return nil +} + +func (s *GoLevelDBStore) reset() { + s.first = InvalidLogID + s.last = InvalidLogID +} + +func (s *GoLevelDBStore) Clear() error { + s.m.Lock() + defer s.m.Unlock() + + if s.db != nil { + s.db.Close() + } + + s.reset() + os.RemoveAll(s.cfg.DBPath) + + return s.open() +} + +func (s *GoLevelDBStore) Close() error { + s.m.Lock() + defer s.m.Unlock() + + if s.db == nil { + return nil + } + + err := s.db.Close() + s.db = nil + return err +} + +func (s *GoLevelDBStore) open() error { + var err error + + s.first = InvalidLogID + s.last = InvalidLogID + + s.db, err = store.Open(s.cfg) + return err +} + +func NewGoLevelDBStore(base string, syncLog int) (*GoLevelDBStore, error) { + cfg := config.NewConfigDefault() + cfg.DBName = "goleveldb" + cfg.DBPath = base + cfg.LevelDB.BlockSize = 16 * 1024 * 1024 + cfg.LevelDB.CacheSize = 64 * 1024 * 1024 + cfg.LevelDB.WriteBufferSize = 64 * 1024 * 1024 + cfg.LevelDB.Compression = false + cfg.DBSyncCommit = syncLog + + s := new(GoLevelDBStore) + s.cfg = cfg + + if err := s.open(); err != nil { + return nil, err + } + + return s, nil +} diff --git a/vendor/github.com/siddontang/ledisdb/rpl/log.go b/vendor/github.com/siddontang/ledisdb/rpl/log.go new file mode 100644 index 000000000000..ad0b48cd4f79 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/rpl/log.go @@ -0,0 +1,167 @@ +package rpl + +import ( + "bytes" + "encoding/binary" + "io" + "sync" +) + +const LogHeadSize = 17 + +type Log struct { + ID uint64 + CreateTime uint32 + Compression uint8 + + Data []byte +} + +func (l *Log) HeadSize() int { + return LogHeadSize +} + +func (l *Log) Size() int { + return l.HeadSize() + len(l.Data) +} + +func (l *Log) Marshal() ([]byte, error) { + buf := bytes.NewBuffer(make([]byte, l.Size())) + buf.Reset() + + if err := l.Encode(buf); err != nil { + return nil, err + } + + return buf.Bytes(), nil +} + +func (l *Log) Unmarshal(b []byte) error { + buf := bytes.NewBuffer(b) + + return l.Decode(buf) +} + +var headPool = sync.Pool{ + New: func() interface{} { return make([]byte, LogHeadSize) }, +} + +func (l *Log) Encode(w io.Writer) error { + b := headPool.Get().([]byte) + pos := 0 + + binary.BigEndian.PutUint64(b[pos:], l.ID) + pos += 8 + binary.BigEndian.PutUint32(b[pos:], uint32(l.CreateTime)) + pos += 4 + b[pos] = l.Compression + pos++ + binary.BigEndian.PutUint32(b[pos:], uint32(len(l.Data))) + + n, err := w.Write(b) + headPool.Put(b) + + if err != nil { + return err + } else if n != LogHeadSize { + return io.ErrShortWrite + } + + if n, err = w.Write(l.Data); err != nil { + return err + } else if n != len(l.Data) { + return io.ErrShortWrite + } + return nil +} + +func (l *Log) Decode(r io.Reader) error { + length, err := l.DecodeHead(r) + if err != nil { + return err + } + + l.growData(int(length)) + + if _, err := io.ReadFull(r, l.Data); err != nil { + return err + } + + return nil +} + +func (l *Log) DecodeHead(r io.Reader) (uint32, error) { + buf := headPool.Get().([]byte) + + if _, err := io.ReadFull(r, buf); err != nil { + headPool.Put(buf) + return 0, err + } + + length := l.decodeHeadBuf(buf) + + headPool.Put(buf) + + return length, nil +} + +func (l *Log) DecodeAt(r io.ReaderAt, pos int64) error { + length, err := l.DecodeHeadAt(r, pos) + if err != nil { + return err + } + + l.growData(int(length)) + var n int + n, err = r.ReadAt(l.Data, pos+int64(LogHeadSize)) + if err == io.EOF && n == len(l.Data) { + err = nil + } + + return err +} + +func (l *Log) growData(length int) { + l.Data = l.Data[0:0] + + if cap(l.Data) >= length { + l.Data = l.Data[0:length] + } else { + l.Data = make([]byte, length) + } +} + +func (l *Log) DecodeHeadAt(r io.ReaderAt, pos int64) (uint32, error) { + buf := headPool.Get().([]byte) + + n, err := r.ReadAt(buf, pos) + if err != nil && err != io.EOF { + headPool.Put(buf) + + return 0, err + } + + length := l.decodeHeadBuf(buf) + headPool.Put(buf) + + if err == io.EOF && (length != 0 || n != len(buf)) { + return 0, err + } + + return length, nil +} + +func (l *Log) decodeHeadBuf(buf []byte) uint32 { + pos := 0 + l.ID = binary.BigEndian.Uint64(buf[pos:]) + pos += 8 + + l.CreateTime = binary.BigEndian.Uint32(buf[pos:]) + pos += 4 + + l.Compression = uint8(buf[pos]) + pos++ + + length := binary.BigEndian.Uint32(buf[pos:]) + return length +} diff --git a/vendor/github.com/siddontang/ledisdb/rpl/rpl.go b/vendor/github.com/siddontang/ledisdb/rpl/rpl.go new file mode 100644 index 000000000000..d232992579d3 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/rpl/rpl.go @@ -0,0 +1,331 @@ +package rpl + +import ( + "encoding/binary" + "github.com/siddontang/go/log" + "github.com/siddontang/go/snappy" + "github.com/siddontang/ledisdb/config" + "os" + "path" + "sync" + "time" +) + +type Stat struct { + FirstID uint64 + LastID uint64 + CommitID uint64 +} + +type Replication struct { + m sync.Mutex + + cfg *config.Config + + s LogStore + + commitID uint64 + commitLog *os.File + + quit chan struct{} + + wg sync.WaitGroup + + nc chan struct{} + + ncm sync.Mutex +} + +func NewReplication(cfg *config.Config) (*Replication, error) { + if len(cfg.Replication.Path) == 0 { + cfg.Replication.Path = path.Join(cfg.DataDir, "rpl") + } + + base := cfg.Replication.Path + + r := new(Replication) + + r.quit = make(chan struct{}) + r.nc = make(chan struct{}) + + r.cfg = cfg + + var err error + + switch cfg.Replication.StoreName { + case "goleveldb": + if r.s, err = NewGoLevelDBStore(path.Join(base, "wal"), cfg.Replication.SyncLog); err != nil { + return nil, err + } + default: + if r.s, err = NewFileStore(path.Join(base, "ldb"), cfg); err != nil { + return nil, err + } + } + + if r.commitLog, err = os.OpenFile(path.Join(base, "commit.log"), os.O_RDWR|os.O_CREATE, 0644); err != nil { + return nil, err + } + + if s, _ := r.commitLog.Stat(); s.Size() == 0 { + r.commitID = 0 + } else if err = binary.Read(r.commitLog, binary.BigEndian, &r.commitID); err != nil { + return nil, err + } + + r.wg.Add(1) + go r.run() + + return r, nil +} + +func (r *Replication) Close() error { + close(r.quit) + + r.wg.Wait() + + r.m.Lock() + defer r.m.Unlock() + + if r.s != nil { + r.s.Close() + r.s = nil + } + + if err := r.updateCommitID(r.commitID, true); err != nil { + log.Error("update commit id err %s", err.Error()) + } + + if r.commitLog != nil { + r.commitLog.Close() + r.commitLog = nil + } + + return nil +} + +func (r *Replication) Log(data []byte) (*Log, error) { + if r.cfg.Replication.Compression { + //todo optimize + var err error + if data, err = snappy.Encode(nil, data); err != nil { + return nil, err + } + } + + r.m.Lock() + + lastID, err := r.s.LastID() + if err != nil { + r.m.Unlock() + return nil, err + } + + commitId := r.commitID + if lastID < commitId { + lastID = commitId + } else if lastID > commitId { + r.m.Unlock() + return nil, ErrCommitIDBehind + } + + l := new(Log) + l.ID = lastID + 1 + l.CreateTime = uint32(time.Now().Unix()) + + if r.cfg.Replication.Compression { + l.Compression = 1 + } else { + l.Compression = 0 + } + + l.Data = data + + if err = r.s.StoreLog(l); err != nil { + r.m.Unlock() + return nil, err + } + + r.m.Unlock() + + r.ncm.Lock() + close(r.nc) + r.nc = make(chan struct{}) + r.ncm.Unlock() + + return l, nil +} + +func (r *Replication) WaitLog() <-chan struct{} { + r.ncm.Lock() + ch := r.nc + r.ncm.Unlock() + return ch +} + +func (r *Replication) StoreLog(log *Log) error { + r.m.Lock() + err := r.s.StoreLog(log) + r.m.Unlock() + + return err +} + +func (r *Replication) FirstLogID() (uint64, error) { + r.m.Lock() + id, err := r.s.FirstID() + r.m.Unlock() + + return id, err +} + +func (r *Replication) LastLogID() (uint64, error) { + r.m.Lock() + id, err := r.s.LastID() + r.m.Unlock() + return id, err +} + +func (r *Replication) LastCommitID() (uint64, error) { + r.m.Lock() + id := r.commitID + r.m.Unlock() + return id, nil +} + +func (r *Replication) UpdateCommitID(id uint64) error { + r.m.Lock() + err := r.updateCommitID(id, r.cfg.Replication.SyncLog == 2) + r.m.Unlock() + + return err +} + +func (r *Replication) Stat() (*Stat, error) { + r.m.Lock() + defer r.m.Unlock() + + s := &Stat{} + var err error + + if s.FirstID, err = r.s.FirstID(); err != nil { + return nil, err + } + + if s.LastID, err = r.s.LastID(); err != nil { + return nil, err + } + + s.CommitID = r.commitID + return s, nil +} + +func (r *Replication) updateCommitID(id uint64, force bool) error { + if force { + if _, err := r.commitLog.Seek(0, os.SEEK_SET); err != nil { + return err + } + + if err := binary.Write(r.commitLog, binary.BigEndian, id); err != nil { + return err + } + } + + r.commitID = id + + return nil +} + +func (r *Replication) CommitIDBehind() (bool, error) { + r.m.Lock() + + id, err := r.s.LastID() + if err != nil { + r.m.Unlock() + return false, err + } + + behind := id > r.commitID + r.m.Unlock() + + return behind, nil +} + +func (r *Replication) GetLog(id uint64, log *Log) error { + return r.s.GetLog(id, log) +} + +func (r *Replication) NextNeedCommitLog(log *Log) error { + r.m.Lock() + defer r.m.Unlock() + + id, err := r.s.LastID() + if err != nil { + return err + } + + if id <= r.commitID { + return ErrNoBehindLog + } + + return r.s.GetLog(r.commitID+1, log) + +} + +func (r *Replication) Clear() error { + return r.ClearWithCommitID(0) +} + +func (r *Replication) ClearWithCommitID(id uint64) error { + r.m.Lock() + defer r.m.Unlock() + + if err := r.s.Clear(); err != nil { + return err + } + + return r.updateCommitID(id, true) +} + +func (r *Replication) run() { + defer r.wg.Done() + + syncTc := time.NewTicker(1 * time.Second) + purgeTc := time.NewTicker(1 * time.Hour) + + for { + select { + case <-purgeTc.C: + n := (r.cfg.Replication.ExpiredLogDays * 24 * 3600) + r.m.Lock() + err := r.s.PurgeExpired(int64(n)) + r.m.Unlock() + if err != nil { + log.Error("purge expired log error %s", err.Error()) + } + case <-syncTc.C: + if r.cfg.Replication.SyncLog == 1 { + r.m.Lock() + err := r.s.Sync() + r.m.Unlock() + if err != nil { + log.Error("sync store error %s", err.Error()) + } + } + if r.cfg.Replication.SyncLog != 2 { + //we will sync commit id every 1 second + r.m.Lock() + err := r.updateCommitID(r.commitID, true) + r.m.Unlock() + + if err != nil { + log.Error("sync commitid error %s", err.Error()) + } + } + case <-r.quit: + syncTc.Stop() + purgeTc.Stop() + return + } + } +} diff --git a/vendor/github.com/siddontang/ledisdb/rpl/store.go b/vendor/github.com/siddontang/ledisdb/rpl/store.go new file mode 100644 index 000000000000..9f985ec6be0f --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/rpl/store.go @@ -0,0 +1,36 @@ +package rpl + +import ( + "errors" +) + +const ( + InvalidLogID uint64 = 0 +) + +var ( + ErrLogNotFound = errors.New("log not found") + ErrStoreLogID = errors.New("log id is less") + ErrNoBehindLog = errors.New("no behind commit log") + ErrCommitIDBehind = errors.New("commit id is behind last log id") +) + +type LogStore interface { + GetLog(id uint64, log *Log) error + + FirstID() (uint64, error) + LastID() (uint64, error) + + // if log id is less than current last id, return error + StoreLog(log *Log) error + + // Delete logs before n seconds + PurgeExpired(n int64) error + + Sync() error + + // Clear all logs + Clear() error + + Close() error +} diff --git a/vendor/github.com/siddontang/ledisdb/store/boltdb/const.go b/vendor/github.com/siddontang/ledisdb/store/boltdb/const.go new file mode 100644 index 000000000000..1e7d0aecf983 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/boltdb/const.go @@ -0,0 +1,3 @@ +package boltdb + +const DBName = "boltdb" diff --git a/vendor/github.com/siddontang/ledisdb/store/boltdb/db.go b/vendor/github.com/siddontang/ledisdb/store/boltdb/db.go new file mode 100644 index 000000000000..ac8cc03f76dd --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/boltdb/db.go @@ -0,0 +1,172 @@ +package boltdb + +import ( + "github.com/boltdb/bolt" + "github.com/siddontang/ledisdb/config" + "github.com/siddontang/ledisdb/store/driver" + "os" + "path" +) + +var bucketName = []byte("ledisdb") + +type Store struct { +} + +func (s Store) String() string { + return DBName +} + +func (s Store) Open(dbPath string, cfg *config.Config) (driver.IDB, error) { + os.MkdirAll(dbPath, 0755) + name := path.Join(dbPath, "ledis_bolt.db") + db := new(DB) + var err error + + db.path = name + db.cfg = cfg + + db.db, err = bolt.Open(name, 0600, nil) + if err != nil { + return nil, err + } + + var tx *bolt.Tx + tx, err = db.db.Begin(true) + if err != nil { + return nil, err + } + + _, err = tx.CreateBucketIfNotExists(bucketName) + if err != nil { + tx.Rollback() + return nil, err + } + + if err = tx.Commit(); err != nil { + return nil, err + } + + return db, nil +} + +func (s Store) Repair(path string, cfg *config.Config) error { + return nil +} + +type DB struct { + cfg *config.Config + db *bolt.DB + path string +} + +func (db *DB) Close() error { + return db.db.Close() +} + +func (db *DB) Get(key []byte) ([]byte, error) { + var value []byte + + t, err := db.db.Begin(false) + if err != nil { + return nil, err + } + defer t.Rollback() + + b := t.Bucket(bucketName) + + value = b.Get(key) + + if value == nil { + return nil, nil + } else { + return append([]byte{}, value...), nil + } +} + +func (db *DB) Put(key []byte, value []byte) error { + err := db.db.Update(func(tx *bolt.Tx) error { + b := tx.Bucket(bucketName) + return b.Put(key, value) + }) + return err +} + +func (db *DB) Delete(key []byte) error { + err := db.db.Update(func(tx *bolt.Tx) error { + b := tx.Bucket(bucketName) + return b.Delete(key) + }) + return err +} + +func (db *DB) SyncPut(key []byte, value []byte) error { + return db.Put(key, value) +} + +func (db *DB) SyncDelete(key []byte) error { + return db.Delete(key) +} + +func (db *DB) NewIterator() driver.IIterator { + tx, err := db.db.Begin(false) + if err != nil { + return &Iterator{} + } + b := tx.Bucket(bucketName) + + return &Iterator{ + tx: tx, + it: b.Cursor()} +} + +func (db *DB) NewWriteBatch() driver.IWriteBatch { + return driver.NewWriteBatch(db) +} + +func (db *DB) Begin() (driver.Tx, error) { + tx, err := db.db.Begin(true) + if err != nil { + return nil, err + } + + return &Tx{ + tx: tx, + b: tx.Bucket(bucketName), + }, nil +} + +func (db *DB) NewSnapshot() (driver.ISnapshot, error) { + return newSnapshot(db) +} + +func (db *DB) BatchPut(writes []driver.Write) error { + err := db.db.Update(func(tx *bolt.Tx) error { + b := tx.Bucket(bucketName) + var err error + for _, w := range writes { + if w.Value == nil { + err = b.Delete(w.Key) + } else { + err = b.Put(w.Key, w.Value) + } + if err != nil { + return err + } + } + return nil + }) + return err +} + +func (db *DB) SyncBatchPut(writes []driver.Write) error { + return db.BatchPut(writes) +} + +func (db *DB) Compact() error { + return nil +} + +func init() { + driver.Register(Store{}) +} diff --git a/vendor/github.com/siddontang/ledisdb/store/boltdb/iterator.go b/vendor/github.com/siddontang/ledisdb/store/boltdb/iterator.go new file mode 100644 index 000000000000..aea750824137 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/boltdb/iterator.go @@ -0,0 +1,50 @@ +package boltdb + +import ( + "github.com/boltdb/bolt" +) + +type Iterator struct { + tx *bolt.Tx + it *bolt.Cursor + key []byte + value []byte +} + +func (it *Iterator) Close() error { + if it.tx != nil { + return it.tx.Rollback() + } else { + return nil + } +} + +func (it *Iterator) First() { + it.key, it.value = it.it.First() +} + +func (it *Iterator) Last() { + it.key, it.value = it.it.Last() +} + +func (it *Iterator) Seek(key []byte) { + it.key, it.value = it.it.Seek(key) +} + +func (it *Iterator) Next() { + it.key, it.value = it.it.Next() +} +func (it *Iterator) Prev() { + it.key, it.value = it.it.Prev() +} + +func (it *Iterator) Valid() bool { + return !(it.key == nil && it.value == nil) +} + +func (it *Iterator) Key() []byte { + return it.key +} +func (it *Iterator) Value() []byte { + return it.value +} diff --git a/vendor/github.com/siddontang/ledisdb/store/boltdb/snapshot.go b/vendor/github.com/siddontang/ledisdb/store/boltdb/snapshot.go new file mode 100644 index 000000000000..ad4df90be555 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/boltdb/snapshot.go @@ -0,0 +1,37 @@ +package boltdb + +import ( + "github.com/boltdb/bolt" + "github.com/siddontang/ledisdb/store/driver" +) + +type Snapshot struct { + tx *bolt.Tx + b *bolt.Bucket +} + +func newSnapshot(db *DB) (*Snapshot, error) { + tx, err := db.db.Begin(false) + if err != nil { + return nil, err + } + + return &Snapshot{ + tx: tx, + b: tx.Bucket(bucketName)}, nil +} + +func (s *Snapshot) Get(key []byte) ([]byte, error) { + return s.b.Get(key), nil +} + +func (s *Snapshot) NewIterator() driver.IIterator { + return &Iterator{ + tx: nil, + it: s.b.Cursor(), + } +} + +func (s *Snapshot) Close() { + s.tx.Rollback() +} diff --git a/vendor/github.com/siddontang/ledisdb/store/boltdb/tx.go b/vendor/github.com/siddontang/ledisdb/store/boltdb/tx.go new file mode 100644 index 000000000000..8dba000f10ca --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/boltdb/tx.go @@ -0,0 +1,61 @@ +package boltdb + +import ( + "github.com/boltdb/bolt" + "github.com/siddontang/ledisdb/store/driver" +) + +type Tx struct { + tx *bolt.Tx + b *bolt.Bucket +} + +func (t *Tx) Get(key []byte) ([]byte, error) { + return t.b.Get(key), nil +} + +func (t *Tx) Put(key []byte, value []byte) error { + return t.b.Put(key, value) +} + +func (t *Tx) Delete(key []byte) error { + return t.b.Delete(key) +} + +func (t *Tx) NewIterator() driver.IIterator { + return &Iterator{ + tx: nil, + it: t.b.Cursor(), + } +} + +func (t *Tx) NewWriteBatch() driver.IWriteBatch { + return driver.NewWriteBatch(t) +} + +func (t *Tx) BatchPut(writes []driver.Write) error { + var err error + for _, w := range writes { + if w.Value == nil { + err = t.b.Delete(w.Key) + } else { + err = t.b.Put(w.Key, w.Value) + } + if err != nil { + return err + } + } + return nil +} + +func (t *Tx) SyncBatchPut(writes []driver.Write) error { + return t.BatchPut(writes) +} + +func (t *Tx) Rollback() error { + return t.tx.Rollback() +} + +func (t *Tx) Commit() error { + return t.tx.Commit() +} diff --git a/vendor/github.com/siddontang/ledisdb/store/db.go b/vendor/github.com/siddontang/ledisdb/store/db.go new file mode 100644 index 000000000000..9964c5ee86d4 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/db.go @@ -0,0 +1,179 @@ +package store + +import ( + "github.com/siddontang/ledisdb/config" + "github.com/siddontang/ledisdb/store/driver" + "sync" + "time" +) + +type DB struct { + db driver.IDB + name string + + st *Stat + + cfg *config.Config + + lastCommit time.Time + + m sync.Mutex +} + +func (db *DB) Close() error { + return db.db.Close() +} + +func (db *DB) String() string { + return db.name +} + +func (db *DB) NewIterator() *Iterator { + db.st.IterNum.Add(1) + + it := new(Iterator) + it.it = db.db.NewIterator() + it.st = db.st + + return it +} + +func (db *DB) Get(key []byte) ([]byte, error) { + t := time.Now() + v, err := db.db.Get(key) + db.st.statGet(v, err) + db.st.GetTotalTime.Add(time.Now().Sub(t)) + return v, err +} + +func (db *DB) Put(key []byte, value []byte) error { + db.st.PutNum.Add(1) + + if db.needSyncCommit() { + return db.db.SyncPut(key, value) + + } else { + return db.db.Put(key, value) + + } +} + +func (db *DB) Delete(key []byte) error { + db.st.DeleteNum.Add(1) + + if db.needSyncCommit() { + return db.db.SyncDelete(key) + } else { + return db.db.Delete(key) + } +} + +func (db *DB) NewWriteBatch() *WriteBatch { + db.st.BatchNum.Add(1) + wb := new(WriteBatch) + wb.wb = db.db.NewWriteBatch() + wb.st = db.st + wb.db = db + return wb +} + +func (db *DB) NewSnapshot() (*Snapshot, error) { + db.st.SnapshotNum.Add(1) + + var err error + s := &Snapshot{} + if s.ISnapshot, err = db.db.NewSnapshot(); err != nil { + return nil, err + } + s.st = db.st + + return s, nil +} + +func (db *DB) Compact() error { + db.st.CompactNum.Add(1) + + t := time.Now() + err := db.db.Compact() + + db.st.CompactTotalTime.Add(time.Now().Sub(t)) + + return err +} + +func (db *DB) RangeIterator(min []byte, max []byte, rangeType uint8) *RangeLimitIterator { + return NewRangeLimitIterator(db.NewIterator(), &Range{min, max, rangeType}, &Limit{0, -1}) +} + +func (db *DB) RevRangeIterator(min []byte, max []byte, rangeType uint8) *RangeLimitIterator { + return NewRevRangeLimitIterator(db.NewIterator(), &Range{min, max, rangeType}, &Limit{0, -1}) +} + +//count < 0, unlimit. +// +//offset must >= 0, if < 0, will get nothing. +func (db *DB) RangeLimitIterator(min []byte, max []byte, rangeType uint8, offset int, count int) *RangeLimitIterator { + return NewRangeLimitIterator(db.NewIterator(), &Range{min, max, rangeType}, &Limit{offset, count}) +} + +//count < 0, unlimit. +// +//offset must >= 0, if < 0, will get nothing. +func (db *DB) RevRangeLimitIterator(min []byte, max []byte, rangeType uint8, offset int, count int) *RangeLimitIterator { + return NewRevRangeLimitIterator(db.NewIterator(), &Range{min, max, rangeType}, &Limit{offset, count}) +} + +func (db *DB) Begin() (*Tx, error) { + tx, err := db.db.Begin() + if err != nil { + return nil, err + } + + db.st.TxNum.Add(1) + + return &Tx{tx, db.st}, nil +} + +func (db *DB) Stat() *Stat { + return db.st +} + +func (db *DB) needSyncCommit() bool { + if db.cfg.DBSyncCommit == 0 { + return false + } else if db.cfg.DBSyncCommit == 2 { + return true + } else { + n := time.Now() + need := false + db.m.Lock() + + if n.Sub(db.lastCommit) > time.Second { + need = true + } + db.lastCommit = n + + db.m.Unlock() + return need + } + +} + +func (db *DB) GetSlice(key []byte) (Slice, error) { + if d, ok := db.db.(driver.ISliceGeter); ok { + t := time.Now() + v, err := d.GetSlice(key) + db.st.statGet(v, err) + db.st.GetTotalTime.Add(time.Now().Sub(t)) + return v, err + } else { + v, err := db.Get(key) + if err != nil { + return nil, err + } else if v == nil { + return nil, nil + } else { + return driver.GoSlice(v), nil + } + } +} diff --git a/vendor/github.com/siddontang/ledisdb/store/driver/batch.go b/vendor/github.com/siddontang/ledisdb/store/driver/batch.go new file mode 100644 index 000000000000..1c1e8992125d --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/driver/batch.go @@ -0,0 +1,70 @@ +package driver + +import ( + "github.com/syndtr/goleveldb/leveldb" +) + +type BatchPuter interface { + BatchPut([]Write) error + SyncBatchPut([]Write) error +} + +type Write struct { + Key []byte + Value []byte +} + +type WriteBatch struct { + d *leveldb.Batch + + wb []Write + w BatchPuter +} + +func (wb *WriteBatch) Close() { + wb.d.Reset() + wb.wb = wb.wb[0:0] +} + +func (wb *WriteBatch) Put(key, value []byte) { + if value == nil { + value = []byte{} + } + wb.wb = append(wb.wb, Write{key, value}) +} + +func (wb *WriteBatch) Delete(key []byte) { + wb.wb = append(wb.wb, Write{key, nil}) +} + +func (wb *WriteBatch) Commit() error { + return wb.w.BatchPut(wb.wb) +} + +func (wb *WriteBatch) SyncCommit() error { + return wb.w.SyncBatchPut(wb.wb) +} + +func (wb *WriteBatch) Rollback() error { + wb.wb = wb.wb[0:0] + return nil +} + +func (wb *WriteBatch) Data() []byte { + wb.d.Reset() + for _, w := range wb.wb { + if w.Value == nil { + wb.d.Delete(w.Key) + } else { + wb.d.Put(w.Key, w.Value) + } + } + return wb.d.Dump() +} + +func NewWriteBatch(puter BatchPuter) *WriteBatch { + return &WriteBatch{ + &leveldb.Batch{}, + []Write{}, + puter} +} diff --git a/vendor/github.com/siddontang/ledisdb/store/driver/driver.go b/vendor/github.com/siddontang/ledisdb/store/driver/driver.go new file mode 100644 index 000000000000..b571738df1d0 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/driver/driver.go @@ -0,0 +1,79 @@ +package driver + +import ( + "errors" +) + +var ( + ErrTxSupport = errors.New("transaction is not supported") +) + +type IDB interface { + Close() error + + Get(key []byte) ([]byte, error) + + Put(key []byte, value []byte) error + Delete(key []byte) error + + SyncPut(key []byte, value []byte) error + SyncDelete(key []byte) error + + NewIterator() IIterator + + NewWriteBatch() IWriteBatch + + NewSnapshot() (ISnapshot, error) + + Begin() (Tx, error) + + Compact() error +} + +type ISnapshot interface { + Get(key []byte) ([]byte, error) + NewIterator() IIterator + Close() +} + +type IIterator interface { + Close() error + + First() + Last() + Seek(key []byte) + + Next() + Prev() + + Valid() bool + + Key() []byte + Value() []byte +} + +type IWriteBatch interface { + Put(key []byte, value []byte) + Delete(key []byte) + Commit() error + SyncCommit() error + Rollback() error + Data() []byte + Close() +} + +type Tx interface { + Get(key []byte) ([]byte, error) + Put(key []byte, value []byte) error + Delete(key []byte) error + + NewIterator() IIterator + NewWriteBatch() IWriteBatch + + Commit() error + Rollback() error +} + +type ISliceGeter interface { + GetSlice(key []byte) (ISlice, error) +} diff --git a/vendor/github.com/siddontang/ledisdb/store/driver/slice.go b/vendor/github.com/siddontang/ledisdb/store/driver/slice.go new file mode 100644 index 000000000000..d0c80e0b8fcf --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/driver/slice.go @@ -0,0 +1,21 @@ +package driver + +type ISlice interface { + Data() []byte + Size() int + Free() +} + +type GoSlice []byte + +func (s GoSlice) Data() []byte { + return []byte(s) +} + +func (s GoSlice) Size() int { + return len(s) +} + +func (s GoSlice) Free() { + +} diff --git a/vendor/github.com/siddontang/ledisdb/store/driver/store.go b/vendor/github.com/siddontang/ledisdb/store/driver/store.go new file mode 100644 index 000000000000..dffa670552bf --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/driver/store.go @@ -0,0 +1,45 @@ +package driver + +import ( + "fmt" + "github.com/siddontang/ledisdb/config" +) + +type Store interface { + String() string + Open(path string, cfg *config.Config) (IDB, error) + Repair(path string, cfg *config.Config) error +} + +var dbs = map[string]Store{} + +func Register(s Store) { + name := s.String() + if _, ok := dbs[name]; ok { + panic(fmt.Errorf("store %s is registered", s)) + } + + dbs[name] = s +} + +func ListStores() []string { + s := []string{} + for k, _ := range dbs { + s = append(s, k) + } + + return s +} + +func GetStore(cfg *config.Config) (Store, error) { + if len(cfg.DBName) == 0 { + cfg.DBName = config.DefaultDBName + } + + s, ok := dbs[cfg.DBName] + if !ok { + return nil, fmt.Errorf("store %s is not registered", cfg.DBName) + } + + return s, nil +} diff --git a/vendor/github.com/siddontang/ledisdb/store/goleveldb/batch.go b/vendor/github.com/siddontang/ledisdb/store/goleveldb/batch.go new file mode 100644 index 000000000000..2032279a2a8e --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/goleveldb/batch.go @@ -0,0 +1,39 @@ +package goleveldb + +import ( + "github.com/syndtr/goleveldb/leveldb" +) + +type WriteBatch struct { + db *DB + wbatch *leveldb.Batch +} + +func (w *WriteBatch) Put(key, value []byte) { + w.wbatch.Put(key, value) +} + +func (w *WriteBatch) Delete(key []byte) { + w.wbatch.Delete(key) +} + +func (w *WriteBatch) Commit() error { + return w.db.db.Write(w.wbatch, nil) +} + +func (w *WriteBatch) SyncCommit() error { + return w.db.db.Write(w.wbatch, w.db.syncOpts) +} + +func (w *WriteBatch) Rollback() error { + w.wbatch.Reset() + return nil +} + +func (w *WriteBatch) Close() { + w.wbatch.Reset() +} + +func (w *WriteBatch) Data() []byte { + return w.wbatch.Dump() +} diff --git a/vendor/github.com/siddontang/ledisdb/store/goleveldb/const.go b/vendor/github.com/siddontang/ledisdb/store/goleveldb/const.go new file mode 100644 index 000000000000..2fffa7c82bb4 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/goleveldb/const.go @@ -0,0 +1,4 @@ +package goleveldb + +const DBName = "goleveldb" +const MemDBName = "memory" diff --git a/vendor/github.com/siddontang/ledisdb/store/goleveldb/db.go b/vendor/github.com/siddontang/ledisdb/store/goleveldb/db.go new file mode 100644 index 000000000000..af8633b8cd7b --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/goleveldb/db.go @@ -0,0 +1,208 @@ +package goleveldb + +import ( + "github.com/syndtr/goleveldb/leveldb" + "github.com/syndtr/goleveldb/leveldb/cache" + "github.com/syndtr/goleveldb/leveldb/filter" + "github.com/syndtr/goleveldb/leveldb/opt" + "github.com/syndtr/goleveldb/leveldb/storage" + "github.com/syndtr/goleveldb/leveldb/util" + + "github.com/siddontang/ledisdb/config" + "github.com/siddontang/ledisdb/store/driver" + + "os" +) + +const defaultFilterBits int = 10 + +type Store struct { +} + +func (s Store) String() string { + return DBName +} + +type MemStore struct { +} + +func (s MemStore) String() string { + return MemDBName +} + +type DB struct { + path string + + cfg *config.LevelDBConfig + + db *leveldb.DB + + opts *opt.Options + + iteratorOpts *opt.ReadOptions + + syncOpts *opt.WriteOptions + + cache cache.Cache + + filter filter.Filter +} + +func (s Store) Open(path string, cfg *config.Config) (driver.IDB, error) { + if err := os.MkdirAll(path, 0755); err != nil { + return nil, err + } + + db := new(DB) + db.path = path + db.cfg = &cfg.LevelDB + + db.initOpts() + + var err error + db.db, err = leveldb.OpenFile(db.path, db.opts) + + if err != nil { + return nil, err + } + + return db, nil +} + +func (s Store) Repair(path string, cfg *config.Config) error { + db, err := leveldb.RecoverFile(path, newOptions(&cfg.LevelDB)) + if err != nil { + return err + } + + db.Close() + return nil +} + +func (s MemStore) Open(path string, cfg *config.Config) (driver.IDB, error) { + db := new(DB) + db.path = path + db.cfg = &cfg.LevelDB + + db.initOpts() + + var err error + db.db, err = leveldb.Open(storage.NewMemStorage(), db.opts) + if err != nil { + return nil, err + } + + return db, nil +} + +func (s MemStore) Repair(path string, cfg *config.Config) error { + return nil +} + +func (db *DB) initOpts() { + db.opts = newOptions(db.cfg) + + db.iteratorOpts = &opt.ReadOptions{} + db.iteratorOpts.DontFillCache = true + + db.syncOpts = &opt.WriteOptions{} + db.syncOpts.Sync = true +} + +func newOptions(cfg *config.LevelDBConfig) *opt.Options { + opts := &opt.Options{} + opts.ErrorIfMissing = false + + opts.BlockCache = cache.NewLRUCache(cfg.CacheSize) + + //we must use bloomfilter + opts.Filter = filter.NewBloomFilter(defaultFilterBits) + + if !cfg.Compression { + opts.Compression = opt.NoCompression + } else { + opts.Compression = opt.SnappyCompression + } + + opts.BlockSize = cfg.BlockSize + opts.WriteBuffer = cfg.WriteBufferSize + opts.CachedOpenFiles = cfg.MaxOpenFiles + + //here we use default value, later add config support + opts.CompactionTableSize = 32 * 1024 * 1024 + opts.WriteL0SlowdownTrigger = 16 + opts.WriteL0PauseTrigger = 64 + + return opts +} + +func (db *DB) Close() error { + return db.db.Close() +} + +func (db *DB) Put(key, value []byte) error { + return db.db.Put(key, value, nil) +} + +func (db *DB) Get(key []byte) ([]byte, error) { + v, err := db.db.Get(key, nil) + if err == leveldb.ErrNotFound { + return nil, nil + } + return v, nil +} + +func (db *DB) Delete(key []byte) error { + return db.db.Delete(key, nil) +} + +func (db *DB) SyncPut(key []byte, value []byte) error { + return db.db.Put(key, value, db.syncOpts) +} + +func (db *DB) SyncDelete(key []byte) error { + return db.db.Delete(key, db.syncOpts) +} + +func (db *DB) NewWriteBatch() driver.IWriteBatch { + wb := &WriteBatch{ + db: db, + wbatch: new(leveldb.Batch), + } + return wb +} + +func (db *DB) NewIterator() driver.IIterator { + it := &Iterator{ + db.db.NewIterator(nil, db.iteratorOpts), + } + + return it +} + +func (db *DB) Begin() (driver.Tx, error) { + return nil, driver.ErrTxSupport +} + +func (db *DB) NewSnapshot() (driver.ISnapshot, error) { + snapshot, err := db.db.GetSnapshot() + if err != nil { + return nil, err + } + + s := &Snapshot{ + db: db, + snp: snapshot, + } + + return s, nil +} + +func (db *DB) Compact() error { + return db.db.CompactRange(util.Range{nil, nil}) +} + +func init() { + driver.Register(Store{}) + driver.Register(MemStore{}) +} diff --git a/vendor/github.com/siddontang/ledisdb/store/goleveldb/iterator.go b/vendor/github.com/siddontang/ledisdb/store/goleveldb/iterator.go new file mode 100644 index 000000000000..c1fd8b5573bb --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/goleveldb/iterator.go @@ -0,0 +1,49 @@ +package goleveldb + +import ( + "github.com/syndtr/goleveldb/leveldb/iterator" +) + +type Iterator struct { + it iterator.Iterator +} + +func (it *Iterator) Key() []byte { + return it.it.Key() +} + +func (it *Iterator) Value() []byte { + return it.it.Value() +} + +func (it *Iterator) Close() error { + if it.it != nil { + it.it.Release() + it.it = nil + } + return nil +} + +func (it *Iterator) Valid() bool { + return it.it.Valid() +} + +func (it *Iterator) Next() { + it.it.Next() +} + +func (it *Iterator) Prev() { + it.it.Prev() +} + +func (it *Iterator) First() { + it.it.First() +} + +func (it *Iterator) Last() { + it.it.Last() +} + +func (it *Iterator) Seek(key []byte) { + it.it.Seek(key) +} diff --git a/vendor/github.com/siddontang/ledisdb/store/goleveldb/snapshot.go b/vendor/github.com/siddontang/ledisdb/store/goleveldb/snapshot.go new file mode 100644 index 000000000000..c615579bb89d --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/goleveldb/snapshot.go @@ -0,0 +1,26 @@ +package goleveldb + +import ( + "github.com/siddontang/ledisdb/store/driver" + "github.com/syndtr/goleveldb/leveldb" +) + +type Snapshot struct { + db *DB + snp *leveldb.Snapshot +} + +func (s *Snapshot) Get(key []byte) ([]byte, error) { + return s.snp.Get(key, s.db.iteratorOpts) +} + +func (s *Snapshot) NewIterator() driver.IIterator { + it := &Iterator{ + s.snp.NewIterator(nil, s.db.iteratorOpts), + } + return it +} + +func (s *Snapshot) Close() { + s.snp.Release() +} diff --git a/vendor/github.com/siddontang/ledisdb/store/iterator.go b/vendor/github.com/siddontang/ledisdb/store/iterator.go new file mode 100644 index 000000000000..d81976bdc827 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/iterator.go @@ -0,0 +1,333 @@ +package store + +import ( + "bytes" + "github.com/siddontang/ledisdb/store/driver" +) + +const ( + IteratorForward uint8 = 0 + IteratorBackward uint8 = 1 +) + +const ( + RangeClose uint8 = 0x00 + RangeLOpen uint8 = 0x01 + RangeROpen uint8 = 0x10 + RangeOpen uint8 = 0x11 +) + +// min must less or equal than max +// +// range type: +// +// close: [min, max] +// open: (min, max) +// lopen: (min, max] +// ropen: [min, max) +// +type Range struct { + Min []byte + Max []byte + + Type uint8 +} + +type Limit struct { + Offset int + Count int +} + +type Iterator struct { + it driver.IIterator + st *Stat +} + +// Returns a copy of key. +func (it *Iterator) Key() []byte { + k := it.it.Key() + if k == nil { + return nil + } + + return append([]byte{}, k...) +} + +// Returns a copy of value. +func (it *Iterator) Value() []byte { + v := it.it.Value() + if v == nil { + return nil + } + + return append([]byte{}, v...) +} + +// Returns a reference of key. +// you must be careful that it will be changed after next iterate. +func (it *Iterator) RawKey() []byte { + return it.it.Key() +} + +// Returns a reference of value. +// you must be careful that it will be changed after next iterate. +func (it *Iterator) RawValue() []byte { + return it.it.Value() +} + +// Copy key to b, if b len is small or nil, returns a new one. +func (it *Iterator) BufKey(b []byte) []byte { + k := it.RawKey() + if k == nil { + return nil + } + if b == nil { + b = []byte{} + } + + b = b[0:0] + return append(b, k...) +} + +// Copy value to b, if b len is small or nil, returns a new one. +func (it *Iterator) BufValue(b []byte) []byte { + v := it.RawValue() + if v == nil { + return nil + } + + if b == nil { + b = []byte{} + } + + b = b[0:0] + return append(b, v...) +} + +func (it *Iterator) Close() { + if it.it != nil { + it.st.IterCloseNum.Add(1) + it.it.Close() + it.it = nil + } +} + +func (it *Iterator) Valid() bool { + return it.it.Valid() +} + +func (it *Iterator) Next() { + it.st.IterSeekNum.Add(1) + it.it.Next() +} + +func (it *Iterator) Prev() { + it.st.IterSeekNum.Add(1) + it.it.Prev() +} + +func (it *Iterator) SeekToFirst() { + it.st.IterSeekNum.Add(1) + it.it.First() +} + +func (it *Iterator) SeekToLast() { + it.st.IterSeekNum.Add(1) + it.it.Last() +} + +func (it *Iterator) Seek(key []byte) { + it.st.IterSeekNum.Add(1) + it.it.Seek(key) +} + +// Finds by key, if not found, nil returns. +func (it *Iterator) Find(key []byte) []byte { + it.Seek(key) + if it.Valid() { + k := it.RawKey() + if k == nil { + return nil + } else if bytes.Equal(k, key) { + return it.Value() + } + } + + return nil +} + +// Finds by key, if not found, nil returns, else a reference of value returns. +// you must be careful that it will be changed after next iterate. +func (it *Iterator) RawFind(key []byte) []byte { + it.Seek(key) + if it.Valid() { + k := it.RawKey() + if k == nil { + return nil + } else if bytes.Equal(k, key) { + return it.RawValue() + } + } + + return nil +} + +type RangeLimitIterator struct { + it *Iterator + + r *Range + l *Limit + + step int + + //0 for IteratorForward, 1 for IteratorBackward + direction uint8 +} + +func (it *RangeLimitIterator) Key() []byte { + return it.it.Key() +} + +func (it *RangeLimitIterator) Value() []byte { + return it.it.Value() +} + +func (it *RangeLimitIterator) RawKey() []byte { + return it.it.RawKey() +} + +func (it *RangeLimitIterator) RawValue() []byte { + return it.it.RawValue() +} + +func (it *RangeLimitIterator) BufKey(b []byte) []byte { + return it.it.BufKey(b) +} + +func (it *RangeLimitIterator) BufValue(b []byte) []byte { + return it.it.BufValue(b) +} + +func (it *RangeLimitIterator) Valid() bool { + if it.l.Offset < 0 { + return false + } else if !it.it.Valid() { + return false + } else if it.l.Count >= 0 && it.step >= it.l.Count { + return false + } + + if it.direction == IteratorForward { + if it.r.Max != nil { + r := bytes.Compare(it.it.RawKey(), it.r.Max) + if it.r.Type&RangeROpen > 0 { + return !(r >= 0) + } else { + return !(r > 0) + } + } + } else { + if it.r.Min != nil { + r := bytes.Compare(it.it.RawKey(), it.r.Min) + if it.r.Type&RangeLOpen > 0 { + return !(r <= 0) + } else { + return !(r < 0) + } + } + } + + return true +} + +func (it *RangeLimitIterator) Next() { + it.step++ + + if it.direction == IteratorForward { + it.it.Next() + } else { + it.it.Prev() + } +} + +func (it *RangeLimitIterator) Close() { + it.it.Close() +} + +func NewRangeLimitIterator(i *Iterator, r *Range, l *Limit) *RangeLimitIterator { + return rangeLimitIterator(i, r, l, IteratorForward) +} + +func NewRevRangeLimitIterator(i *Iterator, r *Range, l *Limit) *RangeLimitIterator { + return rangeLimitIterator(i, r, l, IteratorBackward) +} + +func NewRangeIterator(i *Iterator, r *Range) *RangeLimitIterator { + return rangeLimitIterator(i, r, &Limit{0, -1}, IteratorForward) +} + +func NewRevRangeIterator(i *Iterator, r *Range) *RangeLimitIterator { + return rangeLimitIterator(i, r, &Limit{0, -1}, IteratorBackward) +} + +func rangeLimitIterator(i *Iterator, r *Range, l *Limit, direction uint8) *RangeLimitIterator { + it := new(RangeLimitIterator) + + it.it = i + + it.r = r + it.l = l + it.direction = direction + + it.step = 0 + + if l.Offset < 0 { + return it + } + + if direction == IteratorForward { + if r.Min == nil { + it.it.SeekToFirst() + } else { + it.it.Seek(r.Min) + + if r.Type&RangeLOpen > 0 { + if it.it.Valid() && bytes.Equal(it.it.RawKey(), r.Min) { + it.it.Next() + } + } + } + } else { + if r.Max == nil { + it.it.SeekToLast() + } else { + it.it.Seek(r.Max) + + if !it.it.Valid() { + it.it.SeekToLast() + } else { + if !bytes.Equal(it.it.RawKey(), r.Max) { + it.it.Prev() + } + } + + if r.Type&RangeROpen > 0 { + if it.it.Valid() && bytes.Equal(it.it.RawKey(), r.Max) { + it.it.Prev() + } + } + } + } + + for i := 0; i < l.Offset; i++ { + if it.it.Valid() { + if it.direction == IteratorForward { + it.it.Next() + } else { + it.it.Prev() + } + } + } + + return it +} diff --git a/vendor/github.com/siddontang/ledisdb/store/leveldb/batch.go b/vendor/github.com/siddontang/ledisdb/store/leveldb/batch.go new file mode 100644 index 000000000000..027aa39cdb95 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/leveldb/batch.go @@ -0,0 +1,104 @@ +// +build leveldb + +package leveldb + +// #cgo LDFLAGS: -lleveldb +// #include "leveldb/c.h" +// #include "leveldb_ext.h" +import "C" + +import ( + "github.com/syndtr/goleveldb/leveldb" + "unsafe" +) + +type WriteBatch struct { + db *DB + wbatch *C.leveldb_writebatch_t + + gbatch *leveldb.Batch +} + +func newWriteBatch(db *DB) *WriteBatch { + w := new(WriteBatch) + w.db = db + w.wbatch = C.leveldb_writebatch_create() + w.gbatch = new(leveldb.Batch) + + return w +} + +func (w *WriteBatch) Close() { + if w.wbatch != nil { + C.leveldb_writebatch_destroy(w.wbatch) + w.wbatch = nil + } + + w.gbatch = nil +} + +func (w *WriteBatch) Put(key, value []byte) { + var k, v *C.char + if len(key) != 0 { + k = (*C.char)(unsafe.Pointer(&key[0])) + } + if len(value) != 0 { + v = (*C.char)(unsafe.Pointer(&value[0])) + } + + lenk := len(key) + lenv := len(value) + + C.leveldb_writebatch_put(w.wbatch, k, C.size_t(lenk), v, C.size_t(lenv)) +} + +func (w *WriteBatch) Delete(key []byte) { + C.leveldb_writebatch_delete(w.wbatch, + (*C.char)(unsafe.Pointer(&key[0])), C.size_t(len(key))) +} + +func (w *WriteBatch) Commit() error { + return w.commit(w.db.writeOpts) +} + +func (w *WriteBatch) SyncCommit() error { + return w.commit(w.db.syncOpts) +} + +func (w *WriteBatch) Rollback() error { + C.leveldb_writebatch_clear(w.wbatch) + + return nil +} + +func (w *WriteBatch) commit(wb *WriteOptions) error { + var errStr *C.char + C.leveldb_write(w.db.db, wb.Opt, w.wbatch, &errStr) + if errStr != nil { + return saveError(errStr) + } + return nil +} + +//export leveldb_writebatch_iterate_put +func leveldb_writebatch_iterate_put(p unsafe.Pointer, k *C.char, klen C.size_t, v *C.char, vlen C.size_t) { + b := (*leveldb.Batch)(p) + key := slice(unsafe.Pointer(k), int(klen)) + value := slice(unsafe.Pointer(v), int(vlen)) + b.Put(key, value) +} + +//export leveldb_writebatch_iterate_delete +func leveldb_writebatch_iterate_delete(p unsafe.Pointer, k *C.char, klen C.size_t) { + b := (*leveldb.Batch)(p) + key := slice(unsafe.Pointer(k), int(klen)) + b.Delete(key) +} + +func (w *WriteBatch) Data() []byte { + w.gbatch.Reset() + C.leveldb_writebatch_iterate_ext(w.wbatch, + unsafe.Pointer(w.gbatch)) + b := w.gbatch.Dump() + return b +} diff --git a/vendor/github.com/siddontang/ledisdb/store/leveldb/cache.go b/vendor/github.com/siddontang/ledisdb/store/leveldb/cache.go new file mode 100644 index 000000000000..e5587cbf89e2 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/leveldb/cache.go @@ -0,0 +1,20 @@ +// +build leveldb + +package leveldb + +// #cgo LDFLAGS: -lleveldb +// #include +// #include "leveldb/c.h" +import "C" + +type Cache struct { + Cache *C.leveldb_cache_t +} + +func NewLRUCache(capacity int) *Cache { + return &Cache{C.leveldb_cache_create_lru(C.size_t(capacity))} +} + +func (c *Cache) Close() { + C.leveldb_cache_destroy(c.Cache) +} diff --git a/vendor/github.com/siddontang/ledisdb/store/leveldb/const.go b/vendor/github.com/siddontang/ledisdb/store/leveldb/const.go new file mode 100644 index 000000000000..df5b3c7a83b5 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/leveldb/const.go @@ -0,0 +1,3 @@ +package leveldb + +const DBName = "leveldb" diff --git a/vendor/github.com/siddontang/ledisdb/store/leveldb/db.go b/vendor/github.com/siddontang/ledisdb/store/leveldb/db.go new file mode 100644 index 000000000000..64bbc2bfddf7 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/leveldb/db.go @@ -0,0 +1,315 @@ +// +build leveldb + +// Package leveldb is a wrapper for c++ leveldb +package leveldb + +/* +#cgo LDFLAGS: -lleveldb +#include +#include "leveldb_ext.h" +*/ +import "C" + +import ( + "github.com/siddontang/ledisdb/config" + "github.com/siddontang/ledisdb/store/driver" + "os" + "runtime" + "unsafe" +) + +const defaultFilterBits int = 10 + +type Store struct { +} + +func (s Store) String() string { + return DBName +} + +func (s Store) Open(path string, cfg *config.Config) (driver.IDB, error) { + if err := os.MkdirAll(path, 0755); err != nil { + return nil, err + } + + db := new(DB) + db.path = path + db.cfg = &cfg.LevelDB + + if err := db.open(); err != nil { + return nil, err + } + + return db, nil +} + +func (s Store) Repair(path string, cfg *config.Config) error { + db := new(DB) + db.cfg = &cfg.LevelDB + db.path = path + + err := db.open() + defer db.Close() + + //open ok, do not need repair + if err == nil { + return nil + } + + var errStr *C.char + ldbname := C.CString(path) + defer C.leveldb_free(unsafe.Pointer(ldbname)) + + C.leveldb_repair_db(db.opts.Opt, ldbname, &errStr) + if errStr != nil { + return saveError(errStr) + } + return nil +} + +type DB struct { + path string + + cfg *config.LevelDBConfig + + db *C.leveldb_t + + opts *Options + + //for default read and write options + readOpts *ReadOptions + writeOpts *WriteOptions + iteratorOpts *ReadOptions + + syncOpts *WriteOptions + + cache *Cache + + filter *FilterPolicy +} + +func (db *DB) open() error { + db.initOptions(db.cfg) + + var errStr *C.char + ldbname := C.CString(db.path) + defer C.leveldb_free(unsafe.Pointer(ldbname)) + + db.db = C.leveldb_open(db.opts.Opt, ldbname, &errStr) + if errStr != nil { + db.db = nil + return saveError(errStr) + } + return nil +} + +func (db *DB) initOptions(cfg *config.LevelDBConfig) { + opts := NewOptions() + + opts.SetCreateIfMissing(true) + + db.cache = NewLRUCache(cfg.CacheSize) + opts.SetCache(db.cache) + + //we must use bloomfilter + db.filter = NewBloomFilter(defaultFilterBits) + opts.SetFilterPolicy(db.filter) + + if !cfg.Compression { + opts.SetCompression(NoCompression) + } else { + opts.SetCompression(SnappyCompression) + } + + opts.SetBlockSize(cfg.BlockSize) + + opts.SetWriteBufferSize(cfg.WriteBufferSize) + + opts.SetMaxOpenFiles(cfg.MaxOpenFiles) + + db.opts = opts + + db.readOpts = NewReadOptions() + db.writeOpts = NewWriteOptions() + + db.syncOpts = NewWriteOptions() + db.syncOpts.SetSync(true) + + db.iteratorOpts = NewReadOptions() + db.iteratorOpts.SetFillCache(false) +} + +func (db *DB) Close() error { + if db.db != nil { + C.leveldb_close(db.db) + db.db = nil + } + + db.opts.Close() + + if db.cache != nil { + db.cache.Close() + } + + if db.filter != nil { + db.filter.Close() + } + + db.readOpts.Close() + db.writeOpts.Close() + db.iteratorOpts.Close() + + return nil +} + +func (db *DB) Put(key, value []byte) error { + return db.put(db.writeOpts, key, value) +} + +func (db *DB) Get(key []byte) ([]byte, error) { + return db.get(db.readOpts, key) +} + +func (db *DB) Delete(key []byte) error { + return db.delete(db.writeOpts, key) +} + +func (db *DB) SyncPut(key []byte, value []byte) error { + return db.put(db.syncOpts, key, value) +} + +func (db *DB) SyncDelete(key []byte) error { + return db.delete(db.syncOpts, key) +} + +func (db *DB) NewWriteBatch() driver.IWriteBatch { + wb := newWriteBatch(db) + + runtime.SetFinalizer(wb, func(w *WriteBatch) { + w.Close() + }) + + return wb +} + +func (db *DB) NewIterator() driver.IIterator { + it := new(Iterator) + + it.it = C.leveldb_create_iterator(db.db, db.iteratorOpts.Opt) + + return it +} + +func (db *DB) NewSnapshot() (driver.ISnapshot, error) { + snap := &Snapshot{ + db: db, + snap: C.leveldb_create_snapshot(db.db), + readOpts: NewReadOptions(), + iteratorOpts: NewReadOptions(), + } + snap.readOpts.SetSnapshot(snap) + snap.iteratorOpts.SetSnapshot(snap) + snap.iteratorOpts.SetFillCache(false) + + return snap, nil +} + +func (db *DB) put(wo *WriteOptions, key, value []byte) error { + var errStr *C.char + var k, v *C.char + if len(key) != 0 { + k = (*C.char)(unsafe.Pointer(&key[0])) + } + if len(value) != 0 { + v = (*C.char)(unsafe.Pointer(&value[0])) + } + + lenk := len(key) + lenv := len(value) + C.leveldb_put( + db.db, wo.Opt, k, C.size_t(lenk), v, C.size_t(lenv), &errStr) + + if errStr != nil { + return saveError(errStr) + } + return nil +} + +func (db *DB) get(ro *ReadOptions, key []byte) ([]byte, error) { + var errStr *C.char + var vallen C.size_t + var k *C.char + if len(key) != 0 { + k = (*C.char)(unsafe.Pointer(&key[0])) + } + + value := C.leveldb_get( + db.db, ro.Opt, k, C.size_t(len(key)), &vallen, &errStr) + + if errStr != nil { + return nil, saveError(errStr) + } + + if value == nil { + return nil, nil + } + + defer C.leveldb_free(unsafe.Pointer(value)) + + return C.GoBytes(unsafe.Pointer(value), C.int(vallen)), nil +} + +func (db *DB) getSlice(ro *ReadOptions, key []byte) (driver.ISlice, error) { + var errStr *C.char + var vallen C.size_t + var k *C.char + if len(key) != 0 { + k = (*C.char)(unsafe.Pointer(&key[0])) + } + + value := C.leveldb_get( + db.db, ro.Opt, k, C.size_t(len(key)), &vallen, &errStr) + + if errStr != nil { + return nil, saveError(errStr) + } + + if value == nil { + return nil, nil + } + + return NewCSlice(unsafe.Pointer(value), int(vallen)), nil +} + +func (db *DB) delete(wo *WriteOptions, key []byte) error { + var errStr *C.char + var k *C.char + if len(key) != 0 { + k = (*C.char)(unsafe.Pointer(&key[0])) + } + + C.leveldb_delete( + db.db, wo.Opt, k, C.size_t(len(key)), &errStr) + + if errStr != nil { + return saveError(errStr) + } + return nil +} + +func (db *DB) Begin() (driver.Tx, error) { + return nil, driver.ErrTxSupport +} + +func (db *DB) Compact() error { + C.leveldb_compact_range(db.db, nil, 0, nil, 0) + return nil +} + +func (db *DB) GetSlice(key []byte) (driver.ISlice, error) { + return db.getSlice(db.readOpts, key) +} + +func init() { + driver.Register(Store{}) +} diff --git a/vendor/github.com/siddontang/ledisdb/store/leveldb/filterpolicy.go b/vendor/github.com/siddontang/ledisdb/store/leveldb/filterpolicy.go new file mode 100644 index 000000000000..640139fb8b8e --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/leveldb/filterpolicy.go @@ -0,0 +1,21 @@ +// +build leveldb + +package leveldb + +// #cgo LDFLAGS: -lleveldb +// #include +// #include "leveldb/c.h" +import "C" + +type FilterPolicy struct { + Policy *C.leveldb_filterpolicy_t +} + +func NewBloomFilter(bitsPerKey int) *FilterPolicy { + policy := C.leveldb_filterpolicy_create_bloom(C.int(bitsPerKey)) + return &FilterPolicy{policy} +} + +func (fp *FilterPolicy) Close() { + C.leveldb_filterpolicy_destroy(fp.Policy) +} diff --git a/vendor/github.com/siddontang/ledisdb/store/leveldb/iterator.go b/vendor/github.com/siddontang/ledisdb/store/leveldb/iterator.go new file mode 100644 index 000000000000..49cfd7db18ea --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/leveldb/iterator.go @@ -0,0 +1,70 @@ +// +build leveldb + +package leveldb + +// #cgo LDFLAGS: -lleveldb +// #include +// #include "leveldb/c.h" +// #include "leveldb_ext.h" +import "C" + +import ( + "unsafe" +) + +type Iterator struct { + it *C.leveldb_iterator_t + isValid C.uchar +} + +func (it *Iterator) Key() []byte { + var klen C.size_t + kdata := C.leveldb_iter_key(it.it, &klen) + if kdata == nil { + return nil + } + + return slice(unsafe.Pointer(kdata), int(C.int(klen))) +} + +func (it *Iterator) Value() []byte { + var vlen C.size_t + vdata := C.leveldb_iter_value(it.it, &vlen) + if vdata == nil { + return nil + } + + return slice(unsafe.Pointer(vdata), int(C.int(vlen))) +} + +func (it *Iterator) Close() error { + if it.it != nil { + C.leveldb_iter_destroy(it.it) + it.it = nil + } + return nil +} + +func (it *Iterator) Valid() bool { + return ucharToBool(it.isValid) +} + +func (it *Iterator) Next() { + it.isValid = C.leveldb_iter_next_ext(it.it) +} + +func (it *Iterator) Prev() { + it.isValid = C.leveldb_iter_prev_ext(it.it) +} + +func (it *Iterator) First() { + it.isValid = C.leveldb_iter_seek_to_first_ext(it.it) +} + +func (it *Iterator) Last() { + it.isValid = C.leveldb_iter_seek_to_last_ext(it.it) +} + +func (it *Iterator) Seek(key []byte) { + it.isValid = C.leveldb_iter_seek_ext(it.it, (*C.char)(unsafe.Pointer(&key[0])), C.size_t(len(key))) +} diff --git a/vendor/github.com/siddontang/ledisdb/store/leveldb/leveldb_ext.cc b/vendor/github.com/siddontang/ledisdb/store/leveldb/leveldb_ext.cc new file mode 100644 index 000000000000..540b7397b01b --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/leveldb/leveldb_ext.cc @@ -0,0 +1,95 @@ +// +build leveldb + +#include "leveldb_ext.h" + +#include +//#include + +//#include "leveldb/db.h" + +//using namespace leveldb; + +extern "C" { + +// static bool SaveError(char** errptr, const Status& s) { +// assert(errptr != NULL); +// if (s.ok()) { +// return false; +// } else if (*errptr == NULL) { +// *errptr = strdup(s.ToString().c_str()); +// } else { +// free(*errptr); +// *errptr = strdup(s.ToString().c_str()); +// } +// return true; +// } + +// void* leveldb_get_ext( +// leveldb_t* db, +// const leveldb_readoptions_t* options, +// const char* key, size_t keylen, +// char** valptr, +// size_t* vallen, +// char** errptr) { + +// std::string *tmp = new(std::string); + +// //very tricky, maybe changed with c++ leveldb upgrade +// Status s = (*(DB**)db)->Get(*(ReadOptions*)options, Slice(key, keylen), tmp); + +// if (s.ok()) { +// *valptr = (char*)tmp->data(); +// *vallen = tmp->size(); +// } else { +// delete(tmp); +// tmp = NULL; +// *valptr = NULL; +// *vallen = 0; +// if (!s.IsNotFound()) { +// SaveError(errptr, s); +// } +// } +// return tmp; +// } + +// void leveldb_get_free_ext(void* context) { +// std::string* s = (std::string*)context; + +// delete(s); +// } + + +unsigned char leveldb_iter_seek_to_first_ext(leveldb_iterator_t* iter) { + leveldb_iter_seek_to_first(iter); + return leveldb_iter_valid(iter); +} + +unsigned char leveldb_iter_seek_to_last_ext(leveldb_iterator_t* iter) { + leveldb_iter_seek_to_last(iter); + return leveldb_iter_valid(iter); +} + +unsigned char leveldb_iter_seek_ext(leveldb_iterator_t* iter, const char* k, size_t klen) { + leveldb_iter_seek(iter, k, klen); + return leveldb_iter_valid(iter); +} + +unsigned char leveldb_iter_next_ext(leveldb_iterator_t* iter) { + leveldb_iter_next(iter); + return leveldb_iter_valid(iter); +} + +unsigned char leveldb_iter_prev_ext(leveldb_iterator_t* iter) { + leveldb_iter_prev(iter); + return leveldb_iter_valid(iter); +} + +extern void leveldb_writebatch_iterate_put(void*, const char* k, size_t klen, const char* v, size_t vlen); +extern void leveldb_writebatch_iterate_delete(void*, const char* k, size_t klen); + +void leveldb_writebatch_iterate_ext(leveldb_writebatch_t* w, void *p) { + leveldb_writebatch_iterate(w, p, + leveldb_writebatch_iterate_put, leveldb_writebatch_iterate_delete); +} + +} \ No newline at end of file diff --git a/vendor/github.com/siddontang/ledisdb/store/leveldb/leveldb_ext.h b/vendor/github.com/siddontang/ledisdb/store/leveldb/leveldb_ext.h new file mode 100644 index 000000000000..3eed41bdf9b0 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/leveldb/leveldb_ext.h @@ -0,0 +1,41 @@ +// +build leveldb + +#ifndef LEVELDB_EXT_H +#define LEVELDB_EXT_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "leveldb/c.h" + + +// /* Returns NULL if not found. Otherwise stores the value in **valptr. +// Stores the length of the value in *vallen. +// Returns a context must be later to free*/ +// extern void* leveldb_get_ext( +// leveldb_t* db, +// const leveldb_readoptions_t* options, +// const char* key, size_t keylen, +// char** valptr, +// size_t* vallen, +// char** errptr); + +// // Free context returns by leveldb_get_ext +// extern void leveldb_get_free_ext(void* context); + + +// Below iterator functions like leveldb iterator but returns valid status for iterator +extern unsigned char leveldb_iter_seek_to_first_ext(leveldb_iterator_t*); +extern unsigned char leveldb_iter_seek_to_last_ext(leveldb_iterator_t*); +extern unsigned char leveldb_iter_seek_ext(leveldb_iterator_t*, const char* k, size_t klen); +extern unsigned char leveldb_iter_next_ext(leveldb_iterator_t*); +extern unsigned char leveldb_iter_prev_ext(leveldb_iterator_t*); + +extern void leveldb_writebatch_iterate_ext(leveldb_writebatch_t*, void* p); + +#ifdef __cplusplus +} +#endif + +#endif \ No newline at end of file diff --git a/vendor/github.com/siddontang/ledisdb/store/leveldb/options.go b/vendor/github.com/siddontang/ledisdb/store/leveldb/options.go new file mode 100644 index 000000000000..bd3ad1694b41 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/leveldb/options.go @@ -0,0 +1,122 @@ +// +build leveldb + +package leveldb + +// #cgo LDFLAGS: -lleveldb +// #include "leveldb/c.h" +import "C" + +type CompressionOpt int + +const ( + NoCompression = CompressionOpt(0) + SnappyCompression = CompressionOpt(1) +) + +type Options struct { + Opt *C.leveldb_options_t +} + +type ReadOptions struct { + Opt *C.leveldb_readoptions_t +} + +type WriteOptions struct { + Opt *C.leveldb_writeoptions_t +} + +func NewOptions() *Options { + opt := C.leveldb_options_create() + return &Options{opt} +} + +func NewReadOptions() *ReadOptions { + opt := C.leveldb_readoptions_create() + return &ReadOptions{opt} +} + +func NewWriteOptions() *WriteOptions { + opt := C.leveldb_writeoptions_create() + return &WriteOptions{opt} +} + +func (o *Options) Close() { + C.leveldb_options_destroy(o.Opt) +} + +func (o *Options) SetComparator(cmp *C.leveldb_comparator_t) { + C.leveldb_options_set_comparator(o.Opt, cmp) +} + +func (o *Options) SetErrorIfExists(error_if_exists bool) { + eie := boolToUchar(error_if_exists) + C.leveldb_options_set_error_if_exists(o.Opt, eie) +} + +func (o *Options) SetCache(cache *Cache) { + C.leveldb_options_set_cache(o.Opt, cache.Cache) +} + +func (o *Options) SetWriteBufferSize(s int) { + C.leveldb_options_set_write_buffer_size(o.Opt, C.size_t(s)) +} + +func (o *Options) SetParanoidChecks(pc bool) { + C.leveldb_options_set_paranoid_checks(o.Opt, boolToUchar(pc)) +} + +func (o *Options) SetMaxOpenFiles(n int) { + C.leveldb_options_set_max_open_files(o.Opt, C.int(n)) +} + +func (o *Options) SetBlockSize(s int) { + C.leveldb_options_set_block_size(o.Opt, C.size_t(s)) +} + +func (o *Options) SetBlockRestartInterval(n int) { + C.leveldb_options_set_block_restart_interval(o.Opt, C.int(n)) +} + +func (o *Options) SetCompression(t CompressionOpt) { + C.leveldb_options_set_compression(o.Opt, C.int(t)) +} + +func (o *Options) SetCreateIfMissing(b bool) { + C.leveldb_options_set_create_if_missing(o.Opt, boolToUchar(b)) +} + +func (o *Options) SetFilterPolicy(fp *FilterPolicy) { + var policy *C.leveldb_filterpolicy_t + if fp != nil { + policy = fp.Policy + } + C.leveldb_options_set_filter_policy(o.Opt, policy) +} + +func (ro *ReadOptions) Close() { + C.leveldb_readoptions_destroy(ro.Opt) +} + +func (ro *ReadOptions) SetVerifyChecksums(b bool) { + C.leveldb_readoptions_set_verify_checksums(ro.Opt, boolToUchar(b)) +} + +func (ro *ReadOptions) SetFillCache(b bool) { + C.leveldb_readoptions_set_fill_cache(ro.Opt, boolToUchar(b)) +} + +func (ro *ReadOptions) SetSnapshot(snap *Snapshot) { + var s *C.leveldb_snapshot_t + if snap != nil { + s = snap.snap + } + C.leveldb_readoptions_set_snapshot(ro.Opt, s) +} + +func (wo *WriteOptions) Close() { + C.leveldb_writeoptions_destroy(wo.Opt) +} + +func (wo *WriteOptions) SetSync(b bool) { + C.leveldb_writeoptions_set_sync(wo.Opt, boolToUchar(b)) +} diff --git a/vendor/github.com/siddontang/ledisdb/store/leveldb/slice.go b/vendor/github.com/siddontang/ledisdb/store/leveldb/slice.go new file mode 100644 index 000000000000..83ebf55c0369 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/leveldb/slice.go @@ -0,0 +1,40 @@ +// +build leveldb + +package leveldb + +// #cgo LDFLAGS: -lleveldb +// #include "leveldb/c.h" +import "C" + +import ( + "reflect" + "unsafe" +) + +type CSlice struct { + data unsafe.Pointer + size int +} + +func NewCSlice(p unsafe.Pointer, n int) *CSlice { + return &CSlice{p, n} +} + +func (s *CSlice) Data() []byte { + var value []byte + + sH := (*reflect.SliceHeader)(unsafe.Pointer(&value)) + sH.Cap = int(s.size) + sH.Len = int(s.size) + sH.Data = uintptr(s.data) + + return value +} + +func (s *CSlice) Size() int { + return int(s.size) +} + +func (s *CSlice) Free() { + C.leveldb_free(s.data) +} diff --git a/vendor/github.com/siddontang/ledisdb/store/leveldb/snapshot.go b/vendor/github.com/siddontang/ledisdb/store/leveldb/snapshot.go new file mode 100644 index 000000000000..bdc8d514da8f --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/leveldb/snapshot.go @@ -0,0 +1,39 @@ +// +build leveldb + +package leveldb + +// #cgo LDFLAGS: -lleveldb +// #include "leveldb/c.h" +import "C" + +import ( + "github.com/siddontang/ledisdb/store/driver" +) + +type Snapshot struct { + db *DB + snap *C.leveldb_snapshot_t + readOpts *ReadOptions + iteratorOpts *ReadOptions +} + +func (s *Snapshot) Get(key []byte) ([]byte, error) { + return s.db.get(s.readOpts, key) +} + +func (s *Snapshot) GetSlice(key []byte) (driver.ISlice, error) { + return s.db.getSlice(s.readOpts, key) +} + +func (s *Snapshot) NewIterator() driver.IIterator { + it := new(Iterator) + it.it = C.leveldb_create_iterator(s.db.db, s.db.iteratorOpts.Opt) + return it + +} + +func (s *Snapshot) Close() { + C.leveldb_release_snapshot(s.db.db, s.snap) + s.iteratorOpts.Close() + s.readOpts.Close() +} diff --git a/vendor/github.com/siddontang/ledisdb/store/leveldb/util.go b/vendor/github.com/siddontang/ledisdb/store/leveldb/util.go new file mode 100644 index 000000000000..6efe33b661d6 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/leveldb/util.go @@ -0,0 +1,45 @@ +// +build leveldb + +package leveldb + +// #include "leveldb/c.h" +import "C" + +import ( + "fmt" + "reflect" + "unsafe" +) + +func boolToUchar(b bool) C.uchar { + uc := C.uchar(0) + if b { + uc = C.uchar(1) + } + return uc +} + +func ucharToBool(uc C.uchar) bool { + if uc == C.uchar(0) { + return false + } + return true +} + +func saveError(errStr *C.char) error { + if errStr != nil { + gs := C.GoString(errStr) + C.leveldb_free(unsafe.Pointer(errStr)) + return fmt.Errorf(gs) + } + return nil +} + +func slice(p unsafe.Pointer, n int) []byte { + var b []byte + pbyte := (*reflect.SliceHeader)(unsafe.Pointer(&b)) + pbyte.Data = uintptr(p) + pbyte.Len = n + pbyte.Cap = n + return b +} diff --git a/vendor/github.com/siddontang/ledisdb/store/mdb/const.go b/vendor/github.com/siddontang/ledisdb/store/mdb/const.go new file mode 100644 index 000000000000..cdc70e05fd25 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/mdb/const.go @@ -0,0 +1,3 @@ +package mdb + +const DBName = "lmdb" diff --git a/vendor/github.com/siddontang/ledisdb/store/mdb/mdb.go b/vendor/github.com/siddontang/ledisdb/store/mdb/mdb.go new file mode 100644 index 000000000000..6bf26f64308b --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/mdb/mdb.go @@ -0,0 +1,316 @@ +// +build !windows + +package mdb + +import ( + "github.com/siddontang/ledisdb/config" + "github.com/siddontang/ledisdb/store/driver" + mdb "github.com/szferi/gomdb" + "os" +) + +type Store struct { +} + +func (s Store) String() string { + return DBName +} + +type MDB struct { + env *mdb.Env + db mdb.DBI + path string + cfg *config.Config +} + +func (s Store) Open(path string, c *config.Config) (driver.IDB, error) { + mapSize := c.LMDB.MapSize + noSync := c.LMDB.NoSync + + if mapSize <= 0 { + mapSize = 500 * 1024 * 1024 + } + + env, err := mdb.NewEnv() + if err != nil { + return MDB{}, err + } + + // TODO: max dbs should be configurable + if err := env.SetMaxDBs(1); err != nil { + return MDB{}, err + } + + if err := env.SetMapSize(uint64(mapSize)); err != nil { + return MDB{}, err + } + + if _, err := os.Stat(path); err != nil { + err = os.MkdirAll(path, 0755) + if err != nil { + return MDB{}, err + } + } + + var flags uint = mdb.CREATE + if noSync { + flags |= mdb.NOSYNC | mdb.NOMETASYNC | mdb.WRITEMAP | mdb.MAPASYNC + } + + err = env.Open(path, flags, 0755) + if err != nil { + return MDB{}, err + } + + tx, err := env.BeginTxn(nil, 0) + if err != nil { + return MDB{}, err + } + + dbi, err := tx.DBIOpen(nil, mdb.CREATE) + if err != nil { + return MDB{}, err + } + + if err := tx.Commit(); err != nil { + return MDB{}, err + } + + db := MDB{ + env: env, + db: dbi, + path: path, + } + + return db, nil +} + +func (s Store) Repair(path string, c *config.Config) error { + println("llmd not supports repair") + return nil +} + +func (db MDB) Put(key, value []byte) error { + itr := db.iterator(false) + defer itr.Close() + + itr.err = itr.c.Put(key, value, 0) + itr.setState() + return itr.err +} + +func (db MDB) BatchPut(writes []driver.Write) error { + itr := db.iterator(false) + defer itr.Close() + + for _, w := range writes { + if w.Value == nil { + itr.key, itr.value, itr.err = itr.c.Get(w.Key, nil, mdb.SET) + if itr.err == nil { + itr.err = itr.c.Del(0) + } + } else { + itr.err = itr.c.Put(w.Key, w.Value, 0) + } + + if itr.err != nil && itr.err != mdb.NotFound { + break + } + } + itr.setState() + + return itr.err +} + +func (db MDB) SyncBatchPut(writes []driver.Write) error { + if err := db.BatchPut(writes); err != nil { + return err + } + + return db.env.Sync(1) +} + +func (db MDB) Get(key []byte) ([]byte, error) { + tx, err := db.env.BeginTxn(nil, mdb.RDONLY) + if err != nil { + return nil, err + } + defer tx.Commit() + + v, err := tx.Get(db.db, key) + if err == mdb.NotFound { + return nil, nil + } + return v, err +} + +func (db MDB) Delete(key []byte) error { + itr := db.iterator(false) + defer itr.Close() + + itr.key, itr.value, itr.err = itr.c.Get(key, nil, mdb.SET) + if itr.err == nil { + itr.err = itr.c.Del(0) + } + itr.setState() + return itr.Error() +} + +func (db MDB) SyncPut(key []byte, value []byte) error { + if err := db.Put(key, value); err != nil { + return err + } + + return db.env.Sync(1) +} + +func (db MDB) SyncDelete(key []byte) error { + if err := db.Delete(key); err != nil { + return err + } + + return db.env.Sync(1) +} + +type MDBIterator struct { + key []byte + value []byte + c *mdb.Cursor + tx *mdb.Txn + valid bool + err error + + closeAutoCommit bool +} + +func (itr *MDBIterator) Key() []byte { + return itr.key +} + +func (itr *MDBIterator) Value() []byte { + return itr.value +} + +func (itr *MDBIterator) Valid() bool { + return itr.valid +} + +func (itr *MDBIterator) Error() error { + return itr.err +} + +func (itr *MDBIterator) getCurrent() { + itr.key, itr.value, itr.err = itr.c.Get(nil, nil, mdb.GET_CURRENT) + itr.setState() +} + +func (itr *MDBIterator) Seek(key []byte) { + itr.key, itr.value, itr.err = itr.c.Get(key, nil, mdb.SET_RANGE) + itr.setState() +} +func (itr *MDBIterator) Next() { + itr.key, itr.value, itr.err = itr.c.Get(nil, nil, mdb.NEXT) + itr.setState() +} + +func (itr *MDBIterator) Prev() { + itr.key, itr.value, itr.err = itr.c.Get(nil, nil, mdb.PREV) + itr.setState() +} + +func (itr *MDBIterator) First() { + itr.key, itr.value, itr.err = itr.c.Get(nil, nil, mdb.FIRST) + itr.setState() +} + +func (itr *MDBIterator) Last() { + itr.key, itr.value, itr.err = itr.c.Get(nil, nil, mdb.LAST) + itr.setState() +} + +func (itr *MDBIterator) setState() { + if itr.err != nil { + if itr.err == mdb.NotFound { + itr.err = nil + } + itr.valid = false + } else { + itr.valid = true + } +} + +func (itr *MDBIterator) Close() error { + if err := itr.c.Close(); err != nil { + itr.tx.Abort() + return err + } + + if !itr.closeAutoCommit { + return itr.err + } + + if itr.err != nil { + itr.tx.Abort() + return itr.err + } + return itr.tx.Commit() +} + +func (_ MDB) Name() string { + return "lmdb" +} + +func (db MDB) Path() string { + return db.path +} + +func (db MDB) iterator(rdonly bool) *MDBIterator { + flags := uint(0) + if rdonly { + flags = mdb.RDONLY + } + tx, err := db.env.BeginTxn(nil, flags) + if err != nil { + return &MDBIterator{nil, nil, nil, nil, false, err, true} + } + + c, err := tx.CursorOpen(db.db) + if err != nil { + tx.Abort() + return &MDBIterator{nil, nil, nil, nil, false, err, true} + } + + return &MDBIterator{nil, nil, c, tx, true, nil, true} +} + +func (db MDB) Close() error { + db.env.DBIClose(db.db) + if err := db.env.Close(); err != nil { + panic(err) + } + return nil +} + +func (db MDB) NewIterator() driver.IIterator { + return db.iterator(true) +} + +func (db MDB) NewWriteBatch() driver.IWriteBatch { + return driver.NewWriteBatch(db) +} + +func (db MDB) Begin() (driver.Tx, error) { + return newTx(db) +} + +func (db MDB) NewSnapshot() (driver.ISnapshot, error) { + return newSnapshot(db) +} + +func (db MDB) Compact() error { + return nil +} + +func init() { + driver.Register(Store{}) +} diff --git a/vendor/github.com/siddontang/ledisdb/store/mdb/snapshot.go b/vendor/github.com/siddontang/ledisdb/store/mdb/snapshot.go new file mode 100644 index 000000000000..270907d7117c --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/mdb/snapshot.go @@ -0,0 +1,43 @@ +// +build !windows + +package mdb + +import ( + "github.com/siddontang/ledisdb/store/driver" + mdb "github.com/szferi/gomdb" +) + +type Snapshot struct { + db mdb.DBI + tx *mdb.Txn +} + +func newSnapshot(db MDB) (*Snapshot, error) { + tx, err := db.env.BeginTxn(nil, mdb.RDONLY) + if err != nil { + return nil, err + } + + return &Snapshot{db.db, tx}, nil +} + +func (s *Snapshot) Get(key []byte) ([]byte, error) { + v, err := s.tx.Get(s.db, key) + if err == mdb.NotFound { + return nil, nil + } + return v, err +} + +func (s *Snapshot) NewIterator() driver.IIterator { + c, err := s.tx.CursorOpen(s.db) + if err != nil { + return &MDBIterator{nil, nil, nil, nil, false, err, false} + } + + return &MDBIterator{nil, nil, c, s.tx, true, nil, false} +} + +func (s *Snapshot) Close() { + s.tx.Commit() +} diff --git a/vendor/github.com/siddontang/ledisdb/store/mdb/tx.go b/vendor/github.com/siddontang/ledisdb/store/mdb/tx.go new file mode 100644 index 000000000000..e2e3be0171be --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/mdb/tx.go @@ -0,0 +1,90 @@ +// +build !windows + +package mdb + +import ( + "github.com/siddontang/ledisdb/store/driver" + mdb "github.com/szferi/gomdb" +) + +type Tx struct { + db mdb.DBI + tx *mdb.Txn +} + +func newTx(db MDB) (*Tx, error) { + tx, err := db.env.BeginTxn(nil, uint(0)) + if err != nil { + return nil, err + } + + return &Tx{db.db, tx}, nil +} + +func (t *Tx) Get(key []byte) ([]byte, error) { + v, err := t.tx.Get(t.db, key) + if err == mdb.NotFound { + return nil, nil + } + return v, err +} + +func (t *Tx) Put(key []byte, value []byte) error { + return t.tx.Put(t.db, key, value, mdb.NODUPDATA) +} + +func (t *Tx) Delete(key []byte) error { + return t.tx.Del(t.db, key, nil) +} + +func (t *Tx) NewIterator() driver.IIterator { + return t.newIterator() +} + +func (t *Tx) newIterator() *MDBIterator { + c, err := t.tx.CursorOpen(t.db) + if err != nil { + return &MDBIterator{nil, nil, nil, nil, false, err, false} + } + + return &MDBIterator{nil, nil, c, t.tx, true, nil, false} +} + +func (t *Tx) NewWriteBatch() driver.IWriteBatch { + return driver.NewWriteBatch(t) +} + +func (t *Tx) BatchPut(writes []driver.Write) error { + itr := t.newIterator() + + for _, w := range writes { + if w.Value == nil { + itr.key, itr.value, itr.err = itr.c.Get(w.Key, nil, mdb.SET) + if itr.err == nil { + itr.err = itr.c.Del(0) + } + } else { + itr.err = itr.c.Put(w.Key, w.Value, 0) + } + + if itr.err != nil && itr.err != mdb.NotFound { + break + } + } + itr.setState() + + return itr.Close() +} + +func (t *Tx) SyncBatchPut(writes []driver.Write) error { + return t.BatchPut(writes) +} + +func (t *Tx) Rollback() error { + t.tx.Abort() + return nil +} + +func (t *Tx) Commit() error { + return t.tx.Commit() +} diff --git a/vendor/github.com/siddontang/ledisdb/store/rocksdb/batch.go b/vendor/github.com/siddontang/ledisdb/store/rocksdb/batch.go new file mode 100644 index 000000000000..bb727e7015ef --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/rocksdb/batch.go @@ -0,0 +1,83 @@ +// +build rocksdb + +package rocksdb + +// #cgo LDFLAGS: -lrocksdb +// #include "rocksdb/c.h" +// #include "rocksdb_ext.h" +import "C" + +import ( + "unsafe" +) + +type WriteBatch struct { + db *DB + wbatch *C.rocksdb_writebatch_t + commitOk bool +} + +func (w *WriteBatch) Close() { + if w.wbatch != nil { + C.rocksdb_writebatch_destroy(w.wbatch) + w.wbatch = nil + } +} + +func (w *WriteBatch) Put(key, value []byte) { + w.commitOk = false + + var k, v *C.char + if len(key) != 0 { + k = (*C.char)(unsafe.Pointer(&key[0])) + } + if len(value) != 0 { + v = (*C.char)(unsafe.Pointer(&value[0])) + } + + lenk := len(key) + lenv := len(value) + + C.rocksdb_writebatch_put(w.wbatch, k, C.size_t(lenk), v, C.size_t(lenv)) +} + +func (w *WriteBatch) Delete(key []byte) { + w.commitOk = false + + C.rocksdb_writebatch_delete(w.wbatch, + (*C.char)(unsafe.Pointer(&key[0])), C.size_t(len(key))) +} + +func (w *WriteBatch) Commit() error { + return w.commit(w.db.writeOpts) +} + +func (w *WriteBatch) SyncCommit() error { + return w.commit(w.db.syncOpts) +} + +func (w *WriteBatch) Rollback() error { + if !w.commitOk { + C.rocksdb_writebatch_clear(w.wbatch) + } + return nil +} + +func (w *WriteBatch) commit(wb *WriteOptions) error { + w.commitOk = true + + var errStr *C.char + C.rocksdb_write_ext(w.db.db, wb.Opt, w.wbatch, &errStr) + if errStr != nil { + w.commitOk = false + return saveError(errStr) + } + return nil +} + +func (w *WriteBatch) Data() []byte { + var vallen C.size_t + value := C.rocksdb_writebatch_data(w.wbatch, &vallen) + + return slice(unsafe.Pointer(value), int(vallen)) +} diff --git a/vendor/github.com/siddontang/ledisdb/store/rocksdb/cache.go b/vendor/github.com/siddontang/ledisdb/store/rocksdb/cache.go new file mode 100644 index 000000000000..931998ba4af4 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/rocksdb/cache.go @@ -0,0 +1,20 @@ +// +build rocksdb + +package rocksdb + +// #cgo LDFLAGS: -lrocksdb +// #include +// #include "rocksdb/c.h" +import "C" + +type Cache struct { + Cache *C.rocksdb_cache_t +} + +func NewLRUCache(capacity int) *Cache { + return &Cache{C.rocksdb_cache_create_lru(C.size_t(capacity))} +} + +func (c *Cache) Close() { + C.rocksdb_cache_destroy(c.Cache) +} diff --git a/vendor/github.com/siddontang/ledisdb/store/rocksdb/const.go b/vendor/github.com/siddontang/ledisdb/store/rocksdb/const.go new file mode 100644 index 000000000000..f4155bbe2018 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/rocksdb/const.go @@ -0,0 +1,3 @@ +package rocksdb + +const DBName = "rocksdb" diff --git a/vendor/github.com/siddontang/ledisdb/store/rocksdb/db.go b/vendor/github.com/siddontang/ledisdb/store/rocksdb/db.go new file mode 100644 index 000000000000..952121b8b365 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/rocksdb/db.go @@ -0,0 +1,346 @@ +// +build rocksdb + +// Package rocksdb is a wrapper for c++ rocksdb +package rocksdb + +/* +#cgo LDFLAGS: -lrocksdb +#include +#include +#include "rocksdb_ext.h" +*/ +import "C" + +import ( + "github.com/siddontang/ledisdb/config" + "github.com/siddontang/ledisdb/store/driver" + "os" + "runtime" + "unsafe" +) + +const defaultFilterBits int = 10 + +type Store struct { +} + +func (s Store) String() string { + return DBName +} + +func (s Store) Open(path string, cfg *config.Config) (driver.IDB, error) { + if err := os.MkdirAll(path, 0755); err != nil { + return nil, err + } + + db := new(DB) + db.path = path + db.cfg = &cfg.RocksDB + + if err := db.open(); err != nil { + return nil, err + } + + return db, nil +} + +func (s Store) Repair(path string, cfg *config.Config) error { + db := new(DB) + db.path = path + db.cfg = &cfg.RocksDB + + err := db.open() + defer db.Close() + + //open ok, do not need repair + if err == nil { + return nil + } + + var errStr *C.char + ldbname := C.CString(path) + defer C.free(unsafe.Pointer(ldbname)) + + C.rocksdb_repair_db(db.opts.Opt, ldbname, &errStr) + if errStr != nil { + return saveError(errStr) + } + return nil +} + +type DB struct { + path string + + cfg *config.RocksDBConfig + + db *C.rocksdb_t + + env *Env + + opts *Options + blockOpts *BlockBasedTableOptions + + //for default read and write options + readOpts *ReadOptions + writeOpts *WriteOptions + iteratorOpts *ReadOptions + + syncOpts *WriteOptions + + cache *Cache + + filter *FilterPolicy +} + +func (db *DB) open() error { + db.initOptions(db.cfg) + + var errStr *C.char + ldbname := C.CString(db.path) + defer C.free(unsafe.Pointer(ldbname)) + + db.db = C.rocksdb_open(db.opts.Opt, ldbname, &errStr) + if errStr != nil { + db.db = nil + return saveError(errStr) + } + return nil +} + +func (db *DB) initOptions(cfg *config.RocksDBConfig) { + opts := NewOptions() + blockOpts := NewBlockBasedTableOptions() + + opts.SetCreateIfMissing(true) + + db.env = NewDefaultEnv() + db.env.SetBackgroundThreads(cfg.BackgroundThreads) + db.env.SetHighPriorityBackgroundThreads(cfg.HighPriorityBackgroundThreads) + opts.SetEnv(db.env) + + db.cache = NewLRUCache(cfg.CacheSize) + blockOpts.SetCache(db.cache) + + //we must use bloomfilter + db.filter = NewBloomFilter(defaultFilterBits) + blockOpts.SetFilterPolicy(db.filter) + blockOpts.SetBlockSize(cfg.BlockSize) + opts.SetBlockBasedTableFactory(blockOpts) + + opts.SetCompression(CompressionOpt(cfg.Compression)) + opts.SetWriteBufferSize(cfg.WriteBufferSize) + opts.SetMaxOpenFiles(cfg.MaxOpenFiles) + opts.SetMaxBackgroundCompactions(cfg.MaxBackgroundCompactions) + opts.SetMaxBackgroundFlushes(cfg.MaxBackgroundFlushes) + opts.SetLevel0FileNumCompactionTrigger(cfg.Level0FileNumCompactionTrigger) + opts.SetLevel0SlowdownWritesTrigger(cfg.Level0SlowdownWritesTrigger) + opts.SetLevel0StopWritesTrigger(cfg.Level0StopWritesTrigger) + opts.SetTargetFileSizeBase(cfg.TargetFileSizeBase) + opts.SetTargetFileSizeMultiplier(cfg.TargetFileSizeMultiplier) + opts.SetMaxBytesForLevelBase(cfg.MaxBytesForLevelBase) + opts.SetMaxBytesForLevelMultiplier(cfg.MaxBytesForLevelMultiplier) + opts.DisableDataSync(cfg.DisableDataSync) + opts.SetMinWriteBufferNumberToMerge(cfg.MinWriteBufferNumberToMerge) + opts.DisableAutoCompactions(cfg.DisableAutoCompactions) + opts.EnableStatistics(cfg.EnableStatistics) + opts.UseFsync(cfg.UseFsync) + opts.AllowOsBuffer(cfg.AllowOsBuffer) + opts.SetStatsDumpPeriodSec(cfg.StatsDumpPeriodSec) + + db.opts = opts + db.blockOpts = blockOpts + + db.readOpts = NewReadOptions() + db.writeOpts = NewWriteOptions() + db.writeOpts.DisableWAL(cfg.DisableWAL) + + db.syncOpts = NewWriteOptions() + db.syncOpts.SetSync(true) + db.syncOpts.DisableWAL(cfg.DisableWAL) + + db.iteratorOpts = NewReadOptions() + db.iteratorOpts.SetFillCache(false) +} + +func (db *DB) Close() error { + if db.db != nil { + C.rocksdb_close(db.db) + db.db = nil + } + + if db.filter != nil { + db.filter.Close() + } + + if db.cache != nil { + db.cache.Close() + } + + if db.env != nil { + db.env.Close() + } + + //db.blockOpts.Close() + + db.opts.Close() + + db.readOpts.Close() + db.writeOpts.Close() + db.iteratorOpts.Close() + + return nil +} + +func (db *DB) Put(key, value []byte) error { + return db.put(db.writeOpts, key, value) +} + +func (db *DB) Get(key []byte) ([]byte, error) { + return db.get(db.readOpts, key) +} + +func (db *DB) Delete(key []byte) error { + return db.delete(db.writeOpts, key) +} + +func (db *DB) SyncPut(key []byte, value []byte) error { + return db.put(db.syncOpts, key, value) +} + +func (db *DB) SyncDelete(key []byte) error { + return db.delete(db.syncOpts, key) +} + +func (db *DB) NewWriteBatch() driver.IWriteBatch { + wb := &WriteBatch{ + db: db, + wbatch: C.rocksdb_writebatch_create(), + } + + runtime.SetFinalizer(wb, func(w *WriteBatch) { + w.Close() + }) + + return wb +} + +func (db *DB) NewIterator() driver.IIterator { + it := new(Iterator) + + it.it = C.rocksdb_create_iterator(db.db, db.iteratorOpts.Opt) + + return it +} + +func (db *DB) NewSnapshot() (driver.ISnapshot, error) { + snap := &Snapshot{ + db: db, + snap: C.rocksdb_create_snapshot(db.db), + readOpts: NewReadOptions(), + iteratorOpts: NewReadOptions(), + } + snap.readOpts.SetSnapshot(snap) + snap.iteratorOpts.SetSnapshot(snap) + snap.iteratorOpts.SetFillCache(false) + + return snap, nil +} + +func (db *DB) put(wo *WriteOptions, key, value []byte) error { + var errStr *C.char + var k, v *C.char + if len(key) != 0 { + k = (*C.char)(unsafe.Pointer(&key[0])) + } + if len(value) != 0 { + v = (*C.char)(unsafe.Pointer(&value[0])) + } + + lenk := len(key) + lenv := len(value) + C.rocksdb_put( + db.db, wo.Opt, k, C.size_t(lenk), v, C.size_t(lenv), &errStr) + + if errStr != nil { + return saveError(errStr) + } + return nil +} + +func (db *DB) get(ro *ReadOptions, key []byte) ([]byte, error) { + var errStr *C.char + var vallen C.size_t + var k *C.char + if len(key) != 0 { + k = (*C.char)(unsafe.Pointer(&key[0])) + } + + value := C.rocksdb_get( + db.db, ro.Opt, k, C.size_t(len(key)), &vallen, &errStr) + + if errStr != nil { + return nil, saveError(errStr) + } + + if value == nil { + return nil, nil + } + + defer C.free(unsafe.Pointer(value)) + return C.GoBytes(unsafe.Pointer(value), C.int(vallen)), nil +} + +func (db *DB) getSlice(ro *ReadOptions, key []byte) (driver.ISlice, error) { + var errStr *C.char + var vallen C.size_t + var k *C.char + if len(key) != 0 { + k = (*C.char)(unsafe.Pointer(&key[0])) + } + + value := C.rocksdb_get( + db.db, ro.Opt, k, C.size_t(len(key)), &vallen, &errStr) + + if errStr != nil { + return nil, saveError(errStr) + } + + if value == nil { + return nil, nil + } + + return NewCSlice(unsafe.Pointer(value), int(vallen)), nil +} + +func (db *DB) delete(wo *WriteOptions, key []byte) error { + var errStr *C.char + var k *C.char + if len(key) != 0 { + k = (*C.char)(unsafe.Pointer(&key[0])) + } + + C.rocksdb_delete( + db.db, wo.Opt, k, C.size_t(len(key)), &errStr) + + if errStr != nil { + return saveError(errStr) + } + return nil +} + +func (db *DB) Begin() (driver.Tx, error) { + return nil, driver.ErrTxSupport +} + +func (db *DB) Compact() error { + C.rocksdb_compact_range(db.db, nil, 0, nil, 0) + return nil +} + +func (db *DB) GetSlice(key []byte) (driver.ISlice, error) { + return db.getSlice(db.readOpts, key) +} + +func init() { + driver.Register(Store{}) +} diff --git a/vendor/github.com/siddontang/ledisdb/store/rocksdb/env.go b/vendor/github.com/siddontang/ledisdb/store/rocksdb/env.go new file mode 100644 index 000000000000..e239c1b6c0e8 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/rocksdb/env.go @@ -0,0 +1,27 @@ +// +build rocksdb + +package rocksdb + +// #cgo LDFLAGS: -lrocksdb +// #include "rocksdb/c.h" +import "C" + +type Env struct { + Env *C.rocksdb_env_t +} + +func NewDefaultEnv() *Env { + return &Env{C.rocksdb_create_default_env()} +} + +func (env *Env) SetHighPriorityBackgroundThreads(n int) { + C.rocksdb_env_set_high_priority_background_threads(env.Env, C.int(n)) +} + +func (env *Env) SetBackgroundThreads(n int) { + C.rocksdb_env_set_background_threads(env.Env, C.int(n)) +} + +func (env *Env) Close() { + C.rocksdb_env_destroy(env.Env) +} diff --git a/vendor/github.com/siddontang/ledisdb/store/rocksdb/filterpolicy.go b/vendor/github.com/siddontang/ledisdb/store/rocksdb/filterpolicy.go new file mode 100644 index 000000000000..3be4ef6acac7 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/rocksdb/filterpolicy.go @@ -0,0 +1,21 @@ +// +build rocksdb + +package rocksdb + +// #cgo LDFLAGS: -lrocksdb +// #include +// #include "rocksdb/c.h" +import "C" + +type FilterPolicy struct { + Policy *C.rocksdb_filterpolicy_t +} + +func NewBloomFilter(bitsPerKey int) *FilterPolicy { + policy := C.rocksdb_filterpolicy_create_bloom(C.int(bitsPerKey)) + return &FilterPolicy{policy} +} + +func (fp *FilterPolicy) Close() { + C.rocksdb_filterpolicy_destroy(fp.Policy) +} diff --git a/vendor/github.com/siddontang/ledisdb/store/rocksdb/iterator.go b/vendor/github.com/siddontang/ledisdb/store/rocksdb/iterator.go new file mode 100644 index 000000000000..046c5e9dfaa4 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/rocksdb/iterator.go @@ -0,0 +1,70 @@ +// +build rocksdb + +package rocksdb + +// #cgo LDFLAGS: -lrocksdb +// #include +// #include "rocksdb/c.h" +// #include "rocksdb_ext.h" +import "C" + +import ( + "unsafe" +) + +type Iterator struct { + it *C.rocksdb_iterator_t + isValid C.uchar +} + +func (it *Iterator) Key() []byte { + var klen C.size_t + kdata := C.rocksdb_iter_key(it.it, &klen) + if kdata == nil { + return nil + } + + return slice(unsafe.Pointer(kdata), int(C.int(klen))) +} + +func (it *Iterator) Value() []byte { + var vlen C.size_t + vdata := C.rocksdb_iter_value(it.it, &vlen) + if vdata == nil { + return nil + } + + return slice(unsafe.Pointer(vdata), int(C.int(vlen))) +} + +func (it *Iterator) Close() error { + if it.it != nil { + C.rocksdb_iter_destroy(it.it) + it.it = nil + } + return nil +} + +func (it *Iterator) Valid() bool { + return ucharToBool(it.isValid) +} + +func (it *Iterator) Next() { + it.isValid = C.rocksdb_iter_next_ext(it.it) +} + +func (it *Iterator) Prev() { + it.isValid = C.rocksdb_iter_prev_ext(it.it) +} + +func (it *Iterator) First() { + it.isValid = C.rocksdb_iter_seek_to_first_ext(it.it) +} + +func (it *Iterator) Last() { + it.isValid = C.rocksdb_iter_seek_to_last_ext(it.it) +} + +func (it *Iterator) Seek(key []byte) { + it.isValid = C.rocksdb_iter_seek_ext(it.it, (*C.char)(unsafe.Pointer(&key[0])), C.size_t(len(key))) +} diff --git a/vendor/github.com/siddontang/ledisdb/store/rocksdb/options.go b/vendor/github.com/siddontang/ledisdb/store/rocksdb/options.go new file mode 100644 index 000000000000..27836790fda4 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/rocksdb/options.go @@ -0,0 +1,233 @@ +// +build rocksdb + +package rocksdb + +// #cgo LDFLAGS: -lrocksdb +// #include "rocksdb/c.h" +import "C" + +type CompressionOpt int + +const ( + NoCompression = CompressionOpt(0) + SnappyCompression = CompressionOpt(1) + ZlibCompression = CompressionOpt(2) + Bz2Compression = CompressionOpt(3) + Lz4Compression = CompressionOpt(4) + Lz4hcCompression = CompressionOpt(5) +) + +type Options struct { + Opt *C.rocksdb_options_t +} + +type ReadOptions struct { + Opt *C.rocksdb_readoptions_t +} + +type WriteOptions struct { + Opt *C.rocksdb_writeoptions_t +} + +type BlockBasedTableOptions struct { + Opt *C.rocksdb_block_based_table_options_t +} + +func NewOptions() *Options { + opt := C.rocksdb_options_create() + return &Options{opt} +} + +func NewReadOptions() *ReadOptions { + opt := C.rocksdb_readoptions_create() + return &ReadOptions{opt} +} + +func NewWriteOptions() *WriteOptions { + opt := C.rocksdb_writeoptions_create() + return &WriteOptions{opt} +} + +func NewBlockBasedTableOptions() *BlockBasedTableOptions { + opt := C.rocksdb_block_based_options_create() + return &BlockBasedTableOptions{opt} +} + +func (o *Options) Close() { + C.rocksdb_options_destroy(o.Opt) +} + +func (o *Options) IncreaseParallelism(n int) { + C.rocksdb_options_increase_parallelism(o.Opt, C.int(n)) +} + +func (o *Options) OptimizeLevelStyleCompaction(n int) { + C.rocksdb_options_optimize_level_style_compaction(o.Opt, C.uint64_t(n)) +} + +func (o *Options) SetComparator(cmp *C.rocksdb_comparator_t) { + C.rocksdb_options_set_comparator(o.Opt, cmp) +} + +func (o *Options) SetErrorIfExists(error_if_exists bool) { + eie := boolToUchar(error_if_exists) + C.rocksdb_options_set_error_if_exists(o.Opt, eie) +} + +func (o *Options) SetEnv(env *Env) { + C.rocksdb_options_set_env(o.Opt, env.Env) +} + +func (o *Options) SetWriteBufferSize(s int) { + C.rocksdb_options_set_write_buffer_size(o.Opt, C.size_t(s)) +} + +func (o *Options) SetParanoidChecks(pc bool) { + C.rocksdb_options_set_paranoid_checks(o.Opt, boolToUchar(pc)) +} + +func (o *Options) SetMaxOpenFiles(n int) { + C.rocksdb_options_set_max_open_files(o.Opt, C.int(n)) +} + +func (o *Options) SetCompression(t CompressionOpt) { + C.rocksdb_options_set_compression(o.Opt, C.int(t)) +} + +func (o *Options) SetCreateIfMissing(b bool) { + C.rocksdb_options_set_create_if_missing(o.Opt, boolToUchar(b)) +} + +func (o *Options) SetMaxWriteBufferNumber(n int) { + C.rocksdb_options_set_max_write_buffer_number(o.Opt, C.int(n)) +} + +func (o *Options) SetMaxBackgroundCompactions(n int) { + C.rocksdb_options_set_max_background_compactions(o.Opt, C.int(n)) +} + +func (o *Options) SetMaxBackgroundFlushes(n int) { + C.rocksdb_options_set_max_background_flushes(o.Opt, C.int(n)) +} + +func (o *Options) SetNumLevels(n int) { + C.rocksdb_options_set_num_levels(o.Opt, C.int(n)) +} + +func (o *Options) SetLevel0FileNumCompactionTrigger(n int) { + C.rocksdb_options_set_level0_file_num_compaction_trigger(o.Opt, C.int(n)) +} + +func (o *Options) SetLevel0SlowdownWritesTrigger(n int) { + C.rocksdb_options_set_level0_slowdown_writes_trigger(o.Opt, C.int(n)) +} + +func (o *Options) SetLevel0StopWritesTrigger(n int) { + C.rocksdb_options_set_level0_stop_writes_trigger(o.Opt, C.int(n)) +} + +func (o *Options) SetTargetFileSizeBase(n int) { + C.rocksdb_options_set_target_file_size_base(o.Opt, C.uint64_t(uint64(n))) +} + +func (o *Options) SetTargetFileSizeMultiplier(n int) { + C.rocksdb_options_set_target_file_size_multiplier(o.Opt, C.int(n)) +} + +func (o *Options) SetMaxBytesForLevelBase(n int) { + C.rocksdb_options_set_max_bytes_for_level_base(o.Opt, C.uint64_t(uint64(n))) +} + +func (o *Options) SetMaxBytesForLevelMultiplier(n int) { + C.rocksdb_options_set_max_bytes_for_level_multiplier(o.Opt, C.int(n)) +} + +func (o *Options) SetBlockBasedTableFactory(opt *BlockBasedTableOptions) { + C.rocksdb_options_set_block_based_table_factory(o.Opt, opt.Opt) +} + +func (o *Options) SetMinWriteBufferNumberToMerge(n int) { + C.rocksdb_options_set_min_write_buffer_number_to_merge(o.Opt, C.int(n)) +} + +func (o *Options) DisableDataSync(b bool) { + C.rocksdb_options_set_disable_data_sync(o.Opt, boolToInt(b)) +} + +func (o *Options) DisableAutoCompactions(b bool) { + C.rocksdb_options_set_disable_auto_compactions(o.Opt, boolToInt(b)) +} + +func (o *Options) UseFsync(b bool) { + C.rocksdb_options_set_use_fsync(o.Opt, boolToInt(b)) +} + +func (o *Options) AllowOsBuffer(b bool) { + C.rocksdb_options_set_allow_os_buffer(o.Opt, boolToUchar(b)) +} + +func (o *Options) EnableStatistics(b bool) { + if b { + C.rocksdb_options_enable_statistics(o.Opt) + } +} + +func (o *Options) SetStatsDumpPeriodSec(n int) { + C.rocksdb_options_set_stats_dump_period_sec(o.Opt, C.uint(n)) +} + +func (o *BlockBasedTableOptions) Close() { + C.rocksdb_block_based_options_destroy(o.Opt) +} + +func (o *BlockBasedTableOptions) SetFilterPolicy(fp *FilterPolicy) { + var policy *C.rocksdb_filterpolicy_t + if fp != nil { + policy = fp.Policy + } + C.rocksdb_block_based_options_set_filter_policy(o.Opt, policy) +} + +func (o *BlockBasedTableOptions) SetBlockSize(s int) { + C.rocksdb_block_based_options_set_block_size(o.Opt, C.size_t(s)) +} + +func (o *BlockBasedTableOptions) SetBlockRestartInterval(n int) { + C.rocksdb_block_based_options_set_block_restart_interval(o.Opt, C.int(n)) +} + +func (o *BlockBasedTableOptions) SetCache(cache *Cache) { + C.rocksdb_block_based_options_set_block_cache(o.Opt, cache.Cache) +} + +func (ro *ReadOptions) Close() { + C.rocksdb_readoptions_destroy(ro.Opt) +} + +func (ro *ReadOptions) SetVerifyChecksums(b bool) { + C.rocksdb_readoptions_set_verify_checksums(ro.Opt, boolToUchar(b)) +} + +func (ro *ReadOptions) SetFillCache(b bool) { + C.rocksdb_readoptions_set_fill_cache(ro.Opt, boolToUchar(b)) +} + +func (ro *ReadOptions) SetSnapshot(snap *Snapshot) { + var s *C.rocksdb_snapshot_t + if snap != nil { + s = snap.snap + } + C.rocksdb_readoptions_set_snapshot(ro.Opt, s) +} + +func (wo *WriteOptions) Close() { + C.rocksdb_writeoptions_destroy(wo.Opt) +} + +func (wo *WriteOptions) SetSync(b bool) { + C.rocksdb_writeoptions_set_sync(wo.Opt, boolToUchar(b)) +} + +func (wo *WriteOptions) DisableWAL(b bool) { + C.rocksdb_writeoptions_disable_WAL(wo.Opt, boolToInt(b)) +} diff --git a/vendor/github.com/siddontang/ledisdb/store/rocksdb/rocksdb_ext.cc b/vendor/github.com/siddontang/ledisdb/store/rocksdb/rocksdb_ext.cc new file mode 100644 index 000000000000..39036ab96cc2 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/rocksdb/rocksdb_ext.cc @@ -0,0 +1,44 @@ +// +build rocksdb + +#include "rocksdb_ext.h" + +#include +#include + +extern "C" { + +unsigned char rocksdb_iter_seek_to_first_ext(rocksdb_iterator_t* iter) { + rocksdb_iter_seek_to_first(iter); + return rocksdb_iter_valid(iter); +} + +unsigned char rocksdb_iter_seek_to_last_ext(rocksdb_iterator_t* iter) { + rocksdb_iter_seek_to_last(iter); + return rocksdb_iter_valid(iter); +} + +unsigned char rocksdb_iter_seek_ext(rocksdb_iterator_t* iter, const char* k, size_t klen) { + rocksdb_iter_seek(iter, k, klen); + return rocksdb_iter_valid(iter); +} + +unsigned char rocksdb_iter_next_ext(rocksdb_iterator_t* iter) { + rocksdb_iter_next(iter); + return rocksdb_iter_valid(iter); +} + +unsigned char rocksdb_iter_prev_ext(rocksdb_iterator_t* iter) { + rocksdb_iter_prev(iter); + return rocksdb_iter_valid(iter); +} + +void rocksdb_write_ext(rocksdb_t* db, + const rocksdb_writeoptions_t* options, + rocksdb_writebatch_t* batch, char** errptr) { + rocksdb_write(db, options, batch, errptr); + if(*errptr == NULL) { + rocksdb_writebatch_clear(batch); + } +} + +} \ No newline at end of file diff --git a/vendor/github.com/siddontang/ledisdb/store/rocksdb/rocksdb_ext.h b/vendor/github.com/siddontang/ledisdb/store/rocksdb/rocksdb_ext.h new file mode 100644 index 000000000000..11cb65304105 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/rocksdb/rocksdb_ext.h @@ -0,0 +1,24 @@ +// +build rocksdb + +#ifndef ROCKSDB_EXT_H +#define ROCKSDB_EXT_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "rocksdb/c.h" + +// Below iterator functions like rocksdb iterator but returns valid status for iterator +extern unsigned char rocksdb_iter_seek_to_first_ext(rocksdb_iterator_t*); +extern unsigned char rocksdb_iter_seek_to_last_ext(rocksdb_iterator_t*); +extern unsigned char rocksdb_iter_seek_ext(rocksdb_iterator_t*, const char* k, size_t klen); +extern unsigned char rocksdb_iter_next_ext(rocksdb_iterator_t*); +extern unsigned char rocksdb_iter_prev_ext(rocksdb_iterator_t*); +extern void rocksdb_write_ext(rocksdb_t* db, const rocksdb_writeoptions_t* options, rocksdb_writebatch_t* batch, char** errptr); + +#ifdef __cplusplus +} +#endif + +#endif \ No newline at end of file diff --git a/vendor/github.com/siddontang/ledisdb/store/rocksdb/slice.go b/vendor/github.com/siddontang/ledisdb/store/rocksdb/slice.go new file mode 100644 index 000000000000..bbaa65bd7652 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/rocksdb/slice.go @@ -0,0 +1,41 @@ +//+build rocksdb + +package rocksdb + +// #cgo LDFLAGS: -lrocksdb +// #include +// #include +import "C" + +import ( + "reflect" + "unsafe" +) + +type CSlice struct { + data unsafe.Pointer + size int +} + +func NewCSlice(p unsafe.Pointer, n int) *CSlice { + return &CSlice{p, n} +} + +func (s *CSlice) Data() []byte { + var value []byte + + sH := (*reflect.SliceHeader)(unsafe.Pointer(&value)) + sH.Cap = int(s.size) + sH.Len = int(s.size) + sH.Data = uintptr(s.data) + + return value +} + +func (s *CSlice) Size() int { + return int(s.size) +} + +func (s *CSlice) Free() { + C.free(s.data) +} diff --git a/vendor/github.com/siddontang/ledisdb/store/rocksdb/snapshot.go b/vendor/github.com/siddontang/ledisdb/store/rocksdb/snapshot.go new file mode 100644 index 000000000000..1ced60020027 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/rocksdb/snapshot.go @@ -0,0 +1,39 @@ +// +build rocksdb + +package rocksdb + +// #cgo LDFLAGS: -lrocksdb +// #include "rocksdb/c.h" +import "C" + +import ( + "github.com/siddontang/ledisdb/store/driver" +) + +type Snapshot struct { + db *DB + snap *C.rocksdb_snapshot_t + readOpts *ReadOptions + iteratorOpts *ReadOptions +} + +func (s *Snapshot) Get(key []byte) ([]byte, error) { + return s.db.get(s.readOpts, key) +} + +func (s *Snapshot) GetSlice(key []byte) (driver.ISlice, error) { + return s.db.getSlice(s.readOpts, key) +} + +func (s *Snapshot) NewIterator() driver.IIterator { + it := new(Iterator) + it.it = C.rocksdb_create_iterator(s.db.db, s.db.iteratorOpts.Opt) + return it + +} + +func (s *Snapshot) Close() { + C.rocksdb_release_snapshot(s.db.db, s.snap) + s.iteratorOpts.Close() + s.readOpts.Close() +} diff --git a/vendor/github.com/siddontang/ledisdb/store/rocksdb/util.go b/vendor/github.com/siddontang/ledisdb/store/rocksdb/util.go new file mode 100644 index 000000000000..22b73baf4ad4 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/rocksdb/util.go @@ -0,0 +1,54 @@ +// +build rocksdb + +package rocksdb + +// #include +// #include "rocksdb/c.h" +import "C" + +import ( + "fmt" + "reflect" + "unsafe" +) + +func boolToUchar(b bool) C.uchar { + uc := C.uchar(0) + if b { + uc = C.uchar(1) + } + return uc +} + +func ucharToBool(uc C.uchar) bool { + if uc == C.uchar(0) { + return false + } + return true +} + +func boolToInt(b bool) C.int { + uc := C.int(0) + if b { + uc = C.int(1) + } + return uc +} + +func saveError(errStr *C.char) error { + if errStr != nil { + gs := C.GoString(errStr) + C.free(unsafe.Pointer(errStr)) + return fmt.Errorf(gs) + } + return nil +} + +func slice(p unsafe.Pointer, n int) []byte { + var b []byte + pbyte := (*reflect.SliceHeader)(unsafe.Pointer(&b)) + pbyte.Data = uintptr(p) + pbyte.Len = n + pbyte.Cap = n + return b +} diff --git a/vendor/github.com/siddontang/ledisdb/store/slice.go b/vendor/github.com/siddontang/ledisdb/store/slice.go new file mode 100644 index 000000000000..b027f4f28b8e --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/slice.go @@ -0,0 +1,9 @@ +package store + +import ( + "github.com/siddontang/ledisdb/store/driver" +) + +type Slice interface { + driver.ISlice +} diff --git a/vendor/github.com/siddontang/ledisdb/store/snapshot.go b/vendor/github.com/siddontang/ledisdb/store/snapshot.go new file mode 100644 index 000000000000..a1c9de9944e6 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/snapshot.go @@ -0,0 +1,48 @@ +package store + +import ( + "github.com/siddontang/ledisdb/store/driver" +) + +type Snapshot struct { + driver.ISnapshot + st *Stat +} + +func (s *Snapshot) NewIterator() *Iterator { + it := new(Iterator) + it.it = s.ISnapshot.NewIterator() + it.st = s.st + + s.st.IterNum.Add(1) + + return it +} + +func (s *Snapshot) Get(key []byte) ([]byte, error) { + v, err := s.ISnapshot.Get(key) + s.st.statGet(v, err) + return v, err +} + +func (s *Snapshot) GetSlice(key []byte) (Slice, error) { + if d, ok := s.ISnapshot.(driver.ISliceGeter); ok { + v, err := d.GetSlice(key) + s.st.statGet(v, err) + return v, err + } else { + v, err := s.Get(key) + if err != nil { + return nil, err + } else if v == nil { + return nil, nil + } else { + return driver.GoSlice(v), nil + } + } +} + +func (s *Snapshot) Close() { + s.st.SnapshotCloseNum.Add(1) + s.ISnapshot.Close() +} diff --git a/vendor/github.com/siddontang/ledisdb/store/stat.go b/vendor/github.com/siddontang/ledisdb/store/stat.go new file mode 100644 index 000000000000..e0a035ab8e18 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/stat.go @@ -0,0 +1,37 @@ +package store + +import ( + "github.com/siddontang/go/sync2" +) + +type Stat struct { + GetNum sync2.AtomicInt64 + GetMissingNum sync2.AtomicInt64 + GetTotalTime sync2.AtomicDuration + PutNum sync2.AtomicInt64 + DeleteNum sync2.AtomicInt64 + IterNum sync2.AtomicInt64 + IterSeekNum sync2.AtomicInt64 + IterCloseNum sync2.AtomicInt64 + SnapshotNum sync2.AtomicInt64 + SnapshotCloseNum sync2.AtomicInt64 + BatchNum sync2.AtomicInt64 + BatchCommitNum sync2.AtomicInt64 + BatchCommitTotalTime sync2.AtomicDuration + TxNum sync2.AtomicInt64 + TxCommitNum sync2.AtomicInt64 + TxCloseNum sync2.AtomicInt64 + CompactNum sync2.AtomicInt64 + CompactTotalTime sync2.AtomicDuration +} + +func (st *Stat) statGet(v interface{}, err error) { + st.GetNum.Add(1) + if v == nil && err == nil { + st.GetMissingNum.Add(1) + } +} + +func (st *Stat) Reset() { + *st = Stat{} +} diff --git a/vendor/github.com/siddontang/ledisdb/store/store.go b/vendor/github.com/siddontang/ledisdb/store/store.go new file mode 100644 index 000000000000..d1be9b012c4d --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/store.go @@ -0,0 +1,63 @@ +package store + +import ( + "fmt" + "github.com/siddontang/ledisdb/config" + "github.com/siddontang/ledisdb/store/driver" + "os" + "path" + + _ "github.com/siddontang/ledisdb/store/boltdb" + _ "github.com/siddontang/ledisdb/store/goleveldb" + _ "github.com/siddontang/ledisdb/store/leveldb" + _ "github.com/siddontang/ledisdb/store/mdb" + _ "github.com/siddontang/ledisdb/store/rocksdb" +) + +func getStorePath(cfg *config.Config) string { + if len(cfg.DBPath) > 0 { + return cfg.DBPath + } else { + return path.Join(cfg.DataDir, fmt.Sprintf("%s_data", cfg.DBName)) + } +} + +func Open(cfg *config.Config) (*DB, error) { + s, err := driver.GetStore(cfg) + if err != nil { + return nil, err + } + + path := getStorePath(cfg) + + if err := os.MkdirAll(path, 0755); err != nil { + return nil, err + } + + idb, err := s.Open(path, cfg) + if err != nil { + return nil, err + } + + db := new(DB) + db.db = idb + db.name = s.String() + db.st = &Stat{} + db.cfg = cfg + + return db, nil +} + +func Repair(cfg *config.Config) error { + s, err := driver.GetStore(cfg) + if err != nil { + return err + } + + path := getStorePath(cfg) + + return s.Repair(path, cfg) +} + +func init() { +} diff --git a/vendor/github.com/siddontang/ledisdb/store/tx.go b/vendor/github.com/siddontang/ledisdb/store/tx.go new file mode 100644 index 000000000000..4dbf311fd843 --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/tx.go @@ -0,0 +1,86 @@ +package store + +import ( + "github.com/siddontang/ledisdb/store/driver" +) + +type Tx struct { + tx driver.Tx + st *Stat +} + +func (tx *Tx) NewIterator() *Iterator { + it := new(Iterator) + it.it = tx.tx.NewIterator() + it.st = tx.st + + tx.st.IterNum.Add(1) + + return it +} + +func (tx *Tx) NewWriteBatch() *WriteBatch { + tx.st.BatchNum.Add(1) + + wb := new(WriteBatch) + wb.wb = tx.tx.NewWriteBatch() + wb.st = tx.st + return wb +} + +func (tx *Tx) RangeIterator(min []byte, max []byte, rangeType uint8) *RangeLimitIterator { + return NewRangeLimitIterator(tx.NewIterator(), &Range{min, max, rangeType}, &Limit{0, -1}) +} + +func (tx *Tx) RevRangeIterator(min []byte, max []byte, rangeType uint8) *RangeLimitIterator { + return NewRevRangeLimitIterator(tx.NewIterator(), &Range{min, max, rangeType}, &Limit{0, -1}) +} + +//count < 0, unlimit. +// +//offset must >= 0, if < 0, will get nothing. +func (tx *Tx) RangeLimitIterator(min []byte, max []byte, rangeType uint8, offset int, count int) *RangeLimitIterator { + return NewRangeLimitIterator(tx.NewIterator(), &Range{min, max, rangeType}, &Limit{offset, count}) +} + +//count < 0, unlimit. +// +//offset must >= 0, if < 0, will get nothing. +func (tx *Tx) RevRangeLimitIterator(min []byte, max []byte, rangeType uint8, offset int, count int) *RangeLimitIterator { + return NewRevRangeLimitIterator(tx.NewIterator(), &Range{min, max, rangeType}, &Limit{offset, count}) +} + +func (tx *Tx) Get(key []byte) ([]byte, error) { + v, err := tx.tx.Get(key) + tx.st.statGet(v, err) + return v, err +} + +func (tx *Tx) GetSlice(key []byte) (Slice, error) { + if v, err := tx.Get(key); err != nil { + return nil, err + } else if v == nil { + return nil, nil + } else { + return driver.GoSlice(v), nil + } +} + +func (tx *Tx) Put(key []byte, value []byte) error { + tx.st.PutNum.Add(1) + return tx.tx.Put(key, value) +} + +func (tx *Tx) Delete(key []byte) error { + tx.st.DeleteNum.Add(1) + return tx.tx.Delete(key) +} + +func (tx *Tx) Commit() error { + tx.st.TxCommitNum.Add(1) + return tx.tx.Commit() +} + +func (tx *Tx) Rollback() error { + return tx.tx.Rollback() +} diff --git a/vendor/github.com/siddontang/ledisdb/store/writebatch.go b/vendor/github.com/siddontang/ledisdb/store/writebatch.go new file mode 100644 index 000000000000..c193ae08dbaa --- /dev/null +++ b/vendor/github.com/siddontang/ledisdb/store/writebatch.go @@ -0,0 +1,150 @@ +package store + +import ( + "encoding/binary" + "github.com/siddontang/ledisdb/store/driver" + "github.com/syndtr/goleveldb/leveldb" + "time" +) + +type WriteBatch struct { + wb driver.IWriteBatch + st *Stat + + putNum int64 + deleteNum int64 + db *DB + + data *BatchData +} + +func (wb *WriteBatch) Close() { + wb.wb.Close() +} + +func (wb *WriteBatch) Put(key []byte, value []byte) { + wb.putNum++ + wb.wb.Put(key, value) +} + +func (wb *WriteBatch) Delete(key []byte) { + wb.deleteNum++ + wb.wb.Delete(key) +} + +func (wb *WriteBatch) Commit() error { + wb.st.BatchCommitNum.Add(1) + wb.st.PutNum.Add(wb.putNum) + wb.st.DeleteNum.Add(wb.deleteNum) + wb.putNum = 0 + wb.deleteNum = 0 + + var err error + t := time.Now() + if wb.db == nil || !wb.db.needSyncCommit() { + err = wb.wb.Commit() + } else { + err = wb.wb.SyncCommit() + } + + wb.st.BatchCommitTotalTime.Add(time.Now().Sub(t)) + + return err +} + +func (wb *WriteBatch) Rollback() error { + wb.putNum = 0 + wb.deleteNum = 0 + + return wb.wb.Rollback() +} + +// the data will be undefined after commit or rollback +func (wb *WriteBatch) BatchData() *BatchData { + data := wb.wb.Data() + if wb.data == nil { + wb.data = new(BatchData) + } + + wb.data.Load(data) + return wb.data +} + +func (wb *WriteBatch) Data() []byte { + b := wb.BatchData() + return b.Data() +} + +const BatchDataHeadLen = 12 + +/* + see leveldb batch data format for more information +*/ + +type BatchData struct { + leveldb.Batch +} + +func NewBatchData(data []byte) (*BatchData, error) { + b := new(BatchData) + + if err := b.Load(data); err != nil { + return nil, err + } + + return b, nil +} + +func (d *BatchData) Append(do *BatchData) error { + d1 := d.Dump() + d2 := do.Dump() + + n := d.Len() + do.Len() + + d1 = append(d1, d2[BatchDataHeadLen:]...) + binary.LittleEndian.PutUint32(d1[8:], uint32(n)) + + return d.Load(d1) +} + +func (d *BatchData) Data() []byte { + return d.Dump() +} + +func (d *BatchData) Reset() { + d.Batch.Reset() +} + +type BatchDataReplay interface { + Put(key, value []byte) + Delete(key []byte) +} + +type BatchItem struct { + Key []byte + Value []byte +} + +type batchItems []BatchItem + +func (bs *batchItems) Put(key, value []byte) { + *bs = append(*bs, BatchItem{key, value}) +} + +func (bs *batchItems) Delete(key []byte) { + *bs = append(*bs, BatchItem{key, nil}) +} + +func (d *BatchData) Replay(r BatchDataReplay) error { + return d.Batch.Replay(r) +} + +func (d *BatchData) Items() ([]BatchItem, error) { + is := make(batchItems, 0, d.Len()) + + if err := d.Replay(&is); err != nil { + return nil, err + } + + return []BatchItem(is), nil +} diff --git a/vendor/github.com/syndtr/goleveldb/LICENSE b/vendor/github.com/syndtr/goleveldb/LICENSE new file mode 100644 index 000000000000..4a772d1ab369 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/LICENSE @@ -0,0 +1,24 @@ +Copyright 2012 Suryandaru Triandana +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/batch.go b/vendor/github.com/syndtr/goleveldb/leveldb/batch.go new file mode 100644 index 000000000000..225920002df2 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/batch.go @@ -0,0 +1,349 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package leveldb + +import ( + "encoding/binary" + "fmt" + "io" + + "github.com/syndtr/goleveldb/leveldb/errors" + "github.com/syndtr/goleveldb/leveldb/memdb" + "github.com/syndtr/goleveldb/leveldb/storage" +) + +// ErrBatchCorrupted records reason of batch corruption. This error will be +// wrapped with errors.ErrCorrupted. +type ErrBatchCorrupted struct { + Reason string +} + +func (e *ErrBatchCorrupted) Error() string { + return fmt.Sprintf("leveldb: batch corrupted: %s", e.Reason) +} + +func newErrBatchCorrupted(reason string) error { + return errors.NewErrCorrupted(storage.FileDesc{}, &ErrBatchCorrupted{reason}) +} + +const ( + batchHeaderLen = 8 + 4 + batchGrowRec = 3000 + batchBufioSize = 16 +) + +// BatchReplay wraps basic batch operations. +type BatchReplay interface { + Put(key, value []byte) + Delete(key []byte) +} + +type batchIndex struct { + keyType keyType + keyPos, keyLen int + valuePos, valueLen int +} + +func (index batchIndex) k(data []byte) []byte { + return data[index.keyPos : index.keyPos+index.keyLen] +} + +func (index batchIndex) v(data []byte) []byte { + if index.valueLen != 0 { + return data[index.valuePos : index.valuePos+index.valueLen] + } + return nil +} + +func (index batchIndex) kv(data []byte) (key, value []byte) { + return index.k(data), index.v(data) +} + +// Batch is a write batch. +type Batch struct { + data []byte + index []batchIndex + + // internalLen is sums of key/value pair length plus 8-bytes internal key. + internalLen int +} + +func (b *Batch) grow(n int) { + o := len(b.data) + if cap(b.data)-o < n { + div := 1 + if len(b.index) > batchGrowRec { + div = len(b.index) / batchGrowRec + } + ndata := make([]byte, o, o+n+o/div) + copy(ndata, b.data) + b.data = ndata + } +} + +func (b *Batch) appendRec(kt keyType, key, value []byte) { + n := 1 + binary.MaxVarintLen32 + len(key) + if kt == keyTypeVal { + n += binary.MaxVarintLen32 + len(value) + } + b.grow(n) + index := batchIndex{keyType: kt} + o := len(b.data) + data := b.data[:o+n] + data[o] = byte(kt) + o++ + o += binary.PutUvarint(data[o:], uint64(len(key))) + index.keyPos = o + index.keyLen = len(key) + o += copy(data[o:], key) + if kt == keyTypeVal { + o += binary.PutUvarint(data[o:], uint64(len(value))) + index.valuePos = o + index.valueLen = len(value) + o += copy(data[o:], value) + } + b.data = data[:o] + b.index = append(b.index, index) + b.internalLen += index.keyLen + index.valueLen + 8 +} + +// Put appends 'put operation' of the given key/value pair to the batch. +// It is safe to modify the contents of the argument after Put returns but not +// before. +func (b *Batch) Put(key, value []byte) { + b.appendRec(keyTypeVal, key, value) +} + +// Delete appends 'delete operation' of the given key to the batch. +// It is safe to modify the contents of the argument after Delete returns but +// not before. +func (b *Batch) Delete(key []byte) { + b.appendRec(keyTypeDel, key, nil) +} + +// Dump dumps batch contents. The returned slice can be loaded into the +// batch using Load method. +// The returned slice is not its own copy, so the contents should not be +// modified. +func (b *Batch) Dump() []byte { + return b.data +} + +// Load loads given slice into the batch. Previous contents of the batch +// will be discarded. +// The given slice will not be copied and will be used as batch buffer, so +// it is not safe to modify the contents of the slice. +func (b *Batch) Load(data []byte) error { + return b.decode(data, -1) +} + +// Replay replays batch contents. +func (b *Batch) Replay(r BatchReplay) error { + for _, index := range b.index { + switch index.keyType { + case keyTypeVal: + r.Put(index.k(b.data), index.v(b.data)) + case keyTypeDel: + r.Delete(index.k(b.data)) + } + } + return nil +} + +// Len returns number of records in the batch. +func (b *Batch) Len() int { + return len(b.index) +} + +// Reset resets the batch. +func (b *Batch) Reset() { + b.data = b.data[:0] + b.index = b.index[:0] + b.internalLen = 0 +} + +func (b *Batch) replayInternal(fn func(i int, kt keyType, k, v []byte) error) error { + for i, index := range b.index { + if err := fn(i, index.keyType, index.k(b.data), index.v(b.data)); err != nil { + return err + } + } + return nil +} + +func (b *Batch) append(p *Batch) { + ob := len(b.data) + oi := len(b.index) + b.data = append(b.data, p.data...) + b.index = append(b.index, p.index...) + b.internalLen += p.internalLen + + // Updating index offset. + if ob != 0 { + for ; oi < len(b.index); oi++ { + index := &b.index[oi] + index.keyPos += ob + if index.valueLen != 0 { + index.valuePos += ob + } + } + } +} + +func (b *Batch) decode(data []byte, expectedLen int) error { + b.data = data + b.index = b.index[:0] + b.internalLen = 0 + err := decodeBatch(data, func(i int, index batchIndex) error { + b.index = append(b.index, index) + b.internalLen += index.keyLen + index.valueLen + 8 + return nil + }) + if err != nil { + return err + } + if expectedLen >= 0 && len(b.index) != expectedLen { + return newErrBatchCorrupted(fmt.Sprintf("invalid records length: %d vs %d", expectedLen, len(b.index))) + } + return nil +} + +func (b *Batch) putMem(seq uint64, mdb *memdb.DB) error { + var ik []byte + for i, index := range b.index { + ik = makeInternalKey(ik, index.k(b.data), seq+uint64(i), index.keyType) + if err := mdb.Put(ik, index.v(b.data)); err != nil { + return err + } + } + return nil +} + +func (b *Batch) revertMem(seq uint64, mdb *memdb.DB) error { + var ik []byte + for i, index := range b.index { + ik = makeInternalKey(ik, index.k(b.data), seq+uint64(i), index.keyType) + if err := mdb.Delete(ik); err != nil { + return err + } + } + return nil +} + +func newBatch() interface{} { + return &Batch{} +} + +func decodeBatch(data []byte, fn func(i int, index batchIndex) error) error { + var index batchIndex + for i, o := 0, 0; o < len(data); i++ { + // Key type. + index.keyType = keyType(data[o]) + if index.keyType > keyTypeVal { + return newErrBatchCorrupted(fmt.Sprintf("bad record: invalid type %#x", uint(index.keyType))) + } + o++ + + // Key. + x, n := binary.Uvarint(data[o:]) + o += n + if n <= 0 || o+int(x) > len(data) { + return newErrBatchCorrupted("bad record: invalid key length") + } + index.keyPos = o + index.keyLen = int(x) + o += index.keyLen + + // Value. + if index.keyType == keyTypeVal { + x, n = binary.Uvarint(data[o:]) + o += n + if n <= 0 || o+int(x) > len(data) { + return newErrBatchCorrupted("bad record: invalid value length") + } + index.valuePos = o + index.valueLen = int(x) + o += index.valueLen + } else { + index.valuePos = 0 + index.valueLen = 0 + } + + if err := fn(i, index); err != nil { + return err + } + } + return nil +} + +func decodeBatchToMem(data []byte, expectSeq uint64, mdb *memdb.DB) (seq uint64, batchLen int, err error) { + seq, batchLen, err = decodeBatchHeader(data) + if err != nil { + return 0, 0, err + } + if seq < expectSeq { + return 0, 0, newErrBatchCorrupted("invalid sequence number") + } + data = data[batchHeaderLen:] + var ik []byte + var decodedLen int + err = decodeBatch(data, func(i int, index batchIndex) error { + if i >= batchLen { + return newErrBatchCorrupted("invalid records length") + } + ik = makeInternalKey(ik, index.k(data), seq+uint64(i), index.keyType) + if err := mdb.Put(ik, index.v(data)); err != nil { + return err + } + decodedLen++ + return nil + }) + if err == nil && decodedLen != batchLen { + err = newErrBatchCorrupted(fmt.Sprintf("invalid records length: %d vs %d", batchLen, decodedLen)) + } + return +} + +func encodeBatchHeader(dst []byte, seq uint64, batchLen int) []byte { + dst = ensureBuffer(dst, batchHeaderLen) + binary.LittleEndian.PutUint64(dst, seq) + binary.LittleEndian.PutUint32(dst[8:], uint32(batchLen)) + return dst +} + +func decodeBatchHeader(data []byte) (seq uint64, batchLen int, err error) { + if len(data) < batchHeaderLen { + return 0, 0, newErrBatchCorrupted("too short") + } + + seq = binary.LittleEndian.Uint64(data) + batchLen = int(binary.LittleEndian.Uint32(data[8:])) + if batchLen < 0 { + return 0, 0, newErrBatchCorrupted("invalid records length") + } + return +} + +func batchesLen(batches []*Batch) int { + batchLen := 0 + for _, batch := range batches { + batchLen += batch.Len() + } + return batchLen +} + +func writeBatchesWithHeader(wr io.Writer, batches []*Batch, seq uint64) error { + if _, err := wr.Write(encodeBatchHeader(nil, seq, batchesLen(batches))); err != nil { + return err + } + for _, batch := range batches { + if _, err := wr.Write(batch.data); err != nil { + return err + } + } + return nil +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/cache/cache.go b/vendor/github.com/syndtr/goleveldb/leveldb/cache/cache.go new file mode 100644 index 000000000000..c36ad323597e --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/cache/cache.go @@ -0,0 +1,704 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Package cache provides interface and implementation of a cache algorithms. +package cache + +import ( + "sync" + "sync/atomic" + "unsafe" + + "github.com/syndtr/goleveldb/leveldb/util" +) + +// Cacher provides interface to implements a caching functionality. +// An implementation must be safe for concurrent use. +type Cacher interface { + // Capacity returns cache capacity. + Capacity() int + + // SetCapacity sets cache capacity. + SetCapacity(capacity int) + + // Promote promotes the 'cache node'. + Promote(n *Node) + + // Ban evicts the 'cache node' and prevent subsequent 'promote'. + Ban(n *Node) + + // Evict evicts the 'cache node'. + Evict(n *Node) + + // EvictNS evicts 'cache node' with the given namespace. + EvictNS(ns uint64) + + // EvictAll evicts all 'cache node'. + EvictAll() + + // Close closes the 'cache tree' + Close() error +} + +// Value is a 'cacheable object'. It may implements util.Releaser, if +// so the the Release method will be called once object is released. +type Value interface{} + +// NamespaceGetter provides convenient wrapper for namespace. +type NamespaceGetter struct { + Cache *Cache + NS uint64 +} + +// Get simply calls Cache.Get() method. +func (g *NamespaceGetter) Get(key uint64, setFunc func() (size int, value Value)) *Handle { + return g.Cache.Get(g.NS, key, setFunc) +} + +// The hash tables implementation is based on: +// "Dynamic-Sized Nonblocking Hash Tables", by Yujie Liu, +// Kunlong Zhang, and Michael Spear. +// ACM Symposium on Principles of Distributed Computing, Jul 2014. + +const ( + mInitialSize = 1 << 4 + mOverflowThreshold = 1 << 5 + mOverflowGrowThreshold = 1 << 7 +) + +type mBucket struct { + mu sync.Mutex + node []*Node + frozen bool +} + +func (b *mBucket) freeze() []*Node { + b.mu.Lock() + defer b.mu.Unlock() + if !b.frozen { + b.frozen = true + } + return b.node +} + +func (b *mBucket) get(r *Cache, h *mNode, hash uint32, ns, key uint64, noset bool) (done, added bool, n *Node) { + b.mu.Lock() + + if b.frozen { + b.mu.Unlock() + return + } + + // Scan the node. + for _, n := range b.node { + if n.hash == hash && n.ns == ns && n.key == key { + atomic.AddInt32(&n.ref, 1) + b.mu.Unlock() + return true, false, n + } + } + + // Get only. + if noset { + b.mu.Unlock() + return true, false, nil + } + + // Create node. + n = &Node{ + r: r, + hash: hash, + ns: ns, + key: key, + ref: 1, + } + // Add node to bucket. + b.node = append(b.node, n) + bLen := len(b.node) + b.mu.Unlock() + + // Update counter. + grow := atomic.AddInt32(&r.nodes, 1) >= h.growThreshold + if bLen > mOverflowThreshold { + grow = grow || atomic.AddInt32(&h.overflow, 1) >= mOverflowGrowThreshold + } + + // Grow. + if grow && atomic.CompareAndSwapInt32(&h.resizeInProgess, 0, 1) { + nhLen := len(h.buckets) << 1 + nh := &mNode{ + buckets: make([]unsafe.Pointer, nhLen), + mask: uint32(nhLen) - 1, + pred: unsafe.Pointer(h), + growThreshold: int32(nhLen * mOverflowThreshold), + shrinkThreshold: int32(nhLen >> 1), + } + ok := atomic.CompareAndSwapPointer(&r.mHead, unsafe.Pointer(h), unsafe.Pointer(nh)) + if !ok { + panic("BUG: failed swapping head") + } + go nh.initBuckets() + } + + return true, true, n +} + +func (b *mBucket) delete(r *Cache, h *mNode, hash uint32, ns, key uint64) (done, deleted bool) { + b.mu.Lock() + + if b.frozen { + b.mu.Unlock() + return + } + + // Scan the node. + var ( + n *Node + bLen int + ) + for i := range b.node { + n = b.node[i] + if n.ns == ns && n.key == key { + if atomic.LoadInt32(&n.ref) == 0 { + deleted = true + + // Call releaser. + if n.value != nil { + if r, ok := n.value.(util.Releaser); ok { + r.Release() + } + n.value = nil + } + + // Remove node from bucket. + b.node = append(b.node[:i], b.node[i+1:]...) + bLen = len(b.node) + } + break + } + } + b.mu.Unlock() + + if deleted { + // Call OnDel. + for _, f := range n.onDel { + f() + } + + // Update counter. + atomic.AddInt32(&r.size, int32(n.size)*-1) + shrink := atomic.AddInt32(&r.nodes, -1) < h.shrinkThreshold + if bLen >= mOverflowThreshold { + atomic.AddInt32(&h.overflow, -1) + } + + // Shrink. + if shrink && len(h.buckets) > mInitialSize && atomic.CompareAndSwapInt32(&h.resizeInProgess, 0, 1) { + nhLen := len(h.buckets) >> 1 + nh := &mNode{ + buckets: make([]unsafe.Pointer, nhLen), + mask: uint32(nhLen) - 1, + pred: unsafe.Pointer(h), + growThreshold: int32(nhLen * mOverflowThreshold), + shrinkThreshold: int32(nhLen >> 1), + } + ok := atomic.CompareAndSwapPointer(&r.mHead, unsafe.Pointer(h), unsafe.Pointer(nh)) + if !ok { + panic("BUG: failed swapping head") + } + go nh.initBuckets() + } + } + + return true, deleted +} + +type mNode struct { + buckets []unsafe.Pointer // []*mBucket + mask uint32 + pred unsafe.Pointer // *mNode + resizeInProgess int32 + + overflow int32 + growThreshold int32 + shrinkThreshold int32 +} + +func (n *mNode) initBucket(i uint32) *mBucket { + if b := (*mBucket)(atomic.LoadPointer(&n.buckets[i])); b != nil { + return b + } + + p := (*mNode)(atomic.LoadPointer(&n.pred)) + if p != nil { + var node []*Node + if n.mask > p.mask { + // Grow. + pb := (*mBucket)(atomic.LoadPointer(&p.buckets[i&p.mask])) + if pb == nil { + pb = p.initBucket(i & p.mask) + } + m := pb.freeze() + // Split nodes. + for _, x := range m { + if x.hash&n.mask == i { + node = append(node, x) + } + } + } else { + // Shrink. + pb0 := (*mBucket)(atomic.LoadPointer(&p.buckets[i])) + if pb0 == nil { + pb0 = p.initBucket(i) + } + pb1 := (*mBucket)(atomic.LoadPointer(&p.buckets[i+uint32(len(n.buckets))])) + if pb1 == nil { + pb1 = p.initBucket(i + uint32(len(n.buckets))) + } + m0 := pb0.freeze() + m1 := pb1.freeze() + // Merge nodes. + node = make([]*Node, 0, len(m0)+len(m1)) + node = append(node, m0...) + node = append(node, m1...) + } + b := &mBucket{node: node} + if atomic.CompareAndSwapPointer(&n.buckets[i], nil, unsafe.Pointer(b)) { + if len(node) > mOverflowThreshold { + atomic.AddInt32(&n.overflow, int32(len(node)-mOverflowThreshold)) + } + return b + } + } + + return (*mBucket)(atomic.LoadPointer(&n.buckets[i])) +} + +func (n *mNode) initBuckets() { + for i := range n.buckets { + n.initBucket(uint32(i)) + } + atomic.StorePointer(&n.pred, nil) +} + +// Cache is a 'cache map'. +type Cache struct { + mu sync.RWMutex + mHead unsafe.Pointer // *mNode + nodes int32 + size int32 + cacher Cacher + closed bool +} + +// NewCache creates a new 'cache map'. The cacher is optional and +// may be nil. +func NewCache(cacher Cacher) *Cache { + h := &mNode{ + buckets: make([]unsafe.Pointer, mInitialSize), + mask: mInitialSize - 1, + growThreshold: int32(mInitialSize * mOverflowThreshold), + shrinkThreshold: 0, + } + for i := range h.buckets { + h.buckets[i] = unsafe.Pointer(&mBucket{}) + } + r := &Cache{ + mHead: unsafe.Pointer(h), + cacher: cacher, + } + return r +} + +func (r *Cache) getBucket(hash uint32) (*mNode, *mBucket) { + h := (*mNode)(atomic.LoadPointer(&r.mHead)) + i := hash & h.mask + b := (*mBucket)(atomic.LoadPointer(&h.buckets[i])) + if b == nil { + b = h.initBucket(i) + } + return h, b +} + +func (r *Cache) delete(n *Node) bool { + for { + h, b := r.getBucket(n.hash) + done, deleted := b.delete(r, h, n.hash, n.ns, n.key) + if done { + return deleted + } + } +} + +// Nodes returns number of 'cache node' in the map. +func (r *Cache) Nodes() int { + return int(atomic.LoadInt32(&r.nodes)) +} + +// Size returns sums of 'cache node' size in the map. +func (r *Cache) Size() int { + return int(atomic.LoadInt32(&r.size)) +} + +// Capacity returns cache capacity. +func (r *Cache) Capacity() int { + if r.cacher == nil { + return 0 + } + return r.cacher.Capacity() +} + +// SetCapacity sets cache capacity. +func (r *Cache) SetCapacity(capacity int) { + if r.cacher != nil { + r.cacher.SetCapacity(capacity) + } +} + +// Get gets 'cache node' with the given namespace and key. +// If cache node is not found and setFunc is not nil, Get will atomically creates +// the 'cache node' by calling setFunc. Otherwise Get will returns nil. +// +// The returned 'cache handle' should be released after use by calling Release +// method. +func (r *Cache) Get(ns, key uint64, setFunc func() (size int, value Value)) *Handle { + r.mu.RLock() + defer r.mu.RUnlock() + if r.closed { + return nil + } + + hash := murmur32(ns, key, 0xf00) + for { + h, b := r.getBucket(hash) + done, _, n := b.get(r, h, hash, ns, key, setFunc == nil) + if done { + if n != nil { + n.mu.Lock() + if n.value == nil { + if setFunc == nil { + n.mu.Unlock() + n.unref() + return nil + } + + n.size, n.value = setFunc() + if n.value == nil { + n.size = 0 + n.mu.Unlock() + n.unref() + return nil + } + atomic.AddInt32(&r.size, int32(n.size)) + } + n.mu.Unlock() + if r.cacher != nil { + r.cacher.Promote(n) + } + return &Handle{unsafe.Pointer(n)} + } + + break + } + } + return nil +} + +// Delete removes and ban 'cache node' with the given namespace and key. +// A banned 'cache node' will never inserted into the 'cache tree'. Ban +// only attributed to the particular 'cache node', so when a 'cache node' +// is recreated it will not be banned. +// +// If onDel is not nil, then it will be executed if such 'cache node' +// doesn't exist or once the 'cache node' is released. +// +// Delete return true is such 'cache node' exist. +func (r *Cache) Delete(ns, key uint64, onDel func()) bool { + r.mu.RLock() + defer r.mu.RUnlock() + if r.closed { + return false + } + + hash := murmur32(ns, key, 0xf00) + for { + h, b := r.getBucket(hash) + done, _, n := b.get(r, h, hash, ns, key, true) + if done { + if n != nil { + if onDel != nil { + n.mu.Lock() + n.onDel = append(n.onDel, onDel) + n.mu.Unlock() + } + if r.cacher != nil { + r.cacher.Ban(n) + } + n.unref() + return true + } + + break + } + } + + if onDel != nil { + onDel() + } + + return false +} + +// Evict evicts 'cache node' with the given namespace and key. This will +// simply call Cacher.Evict. +// +// Evict return true is such 'cache node' exist. +func (r *Cache) Evict(ns, key uint64) bool { + r.mu.RLock() + defer r.mu.RUnlock() + if r.closed { + return false + } + + hash := murmur32(ns, key, 0xf00) + for { + h, b := r.getBucket(hash) + done, _, n := b.get(r, h, hash, ns, key, true) + if done { + if n != nil { + if r.cacher != nil { + r.cacher.Evict(n) + } + n.unref() + return true + } + + break + } + } + + return false +} + +// EvictNS evicts 'cache node' with the given namespace. This will +// simply call Cacher.EvictNS. +func (r *Cache) EvictNS(ns uint64) { + r.mu.RLock() + defer r.mu.RUnlock() + if r.closed { + return + } + + if r.cacher != nil { + r.cacher.EvictNS(ns) + } +} + +// EvictAll evicts all 'cache node'. This will simply call Cacher.EvictAll. +func (r *Cache) EvictAll() { + r.mu.RLock() + defer r.mu.RUnlock() + if r.closed { + return + } + + if r.cacher != nil { + r.cacher.EvictAll() + } +} + +// Close closes the 'cache map' and forcefully releases all 'cache node'. +func (r *Cache) Close() error { + r.mu.Lock() + if !r.closed { + r.closed = true + + h := (*mNode)(r.mHead) + h.initBuckets() + + for i := range h.buckets { + b := (*mBucket)(h.buckets[i]) + for _, n := range b.node { + // Call releaser. + if n.value != nil { + if r, ok := n.value.(util.Releaser); ok { + r.Release() + } + n.value = nil + } + + // Call OnDel. + for _, f := range n.onDel { + f() + } + n.onDel = nil + } + } + } + r.mu.Unlock() + + // Avoid deadlock. + if r.cacher != nil { + if err := r.cacher.Close(); err != nil { + return err + } + } + return nil +} + +// CloseWeak closes the 'cache map' and evict all 'cache node' from cacher, but +// unlike Close it doesn't forcefully releases 'cache node'. +func (r *Cache) CloseWeak() error { + r.mu.Lock() + if !r.closed { + r.closed = true + } + r.mu.Unlock() + + // Avoid deadlock. + if r.cacher != nil { + r.cacher.EvictAll() + if err := r.cacher.Close(); err != nil { + return err + } + } + return nil +} + +// Node is a 'cache node'. +type Node struct { + r *Cache + + hash uint32 + ns, key uint64 + + mu sync.Mutex + size int + value Value + + ref int32 + onDel []func() + + CacheData unsafe.Pointer +} + +// NS returns this 'cache node' namespace. +func (n *Node) NS() uint64 { + return n.ns +} + +// Key returns this 'cache node' key. +func (n *Node) Key() uint64 { + return n.key +} + +// Size returns this 'cache node' size. +func (n *Node) Size() int { + return n.size +} + +// Value returns this 'cache node' value. +func (n *Node) Value() Value { + return n.value +} + +// Ref returns this 'cache node' ref counter. +func (n *Node) Ref() int32 { + return atomic.LoadInt32(&n.ref) +} + +// GetHandle returns an handle for this 'cache node'. +func (n *Node) GetHandle() *Handle { + if atomic.AddInt32(&n.ref, 1) <= 1 { + panic("BUG: Node.GetHandle on zero ref") + } + return &Handle{unsafe.Pointer(n)} +} + +func (n *Node) unref() { + if atomic.AddInt32(&n.ref, -1) == 0 { + n.r.delete(n) + } +} + +func (n *Node) unrefLocked() { + if atomic.AddInt32(&n.ref, -1) == 0 { + n.r.mu.RLock() + if !n.r.closed { + n.r.delete(n) + } + n.r.mu.RUnlock() + } +} + +// Handle is a 'cache handle' of a 'cache node'. +type Handle struct { + n unsafe.Pointer // *Node +} + +// Value returns the value of the 'cache node'. +func (h *Handle) Value() Value { + n := (*Node)(atomic.LoadPointer(&h.n)) + if n != nil { + return n.value + } + return nil +} + +// Release releases this 'cache handle'. +// It is safe to call release multiple times. +func (h *Handle) Release() { + nPtr := atomic.LoadPointer(&h.n) + if nPtr != nil && atomic.CompareAndSwapPointer(&h.n, nPtr, nil) { + n := (*Node)(nPtr) + n.unrefLocked() + } +} + +func murmur32(ns, key uint64, seed uint32) uint32 { + const ( + m = uint32(0x5bd1e995) + r = 24 + ) + + k1 := uint32(ns >> 32) + k2 := uint32(ns) + k3 := uint32(key >> 32) + k4 := uint32(key) + + k1 *= m + k1 ^= k1 >> r + k1 *= m + + k2 *= m + k2 ^= k2 >> r + k2 *= m + + k3 *= m + k3 ^= k3 >> r + k3 *= m + + k4 *= m + k4 ^= k4 >> r + k4 *= m + + h := seed + + h *= m + h ^= k1 + h *= m + h ^= k2 + h *= m + h ^= k3 + h *= m + h ^= k4 + + h ^= h >> 13 + h *= m + h ^= h >> 15 + + return h +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/cache/lru.go b/vendor/github.com/syndtr/goleveldb/leveldb/cache/lru.go new file mode 100644 index 000000000000..d9a84cde15e8 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/cache/lru.go @@ -0,0 +1,195 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package cache + +import ( + "sync" + "unsafe" +) + +type lruNode struct { + n *Node + h *Handle + ban bool + + next, prev *lruNode +} + +func (n *lruNode) insert(at *lruNode) { + x := at.next + at.next = n + n.prev = at + n.next = x + x.prev = n +} + +func (n *lruNode) remove() { + if n.prev != nil { + n.prev.next = n.next + n.next.prev = n.prev + n.prev = nil + n.next = nil + } else { + panic("BUG: removing removed node") + } +} + +type lru struct { + mu sync.Mutex + capacity int + used int + recent lruNode +} + +func (r *lru) reset() { + r.recent.next = &r.recent + r.recent.prev = &r.recent + r.used = 0 +} + +func (r *lru) Capacity() int { + r.mu.Lock() + defer r.mu.Unlock() + return r.capacity +} + +func (r *lru) SetCapacity(capacity int) { + var evicted []*lruNode + + r.mu.Lock() + r.capacity = capacity + for r.used > r.capacity { + rn := r.recent.prev + if rn == nil { + panic("BUG: invalid LRU used or capacity counter") + } + rn.remove() + rn.n.CacheData = nil + r.used -= rn.n.Size() + evicted = append(evicted, rn) + } + r.mu.Unlock() + + for _, rn := range evicted { + rn.h.Release() + } +} + +func (r *lru) Promote(n *Node) { + var evicted []*lruNode + + r.mu.Lock() + if n.CacheData == nil { + if n.Size() <= r.capacity { + rn := &lruNode{n: n, h: n.GetHandle()} + rn.insert(&r.recent) + n.CacheData = unsafe.Pointer(rn) + r.used += n.Size() + + for r.used > r.capacity { + rn := r.recent.prev + if rn == nil { + panic("BUG: invalid LRU used or capacity counter") + } + rn.remove() + rn.n.CacheData = nil + r.used -= rn.n.Size() + evicted = append(evicted, rn) + } + } + } else { + rn := (*lruNode)(n.CacheData) + if !rn.ban { + rn.remove() + rn.insert(&r.recent) + } + } + r.mu.Unlock() + + for _, rn := range evicted { + rn.h.Release() + } +} + +func (r *lru) Ban(n *Node) { + r.mu.Lock() + if n.CacheData == nil { + n.CacheData = unsafe.Pointer(&lruNode{n: n, ban: true}) + } else { + rn := (*lruNode)(n.CacheData) + if !rn.ban { + rn.remove() + rn.ban = true + r.used -= rn.n.Size() + r.mu.Unlock() + + rn.h.Release() + rn.h = nil + return + } + } + r.mu.Unlock() +} + +func (r *lru) Evict(n *Node) { + r.mu.Lock() + rn := (*lruNode)(n.CacheData) + if rn == nil || rn.ban { + r.mu.Unlock() + return + } + n.CacheData = nil + r.mu.Unlock() + + rn.h.Release() +} + +func (r *lru) EvictNS(ns uint64) { + var evicted []*lruNode + + r.mu.Lock() + for e := r.recent.prev; e != &r.recent; { + rn := e + e = e.prev + if rn.n.NS() == ns { + rn.remove() + rn.n.CacheData = nil + r.used -= rn.n.Size() + evicted = append(evicted, rn) + } + } + r.mu.Unlock() + + for _, rn := range evicted { + rn.h.Release() + } +} + +func (r *lru) EvictAll() { + r.mu.Lock() + back := r.recent.prev + for rn := back; rn != &r.recent; rn = rn.prev { + rn.n.CacheData = nil + } + r.reset() + r.mu.Unlock() + + for rn := back; rn != &r.recent; rn = rn.prev { + rn.h.Release() + } +} + +func (r *lru) Close() error { + return nil +} + +// NewLRU create a new LRU-cache. +func NewLRU(capacity int) Cacher { + r := &lru{capacity: capacity} + r.reset() + return r +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/comparer.go b/vendor/github.com/syndtr/goleveldb/leveldb/comparer.go new file mode 100644 index 000000000000..448402b82698 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/comparer.go @@ -0,0 +1,67 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package leveldb + +import ( + "github.com/syndtr/goleveldb/leveldb/comparer" +) + +type iComparer struct { + ucmp comparer.Comparer +} + +func (icmp *iComparer) uName() string { + return icmp.ucmp.Name() +} + +func (icmp *iComparer) uCompare(a, b []byte) int { + return icmp.ucmp.Compare(a, b) +} + +func (icmp *iComparer) uSeparator(dst, a, b []byte) []byte { + return icmp.ucmp.Separator(dst, a, b) +} + +func (icmp *iComparer) uSuccessor(dst, b []byte) []byte { + return icmp.ucmp.Successor(dst, b) +} + +func (icmp *iComparer) Name() string { + return icmp.uName() +} + +func (icmp *iComparer) Compare(a, b []byte) int { + x := icmp.uCompare(internalKey(a).ukey(), internalKey(b).ukey()) + if x == 0 { + if m, n := internalKey(a).num(), internalKey(b).num(); m > n { + return -1 + } else if m < n { + return 1 + } + } + return x +} + +func (icmp *iComparer) Separator(dst, a, b []byte) []byte { + ua, ub := internalKey(a).ukey(), internalKey(b).ukey() + dst = icmp.uSeparator(dst, ua, ub) + if dst != nil && len(dst) < len(ua) && icmp.uCompare(ua, dst) < 0 { + // Append earliest possible number. + return append(dst, keyMaxNumBytes...) + } + return nil +} + +func (icmp *iComparer) Successor(dst, b []byte) []byte { + ub := internalKey(b).ukey() + dst = icmp.uSuccessor(dst, ub) + if dst != nil && len(dst) < len(ub) && icmp.uCompare(ub, dst) < 0 { + // Append earliest possible number. + return append(dst, keyMaxNumBytes...) + } + return nil +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/comparer/bytes_comparer.go b/vendor/github.com/syndtr/goleveldb/leveldb/comparer/bytes_comparer.go new file mode 100644 index 000000000000..abf9fb65c7a6 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/comparer/bytes_comparer.go @@ -0,0 +1,51 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package comparer + +import "bytes" + +type bytesComparer struct{} + +func (bytesComparer) Compare(a, b []byte) int { + return bytes.Compare(a, b) +} + +func (bytesComparer) Name() string { + return "leveldb.BytewiseComparator" +} + +func (bytesComparer) Separator(dst, a, b []byte) []byte { + i, n := 0, len(a) + if n > len(b) { + n = len(b) + } + for ; i < n && a[i] == b[i]; i++ { + } + if i >= n { + // Do not shorten if one string is a prefix of the other + } else if c := a[i]; c < 0xff && c+1 < b[i] { + dst = append(dst, a[:i+1]...) + dst[len(dst)-1]++ + return dst + } + return nil +} + +func (bytesComparer) Successor(dst, b []byte) []byte { + for i, c := range b { + if c != 0xff { + dst = append(dst, b[:i+1]...) + dst[len(dst)-1]++ + return dst + } + } + return nil +} + +// DefaultComparer are default implementation of the Comparer interface. +// It uses the natural ordering, consistent with bytes.Compare. +var DefaultComparer = bytesComparer{} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/comparer/comparer.go b/vendor/github.com/syndtr/goleveldb/leveldb/comparer/comparer.go new file mode 100644 index 000000000000..2c522db23b90 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/comparer/comparer.go @@ -0,0 +1,57 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Package comparer provides interface and implementation for ordering +// sets of data. +package comparer + +// BasicComparer is the interface that wraps the basic Compare method. +type BasicComparer interface { + // Compare returns -1, 0, or +1 depending on whether a is 'less than', + // 'equal to' or 'greater than' b. The two arguments can only be 'equal' + // if their contents are exactly equal. Furthermore, the empty slice + // must be 'less than' any non-empty slice. + Compare(a, b []byte) int +} + +// Comparer defines a total ordering over the space of []byte keys: a 'less +// than' relationship. +type Comparer interface { + BasicComparer + + // Name returns name of the comparer. + // + // The Level-DB on-disk format stores the comparer name, and opening a + // database with a different comparer from the one it was created with + // will result in an error. + // + // An implementation to a new name whenever the comparer implementation + // changes in a way that will cause the relative ordering of any two keys + // to change. + // + // Names starting with "leveldb." are reserved and should not be used + // by any users of this package. + Name() string + + // Bellow are advanced functions used to reduce the space requirements + // for internal data structures such as index blocks. + + // Separator appends a sequence of bytes x to dst such that a <= x && x < b, + // where 'less than' is consistent with Compare. An implementation should + // return nil if x equal to a. + // + // Either contents of a or b should not by any means modified. Doing so + // may cause corruption on the internal state. + Separator(dst, a, b []byte) []byte + + // Successor appends a sequence of bytes x to dst such that x >= b, where + // 'less than' is consistent with Compare. An implementation should return + // nil if x equal to b. + // + // Contents of b should not by any means modified. Doing so may cause + // corruption on the internal state. + Successor(dst, b []byte) []byte +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/db.go b/vendor/github.com/syndtr/goleveldb/leveldb/db.go new file mode 100644 index 000000000000..b27c38d37e67 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/db.go @@ -0,0 +1,1175 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package leveldb + +import ( + "container/list" + "fmt" + "io" + "os" + "runtime" + "strings" + "sync" + "sync/atomic" + "time" + + "github.com/syndtr/goleveldb/leveldb/errors" + "github.com/syndtr/goleveldb/leveldb/iterator" + "github.com/syndtr/goleveldb/leveldb/journal" + "github.com/syndtr/goleveldb/leveldb/memdb" + "github.com/syndtr/goleveldb/leveldb/opt" + "github.com/syndtr/goleveldb/leveldb/storage" + "github.com/syndtr/goleveldb/leveldb/table" + "github.com/syndtr/goleveldb/leveldb/util" +) + +// DB is a LevelDB database. +type DB struct { + // Need 64-bit alignment. + seq uint64 + + // Stats. Need 64-bit alignment. + cWriteDelay int64 // The cumulative duration of write delays + cWriteDelayN int32 // The cumulative number of write delays + inWritePaused int32 // The indicator whether write operation is paused by compaction + aliveSnaps, aliveIters int32 + + // Session. + s *session + + // MemDB. + memMu sync.RWMutex + memPool chan *memdb.DB + mem, frozenMem *memDB + journal *journal.Writer + journalWriter storage.Writer + journalFd storage.FileDesc + frozenJournalFd storage.FileDesc + frozenSeq uint64 + + // Snapshot. + snapsMu sync.Mutex + snapsList *list.List + + // Write. + batchPool sync.Pool + writeMergeC chan writeMerge + writeMergedC chan bool + writeLockC chan struct{} + writeAckC chan error + writeDelay time.Duration + writeDelayN int + tr *Transaction + + // Compaction. + compCommitLk sync.Mutex + tcompCmdC chan cCmd + tcompPauseC chan chan<- struct{} + mcompCmdC chan cCmd + compErrC chan error + compPerErrC chan error + compErrSetC chan error + compWriteLocking bool + compStats cStats + memdbMaxLevel int // For testing. + + // Close. + closeW sync.WaitGroup + closeC chan struct{} + closed uint32 + closer io.Closer +} + +func openDB(s *session) (*DB, error) { + s.log("db@open opening") + start := time.Now() + db := &DB{ + s: s, + // Initial sequence + seq: s.stSeqNum, + // MemDB + memPool: make(chan *memdb.DB, 1), + // Snapshot + snapsList: list.New(), + // Write + batchPool: sync.Pool{New: newBatch}, + writeMergeC: make(chan writeMerge), + writeMergedC: make(chan bool), + writeLockC: make(chan struct{}, 1), + writeAckC: make(chan error), + // Compaction + tcompCmdC: make(chan cCmd), + tcompPauseC: make(chan chan<- struct{}), + mcompCmdC: make(chan cCmd), + compErrC: make(chan error), + compPerErrC: make(chan error), + compErrSetC: make(chan error), + // Close + closeC: make(chan struct{}), + } + + // Read-only mode. + readOnly := s.o.GetReadOnly() + + if readOnly { + // Recover journals (read-only mode). + if err := db.recoverJournalRO(); err != nil { + return nil, err + } + } else { + // Recover journals. + if err := db.recoverJournal(); err != nil { + return nil, err + } + + // Remove any obsolete files. + if err := db.checkAndCleanFiles(); err != nil { + // Close journal. + if db.journal != nil { + db.journal.Close() + db.journalWriter.Close() + } + return nil, err + } + + } + + // Doesn't need to be included in the wait group. + go db.compactionError() + go db.mpoolDrain() + + if readOnly { + db.SetReadOnly() + } else { + db.closeW.Add(2) + go db.tCompaction() + go db.mCompaction() + // go db.jWriter() + } + + s.logf("db@open done T·%v", time.Since(start)) + + runtime.SetFinalizer(db, (*DB).Close) + return db, nil +} + +// Open opens or creates a DB for the given storage. +// The DB will be created if not exist, unless ErrorIfMissing is true. +// Also, if ErrorIfExist is true and the DB exist Open will returns +// os.ErrExist error. +// +// Open will return an error with type of ErrCorrupted if corruption +// detected in the DB. Use errors.IsCorrupted to test whether an error is +// due to corruption. Corrupted DB can be recovered with Recover function. +// +// The returned DB instance is safe for concurrent use. +// The DB must be closed after use, by calling Close method. +func Open(stor storage.Storage, o *opt.Options) (db *DB, err error) { + s, err := newSession(stor, o) + if err != nil { + return + } + defer func() { + if err != nil { + s.close() + s.release() + } + }() + + err = s.recover() + if err != nil { + if !os.IsNotExist(err) || s.o.GetErrorIfMissing() || s.o.GetReadOnly() { + return + } + err = s.create() + if err != nil { + return + } + } else if s.o.GetErrorIfExist() { + err = os.ErrExist + return + } + + return openDB(s) +} + +// OpenFile opens or creates a DB for the given path. +// The DB will be created if not exist, unless ErrorIfMissing is true. +// Also, if ErrorIfExist is true and the DB exist OpenFile will returns +// os.ErrExist error. +// +// OpenFile uses standard file-system backed storage implementation as +// described in the leveldb/storage package. +// +// OpenFile will return an error with type of ErrCorrupted if corruption +// detected in the DB. Use errors.IsCorrupted to test whether an error is +// due to corruption. Corrupted DB can be recovered with Recover function. +// +// The returned DB instance is safe for concurrent use. +// The DB must be closed after use, by calling Close method. +func OpenFile(path string, o *opt.Options) (db *DB, err error) { + stor, err := storage.OpenFile(path, o.GetReadOnly()) + if err != nil { + return + } + db, err = Open(stor, o) + if err != nil { + stor.Close() + } else { + db.closer = stor + } + return +} + +// Recover recovers and opens a DB with missing or corrupted manifest files +// for the given storage. It will ignore any manifest files, valid or not. +// The DB must already exist or it will returns an error. +// Also, Recover will ignore ErrorIfMissing and ErrorIfExist options. +// +// The returned DB instance is safe for concurrent use. +// The DB must be closed after use, by calling Close method. +func Recover(stor storage.Storage, o *opt.Options) (db *DB, err error) { + s, err := newSession(stor, o) + if err != nil { + return + } + defer func() { + if err != nil { + s.close() + s.release() + } + }() + + err = recoverTable(s, o) + if err != nil { + return + } + return openDB(s) +} + +// RecoverFile recovers and opens a DB with missing or corrupted manifest files +// for the given path. It will ignore any manifest files, valid or not. +// The DB must already exist or it will returns an error. +// Also, Recover will ignore ErrorIfMissing and ErrorIfExist options. +// +// RecoverFile uses standard file-system backed storage implementation as described +// in the leveldb/storage package. +// +// The returned DB instance is safe for concurrent use. +// The DB must be closed after use, by calling Close method. +func RecoverFile(path string, o *opt.Options) (db *DB, err error) { + stor, err := storage.OpenFile(path, false) + if err != nil { + return + } + db, err = Recover(stor, o) + if err != nil { + stor.Close() + } else { + db.closer = stor + } + return +} + +func recoverTable(s *session, o *opt.Options) error { + o = dupOptions(o) + // Mask StrictReader, lets StrictRecovery doing its job. + o.Strict &= ^opt.StrictReader + + // Get all tables and sort it by file number. + fds, err := s.stor.List(storage.TypeTable) + if err != nil { + return err + } + sortFds(fds) + + var ( + maxSeq uint64 + recoveredKey, goodKey, corruptedKey, corruptedBlock, droppedTable int + + // We will drop corrupted table. + strict = o.GetStrict(opt.StrictRecovery) + noSync = o.GetNoSync() + + rec = &sessionRecord{} + bpool = util.NewBufferPool(o.GetBlockSize() + 5) + ) + buildTable := func(iter iterator.Iterator) (tmpFd storage.FileDesc, size int64, err error) { + tmpFd = s.newTemp() + writer, err := s.stor.Create(tmpFd) + if err != nil { + return + } + defer func() { + writer.Close() + if err != nil { + s.stor.Remove(tmpFd) + tmpFd = storage.FileDesc{} + } + }() + + // Copy entries. + tw := table.NewWriter(writer, o) + for iter.Next() { + key := iter.Key() + if validInternalKey(key) { + err = tw.Append(key, iter.Value()) + if err != nil { + return + } + } + } + err = iter.Error() + if err != nil && !errors.IsCorrupted(err) { + return + } + err = tw.Close() + if err != nil { + return + } + if !noSync { + err = writer.Sync() + if err != nil { + return + } + } + size = int64(tw.BytesLen()) + return + } + recoverTable := func(fd storage.FileDesc) error { + s.logf("table@recovery recovering @%d", fd.Num) + reader, err := s.stor.Open(fd) + if err != nil { + return err + } + var closed bool + defer func() { + if !closed { + reader.Close() + } + }() + + // Get file size. + size, err := reader.Seek(0, 2) + if err != nil { + return err + } + + var ( + tSeq uint64 + tgoodKey, tcorruptedKey, tcorruptedBlock int + imin, imax []byte + ) + tr, err := table.NewReader(reader, size, fd, nil, bpool, o) + if err != nil { + return err + } + iter := tr.NewIterator(nil, nil) + if itererr, ok := iter.(iterator.ErrorCallbackSetter); ok { + itererr.SetErrorCallback(func(err error) { + if errors.IsCorrupted(err) { + s.logf("table@recovery block corruption @%d %q", fd.Num, err) + tcorruptedBlock++ + } + }) + } + + // Scan the table. + for iter.Next() { + key := iter.Key() + _, seq, _, kerr := parseInternalKey(key) + if kerr != nil { + tcorruptedKey++ + continue + } + tgoodKey++ + if seq > tSeq { + tSeq = seq + } + if imin == nil { + imin = append([]byte{}, key...) + } + imax = append(imax[:0], key...) + } + if err := iter.Error(); err != nil && !errors.IsCorrupted(err) { + iter.Release() + return err + } + iter.Release() + + goodKey += tgoodKey + corruptedKey += tcorruptedKey + corruptedBlock += tcorruptedBlock + + if strict && (tcorruptedKey > 0 || tcorruptedBlock > 0) { + droppedTable++ + s.logf("table@recovery dropped @%d Gk·%d Ck·%d Cb·%d S·%d Q·%d", fd.Num, tgoodKey, tcorruptedKey, tcorruptedBlock, size, tSeq) + return nil + } + + if tgoodKey > 0 { + if tcorruptedKey > 0 || tcorruptedBlock > 0 { + // Rebuild the table. + s.logf("table@recovery rebuilding @%d", fd.Num) + iter := tr.NewIterator(nil, nil) + tmpFd, newSize, err := buildTable(iter) + iter.Release() + if err != nil { + return err + } + closed = true + reader.Close() + if err := s.stor.Rename(tmpFd, fd); err != nil { + return err + } + size = newSize + } + if tSeq > maxSeq { + maxSeq = tSeq + } + recoveredKey += tgoodKey + // Add table to level 0. + rec.addTable(0, fd.Num, size, imin, imax) + s.logf("table@recovery recovered @%d Gk·%d Ck·%d Cb·%d S·%d Q·%d", fd.Num, tgoodKey, tcorruptedKey, tcorruptedBlock, size, tSeq) + } else { + droppedTable++ + s.logf("table@recovery unrecoverable @%d Ck·%d Cb·%d S·%d", fd.Num, tcorruptedKey, tcorruptedBlock, size) + } + + return nil + } + + // Recover all tables. + if len(fds) > 0 { + s.logf("table@recovery F·%d", len(fds)) + + // Mark file number as used. + s.markFileNum(fds[len(fds)-1].Num) + + for _, fd := range fds { + if err := recoverTable(fd); err != nil { + return err + } + } + + s.logf("table@recovery recovered F·%d N·%d Gk·%d Ck·%d Q·%d", len(fds), recoveredKey, goodKey, corruptedKey, maxSeq) + } + + // Set sequence number. + rec.setSeqNum(maxSeq) + + // Create new manifest. + if err := s.create(); err != nil { + return err + } + + // Commit. + return s.commit(rec) +} + +func (db *DB) recoverJournal() error { + // Get all journals and sort it by file number. + rawFds, err := db.s.stor.List(storage.TypeJournal) + if err != nil { + return err + } + sortFds(rawFds) + + // Journals that will be recovered. + var fds []storage.FileDesc + for _, fd := range rawFds { + if fd.Num >= db.s.stJournalNum || fd.Num == db.s.stPrevJournalNum { + fds = append(fds, fd) + } + } + + var ( + ofd storage.FileDesc // Obsolete file. + rec = &sessionRecord{} + ) + + // Recover journals. + if len(fds) > 0 { + db.logf("journal@recovery F·%d", len(fds)) + + // Mark file number as used. + db.s.markFileNum(fds[len(fds)-1].Num) + + var ( + // Options. + strict = db.s.o.GetStrict(opt.StrictJournal) + checksum = db.s.o.GetStrict(opt.StrictJournalChecksum) + writeBuffer = db.s.o.GetWriteBuffer() + + jr *journal.Reader + mdb = memdb.New(db.s.icmp, writeBuffer) + buf = &util.Buffer{} + batchSeq uint64 + batchLen int + ) + + for _, fd := range fds { + db.logf("journal@recovery recovering @%d", fd.Num) + + fr, err := db.s.stor.Open(fd) + if err != nil { + return err + } + + // Create or reset journal reader instance. + if jr == nil { + jr = journal.NewReader(fr, dropper{db.s, fd}, strict, checksum) + } else { + jr.Reset(fr, dropper{db.s, fd}, strict, checksum) + } + + // Flush memdb and remove obsolete journal file. + if !ofd.Zero() { + if mdb.Len() > 0 { + if _, err := db.s.flushMemdb(rec, mdb, 0); err != nil { + fr.Close() + return err + } + } + + rec.setJournalNum(fd.Num) + rec.setSeqNum(db.seq) + if err := db.s.commit(rec); err != nil { + fr.Close() + return err + } + rec.resetAddedTables() + + db.s.stor.Remove(ofd) + ofd = storage.FileDesc{} + } + + // Replay journal to memdb. + mdb.Reset() + for { + r, err := jr.Next() + if err != nil { + if err == io.EOF { + break + } + + fr.Close() + return errors.SetFd(err, fd) + } + + buf.Reset() + if _, err := buf.ReadFrom(r); err != nil { + if err == io.ErrUnexpectedEOF { + // This is error returned due to corruption, with strict == false. + continue + } + + fr.Close() + return errors.SetFd(err, fd) + } + batchSeq, batchLen, err = decodeBatchToMem(buf.Bytes(), db.seq, mdb) + if err != nil { + if !strict && errors.IsCorrupted(err) { + db.s.logf("journal error: %v (skipped)", err) + // We won't apply sequence number as it might be corrupted. + continue + } + + fr.Close() + return errors.SetFd(err, fd) + } + + // Save sequence number. + db.seq = batchSeq + uint64(batchLen) + + // Flush it if large enough. + if mdb.Size() >= writeBuffer { + if _, err := db.s.flushMemdb(rec, mdb, 0); err != nil { + fr.Close() + return err + } + + mdb.Reset() + } + } + + fr.Close() + ofd = fd + } + + // Flush the last memdb. + if mdb.Len() > 0 { + if _, err := db.s.flushMemdb(rec, mdb, 0); err != nil { + return err + } + } + } + + // Create a new journal. + if _, err := db.newMem(0); err != nil { + return err + } + + // Commit. + rec.setJournalNum(db.journalFd.Num) + rec.setSeqNum(db.seq) + if err := db.s.commit(rec); err != nil { + // Close journal on error. + if db.journal != nil { + db.journal.Close() + db.journalWriter.Close() + } + return err + } + + // Remove the last obsolete journal file. + if !ofd.Zero() { + db.s.stor.Remove(ofd) + } + + return nil +} + +func (db *DB) recoverJournalRO() error { + // Get all journals and sort it by file number. + rawFds, err := db.s.stor.List(storage.TypeJournal) + if err != nil { + return err + } + sortFds(rawFds) + + // Journals that will be recovered. + var fds []storage.FileDesc + for _, fd := range rawFds { + if fd.Num >= db.s.stJournalNum || fd.Num == db.s.stPrevJournalNum { + fds = append(fds, fd) + } + } + + var ( + // Options. + strict = db.s.o.GetStrict(opt.StrictJournal) + checksum = db.s.o.GetStrict(opt.StrictJournalChecksum) + writeBuffer = db.s.o.GetWriteBuffer() + + mdb = memdb.New(db.s.icmp, writeBuffer) + ) + + // Recover journals. + if len(fds) > 0 { + db.logf("journal@recovery RO·Mode F·%d", len(fds)) + + var ( + jr *journal.Reader + buf = &util.Buffer{} + batchSeq uint64 + batchLen int + ) + + for _, fd := range fds { + db.logf("journal@recovery recovering @%d", fd.Num) + + fr, err := db.s.stor.Open(fd) + if err != nil { + return err + } + + // Create or reset journal reader instance. + if jr == nil { + jr = journal.NewReader(fr, dropper{db.s, fd}, strict, checksum) + } else { + jr.Reset(fr, dropper{db.s, fd}, strict, checksum) + } + + // Replay journal to memdb. + for { + r, err := jr.Next() + if err != nil { + if err == io.EOF { + break + } + + fr.Close() + return errors.SetFd(err, fd) + } + + buf.Reset() + if _, err := buf.ReadFrom(r); err != nil { + if err == io.ErrUnexpectedEOF { + // This is error returned due to corruption, with strict == false. + continue + } + + fr.Close() + return errors.SetFd(err, fd) + } + batchSeq, batchLen, err = decodeBatchToMem(buf.Bytes(), db.seq, mdb) + if err != nil { + if !strict && errors.IsCorrupted(err) { + db.s.logf("journal error: %v (skipped)", err) + // We won't apply sequence number as it might be corrupted. + continue + } + + fr.Close() + return errors.SetFd(err, fd) + } + + // Save sequence number. + db.seq = batchSeq + uint64(batchLen) + } + + fr.Close() + } + } + + // Set memDB. + db.mem = &memDB{db: db, DB: mdb, ref: 1} + + return nil +} + +func memGet(mdb *memdb.DB, ikey internalKey, icmp *iComparer) (ok bool, mv []byte, err error) { + mk, mv, err := mdb.Find(ikey) + if err == nil { + ukey, _, kt, kerr := parseInternalKey(mk) + if kerr != nil { + // Shouldn't have had happen. + panic(kerr) + } + if icmp.uCompare(ukey, ikey.ukey()) == 0 { + if kt == keyTypeDel { + return true, nil, ErrNotFound + } + return true, mv, nil + + } + } else if err != ErrNotFound { + return true, nil, err + } + return +} + +func (db *DB) get(auxm *memdb.DB, auxt tFiles, key []byte, seq uint64, ro *opt.ReadOptions) (value []byte, err error) { + ikey := makeInternalKey(nil, key, seq, keyTypeSeek) + + if auxm != nil { + if ok, mv, me := memGet(auxm, ikey, db.s.icmp); ok { + return append([]byte{}, mv...), me + } + } + + em, fm := db.getMems() + for _, m := range [...]*memDB{em, fm} { + if m == nil { + continue + } + defer m.decref() + + if ok, mv, me := memGet(m.DB, ikey, db.s.icmp); ok { + return append([]byte{}, mv...), me + } + } + + v := db.s.version() + value, cSched, err := v.get(auxt, ikey, ro, false) + v.release() + if cSched { + // Trigger table compaction. + db.compTrigger(db.tcompCmdC) + } + return +} + +func nilIfNotFound(err error) error { + if err == ErrNotFound { + return nil + } + return err +} + +func (db *DB) has(auxm *memdb.DB, auxt tFiles, key []byte, seq uint64, ro *opt.ReadOptions) (ret bool, err error) { + ikey := makeInternalKey(nil, key, seq, keyTypeSeek) + + if auxm != nil { + if ok, _, me := memGet(auxm, ikey, db.s.icmp); ok { + return me == nil, nilIfNotFound(me) + } + } + + em, fm := db.getMems() + for _, m := range [...]*memDB{em, fm} { + if m == nil { + continue + } + defer m.decref() + + if ok, _, me := memGet(m.DB, ikey, db.s.icmp); ok { + return me == nil, nilIfNotFound(me) + } + } + + v := db.s.version() + _, cSched, err := v.get(auxt, ikey, ro, true) + v.release() + if cSched { + // Trigger table compaction. + db.compTrigger(db.tcompCmdC) + } + if err == nil { + ret = true + } else if err == ErrNotFound { + err = nil + } + return +} + +// Get gets the value for the given key. It returns ErrNotFound if the +// DB does not contains the key. +// +// The returned slice is its own copy, it is safe to modify the contents +// of the returned slice. +// It is safe to modify the contents of the argument after Get returns. +func (db *DB) Get(key []byte, ro *opt.ReadOptions) (value []byte, err error) { + err = db.ok() + if err != nil { + return + } + + se := db.acquireSnapshot() + defer db.releaseSnapshot(se) + return db.get(nil, nil, key, se.seq, ro) +} + +// Has returns true if the DB does contains the given key. +// +// It is safe to modify the contents of the argument after Has returns. +func (db *DB) Has(key []byte, ro *opt.ReadOptions) (ret bool, err error) { + err = db.ok() + if err != nil { + return + } + + se := db.acquireSnapshot() + defer db.releaseSnapshot(se) + return db.has(nil, nil, key, se.seq, ro) +} + +// NewIterator returns an iterator for the latest snapshot of the +// underlying DB. +// The returned iterator is not safe for concurrent use, but it is safe to use +// multiple iterators concurrently, with each in a dedicated goroutine. +// It is also safe to use an iterator concurrently with modifying its +// underlying DB. The resultant key/value pairs are guaranteed to be +// consistent. +// +// Slice allows slicing the iterator to only contains keys in the given +// range. A nil Range.Start is treated as a key before all keys in the +// DB. And a nil Range.Limit is treated as a key after all keys in +// the DB. +// +// The iterator must be released after use, by calling Release method. +// +// Also read Iterator documentation of the leveldb/iterator package. +func (db *DB) NewIterator(slice *util.Range, ro *opt.ReadOptions) iterator.Iterator { + if err := db.ok(); err != nil { + return iterator.NewEmptyIterator(err) + } + + se := db.acquireSnapshot() + defer db.releaseSnapshot(se) + // Iterator holds 'version' lock, 'version' is immutable so snapshot + // can be released after iterator created. + return db.newIterator(nil, nil, se.seq, slice, ro) +} + +// GetSnapshot returns a latest snapshot of the underlying DB. A snapshot +// is a frozen snapshot of a DB state at a particular point in time. The +// content of snapshot are guaranteed to be consistent. +// +// The snapshot must be released after use, by calling Release method. +func (db *DB) GetSnapshot() (*Snapshot, error) { + if err := db.ok(); err != nil { + return nil, err + } + + return db.newSnapshot(), nil +} + +// GetProperty returns value of the given property name. +// +// Property names: +// leveldb.num-files-at-level{n} +// Returns the number of files at level 'n'. +// leveldb.stats +// Returns statistics of the underlying DB. +// leveldb.iostats +// Returns statistics of effective disk read and write. +// leveldb.writedelay +// Returns cumulative write delay caused by compaction. +// leveldb.sstables +// Returns sstables list for each level. +// leveldb.blockpool +// Returns block pool stats. +// leveldb.cachedblock +// Returns size of cached block. +// leveldb.openedtables +// Returns number of opened tables. +// leveldb.alivesnaps +// Returns number of alive snapshots. +// leveldb.aliveiters +// Returns number of alive iterators. +func (db *DB) GetProperty(name string) (value string, err error) { + err = db.ok() + if err != nil { + return + } + + const prefix = "leveldb." + if !strings.HasPrefix(name, prefix) { + return "", ErrNotFound + } + p := name[len(prefix):] + + v := db.s.version() + defer v.release() + + numFilesPrefix := "num-files-at-level" + switch { + case strings.HasPrefix(p, numFilesPrefix): + var level uint + var rest string + n, _ := fmt.Sscanf(p[len(numFilesPrefix):], "%d%s", &level, &rest) + if n != 1 { + err = ErrNotFound + } else { + value = fmt.Sprint(v.tLen(int(level))) + } + case p == "stats": + value = "Compactions\n" + + " Level | Tables | Size(MB) | Time(sec) | Read(MB) | Write(MB)\n" + + "-------+------------+---------------+---------------+---------------+---------------\n" + for level, tables := range v.levels { + duration, read, write := db.compStats.getStat(level) + if len(tables) == 0 && duration == 0 { + continue + } + value += fmt.Sprintf(" %3d | %10d | %13.5f | %13.5f | %13.5f | %13.5f\n", + level, len(tables), float64(tables.size())/1048576.0, duration.Seconds(), + float64(read)/1048576.0, float64(write)/1048576.0) + } + case p == "iostats": + value = fmt.Sprintf("Read(MB):%.5f Write(MB):%.5f", + float64(db.s.stor.reads())/1048576.0, + float64(db.s.stor.writes())/1048576.0) + case p == "writedelay": + writeDelayN, writeDelay := atomic.LoadInt32(&db.cWriteDelayN), time.Duration(atomic.LoadInt64(&db.cWriteDelay)) + paused := atomic.LoadInt32(&db.inWritePaused) == 1 + value = fmt.Sprintf("DelayN:%d Delay:%s Paused:%t", writeDelayN, writeDelay, paused) + case p == "sstables": + for level, tables := range v.levels { + value += fmt.Sprintf("--- level %d ---\n", level) + for _, t := range tables { + value += fmt.Sprintf("%d:%d[%q .. %q]\n", t.fd.Num, t.size, t.imin, t.imax) + } + } + case p == "blockpool": + value = fmt.Sprintf("%v", db.s.tops.bpool) + case p == "cachedblock": + if db.s.tops.bcache != nil { + value = fmt.Sprintf("%d", db.s.tops.bcache.Size()) + } else { + value = "" + } + case p == "openedtables": + value = fmt.Sprintf("%d", db.s.tops.cache.Size()) + case p == "alivesnaps": + value = fmt.Sprintf("%d", atomic.LoadInt32(&db.aliveSnaps)) + case p == "aliveiters": + value = fmt.Sprintf("%d", atomic.LoadInt32(&db.aliveIters)) + default: + err = ErrNotFound + } + + return +} + +// DBStats is database statistics. +type DBStats struct { + WriteDelayCount int32 + WriteDelayDuration time.Duration + WritePaused bool + + AliveSnapshots int32 + AliveIterators int32 + + IOWrite uint64 + IORead uint64 + + BlockCacheSize int + OpenedTablesCount int + + LevelSizes []int64 + LevelTablesCounts []int + LevelRead []int64 + LevelWrite []int64 + LevelDurations []time.Duration +} + +// Stats populates s with database statistics. +func (db *DB) Stats(s *DBStats) error { + err := db.ok() + if err != nil { + return err + } + + s.IORead = db.s.stor.reads() + s.IOWrite = db.s.stor.writes() + s.WriteDelayCount = atomic.LoadInt32(&db.cWriteDelayN) + s.WriteDelayDuration = time.Duration(atomic.LoadInt64(&db.cWriteDelay)) + s.WritePaused = atomic.LoadInt32(&db.inWritePaused) == 1 + + s.OpenedTablesCount = db.s.tops.cache.Size() + if db.s.tops.bcache != nil { + s.BlockCacheSize = db.s.tops.bcache.Size() + } else { + s.BlockCacheSize = 0 + } + + s.AliveIterators = atomic.LoadInt32(&db.aliveIters) + s.AliveSnapshots = atomic.LoadInt32(&db.aliveSnaps) + + s.LevelDurations = s.LevelDurations[:0] + s.LevelRead = s.LevelRead[:0] + s.LevelWrite = s.LevelWrite[:0] + s.LevelSizes = s.LevelSizes[:0] + s.LevelTablesCounts = s.LevelTablesCounts[:0] + + v := db.s.version() + defer v.release() + + for level, tables := range v.levels { + duration, read, write := db.compStats.getStat(level) + if len(tables) == 0 && duration == 0 { + continue + } + s.LevelDurations = append(s.LevelDurations, duration) + s.LevelRead = append(s.LevelRead, read) + s.LevelWrite = append(s.LevelWrite, write) + s.LevelSizes = append(s.LevelSizes, tables.size()) + s.LevelTablesCounts = append(s.LevelTablesCounts, len(tables)) + } + + return nil +} + +// SizeOf calculates approximate sizes of the given key ranges. +// The length of the returned sizes are equal with the length of the given +// ranges. The returned sizes measure storage space usage, so if the user +// data compresses by a factor of ten, the returned sizes will be one-tenth +// the size of the corresponding user data size. +// The results may not include the sizes of recently written data. +func (db *DB) SizeOf(ranges []util.Range) (Sizes, error) { + if err := db.ok(); err != nil { + return nil, err + } + + v := db.s.version() + defer v.release() + + sizes := make(Sizes, 0, len(ranges)) + for _, r := range ranges { + imin := makeInternalKey(nil, r.Start, keyMaxSeq, keyTypeSeek) + imax := makeInternalKey(nil, r.Limit, keyMaxSeq, keyTypeSeek) + start, err := v.offsetOf(imin) + if err != nil { + return nil, err + } + limit, err := v.offsetOf(imax) + if err != nil { + return nil, err + } + var size int64 + if limit >= start { + size = limit - start + } + sizes = append(sizes, size) + } + + return sizes, nil +} + +// Close closes the DB. This will also releases any outstanding snapshot, +// abort any in-flight compaction and discard open transaction. +// +// It is not safe to close a DB until all outstanding iterators are released. +// It is valid to call Close multiple times. Other methods should not be +// called after the DB has been closed. +func (db *DB) Close() error { + if !db.setClosed() { + return ErrClosed + } + + start := time.Now() + db.log("db@close closing") + + // Clear the finalizer. + runtime.SetFinalizer(db, nil) + + // Get compaction error. + var err error + select { + case err = <-db.compErrC: + if err == ErrReadOnly { + err = nil + } + default: + } + + // Signal all goroutines. + close(db.closeC) + + // Discard open transaction. + if db.tr != nil { + db.tr.Discard() + } + + // Acquire writer lock. + db.writeLockC <- struct{}{} + + // Wait for all gorotines to exit. + db.closeW.Wait() + + // Closes journal. + if db.journal != nil { + db.journal.Close() + db.journalWriter.Close() + db.journal = nil + db.journalWriter = nil + } + + if db.writeDelayN > 0 { + db.logf("db@write was delayed N·%d T·%v", db.writeDelayN, db.writeDelay) + } + + // Close session. + db.s.close() + db.logf("db@close done T·%v", time.Since(start)) + db.s.release() + + if db.closer != nil { + if err1 := db.closer.Close(); err == nil { + err = err1 + } + db.closer = nil + } + + // Clear memdbs. + db.clearMems() + + return err +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/db_compaction.go b/vendor/github.com/syndtr/goleveldb/leveldb/db_compaction.go new file mode 100644 index 000000000000..0c1b9a53b8e7 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/db_compaction.go @@ -0,0 +1,854 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package leveldb + +import ( + "sync" + "time" + + "github.com/syndtr/goleveldb/leveldb/errors" + "github.com/syndtr/goleveldb/leveldb/opt" + "github.com/syndtr/goleveldb/leveldb/storage" +) + +var ( + errCompactionTransactExiting = errors.New("leveldb: compaction transact exiting") +) + +type cStat struct { + duration time.Duration + read int64 + write int64 +} + +func (p *cStat) add(n *cStatStaging) { + p.duration += n.duration + p.read += n.read + p.write += n.write +} + +func (p *cStat) get() (duration time.Duration, read, write int64) { + return p.duration, p.read, p.write +} + +type cStatStaging struct { + start time.Time + duration time.Duration + on bool + read int64 + write int64 +} + +func (p *cStatStaging) startTimer() { + if !p.on { + p.start = time.Now() + p.on = true + } +} + +func (p *cStatStaging) stopTimer() { + if p.on { + p.duration += time.Since(p.start) + p.on = false + } +} + +type cStats struct { + lk sync.Mutex + stats []cStat +} + +func (p *cStats) addStat(level int, n *cStatStaging) { + p.lk.Lock() + if level >= len(p.stats) { + newStats := make([]cStat, level+1) + copy(newStats, p.stats) + p.stats = newStats + } + p.stats[level].add(n) + p.lk.Unlock() +} + +func (p *cStats) getStat(level int) (duration time.Duration, read, write int64) { + p.lk.Lock() + defer p.lk.Unlock() + if level < len(p.stats) { + return p.stats[level].get() + } + return +} + +func (db *DB) compactionError() { + var err error +noerr: + // No error. + for { + select { + case err = <-db.compErrSetC: + switch { + case err == nil: + case err == ErrReadOnly, errors.IsCorrupted(err): + goto hasperr + default: + goto haserr + } + case <-db.closeC: + return + } + } +haserr: + // Transient error. + for { + select { + case db.compErrC <- err: + case err = <-db.compErrSetC: + switch { + case err == nil: + goto noerr + case err == ErrReadOnly, errors.IsCorrupted(err): + goto hasperr + default: + } + case <-db.closeC: + return + } + } +hasperr: + // Persistent error. + for { + select { + case db.compErrC <- err: + case db.compPerErrC <- err: + case db.writeLockC <- struct{}{}: + // Hold write lock, so that write won't pass-through. + db.compWriteLocking = true + case <-db.closeC: + if db.compWriteLocking { + // We should release the lock or Close will hang. + <-db.writeLockC + } + return + } + } +} + +type compactionTransactCounter int + +func (cnt *compactionTransactCounter) incr() { + *cnt++ +} + +type compactionTransactInterface interface { + run(cnt *compactionTransactCounter) error + revert() error +} + +func (db *DB) compactionTransact(name string, t compactionTransactInterface) { + defer func() { + if x := recover(); x != nil { + if x == errCompactionTransactExiting { + if err := t.revert(); err != nil { + db.logf("%s revert error %q", name, err) + } + } + panic(x) + } + }() + + const ( + backoffMin = 1 * time.Second + backoffMax = 8 * time.Second + backoffMul = 2 * time.Second + ) + var ( + backoff = backoffMin + backoffT = time.NewTimer(backoff) + lastCnt = compactionTransactCounter(0) + + disableBackoff = db.s.o.GetDisableCompactionBackoff() + ) + for n := 0; ; n++ { + // Check whether the DB is closed. + if db.isClosed() { + db.logf("%s exiting", name) + db.compactionExitTransact() + } else if n > 0 { + db.logf("%s retrying N·%d", name, n) + } + + // Execute. + cnt := compactionTransactCounter(0) + err := t.run(&cnt) + if err != nil { + db.logf("%s error I·%d %q", name, cnt, err) + } + + // Set compaction error status. + select { + case db.compErrSetC <- err: + case perr := <-db.compPerErrC: + if err != nil { + db.logf("%s exiting (persistent error %q)", name, perr) + db.compactionExitTransact() + } + case <-db.closeC: + db.logf("%s exiting", name) + db.compactionExitTransact() + } + if err == nil { + return + } + if errors.IsCorrupted(err) { + db.logf("%s exiting (corruption detected)", name) + db.compactionExitTransact() + } + + if !disableBackoff { + // Reset backoff duration if counter is advancing. + if cnt > lastCnt { + backoff = backoffMin + lastCnt = cnt + } + + // Backoff. + backoffT.Reset(backoff) + if backoff < backoffMax { + backoff *= backoffMul + if backoff > backoffMax { + backoff = backoffMax + } + } + select { + case <-backoffT.C: + case <-db.closeC: + db.logf("%s exiting", name) + db.compactionExitTransact() + } + } + } +} + +type compactionTransactFunc struct { + runFunc func(cnt *compactionTransactCounter) error + revertFunc func() error +} + +func (t *compactionTransactFunc) run(cnt *compactionTransactCounter) error { + return t.runFunc(cnt) +} + +func (t *compactionTransactFunc) revert() error { + if t.revertFunc != nil { + return t.revertFunc() + } + return nil +} + +func (db *DB) compactionTransactFunc(name string, run func(cnt *compactionTransactCounter) error, revert func() error) { + db.compactionTransact(name, &compactionTransactFunc{run, revert}) +} + +func (db *DB) compactionExitTransact() { + panic(errCompactionTransactExiting) +} + +func (db *DB) compactionCommit(name string, rec *sessionRecord) { + db.compCommitLk.Lock() + defer db.compCommitLk.Unlock() // Defer is necessary. + db.compactionTransactFunc(name+"@commit", func(cnt *compactionTransactCounter) error { + return db.s.commit(rec) + }, nil) +} + +func (db *DB) memCompaction() { + mdb := db.getFrozenMem() + if mdb == nil { + return + } + defer mdb.decref() + + db.logf("memdb@flush N·%d S·%s", mdb.Len(), shortenb(mdb.Size())) + + // Don't compact empty memdb. + if mdb.Len() == 0 { + db.logf("memdb@flush skipping") + // drop frozen memdb + db.dropFrozenMem() + return + } + + // Pause table compaction. + resumeC := make(chan struct{}) + select { + case db.tcompPauseC <- (chan<- struct{})(resumeC): + case <-db.compPerErrC: + close(resumeC) + resumeC = nil + case <-db.closeC: + db.compactionExitTransact() + } + + var ( + rec = &sessionRecord{} + stats = &cStatStaging{} + flushLevel int + ) + + // Generate tables. + db.compactionTransactFunc("memdb@flush", func(cnt *compactionTransactCounter) (err error) { + stats.startTimer() + flushLevel, err = db.s.flushMemdb(rec, mdb.DB, db.memdbMaxLevel) + stats.stopTimer() + return + }, func() error { + for _, r := range rec.addedTables { + db.logf("memdb@flush revert @%d", r.num) + if err := db.s.stor.Remove(storage.FileDesc{Type: storage.TypeTable, Num: r.num}); err != nil { + return err + } + } + return nil + }) + + rec.setJournalNum(db.journalFd.Num) + rec.setSeqNum(db.frozenSeq) + + // Commit. + stats.startTimer() + db.compactionCommit("memdb", rec) + stats.stopTimer() + + db.logf("memdb@flush committed F·%d T·%v", len(rec.addedTables), stats.duration) + + for _, r := range rec.addedTables { + stats.write += r.size + } + db.compStats.addStat(flushLevel, stats) + + // Drop frozen memdb. + db.dropFrozenMem() + + // Resume table compaction. + if resumeC != nil { + select { + case <-resumeC: + close(resumeC) + case <-db.closeC: + db.compactionExitTransact() + } + } + + // Trigger table compaction. + db.compTrigger(db.tcompCmdC) +} + +type tableCompactionBuilder struct { + db *DB + s *session + c *compaction + rec *sessionRecord + stat0, stat1 *cStatStaging + + snapHasLastUkey bool + snapLastUkey []byte + snapLastSeq uint64 + snapIter int + snapKerrCnt int + snapDropCnt int + + kerrCnt int + dropCnt int + + minSeq uint64 + strict bool + tableSize int + + tw *tWriter +} + +func (b *tableCompactionBuilder) appendKV(key, value []byte) error { + // Create new table if not already. + if b.tw == nil { + // Check for pause event. + if b.db != nil { + select { + case ch := <-b.db.tcompPauseC: + b.db.pauseCompaction(ch) + case <-b.db.closeC: + b.db.compactionExitTransact() + default: + } + } + + // Create new table. + var err error + b.tw, err = b.s.tops.create() + if err != nil { + return err + } + } + + // Write key/value into table. + return b.tw.append(key, value) +} + +func (b *tableCompactionBuilder) needFlush() bool { + return b.tw.tw.BytesLen() >= b.tableSize +} + +func (b *tableCompactionBuilder) flush() error { + t, err := b.tw.finish() + if err != nil { + return err + } + b.rec.addTableFile(b.c.sourceLevel+1, t) + b.stat1.write += t.size + b.s.logf("table@build created L%d@%d N·%d S·%s %q:%q", b.c.sourceLevel+1, t.fd.Num, b.tw.tw.EntriesLen(), shortenb(int(t.size)), t.imin, t.imax) + b.tw = nil + return nil +} + +func (b *tableCompactionBuilder) cleanup() { + if b.tw != nil { + b.tw.drop() + b.tw = nil + } +} + +func (b *tableCompactionBuilder) run(cnt *compactionTransactCounter) error { + snapResumed := b.snapIter > 0 + hasLastUkey := b.snapHasLastUkey // The key might has zero length, so this is necessary. + lastUkey := append([]byte{}, b.snapLastUkey...) + lastSeq := b.snapLastSeq + b.kerrCnt = b.snapKerrCnt + b.dropCnt = b.snapDropCnt + // Restore compaction state. + b.c.restore() + + defer b.cleanup() + + b.stat1.startTimer() + defer b.stat1.stopTimer() + + iter := b.c.newIterator() + defer iter.Release() + for i := 0; iter.Next(); i++ { + // Incr transact counter. + cnt.incr() + + // Skip until last state. + if i < b.snapIter { + continue + } + + resumed := false + if snapResumed { + resumed = true + snapResumed = false + } + + ikey := iter.Key() + ukey, seq, kt, kerr := parseInternalKey(ikey) + + if kerr == nil { + shouldStop := !resumed && b.c.shouldStopBefore(ikey) + + if !hasLastUkey || b.s.icmp.uCompare(lastUkey, ukey) != 0 { + // First occurrence of this user key. + + // Only rotate tables if ukey doesn't hop across. + if b.tw != nil && (shouldStop || b.needFlush()) { + if err := b.flush(); err != nil { + return err + } + + // Creates snapshot of the state. + b.c.save() + b.snapHasLastUkey = hasLastUkey + b.snapLastUkey = append(b.snapLastUkey[:0], lastUkey...) + b.snapLastSeq = lastSeq + b.snapIter = i + b.snapKerrCnt = b.kerrCnt + b.snapDropCnt = b.dropCnt + } + + hasLastUkey = true + lastUkey = append(lastUkey[:0], ukey...) + lastSeq = keyMaxSeq + } + + switch { + case lastSeq <= b.minSeq: + // Dropped because newer entry for same user key exist + fallthrough // (A) + case kt == keyTypeDel && seq <= b.minSeq && b.c.baseLevelForKey(lastUkey): + // For this user key: + // (1) there is no data in higher levels + // (2) data in lower levels will have larger seq numbers + // (3) data in layers that are being compacted here and have + // smaller seq numbers will be dropped in the next + // few iterations of this loop (by rule (A) above). + // Therefore this deletion marker is obsolete and can be dropped. + lastSeq = seq + b.dropCnt++ + continue + default: + lastSeq = seq + } + } else { + if b.strict { + return kerr + } + + // Don't drop corrupted keys. + hasLastUkey = false + lastUkey = lastUkey[:0] + lastSeq = keyMaxSeq + b.kerrCnt++ + } + + if err := b.appendKV(ikey, iter.Value()); err != nil { + return err + } + } + + if err := iter.Error(); err != nil { + return err + } + + // Finish last table. + if b.tw != nil && !b.tw.empty() { + return b.flush() + } + return nil +} + +func (b *tableCompactionBuilder) revert() error { + for _, at := range b.rec.addedTables { + b.s.logf("table@build revert @%d", at.num) + if err := b.s.stor.Remove(storage.FileDesc{Type: storage.TypeTable, Num: at.num}); err != nil { + return err + } + } + return nil +} + +func (db *DB) tableCompaction(c *compaction, noTrivial bool) { + defer c.release() + + rec := &sessionRecord{} + rec.addCompPtr(c.sourceLevel, c.imax) + + if !noTrivial && c.trivial() { + t := c.levels[0][0] + db.logf("table@move L%d@%d -> L%d", c.sourceLevel, t.fd.Num, c.sourceLevel+1) + rec.delTable(c.sourceLevel, t.fd.Num) + rec.addTableFile(c.sourceLevel+1, t) + db.compactionCommit("table-move", rec) + return + } + + var stats [2]cStatStaging + for i, tables := range c.levels { + for _, t := range tables { + stats[i].read += t.size + // Insert deleted tables into record + rec.delTable(c.sourceLevel+i, t.fd.Num) + } + } + sourceSize := int(stats[0].read + stats[1].read) + minSeq := db.minSeq() + db.logf("table@compaction L%d·%d -> L%d·%d S·%s Q·%d", c.sourceLevel, len(c.levels[0]), c.sourceLevel+1, len(c.levels[1]), shortenb(sourceSize), minSeq) + + b := &tableCompactionBuilder{ + db: db, + s: db.s, + c: c, + rec: rec, + stat1: &stats[1], + minSeq: minSeq, + strict: db.s.o.GetStrict(opt.StrictCompaction), + tableSize: db.s.o.GetCompactionTableSize(c.sourceLevel + 1), + } + db.compactionTransact("table@build", b) + + // Commit. + stats[1].startTimer() + db.compactionCommit("table", rec) + stats[1].stopTimer() + + resultSize := int(stats[1].write) + db.logf("table@compaction committed F%s S%s Ke·%d D·%d T·%v", sint(len(rec.addedTables)-len(rec.deletedTables)), sshortenb(resultSize-sourceSize), b.kerrCnt, b.dropCnt, stats[1].duration) + + // Save compaction stats + for i := range stats { + db.compStats.addStat(c.sourceLevel+1, &stats[i]) + } +} + +func (db *DB) tableRangeCompaction(level int, umin, umax []byte) error { + db.logf("table@compaction range L%d %q:%q", level, umin, umax) + if level >= 0 { + if c := db.s.getCompactionRange(level, umin, umax, true); c != nil { + db.tableCompaction(c, true) + } + } else { + // Retry until nothing to compact. + for { + compacted := false + + // Scan for maximum level with overlapped tables. + v := db.s.version() + m := 1 + for i := m; i < len(v.levels); i++ { + tables := v.levels[i] + if tables.overlaps(db.s.icmp, umin, umax, false) { + m = i + } + } + v.release() + + for level := 0; level < m; level++ { + if c := db.s.getCompactionRange(level, umin, umax, false); c != nil { + db.tableCompaction(c, true) + compacted = true + } + } + + if !compacted { + break + } + } + } + + return nil +} + +func (db *DB) tableAutoCompaction() { + if c := db.s.pickCompaction(); c != nil { + db.tableCompaction(c, false) + } +} + +func (db *DB) tableNeedCompaction() bool { + v := db.s.version() + defer v.release() + return v.needCompaction() +} + +// resumeWrite returns an indicator whether we should resume write operation if enough level0 files are compacted. +func (db *DB) resumeWrite() bool { + v := db.s.version() + defer v.release() + if v.tLen(0) < db.s.o.GetWriteL0PauseTrigger() { + return true + } + return false +} + +func (db *DB) pauseCompaction(ch chan<- struct{}) { + select { + case ch <- struct{}{}: + case <-db.closeC: + db.compactionExitTransact() + } +} + +type cCmd interface { + ack(err error) +} + +type cAuto struct { + // Note for table compaction, an non-empty ackC represents it's a compaction waiting command. + ackC chan<- error +} + +func (r cAuto) ack(err error) { + if r.ackC != nil { + defer func() { + recover() + }() + r.ackC <- err + } +} + +type cRange struct { + level int + min, max []byte + ackC chan<- error +} + +func (r cRange) ack(err error) { + if r.ackC != nil { + defer func() { + recover() + }() + r.ackC <- err + } +} + +// This will trigger auto compaction but will not wait for it. +func (db *DB) compTrigger(compC chan<- cCmd) { + select { + case compC <- cAuto{}: + default: + } +} + +// This will trigger auto compaction and/or wait for all compaction to be done. +func (db *DB) compTriggerWait(compC chan<- cCmd) (err error) { + ch := make(chan error) + defer close(ch) + // Send cmd. + select { + case compC <- cAuto{ch}: + case err = <-db.compErrC: + return + case <-db.closeC: + return ErrClosed + } + // Wait cmd. + select { + case err = <-ch: + case err = <-db.compErrC: + case <-db.closeC: + return ErrClosed + } + return err +} + +// Send range compaction request. +func (db *DB) compTriggerRange(compC chan<- cCmd, level int, min, max []byte) (err error) { + ch := make(chan error) + defer close(ch) + // Send cmd. + select { + case compC <- cRange{level, min, max, ch}: + case err := <-db.compErrC: + return err + case <-db.closeC: + return ErrClosed + } + // Wait cmd. + select { + case err = <-ch: + case err = <-db.compErrC: + case <-db.closeC: + return ErrClosed + } + return err +} + +func (db *DB) mCompaction() { + var x cCmd + + defer func() { + if x := recover(); x != nil { + if x != errCompactionTransactExiting { + panic(x) + } + } + if x != nil { + x.ack(ErrClosed) + } + db.closeW.Done() + }() + + for { + select { + case x = <-db.mcompCmdC: + switch x.(type) { + case cAuto: + db.memCompaction() + x.ack(nil) + x = nil + default: + panic("leveldb: unknown command") + } + case <-db.closeC: + return + } + } +} + +func (db *DB) tCompaction() { + var ( + x cCmd + waitQ []cCmd + ) + + defer func() { + if x := recover(); x != nil { + if x != errCompactionTransactExiting { + panic(x) + } + } + for i := range waitQ { + waitQ[i].ack(ErrClosed) + waitQ[i] = nil + } + if x != nil { + x.ack(ErrClosed) + } + db.closeW.Done() + }() + + for { + if db.tableNeedCompaction() { + select { + case x = <-db.tcompCmdC: + case ch := <-db.tcompPauseC: + db.pauseCompaction(ch) + continue + case <-db.closeC: + return + default: + } + // Resume write operation as soon as possible. + if len(waitQ) > 0 && db.resumeWrite() { + for i := range waitQ { + waitQ[i].ack(nil) + waitQ[i] = nil + } + waitQ = waitQ[:0] + } + } else { + for i := range waitQ { + waitQ[i].ack(nil) + waitQ[i] = nil + } + waitQ = waitQ[:0] + select { + case x = <-db.tcompCmdC: + case ch := <-db.tcompPauseC: + db.pauseCompaction(ch) + continue + case <-db.closeC: + return + } + } + if x != nil { + switch cmd := x.(type) { + case cAuto: + if cmd.ackC != nil { + // Check the write pause state before caching it. + if db.resumeWrite() { + x.ack(nil) + } else { + waitQ = append(waitQ, x) + } + } + case cRange: + x.ack(db.tableRangeCompaction(cmd.level, cmd.min, cmd.max)) + default: + panic("leveldb: unknown command") + } + x = nil + } + db.tableAutoCompaction() + } +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/db_iter.go b/vendor/github.com/syndtr/goleveldb/leveldb/db_iter.go new file mode 100644 index 000000000000..03c24cdab50d --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/db_iter.go @@ -0,0 +1,360 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package leveldb + +import ( + "errors" + "math/rand" + "runtime" + "sync" + "sync/atomic" + + "github.com/syndtr/goleveldb/leveldb/iterator" + "github.com/syndtr/goleveldb/leveldb/opt" + "github.com/syndtr/goleveldb/leveldb/util" +) + +var ( + errInvalidInternalKey = errors.New("leveldb: Iterator: invalid internal key") +) + +type memdbReleaser struct { + once sync.Once + m *memDB +} + +func (mr *memdbReleaser) Release() { + mr.once.Do(func() { + mr.m.decref() + }) +} + +func (db *DB) newRawIterator(auxm *memDB, auxt tFiles, slice *util.Range, ro *opt.ReadOptions) iterator.Iterator { + strict := opt.GetStrict(db.s.o.Options, ro, opt.StrictReader) + em, fm := db.getMems() + v := db.s.version() + + tableIts := v.getIterators(slice, ro) + n := len(tableIts) + len(auxt) + 3 + its := make([]iterator.Iterator, 0, n) + + if auxm != nil { + ami := auxm.NewIterator(slice) + ami.SetReleaser(&memdbReleaser{m: auxm}) + its = append(its, ami) + } + for _, t := range auxt { + its = append(its, v.s.tops.newIterator(t, slice, ro)) + } + + emi := em.NewIterator(slice) + emi.SetReleaser(&memdbReleaser{m: em}) + its = append(its, emi) + if fm != nil { + fmi := fm.NewIterator(slice) + fmi.SetReleaser(&memdbReleaser{m: fm}) + its = append(its, fmi) + } + its = append(its, tableIts...) + mi := iterator.NewMergedIterator(its, db.s.icmp, strict) + mi.SetReleaser(&versionReleaser{v: v}) + return mi +} + +func (db *DB) newIterator(auxm *memDB, auxt tFiles, seq uint64, slice *util.Range, ro *opt.ReadOptions) *dbIter { + var islice *util.Range + if slice != nil { + islice = &util.Range{} + if slice.Start != nil { + islice.Start = makeInternalKey(nil, slice.Start, keyMaxSeq, keyTypeSeek) + } + if slice.Limit != nil { + islice.Limit = makeInternalKey(nil, slice.Limit, keyMaxSeq, keyTypeSeek) + } + } + rawIter := db.newRawIterator(auxm, auxt, islice, ro) + iter := &dbIter{ + db: db, + icmp: db.s.icmp, + iter: rawIter, + seq: seq, + strict: opt.GetStrict(db.s.o.Options, ro, opt.StrictReader), + key: make([]byte, 0), + value: make([]byte, 0), + } + atomic.AddInt32(&db.aliveIters, 1) + runtime.SetFinalizer(iter, (*dbIter).Release) + return iter +} + +func (db *DB) iterSamplingRate() int { + return rand.Intn(2 * db.s.o.GetIteratorSamplingRate()) +} + +type dir int + +const ( + dirReleased dir = iota - 1 + dirSOI + dirEOI + dirBackward + dirForward +) + +// dbIter represent an interator states over a database session. +type dbIter struct { + db *DB + icmp *iComparer + iter iterator.Iterator + seq uint64 + strict bool + + smaplingGap int + dir dir + key []byte + value []byte + err error + releaser util.Releaser +} + +func (i *dbIter) sampleSeek() { + ikey := i.iter.Key() + i.smaplingGap -= len(ikey) + len(i.iter.Value()) + for i.smaplingGap < 0 { + i.smaplingGap += i.db.iterSamplingRate() + i.db.sampleSeek(ikey) + } +} + +func (i *dbIter) setErr(err error) { + i.err = err + i.key = nil + i.value = nil +} + +func (i *dbIter) iterErr() { + if err := i.iter.Error(); err != nil { + i.setErr(err) + } +} + +func (i *dbIter) Valid() bool { + return i.err == nil && i.dir > dirEOI +} + +func (i *dbIter) First() bool { + if i.err != nil { + return false + } else if i.dir == dirReleased { + i.err = ErrIterReleased + return false + } + + if i.iter.First() { + i.dir = dirSOI + return i.next() + } + i.dir = dirEOI + i.iterErr() + return false +} + +func (i *dbIter) Last() bool { + if i.err != nil { + return false + } else if i.dir == dirReleased { + i.err = ErrIterReleased + return false + } + + if i.iter.Last() { + return i.prev() + } + i.dir = dirSOI + i.iterErr() + return false +} + +func (i *dbIter) Seek(key []byte) bool { + if i.err != nil { + return false + } else if i.dir == dirReleased { + i.err = ErrIterReleased + return false + } + + ikey := makeInternalKey(nil, key, i.seq, keyTypeSeek) + if i.iter.Seek(ikey) { + i.dir = dirSOI + return i.next() + } + i.dir = dirEOI + i.iterErr() + return false +} + +func (i *dbIter) next() bool { + for { + if ukey, seq, kt, kerr := parseInternalKey(i.iter.Key()); kerr == nil { + i.sampleSeek() + if seq <= i.seq { + switch kt { + case keyTypeDel: + // Skip deleted key. + i.key = append(i.key[:0], ukey...) + i.dir = dirForward + case keyTypeVal: + if i.dir == dirSOI || i.icmp.uCompare(ukey, i.key) > 0 { + i.key = append(i.key[:0], ukey...) + i.value = append(i.value[:0], i.iter.Value()...) + i.dir = dirForward + return true + } + } + } + } else if i.strict { + i.setErr(kerr) + break + } + if !i.iter.Next() { + i.dir = dirEOI + i.iterErr() + break + } + } + return false +} + +func (i *dbIter) Next() bool { + if i.dir == dirEOI || i.err != nil { + return false + } else if i.dir == dirReleased { + i.err = ErrIterReleased + return false + } + + if !i.iter.Next() || (i.dir == dirBackward && !i.iter.Next()) { + i.dir = dirEOI + i.iterErr() + return false + } + return i.next() +} + +func (i *dbIter) prev() bool { + i.dir = dirBackward + del := true + if i.iter.Valid() { + for { + if ukey, seq, kt, kerr := parseInternalKey(i.iter.Key()); kerr == nil { + i.sampleSeek() + if seq <= i.seq { + if !del && i.icmp.uCompare(ukey, i.key) < 0 { + return true + } + del = (kt == keyTypeDel) + if !del { + i.key = append(i.key[:0], ukey...) + i.value = append(i.value[:0], i.iter.Value()...) + } + } + } else if i.strict { + i.setErr(kerr) + return false + } + if !i.iter.Prev() { + break + } + } + } + if del { + i.dir = dirSOI + i.iterErr() + return false + } + return true +} + +func (i *dbIter) Prev() bool { + if i.dir == dirSOI || i.err != nil { + return false + } else if i.dir == dirReleased { + i.err = ErrIterReleased + return false + } + + switch i.dir { + case dirEOI: + return i.Last() + case dirForward: + for i.iter.Prev() { + if ukey, _, _, kerr := parseInternalKey(i.iter.Key()); kerr == nil { + i.sampleSeek() + if i.icmp.uCompare(ukey, i.key) < 0 { + goto cont + } + } else if i.strict { + i.setErr(kerr) + return false + } + } + i.dir = dirSOI + i.iterErr() + return false + } + +cont: + return i.prev() +} + +func (i *dbIter) Key() []byte { + if i.err != nil || i.dir <= dirEOI { + return nil + } + return i.key +} + +func (i *dbIter) Value() []byte { + if i.err != nil || i.dir <= dirEOI { + return nil + } + return i.value +} + +func (i *dbIter) Release() { + if i.dir != dirReleased { + // Clear the finalizer. + runtime.SetFinalizer(i, nil) + + if i.releaser != nil { + i.releaser.Release() + i.releaser = nil + } + + i.dir = dirReleased + i.key = nil + i.value = nil + i.iter.Release() + i.iter = nil + atomic.AddInt32(&i.db.aliveIters, -1) + i.db = nil + } +} + +func (i *dbIter) SetReleaser(releaser util.Releaser) { + if i.dir == dirReleased { + panic(util.ErrReleased) + } + if i.releaser != nil && releaser != nil { + panic(util.ErrHasReleaser) + } + i.releaser = releaser +} + +func (i *dbIter) Error() error { + return i.err +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/db_snapshot.go b/vendor/github.com/syndtr/goleveldb/leveldb/db_snapshot.go new file mode 100644 index 000000000000..2c69d2e531d5 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/db_snapshot.go @@ -0,0 +1,183 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package leveldb + +import ( + "container/list" + "fmt" + "runtime" + "sync" + "sync/atomic" + + "github.com/syndtr/goleveldb/leveldb/iterator" + "github.com/syndtr/goleveldb/leveldb/opt" + "github.com/syndtr/goleveldb/leveldb/util" +) + +type snapshotElement struct { + seq uint64 + ref int + e *list.Element +} + +// Acquires a snapshot, based on latest sequence. +func (db *DB) acquireSnapshot() *snapshotElement { + db.snapsMu.Lock() + defer db.snapsMu.Unlock() + + seq := db.getSeq() + + if e := db.snapsList.Back(); e != nil { + se := e.Value.(*snapshotElement) + if se.seq == seq { + se.ref++ + return se + } else if seq < se.seq { + panic("leveldb: sequence number is not increasing") + } + } + se := &snapshotElement{seq: seq, ref: 1} + se.e = db.snapsList.PushBack(se) + return se +} + +// Releases given snapshot element. +func (db *DB) releaseSnapshot(se *snapshotElement) { + db.snapsMu.Lock() + defer db.snapsMu.Unlock() + + se.ref-- + if se.ref == 0 { + db.snapsList.Remove(se.e) + se.e = nil + } else if se.ref < 0 { + panic("leveldb: Snapshot: negative element reference") + } +} + +// Gets minimum sequence that not being snapshotted. +func (db *DB) minSeq() uint64 { + db.snapsMu.Lock() + defer db.snapsMu.Unlock() + + if e := db.snapsList.Front(); e != nil { + return e.Value.(*snapshotElement).seq + } + + return db.getSeq() +} + +// Snapshot is a DB snapshot. +type Snapshot struct { + db *DB + elem *snapshotElement + mu sync.RWMutex + released bool +} + +// Creates new snapshot object. +func (db *DB) newSnapshot() *Snapshot { + snap := &Snapshot{ + db: db, + elem: db.acquireSnapshot(), + } + atomic.AddInt32(&db.aliveSnaps, 1) + runtime.SetFinalizer(snap, (*Snapshot).Release) + return snap +} + +func (snap *Snapshot) String() string { + return fmt.Sprintf("leveldb.Snapshot{%d}", snap.elem.seq) +} + +// Get gets the value for the given key. It returns ErrNotFound if +// the DB does not contains the key. +// +// The caller should not modify the contents of the returned slice, but +// it is safe to modify the contents of the argument after Get returns. +func (snap *Snapshot) Get(key []byte, ro *opt.ReadOptions) (value []byte, err error) { + err = snap.db.ok() + if err != nil { + return + } + snap.mu.RLock() + defer snap.mu.RUnlock() + if snap.released { + err = ErrSnapshotReleased + return + } + return snap.db.get(nil, nil, key, snap.elem.seq, ro) +} + +// Has returns true if the DB does contains the given key. +// +// It is safe to modify the contents of the argument after Get returns. +func (snap *Snapshot) Has(key []byte, ro *opt.ReadOptions) (ret bool, err error) { + err = snap.db.ok() + if err != nil { + return + } + snap.mu.RLock() + defer snap.mu.RUnlock() + if snap.released { + err = ErrSnapshotReleased + return + } + return snap.db.has(nil, nil, key, snap.elem.seq, ro) +} + +// NewIterator returns an iterator for the snapshot of the underlying DB. +// The returned iterator is not safe for concurrent use, but it is safe to use +// multiple iterators concurrently, with each in a dedicated goroutine. +// It is also safe to use an iterator concurrently with modifying its +// underlying DB. The resultant key/value pairs are guaranteed to be +// consistent. +// +// Slice allows slicing the iterator to only contains keys in the given +// range. A nil Range.Start is treated as a key before all keys in the +// DB. And a nil Range.Limit is treated as a key after all keys in +// the DB. +// +// The iterator must be released after use, by calling Release method. +// Releasing the snapshot doesn't mean releasing the iterator too, the +// iterator would be still valid until released. +// +// Also read Iterator documentation of the leveldb/iterator package. +func (snap *Snapshot) NewIterator(slice *util.Range, ro *opt.ReadOptions) iterator.Iterator { + if err := snap.db.ok(); err != nil { + return iterator.NewEmptyIterator(err) + } + snap.mu.Lock() + defer snap.mu.Unlock() + if snap.released { + return iterator.NewEmptyIterator(ErrSnapshotReleased) + } + // Since iterator already hold version ref, it doesn't need to + // hold snapshot ref. + return snap.db.newIterator(nil, nil, snap.elem.seq, slice, ro) +} + +// Release releases the snapshot. This will not release any returned +// iterators, the iterators would still be valid until released or the +// underlying DB is closed. +// +// Other methods should not be called after the snapshot has been released. +func (snap *Snapshot) Release() { + snap.mu.Lock() + defer snap.mu.Unlock() + + if !snap.released { + // Clear the finalizer. + runtime.SetFinalizer(snap, nil) + + snap.released = true + snap.db.releaseSnapshot(snap.elem) + atomic.AddInt32(&snap.db.aliveSnaps, -1) + snap.db = nil + snap.elem = nil + } +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/db_state.go b/vendor/github.com/syndtr/goleveldb/leveldb/db_state.go new file mode 100644 index 000000000000..65e1c54bb41b --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/db_state.go @@ -0,0 +1,239 @@ +// Copyright (c) 2013, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package leveldb + +import ( + "errors" + "sync/atomic" + "time" + + "github.com/syndtr/goleveldb/leveldb/journal" + "github.com/syndtr/goleveldb/leveldb/memdb" + "github.com/syndtr/goleveldb/leveldb/storage" +) + +var ( + errHasFrozenMem = errors.New("has frozen mem") +) + +type memDB struct { + db *DB + *memdb.DB + ref int32 +} + +func (m *memDB) getref() int32 { + return atomic.LoadInt32(&m.ref) +} + +func (m *memDB) incref() { + atomic.AddInt32(&m.ref, 1) +} + +func (m *memDB) decref() { + if ref := atomic.AddInt32(&m.ref, -1); ref == 0 { + // Only put back memdb with std capacity. + if m.Capacity() == m.db.s.o.GetWriteBuffer() { + m.Reset() + m.db.mpoolPut(m.DB) + } + m.db = nil + m.DB = nil + } else if ref < 0 { + panic("negative memdb ref") + } +} + +// Get latest sequence number. +func (db *DB) getSeq() uint64 { + return atomic.LoadUint64(&db.seq) +} + +// Atomically adds delta to seq. +func (db *DB) addSeq(delta uint64) { + atomic.AddUint64(&db.seq, delta) +} + +func (db *DB) setSeq(seq uint64) { + atomic.StoreUint64(&db.seq, seq) +} + +func (db *DB) sampleSeek(ikey internalKey) { + v := db.s.version() + if v.sampleSeek(ikey) { + // Trigger table compaction. + db.compTrigger(db.tcompCmdC) + } + v.release() +} + +func (db *DB) mpoolPut(mem *memdb.DB) { + if !db.isClosed() { + select { + case db.memPool <- mem: + default: + } + } +} + +func (db *DB) mpoolGet(n int) *memDB { + var mdb *memdb.DB + select { + case mdb = <-db.memPool: + default: + } + if mdb == nil || mdb.Capacity() < n { + mdb = memdb.New(db.s.icmp, maxInt(db.s.o.GetWriteBuffer(), n)) + } + return &memDB{ + db: db, + DB: mdb, + } +} + +func (db *DB) mpoolDrain() { + ticker := time.NewTicker(30 * time.Second) + for { + select { + case <-ticker.C: + select { + case <-db.memPool: + default: + } + case <-db.closeC: + ticker.Stop() + // Make sure the pool is drained. + select { + case <-db.memPool: + case <-time.After(time.Second): + } + close(db.memPool) + return + } + } +} + +// Create new memdb and froze the old one; need external synchronization. +// newMem only called synchronously by the writer. +func (db *DB) newMem(n int) (mem *memDB, err error) { + fd := storage.FileDesc{Type: storage.TypeJournal, Num: db.s.allocFileNum()} + w, err := db.s.stor.Create(fd) + if err != nil { + db.s.reuseFileNum(fd.Num) + return + } + + db.memMu.Lock() + defer db.memMu.Unlock() + + if db.frozenMem != nil { + return nil, errHasFrozenMem + } + + if db.journal == nil { + db.journal = journal.NewWriter(w) + } else { + db.journal.Reset(w) + db.journalWriter.Close() + db.frozenJournalFd = db.journalFd + } + db.journalWriter = w + db.journalFd = fd + db.frozenMem = db.mem + mem = db.mpoolGet(n) + mem.incref() // for self + mem.incref() // for caller + db.mem = mem + // The seq only incremented by the writer. And whoever called newMem + // should hold write lock, so no need additional synchronization here. + db.frozenSeq = db.seq + return +} + +// Get all memdbs. +func (db *DB) getMems() (e, f *memDB) { + db.memMu.RLock() + defer db.memMu.RUnlock() + if db.mem != nil { + db.mem.incref() + } else if !db.isClosed() { + panic("nil effective mem") + } + if db.frozenMem != nil { + db.frozenMem.incref() + } + return db.mem, db.frozenMem +} + +// Get effective memdb. +func (db *DB) getEffectiveMem() *memDB { + db.memMu.RLock() + defer db.memMu.RUnlock() + if db.mem != nil { + db.mem.incref() + } else if !db.isClosed() { + panic("nil effective mem") + } + return db.mem +} + +// Check whether we has frozen memdb. +func (db *DB) hasFrozenMem() bool { + db.memMu.RLock() + defer db.memMu.RUnlock() + return db.frozenMem != nil +} + +// Get frozen memdb. +func (db *DB) getFrozenMem() *memDB { + db.memMu.RLock() + defer db.memMu.RUnlock() + if db.frozenMem != nil { + db.frozenMem.incref() + } + return db.frozenMem +} + +// Drop frozen memdb; assume that frozen memdb isn't nil. +func (db *DB) dropFrozenMem() { + db.memMu.Lock() + if err := db.s.stor.Remove(db.frozenJournalFd); err != nil { + db.logf("journal@remove removing @%d %q", db.frozenJournalFd.Num, err) + } else { + db.logf("journal@remove removed @%d", db.frozenJournalFd.Num) + } + db.frozenJournalFd = storage.FileDesc{} + db.frozenMem.decref() + db.frozenMem = nil + db.memMu.Unlock() +} + +// Clear mems ptr; used by DB.Close(). +func (db *DB) clearMems() { + db.memMu.Lock() + db.mem = nil + db.frozenMem = nil + db.memMu.Unlock() +} + +// Set closed flag; return true if not already closed. +func (db *DB) setClosed() bool { + return atomic.CompareAndSwapUint32(&db.closed, 0, 1) +} + +// Check whether DB was closed. +func (db *DB) isClosed() bool { + return atomic.LoadUint32(&db.closed) != 0 +} + +// Check read ok status. +func (db *DB) ok() error { + if db.isClosed() { + return ErrClosed + } + return nil +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/db_transaction.go b/vendor/github.com/syndtr/goleveldb/leveldb/db_transaction.go new file mode 100644 index 000000000000..b8f7e7d21dfa --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/db_transaction.go @@ -0,0 +1,325 @@ +// Copyright (c) 2016, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package leveldb + +import ( + "errors" + "sync" + "time" + + "github.com/syndtr/goleveldb/leveldb/iterator" + "github.com/syndtr/goleveldb/leveldb/opt" + "github.com/syndtr/goleveldb/leveldb/util" +) + +var errTransactionDone = errors.New("leveldb: transaction already closed") + +// Transaction is the transaction handle. +type Transaction struct { + db *DB + lk sync.RWMutex + seq uint64 + mem *memDB + tables tFiles + ikScratch []byte + rec sessionRecord + stats cStatStaging + closed bool +} + +// Get gets the value for the given key. It returns ErrNotFound if the +// DB does not contains the key. +// +// The returned slice is its own copy, it is safe to modify the contents +// of the returned slice. +// It is safe to modify the contents of the argument after Get returns. +func (tr *Transaction) Get(key []byte, ro *opt.ReadOptions) ([]byte, error) { + tr.lk.RLock() + defer tr.lk.RUnlock() + if tr.closed { + return nil, errTransactionDone + } + return tr.db.get(tr.mem.DB, tr.tables, key, tr.seq, ro) +} + +// Has returns true if the DB does contains the given key. +// +// It is safe to modify the contents of the argument after Has returns. +func (tr *Transaction) Has(key []byte, ro *opt.ReadOptions) (bool, error) { + tr.lk.RLock() + defer tr.lk.RUnlock() + if tr.closed { + return false, errTransactionDone + } + return tr.db.has(tr.mem.DB, tr.tables, key, tr.seq, ro) +} + +// NewIterator returns an iterator for the latest snapshot of the transaction. +// The returned iterator is not safe for concurrent use, but it is safe to use +// multiple iterators concurrently, with each in a dedicated goroutine. +// It is also safe to use an iterator concurrently while writes to the +// transaction. The resultant key/value pairs are guaranteed to be consistent. +// +// Slice allows slicing the iterator to only contains keys in the given +// range. A nil Range.Start is treated as a key before all keys in the +// DB. And a nil Range.Limit is treated as a key after all keys in +// the DB. +// +// The iterator must be released after use, by calling Release method. +// +// Also read Iterator documentation of the leveldb/iterator package. +func (tr *Transaction) NewIterator(slice *util.Range, ro *opt.ReadOptions) iterator.Iterator { + tr.lk.RLock() + defer tr.lk.RUnlock() + if tr.closed { + return iterator.NewEmptyIterator(errTransactionDone) + } + tr.mem.incref() + return tr.db.newIterator(tr.mem, tr.tables, tr.seq, slice, ro) +} + +func (tr *Transaction) flush() error { + // Flush memdb. + if tr.mem.Len() != 0 { + tr.stats.startTimer() + iter := tr.mem.NewIterator(nil) + t, n, err := tr.db.s.tops.createFrom(iter) + iter.Release() + tr.stats.stopTimer() + if err != nil { + return err + } + if tr.mem.getref() == 1 { + tr.mem.Reset() + } else { + tr.mem.decref() + tr.mem = tr.db.mpoolGet(0) + tr.mem.incref() + } + tr.tables = append(tr.tables, t) + tr.rec.addTableFile(0, t) + tr.stats.write += t.size + tr.db.logf("transaction@flush created L0@%d N·%d S·%s %q:%q", t.fd.Num, n, shortenb(int(t.size)), t.imin, t.imax) + } + return nil +} + +func (tr *Transaction) put(kt keyType, key, value []byte) error { + tr.ikScratch = makeInternalKey(tr.ikScratch, key, tr.seq+1, kt) + if tr.mem.Free() < len(tr.ikScratch)+len(value) { + if err := tr.flush(); err != nil { + return err + } + } + if err := tr.mem.Put(tr.ikScratch, value); err != nil { + return err + } + tr.seq++ + return nil +} + +// Put sets the value for the given key. It overwrites any previous value +// for that key; a DB is not a multi-map. +// Please note that the transaction is not compacted until committed, so if you +// writes 10 same keys, then those 10 same keys are in the transaction. +// +// It is safe to modify the contents of the arguments after Put returns. +func (tr *Transaction) Put(key, value []byte, wo *opt.WriteOptions) error { + tr.lk.Lock() + defer tr.lk.Unlock() + if tr.closed { + return errTransactionDone + } + return tr.put(keyTypeVal, key, value) +} + +// Delete deletes the value for the given key. +// Please note that the transaction is not compacted until committed, so if you +// writes 10 same keys, then those 10 same keys are in the transaction. +// +// It is safe to modify the contents of the arguments after Delete returns. +func (tr *Transaction) Delete(key []byte, wo *opt.WriteOptions) error { + tr.lk.Lock() + defer tr.lk.Unlock() + if tr.closed { + return errTransactionDone + } + return tr.put(keyTypeDel, key, nil) +} + +// Write apply the given batch to the transaction. The batch will be applied +// sequentially. +// Please note that the transaction is not compacted until committed, so if you +// writes 10 same keys, then those 10 same keys are in the transaction. +// +// It is safe to modify the contents of the arguments after Write returns. +func (tr *Transaction) Write(b *Batch, wo *opt.WriteOptions) error { + if b == nil || b.Len() == 0 { + return nil + } + + tr.lk.Lock() + defer tr.lk.Unlock() + if tr.closed { + return errTransactionDone + } + return b.replayInternal(func(i int, kt keyType, k, v []byte) error { + return tr.put(kt, k, v) + }) +} + +func (tr *Transaction) setDone() { + tr.closed = true + tr.db.tr = nil + tr.mem.decref() + <-tr.db.writeLockC +} + +// Commit commits the transaction. If error is not nil, then the transaction is +// not committed, it can then either be retried or discarded. +// +// Other methods should not be called after transaction has been committed. +func (tr *Transaction) Commit() error { + if err := tr.db.ok(); err != nil { + return err + } + + tr.lk.Lock() + defer tr.lk.Unlock() + if tr.closed { + return errTransactionDone + } + if err := tr.flush(); err != nil { + // Return error, lets user decide either to retry or discard + // transaction. + return err + } + if len(tr.tables) != 0 { + // Committing transaction. + tr.rec.setSeqNum(tr.seq) + tr.db.compCommitLk.Lock() + tr.stats.startTimer() + var cerr error + for retry := 0; retry < 3; retry++ { + cerr = tr.db.s.commit(&tr.rec) + if cerr != nil { + tr.db.logf("transaction@commit error R·%d %q", retry, cerr) + select { + case <-time.After(time.Second): + case <-tr.db.closeC: + tr.db.logf("transaction@commit exiting") + tr.db.compCommitLk.Unlock() + return cerr + } + } else { + // Success. Set db.seq. + tr.db.setSeq(tr.seq) + break + } + } + tr.stats.stopTimer() + if cerr != nil { + // Return error, lets user decide either to retry or discard + // transaction. + return cerr + } + + // Update compaction stats. This is safe as long as we hold compCommitLk. + tr.db.compStats.addStat(0, &tr.stats) + + // Trigger table auto-compaction. + tr.db.compTrigger(tr.db.tcompCmdC) + tr.db.compCommitLk.Unlock() + + // Additionally, wait compaction when certain threshold reached. + // Ignore error, returns error only if transaction can't be committed. + tr.db.waitCompaction() + } + // Only mark as done if transaction committed successfully. + tr.setDone() + return nil +} + +func (tr *Transaction) discard() { + // Discard transaction. + for _, t := range tr.tables { + tr.db.logf("transaction@discard @%d", t.fd.Num) + if err1 := tr.db.s.stor.Remove(t.fd); err1 == nil { + tr.db.s.reuseFileNum(t.fd.Num) + } + } +} + +// Discard discards the transaction. +// +// Other methods should not be called after transaction has been discarded. +func (tr *Transaction) Discard() { + tr.lk.Lock() + if !tr.closed { + tr.discard() + tr.setDone() + } + tr.lk.Unlock() +} + +func (db *DB) waitCompaction() error { + if db.s.tLen(0) >= db.s.o.GetWriteL0PauseTrigger() { + return db.compTriggerWait(db.tcompCmdC) + } + return nil +} + +// OpenTransaction opens an atomic DB transaction. Only one transaction can be +// opened at a time. Subsequent call to Write and OpenTransaction will be blocked +// until in-flight transaction is committed or discarded. +// The returned transaction handle is safe for concurrent use. +// +// Transaction is expensive and can overwhelm compaction, especially if +// transaction size is small. Use with caution. +// +// The transaction must be closed once done, either by committing or discarding +// the transaction. +// Closing the DB will discard open transaction. +func (db *DB) OpenTransaction() (*Transaction, error) { + if err := db.ok(); err != nil { + return nil, err + } + + // The write happen synchronously. + select { + case db.writeLockC <- struct{}{}: + case err := <-db.compPerErrC: + return nil, err + case <-db.closeC: + return nil, ErrClosed + } + + if db.tr != nil { + panic("leveldb: has open transaction") + } + + // Flush current memdb. + if db.mem != nil && db.mem.Len() != 0 { + if _, err := db.rotateMem(0, true); err != nil { + return nil, err + } + } + + // Wait compaction when certain threshold reached. + if err := db.waitCompaction(); err != nil { + return nil, err + } + + tr := &Transaction{ + db: db, + seq: db.seq, + mem: db.mpoolGet(0), + } + tr.mem.incref() + db.tr = tr + return tr, nil +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/db_util.go b/vendor/github.com/syndtr/goleveldb/leveldb/db_util.go new file mode 100644 index 000000000000..3f0654894b44 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/db_util.go @@ -0,0 +1,102 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package leveldb + +import ( + "github.com/syndtr/goleveldb/leveldb/errors" + "github.com/syndtr/goleveldb/leveldb/iterator" + "github.com/syndtr/goleveldb/leveldb/opt" + "github.com/syndtr/goleveldb/leveldb/storage" + "github.com/syndtr/goleveldb/leveldb/util" +) + +// Reader is the interface that wraps basic Get and NewIterator methods. +// This interface implemented by both DB and Snapshot. +type Reader interface { + Get(key []byte, ro *opt.ReadOptions) (value []byte, err error) + NewIterator(slice *util.Range, ro *opt.ReadOptions) iterator.Iterator +} + +// Sizes is list of size. +type Sizes []int64 + +// Sum returns sum of the sizes. +func (sizes Sizes) Sum() int64 { + var sum int64 + for _, size := range sizes { + sum += size + } + return sum +} + +// Logging. +func (db *DB) log(v ...interface{}) { db.s.log(v...) } +func (db *DB) logf(format string, v ...interface{}) { db.s.logf(format, v...) } + +// Check and clean files. +func (db *DB) checkAndCleanFiles() error { + v := db.s.version() + defer v.release() + + tmap := make(map[int64]bool) + for _, tables := range v.levels { + for _, t := range tables { + tmap[t.fd.Num] = false + } + } + + fds, err := db.s.stor.List(storage.TypeAll) + if err != nil { + return err + } + + var nt int + var rem []storage.FileDesc + for _, fd := range fds { + keep := true + switch fd.Type { + case storage.TypeManifest: + keep = fd.Num >= db.s.manifestFd.Num + case storage.TypeJournal: + if !db.frozenJournalFd.Zero() { + keep = fd.Num >= db.frozenJournalFd.Num + } else { + keep = fd.Num >= db.journalFd.Num + } + case storage.TypeTable: + _, keep = tmap[fd.Num] + if keep { + tmap[fd.Num] = true + nt++ + } + } + + if !keep { + rem = append(rem, fd) + } + } + + if nt != len(tmap) { + var mfds []storage.FileDesc + for num, present := range tmap { + if !present { + mfds = append(mfds, storage.FileDesc{Type: storage.TypeTable, Num: num}) + db.logf("db@janitor table missing @%d", num) + } + } + return errors.NewErrCorrupted(storage.FileDesc{}, &errors.ErrMissingFiles{Fds: mfds}) + } + + db.logf("db@janitor F·%d G·%d", len(fds), len(rem)) + for _, fd := range rem { + db.logf("db@janitor removing %s-%d", fd.Type, fd.Num) + if err := db.s.stor.Remove(fd); err != nil { + return err + } + } + return nil +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/db_write.go b/vendor/github.com/syndtr/goleveldb/leveldb/db_write.go new file mode 100644 index 000000000000..db0c1bece1db --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/db_write.go @@ -0,0 +1,464 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package leveldb + +import ( + "sync/atomic" + "time" + + "github.com/syndtr/goleveldb/leveldb/memdb" + "github.com/syndtr/goleveldb/leveldb/opt" + "github.com/syndtr/goleveldb/leveldb/util" +) + +func (db *DB) writeJournal(batches []*Batch, seq uint64, sync bool) error { + wr, err := db.journal.Next() + if err != nil { + return err + } + if err := writeBatchesWithHeader(wr, batches, seq); err != nil { + return err + } + if err := db.journal.Flush(); err != nil { + return err + } + if sync { + return db.journalWriter.Sync() + } + return nil +} + +func (db *DB) rotateMem(n int, wait bool) (mem *memDB, err error) { + retryLimit := 3 +retry: + // Wait for pending memdb compaction. + err = db.compTriggerWait(db.mcompCmdC) + if err != nil { + return + } + retryLimit-- + + // Create new memdb and journal. + mem, err = db.newMem(n) + if err != nil { + if err == errHasFrozenMem { + if retryLimit <= 0 { + panic("BUG: still has frozen memdb") + } + goto retry + } + return + } + + // Schedule memdb compaction. + if wait { + err = db.compTriggerWait(db.mcompCmdC) + } else { + db.compTrigger(db.mcompCmdC) + } + return +} + +func (db *DB) flush(n int) (mdb *memDB, mdbFree int, err error) { + delayed := false + slowdownTrigger := db.s.o.GetWriteL0SlowdownTrigger() + pauseTrigger := db.s.o.GetWriteL0PauseTrigger() + flush := func() (retry bool) { + mdb = db.getEffectiveMem() + if mdb == nil { + err = ErrClosed + return false + } + defer func() { + if retry { + mdb.decref() + mdb = nil + } + }() + tLen := db.s.tLen(0) + mdbFree = mdb.Free() + switch { + case tLen >= slowdownTrigger && !delayed: + delayed = true + time.Sleep(time.Millisecond) + case mdbFree >= n: + return false + case tLen >= pauseTrigger: + delayed = true + // Set the write paused flag explicitly. + atomic.StoreInt32(&db.inWritePaused, 1) + err = db.compTriggerWait(db.tcompCmdC) + // Unset the write paused flag. + atomic.StoreInt32(&db.inWritePaused, 0) + if err != nil { + return false + } + default: + // Allow memdb to grow if it has no entry. + if mdb.Len() == 0 { + mdbFree = n + } else { + mdb.decref() + mdb, err = db.rotateMem(n, false) + if err == nil { + mdbFree = mdb.Free() + } else { + mdbFree = 0 + } + } + return false + } + return true + } + start := time.Now() + for flush() { + } + if delayed { + db.writeDelay += time.Since(start) + db.writeDelayN++ + } else if db.writeDelayN > 0 { + db.logf("db@write was delayed N·%d T·%v", db.writeDelayN, db.writeDelay) + atomic.AddInt32(&db.cWriteDelayN, int32(db.writeDelayN)) + atomic.AddInt64(&db.cWriteDelay, int64(db.writeDelay)) + db.writeDelay = 0 + db.writeDelayN = 0 + } + return +} + +type writeMerge struct { + sync bool + batch *Batch + keyType keyType + key, value []byte +} + +func (db *DB) unlockWrite(overflow bool, merged int, err error) { + for i := 0; i < merged; i++ { + db.writeAckC <- err + } + if overflow { + // Pass lock to the next write (that failed to merge). + db.writeMergedC <- false + } else { + // Release lock. + <-db.writeLockC + } +} + +// ourBatch is batch that we can modify. +func (db *DB) writeLocked(batch, ourBatch *Batch, merge, sync bool) error { + // Try to flush memdb. This method would also trying to throttle writes + // if it is too fast and compaction cannot catch-up. + mdb, mdbFree, err := db.flush(batch.internalLen) + if err != nil { + db.unlockWrite(false, 0, err) + return err + } + defer mdb.decref() + + var ( + overflow bool + merged int + batches = []*Batch{batch} + ) + + if merge { + // Merge limit. + var mergeLimit int + if batch.internalLen > 128<<10 { + mergeLimit = (1 << 20) - batch.internalLen + } else { + mergeLimit = 128 << 10 + } + mergeCap := mdbFree - batch.internalLen + if mergeLimit > mergeCap { + mergeLimit = mergeCap + } + + merge: + for mergeLimit > 0 { + select { + case incoming := <-db.writeMergeC: + if incoming.batch != nil { + // Merge batch. + if incoming.batch.internalLen > mergeLimit { + overflow = true + break merge + } + batches = append(batches, incoming.batch) + mergeLimit -= incoming.batch.internalLen + } else { + // Merge put. + internalLen := len(incoming.key) + len(incoming.value) + 8 + if internalLen > mergeLimit { + overflow = true + break merge + } + if ourBatch == nil { + ourBatch = db.batchPool.Get().(*Batch) + ourBatch.Reset() + batches = append(batches, ourBatch) + } + // We can use same batch since concurrent write doesn't + // guarantee write order. + ourBatch.appendRec(incoming.keyType, incoming.key, incoming.value) + mergeLimit -= internalLen + } + sync = sync || incoming.sync + merged++ + db.writeMergedC <- true + + default: + break merge + } + } + } + + // Release ourBatch if any. + if ourBatch != nil { + defer db.batchPool.Put(ourBatch) + } + + // Seq number. + seq := db.seq + 1 + + // Write journal. + if err := db.writeJournal(batches, seq, sync); err != nil { + db.unlockWrite(overflow, merged, err) + return err + } + + // Put batches. + for _, batch := range batches { + if err := batch.putMem(seq, mdb.DB); err != nil { + panic(err) + } + seq += uint64(batch.Len()) + } + + // Incr seq number. + db.addSeq(uint64(batchesLen(batches))) + + // Rotate memdb if it's reach the threshold. + if batch.internalLen >= mdbFree { + db.rotateMem(0, false) + } + + db.unlockWrite(overflow, merged, nil) + return nil +} + +// Write apply the given batch to the DB. The batch records will be applied +// sequentially. Write might be used concurrently, when used concurrently and +// batch is small enough, write will try to merge the batches. Set NoWriteMerge +// option to true to disable write merge. +// +// It is safe to modify the contents of the arguments after Write returns but +// not before. Write will not modify content of the batch. +func (db *DB) Write(batch *Batch, wo *opt.WriteOptions) error { + if err := db.ok(); err != nil || batch == nil || batch.Len() == 0 { + return err + } + + // If the batch size is larger than write buffer, it may justified to write + // using transaction instead. Using transaction the batch will be written + // into tables directly, skipping the journaling. + if batch.internalLen > db.s.o.GetWriteBuffer() && !db.s.o.GetDisableLargeBatchTransaction() { + tr, err := db.OpenTransaction() + if err != nil { + return err + } + if err := tr.Write(batch, wo); err != nil { + tr.Discard() + return err + } + return tr.Commit() + } + + merge := !wo.GetNoWriteMerge() && !db.s.o.GetNoWriteMerge() + sync := wo.GetSync() && !db.s.o.GetNoSync() + + // Acquire write lock. + if merge { + select { + case db.writeMergeC <- writeMerge{sync: sync, batch: batch}: + if <-db.writeMergedC { + // Write is merged. + return <-db.writeAckC + } + // Write is not merged, the write lock is handed to us. Continue. + case db.writeLockC <- struct{}{}: + // Write lock acquired. + case err := <-db.compPerErrC: + // Compaction error. + return err + case <-db.closeC: + // Closed + return ErrClosed + } + } else { + select { + case db.writeLockC <- struct{}{}: + // Write lock acquired. + case err := <-db.compPerErrC: + // Compaction error. + return err + case <-db.closeC: + // Closed + return ErrClosed + } + } + + return db.writeLocked(batch, nil, merge, sync) +} + +func (db *DB) putRec(kt keyType, key, value []byte, wo *opt.WriteOptions) error { + if err := db.ok(); err != nil { + return err + } + + merge := !wo.GetNoWriteMerge() && !db.s.o.GetNoWriteMerge() + sync := wo.GetSync() && !db.s.o.GetNoSync() + + // Acquire write lock. + if merge { + select { + case db.writeMergeC <- writeMerge{sync: sync, keyType: kt, key: key, value: value}: + if <-db.writeMergedC { + // Write is merged. + return <-db.writeAckC + } + // Write is not merged, the write lock is handed to us. Continue. + case db.writeLockC <- struct{}{}: + // Write lock acquired. + case err := <-db.compPerErrC: + // Compaction error. + return err + case <-db.closeC: + // Closed + return ErrClosed + } + } else { + select { + case db.writeLockC <- struct{}{}: + // Write lock acquired. + case err := <-db.compPerErrC: + // Compaction error. + return err + case <-db.closeC: + // Closed + return ErrClosed + } + } + + batch := db.batchPool.Get().(*Batch) + batch.Reset() + batch.appendRec(kt, key, value) + return db.writeLocked(batch, batch, merge, sync) +} + +// Put sets the value for the given key. It overwrites any previous value +// for that key; a DB is not a multi-map. Write merge also applies for Put, see +// Write. +// +// It is safe to modify the contents of the arguments after Put returns but not +// before. +func (db *DB) Put(key, value []byte, wo *opt.WriteOptions) error { + return db.putRec(keyTypeVal, key, value, wo) +} + +// Delete deletes the value for the given key. Delete will not returns error if +// key doesn't exist. Write merge also applies for Delete, see Write. +// +// It is safe to modify the contents of the arguments after Delete returns but +// not before. +func (db *DB) Delete(key []byte, wo *opt.WriteOptions) error { + return db.putRec(keyTypeDel, key, nil, wo) +} + +func isMemOverlaps(icmp *iComparer, mem *memdb.DB, min, max []byte) bool { + iter := mem.NewIterator(nil) + defer iter.Release() + return (max == nil || (iter.First() && icmp.uCompare(max, internalKey(iter.Key()).ukey()) >= 0)) && + (min == nil || (iter.Last() && icmp.uCompare(min, internalKey(iter.Key()).ukey()) <= 0)) +} + +// CompactRange compacts the underlying DB for the given key range. +// In particular, deleted and overwritten versions are discarded, +// and the data is rearranged to reduce the cost of operations +// needed to access the data. This operation should typically only +// be invoked by users who understand the underlying implementation. +// +// A nil Range.Start is treated as a key before all keys in the DB. +// And a nil Range.Limit is treated as a key after all keys in the DB. +// Therefore if both is nil then it will compact entire DB. +func (db *DB) CompactRange(r util.Range) error { + if err := db.ok(); err != nil { + return err + } + + // Lock writer. + select { + case db.writeLockC <- struct{}{}: + case err := <-db.compPerErrC: + return err + case <-db.closeC: + return ErrClosed + } + + // Check for overlaps in memdb. + mdb := db.getEffectiveMem() + if mdb == nil { + return ErrClosed + } + defer mdb.decref() + if isMemOverlaps(db.s.icmp, mdb.DB, r.Start, r.Limit) { + // Memdb compaction. + if _, err := db.rotateMem(0, false); err != nil { + <-db.writeLockC + return err + } + <-db.writeLockC + if err := db.compTriggerWait(db.mcompCmdC); err != nil { + return err + } + } else { + <-db.writeLockC + } + + // Table compaction. + return db.compTriggerRange(db.tcompCmdC, -1, r.Start, r.Limit) +} + +// SetReadOnly makes DB read-only. It will stay read-only until reopened. +func (db *DB) SetReadOnly() error { + if err := db.ok(); err != nil { + return err + } + + // Lock writer. + select { + case db.writeLockC <- struct{}{}: + db.compWriteLocking = true + case err := <-db.compPerErrC: + return err + case <-db.closeC: + return ErrClosed + } + + // Set compaction read-only. + select { + case db.compErrSetC <- ErrReadOnly: + case perr := <-db.compPerErrC: + return perr + case <-db.closeC: + return ErrClosed + } + + return nil +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/doc.go b/vendor/github.com/syndtr/goleveldb/leveldb/doc.go new file mode 100644 index 000000000000..be768e5739f7 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/doc.go @@ -0,0 +1,92 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Package leveldb provides implementation of LevelDB key/value database. +// +// Create or open a database: +// +// // The returned DB instance is safe for concurrent use. Which mean that all +// // DB's methods may be called concurrently from multiple goroutine. +// db, err := leveldb.OpenFile("path/to/db", nil) +// ... +// defer db.Close() +// ... +// +// Read or modify the database content: +// +// // Remember that the contents of the returned slice should not be modified. +// data, err := db.Get([]byte("key"), nil) +// ... +// err = db.Put([]byte("key"), []byte("value"), nil) +// ... +// err = db.Delete([]byte("key"), nil) +// ... +// +// Iterate over database content: +// +// iter := db.NewIterator(nil, nil) +// for iter.Next() { +// // Remember that the contents of the returned slice should not be modified, and +// // only valid until the next call to Next. +// key := iter.Key() +// value := iter.Value() +// ... +// } +// iter.Release() +// err = iter.Error() +// ... +// +// Iterate over subset of database content with a particular prefix: +// iter := db.NewIterator(util.BytesPrefix([]byte("foo-")), nil) +// for iter.Next() { +// // Use key/value. +// ... +// } +// iter.Release() +// err = iter.Error() +// ... +// +// Seek-then-Iterate: +// +// iter := db.NewIterator(nil, nil) +// for ok := iter.Seek(key); ok; ok = iter.Next() { +// // Use key/value. +// ... +// } +// iter.Release() +// err = iter.Error() +// ... +// +// Iterate over subset of database content: +// +// iter := db.NewIterator(&util.Range{Start: []byte("foo"), Limit: []byte("xoo")}, nil) +// for iter.Next() { +// // Use key/value. +// ... +// } +// iter.Release() +// err = iter.Error() +// ... +// +// Batch writes: +// +// batch := new(leveldb.Batch) +// batch.Put([]byte("foo"), []byte("value")) +// batch.Put([]byte("bar"), []byte("another value")) +// batch.Delete([]byte("baz")) +// err = db.Write(batch, nil) +// ... +// +// Use bloom filter: +// +// o := &opt.Options{ +// Filter: filter.NewBloomFilter(10), +// } +// db, err := leveldb.OpenFile("path/to/db", o) +// ... +// defer db.Close() +// ... +package leveldb diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/errors.go b/vendor/github.com/syndtr/goleveldb/leveldb/errors.go new file mode 100644 index 000000000000..de2649812c20 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/errors.go @@ -0,0 +1,20 @@ +// Copyright (c) 2014, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package leveldb + +import ( + "github.com/syndtr/goleveldb/leveldb/errors" +) + +// Common errors. +var ( + ErrNotFound = errors.ErrNotFound + ErrReadOnly = errors.New("leveldb: read-only mode") + ErrSnapshotReleased = errors.New("leveldb: snapshot released") + ErrIterReleased = errors.New("leveldb: iterator released") + ErrClosed = errors.New("leveldb: closed") +) diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/errors/errors.go b/vendor/github.com/syndtr/goleveldb/leveldb/errors/errors.go new file mode 100644 index 000000000000..8d6146b6f5c6 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/errors/errors.go @@ -0,0 +1,78 @@ +// Copyright (c) 2014, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Package errors provides common error types used throughout leveldb. +package errors + +import ( + "errors" + "fmt" + + "github.com/syndtr/goleveldb/leveldb/storage" + "github.com/syndtr/goleveldb/leveldb/util" +) + +// Common errors. +var ( + ErrNotFound = New("leveldb: not found") + ErrReleased = util.ErrReleased + ErrHasReleaser = util.ErrHasReleaser +) + +// New returns an error that formats as the given text. +func New(text string) error { + return errors.New(text) +} + +// ErrCorrupted is the type that wraps errors that indicate corruption in +// the database. +type ErrCorrupted struct { + Fd storage.FileDesc + Err error +} + +func (e *ErrCorrupted) Error() string { + if !e.Fd.Zero() { + return fmt.Sprintf("%v [file=%v]", e.Err, e.Fd) + } + return e.Err.Error() +} + +// NewErrCorrupted creates new ErrCorrupted error. +func NewErrCorrupted(fd storage.FileDesc, err error) error { + return &ErrCorrupted{fd, err} +} + +// IsCorrupted returns a boolean indicating whether the error is indicating +// a corruption. +func IsCorrupted(err error) bool { + switch err.(type) { + case *ErrCorrupted: + return true + case *storage.ErrCorrupted: + return true + } + return false +} + +// ErrMissingFiles is the type that indicating a corruption due to missing +// files. ErrMissingFiles always wrapped with ErrCorrupted. +type ErrMissingFiles struct { + Fds []storage.FileDesc +} + +func (e *ErrMissingFiles) Error() string { return "file missing" } + +// SetFd sets 'file info' of the given error with the given file. +// Currently only ErrCorrupted is supported, otherwise will do nothing. +func SetFd(err error, fd storage.FileDesc) error { + switch x := err.(type) { + case *ErrCorrupted: + x.Fd = fd + return x + } + return err +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/filter.go b/vendor/github.com/syndtr/goleveldb/leveldb/filter.go new file mode 100644 index 000000000000..e961e420d3c4 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/filter.go @@ -0,0 +1,31 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package leveldb + +import ( + "github.com/syndtr/goleveldb/leveldb/filter" +) + +type iFilter struct { + filter.Filter +} + +func (f iFilter) Contains(filter, key []byte) bool { + return f.Filter.Contains(filter, internalKey(key).ukey()) +} + +func (f iFilter) NewGenerator() filter.FilterGenerator { + return iFilterGenerator{f.Filter.NewGenerator()} +} + +type iFilterGenerator struct { + filter.FilterGenerator +} + +func (g iFilterGenerator) Add(key []byte) { + g.FilterGenerator.Add(internalKey(key).ukey()) +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/filter/bloom.go b/vendor/github.com/syndtr/goleveldb/leveldb/filter/bloom.go new file mode 100644 index 000000000000..bab0e99705f5 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/filter/bloom.go @@ -0,0 +1,116 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package filter + +import ( + "github.com/syndtr/goleveldb/leveldb/util" +) + +func bloomHash(key []byte) uint32 { + return util.Hash(key, 0xbc9f1d34) +} + +type bloomFilter int + +// The bloom filter serializes its parameters and is backward compatible +// with respect to them. Therefor, its parameters are not added to its +// name. +func (bloomFilter) Name() string { + return "leveldb.BuiltinBloomFilter" +} + +func (f bloomFilter) Contains(filter, key []byte) bool { + nBytes := len(filter) - 1 + if nBytes < 1 { + return false + } + nBits := uint32(nBytes * 8) + + // Use the encoded k so that we can read filters generated by + // bloom filters created using different parameters. + k := filter[nBytes] + if k > 30 { + // Reserved for potentially new encodings for short bloom filters. + // Consider it a match. + return true + } + + kh := bloomHash(key) + delta := (kh >> 17) | (kh << 15) // Rotate right 17 bits + for j := uint8(0); j < k; j++ { + bitpos := kh % nBits + if (uint32(filter[bitpos/8]) & (1 << (bitpos % 8))) == 0 { + return false + } + kh += delta + } + return true +} + +func (f bloomFilter) NewGenerator() FilterGenerator { + // Round down to reduce probing cost a little bit. + k := uint8(f * 69 / 100) // 0.69 =~ ln(2) + if k < 1 { + k = 1 + } else if k > 30 { + k = 30 + } + return &bloomFilterGenerator{ + n: int(f), + k: k, + } +} + +type bloomFilterGenerator struct { + n int + k uint8 + + keyHashes []uint32 +} + +func (g *bloomFilterGenerator) Add(key []byte) { + // Use double-hashing to generate a sequence of hash values. + // See analysis in [Kirsch,Mitzenmacher 2006]. + g.keyHashes = append(g.keyHashes, bloomHash(key)) +} + +func (g *bloomFilterGenerator) Generate(b Buffer) { + // Compute bloom filter size (in both bits and bytes) + nBits := uint32(len(g.keyHashes) * g.n) + // For small n, we can see a very high false positive rate. Fix it + // by enforcing a minimum bloom filter length. + if nBits < 64 { + nBits = 64 + } + nBytes := (nBits + 7) / 8 + nBits = nBytes * 8 + + dest := b.Alloc(int(nBytes) + 1) + dest[nBytes] = g.k + for _, kh := range g.keyHashes { + delta := (kh >> 17) | (kh << 15) // Rotate right 17 bits + for j := uint8(0); j < g.k; j++ { + bitpos := kh % nBits + dest[bitpos/8] |= (1 << (bitpos % 8)) + kh += delta + } + } + + g.keyHashes = g.keyHashes[:0] +} + +// NewBloomFilter creates a new initialized bloom filter for given +// bitsPerKey. +// +// Since bitsPerKey is persisted individually for each bloom filter +// serialization, bloom filters are backwards compatible with respect to +// changing bitsPerKey. This means that no big performance penalty will +// be experienced when changing the parameter. See documentation for +// opt.Options.Filter for more information. +func NewBloomFilter(bitsPerKey int) Filter { + return bloomFilter(bitsPerKey) +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/filter/filter.go b/vendor/github.com/syndtr/goleveldb/leveldb/filter/filter.go new file mode 100644 index 000000000000..7a925c5a869e --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/filter/filter.go @@ -0,0 +1,60 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Package filter provides interface and implementation of probabilistic +// data structure. +// +// The filter is resposible for creating small filter from a set of keys. +// These filter will then used to test whether a key is a member of the set. +// In many cases, a filter can cut down the number of disk seeks from a +// handful to a single disk seek per DB.Get call. +package filter + +// Buffer is the interface that wraps basic Alloc, Write and WriteByte methods. +type Buffer interface { + // Alloc allocs n bytes of slice from the buffer. This also advancing + // write offset. + Alloc(n int) []byte + + // Write appends the contents of p to the buffer. + Write(p []byte) (n int, err error) + + // WriteByte appends the byte c to the buffer. + WriteByte(c byte) error +} + +// Filter is the filter. +type Filter interface { + // Name returns the name of this policy. + // + // Note that if the filter encoding changes in an incompatible way, + // the name returned by this method must be changed. Otherwise, old + // incompatible filters may be passed to methods of this type. + Name() string + + // NewGenerator creates a new filter generator. + NewGenerator() FilterGenerator + + // Contains returns true if the filter contains the given key. + // + // The filter are filters generated by the filter generator. + Contains(filter, key []byte) bool +} + +// FilterGenerator is the filter generator. +type FilterGenerator interface { + // Add adds a key to the filter generator. + // + // The key may become invalid after call to this method end, therefor + // key must be copied if implementation require keeping key for later + // use. The key should not modified directly, doing so may cause + // undefined results. + Add(key []byte) + + // Generate generates filters based on keys passed so far. After call + // to Generate the filter generator maybe resetted, depends on implementation. + Generate(b Buffer) +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/iterator/array_iter.go b/vendor/github.com/syndtr/goleveldb/leveldb/iterator/array_iter.go new file mode 100644 index 000000000000..a23ab05f70fe --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/iterator/array_iter.go @@ -0,0 +1,184 @@ +// Copyright (c) 2014, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package iterator + +import ( + "github.com/syndtr/goleveldb/leveldb/util" +) + +// BasicArray is the interface that wraps basic Len and Search method. +type BasicArray interface { + // Len returns length of the array. + Len() int + + // Search finds smallest index that point to a key that is greater + // than or equal to the given key. + Search(key []byte) int +} + +// Array is the interface that wraps BasicArray and basic Index method. +type Array interface { + BasicArray + + // Index returns key/value pair with index of i. + Index(i int) (key, value []byte) +} + +// Array is the interface that wraps BasicArray and basic Get method. +type ArrayIndexer interface { + BasicArray + + // Get returns a new data iterator with index of i. + Get(i int) Iterator +} + +type basicArrayIterator struct { + util.BasicReleaser + array BasicArray + pos int + err error +} + +func (i *basicArrayIterator) Valid() bool { + return i.pos >= 0 && i.pos < i.array.Len() && !i.Released() +} + +func (i *basicArrayIterator) First() bool { + if i.Released() { + i.err = ErrIterReleased + return false + } + + if i.array.Len() == 0 { + i.pos = -1 + return false + } + i.pos = 0 + return true +} + +func (i *basicArrayIterator) Last() bool { + if i.Released() { + i.err = ErrIterReleased + return false + } + + n := i.array.Len() + if n == 0 { + i.pos = 0 + return false + } + i.pos = n - 1 + return true +} + +func (i *basicArrayIterator) Seek(key []byte) bool { + if i.Released() { + i.err = ErrIterReleased + return false + } + + n := i.array.Len() + if n == 0 { + i.pos = 0 + return false + } + i.pos = i.array.Search(key) + if i.pos >= n { + return false + } + return true +} + +func (i *basicArrayIterator) Next() bool { + if i.Released() { + i.err = ErrIterReleased + return false + } + + i.pos++ + if n := i.array.Len(); i.pos >= n { + i.pos = n + return false + } + return true +} + +func (i *basicArrayIterator) Prev() bool { + if i.Released() { + i.err = ErrIterReleased + return false + } + + i.pos-- + if i.pos < 0 { + i.pos = -1 + return false + } + return true +} + +func (i *basicArrayIterator) Error() error { return i.err } + +type arrayIterator struct { + basicArrayIterator + array Array + pos int + key, value []byte +} + +func (i *arrayIterator) updateKV() { + if i.pos == i.basicArrayIterator.pos { + return + } + i.pos = i.basicArrayIterator.pos + if i.Valid() { + i.key, i.value = i.array.Index(i.pos) + } else { + i.key = nil + i.value = nil + } +} + +func (i *arrayIterator) Key() []byte { + i.updateKV() + return i.key +} + +func (i *arrayIterator) Value() []byte { + i.updateKV() + return i.value +} + +type arrayIteratorIndexer struct { + basicArrayIterator + array ArrayIndexer +} + +func (i *arrayIteratorIndexer) Get() Iterator { + if i.Valid() { + return i.array.Get(i.basicArrayIterator.pos) + } + return nil +} + +// NewArrayIterator returns an iterator from the given array. +func NewArrayIterator(array Array) Iterator { + return &arrayIterator{ + basicArrayIterator: basicArrayIterator{array: array, pos: -1}, + array: array, + pos: -1, + } +} + +// NewArrayIndexer returns an index iterator from the given array. +func NewArrayIndexer(array ArrayIndexer) IteratorIndexer { + return &arrayIteratorIndexer{ + basicArrayIterator: basicArrayIterator{array: array, pos: -1}, + array: array, + } +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/iterator/indexed_iter.go b/vendor/github.com/syndtr/goleveldb/leveldb/iterator/indexed_iter.go new file mode 100644 index 000000000000..939adbb9332b --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/iterator/indexed_iter.go @@ -0,0 +1,242 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package iterator + +import ( + "github.com/syndtr/goleveldb/leveldb/errors" + "github.com/syndtr/goleveldb/leveldb/util" +) + +// IteratorIndexer is the interface that wraps CommonIterator and basic Get +// method. IteratorIndexer provides index for indexed iterator. +type IteratorIndexer interface { + CommonIterator + + // Get returns a new data iterator for the current position, or nil if + // done. + Get() Iterator +} + +type indexedIterator struct { + util.BasicReleaser + index IteratorIndexer + strict bool + + data Iterator + err error + errf func(err error) + closed bool +} + +func (i *indexedIterator) setData() { + if i.data != nil { + i.data.Release() + } + i.data = i.index.Get() +} + +func (i *indexedIterator) clearData() { + if i.data != nil { + i.data.Release() + } + i.data = nil +} + +func (i *indexedIterator) indexErr() { + if err := i.index.Error(); err != nil { + if i.errf != nil { + i.errf(err) + } + i.err = err + } +} + +func (i *indexedIterator) dataErr() bool { + if err := i.data.Error(); err != nil { + if i.errf != nil { + i.errf(err) + } + if i.strict || !errors.IsCorrupted(err) { + i.err = err + return true + } + } + return false +} + +func (i *indexedIterator) Valid() bool { + return i.data != nil && i.data.Valid() +} + +func (i *indexedIterator) First() bool { + if i.err != nil { + return false + } else if i.Released() { + i.err = ErrIterReleased + return false + } + + if !i.index.First() { + i.indexErr() + i.clearData() + return false + } + i.setData() + return i.Next() +} + +func (i *indexedIterator) Last() bool { + if i.err != nil { + return false + } else if i.Released() { + i.err = ErrIterReleased + return false + } + + if !i.index.Last() { + i.indexErr() + i.clearData() + return false + } + i.setData() + if !i.data.Last() { + if i.dataErr() { + return false + } + i.clearData() + return i.Prev() + } + return true +} + +func (i *indexedIterator) Seek(key []byte) bool { + if i.err != nil { + return false + } else if i.Released() { + i.err = ErrIterReleased + return false + } + + if !i.index.Seek(key) { + i.indexErr() + i.clearData() + return false + } + i.setData() + if !i.data.Seek(key) { + if i.dataErr() { + return false + } + i.clearData() + return i.Next() + } + return true +} + +func (i *indexedIterator) Next() bool { + if i.err != nil { + return false + } else if i.Released() { + i.err = ErrIterReleased + return false + } + + switch { + case i.data != nil && !i.data.Next(): + if i.dataErr() { + return false + } + i.clearData() + fallthrough + case i.data == nil: + if !i.index.Next() { + i.indexErr() + return false + } + i.setData() + return i.Next() + } + return true +} + +func (i *indexedIterator) Prev() bool { + if i.err != nil { + return false + } else if i.Released() { + i.err = ErrIterReleased + return false + } + + switch { + case i.data != nil && !i.data.Prev(): + if i.dataErr() { + return false + } + i.clearData() + fallthrough + case i.data == nil: + if !i.index.Prev() { + i.indexErr() + return false + } + i.setData() + if !i.data.Last() { + if i.dataErr() { + return false + } + i.clearData() + return i.Prev() + } + } + return true +} + +func (i *indexedIterator) Key() []byte { + if i.data == nil { + return nil + } + return i.data.Key() +} + +func (i *indexedIterator) Value() []byte { + if i.data == nil { + return nil + } + return i.data.Value() +} + +func (i *indexedIterator) Release() { + i.clearData() + i.index.Release() + i.BasicReleaser.Release() +} + +func (i *indexedIterator) Error() error { + if i.err != nil { + return i.err + } + if err := i.index.Error(); err != nil { + return err + } + return nil +} + +func (i *indexedIterator) SetErrorCallback(f func(err error)) { + i.errf = f +} + +// NewIndexedIterator returns an 'indexed iterator'. An index is iterator +// that returns another iterator, a 'data iterator'. A 'data iterator' is the +// iterator that contains actual key/value pairs. +// +// If strict is true the any 'corruption errors' (i.e errors.IsCorrupted(err) == true) +// won't be ignored and will halt 'indexed iterator', otherwise the iterator will +// continue to the next 'data iterator'. Corruption on 'index iterator' will not be +// ignored and will halt the iterator. +func NewIndexedIterator(index IteratorIndexer, strict bool) Iterator { + return &indexedIterator{index: index, strict: strict} +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/iterator/iter.go b/vendor/github.com/syndtr/goleveldb/leveldb/iterator/iter.go new file mode 100644 index 000000000000..96fb0f6859ca --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/iterator/iter.go @@ -0,0 +1,132 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Package iterator provides interface and implementation to traverse over +// contents of a database. +package iterator + +import ( + "errors" + + "github.com/syndtr/goleveldb/leveldb/util" +) + +var ( + ErrIterReleased = errors.New("leveldb/iterator: iterator released") +) + +// IteratorSeeker is the interface that wraps the 'seeks method'. +type IteratorSeeker interface { + // First moves the iterator to the first key/value pair. If the iterator + // only contains one key/value pair then First and Last would moves + // to the same key/value pair. + // It returns whether such pair exist. + First() bool + + // Last moves the iterator to the last key/value pair. If the iterator + // only contains one key/value pair then First and Last would moves + // to the same key/value pair. + // It returns whether such pair exist. + Last() bool + + // Seek moves the iterator to the first key/value pair whose key is greater + // than or equal to the given key. + // It returns whether such pair exist. + // + // It is safe to modify the contents of the argument after Seek returns. + Seek(key []byte) bool + + // Next moves the iterator to the next key/value pair. + // It returns false if the iterator is exhausted. + Next() bool + + // Prev moves the iterator to the previous key/value pair. + // It returns false if the iterator is exhausted. + Prev() bool +} + +// CommonIterator is the interface that wraps common iterator methods. +type CommonIterator interface { + IteratorSeeker + + // util.Releaser is the interface that wraps basic Release method. + // When called Release will releases any resources associated with the + // iterator. + util.Releaser + + // util.ReleaseSetter is the interface that wraps the basic SetReleaser + // method. + util.ReleaseSetter + + // TODO: Remove this when ready. + Valid() bool + + // Error returns any accumulated error. Exhausting all the key/value pairs + // is not considered to be an error. + Error() error +} + +// Iterator iterates over a DB's key/value pairs in key order. +// +// When encounter an error any 'seeks method' will return false and will +// yield no key/value pairs. The error can be queried by calling the Error +// method. Calling Release is still necessary. +// +// An iterator must be released after use, but it is not necessary to read +// an iterator until exhaustion. +// Also, an iterator is not necessarily safe for concurrent use, but it is +// safe to use multiple iterators concurrently, with each in a dedicated +// goroutine. +type Iterator interface { + CommonIterator + + // Key returns the key of the current key/value pair, or nil if done. + // The caller should not modify the contents of the returned slice, and + // its contents may change on the next call to any 'seeks method'. + Key() []byte + + // Value returns the value of the current key/value pair, or nil if done. + // The caller should not modify the contents of the returned slice, and + // its contents may change on the next call to any 'seeks method'. + Value() []byte +} + +// ErrorCallbackSetter is the interface that wraps basic SetErrorCallback +// method. +// +// ErrorCallbackSetter implemented by indexed and merged iterator. +type ErrorCallbackSetter interface { + // SetErrorCallback allows set an error callback of the corresponding + // iterator. Use nil to clear the callback. + SetErrorCallback(f func(err error)) +} + +type emptyIterator struct { + util.BasicReleaser + err error +} + +func (i *emptyIterator) rErr() { + if i.err == nil && i.Released() { + i.err = ErrIterReleased + } +} + +func (*emptyIterator) Valid() bool { return false } +func (i *emptyIterator) First() bool { i.rErr(); return false } +func (i *emptyIterator) Last() bool { i.rErr(); return false } +func (i *emptyIterator) Seek(key []byte) bool { i.rErr(); return false } +func (i *emptyIterator) Next() bool { i.rErr(); return false } +func (i *emptyIterator) Prev() bool { i.rErr(); return false } +func (*emptyIterator) Key() []byte { return nil } +func (*emptyIterator) Value() []byte { return nil } +func (i *emptyIterator) Error() error { return i.err } + +// NewEmptyIterator creates an empty iterator. The err parameter can be +// nil, but if not nil the given err will be returned by Error method. +func NewEmptyIterator(err error) Iterator { + return &emptyIterator{err: err} +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/iterator/merged_iter.go b/vendor/github.com/syndtr/goleveldb/leveldb/iterator/merged_iter.go new file mode 100644 index 000000000000..1a7e29df8fbd --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/iterator/merged_iter.go @@ -0,0 +1,304 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package iterator + +import ( + "github.com/syndtr/goleveldb/leveldb/comparer" + "github.com/syndtr/goleveldb/leveldb/errors" + "github.com/syndtr/goleveldb/leveldb/util" +) + +type dir int + +const ( + dirReleased dir = iota - 1 + dirSOI + dirEOI + dirBackward + dirForward +) + +type mergedIterator struct { + cmp comparer.Comparer + iters []Iterator + strict bool + + keys [][]byte + index int + dir dir + err error + errf func(err error) + releaser util.Releaser +} + +func assertKey(key []byte) []byte { + if key == nil { + panic("leveldb/iterator: nil key") + } + return key +} + +func (i *mergedIterator) iterErr(iter Iterator) bool { + if err := iter.Error(); err != nil { + if i.errf != nil { + i.errf(err) + } + if i.strict || !errors.IsCorrupted(err) { + i.err = err + return true + } + } + return false +} + +func (i *mergedIterator) Valid() bool { + return i.err == nil && i.dir > dirEOI +} + +func (i *mergedIterator) First() bool { + if i.err != nil { + return false + } else if i.dir == dirReleased { + i.err = ErrIterReleased + return false + } + + for x, iter := range i.iters { + switch { + case iter.First(): + i.keys[x] = assertKey(iter.Key()) + case i.iterErr(iter): + return false + default: + i.keys[x] = nil + } + } + i.dir = dirSOI + return i.next() +} + +func (i *mergedIterator) Last() bool { + if i.err != nil { + return false + } else if i.dir == dirReleased { + i.err = ErrIterReleased + return false + } + + for x, iter := range i.iters { + switch { + case iter.Last(): + i.keys[x] = assertKey(iter.Key()) + case i.iterErr(iter): + return false + default: + i.keys[x] = nil + } + } + i.dir = dirEOI + return i.prev() +} + +func (i *mergedIterator) Seek(key []byte) bool { + if i.err != nil { + return false + } else if i.dir == dirReleased { + i.err = ErrIterReleased + return false + } + + for x, iter := range i.iters { + switch { + case iter.Seek(key): + i.keys[x] = assertKey(iter.Key()) + case i.iterErr(iter): + return false + default: + i.keys[x] = nil + } + } + i.dir = dirSOI + return i.next() +} + +func (i *mergedIterator) next() bool { + var key []byte + if i.dir == dirForward { + key = i.keys[i.index] + } + for x, tkey := range i.keys { + if tkey != nil && (key == nil || i.cmp.Compare(tkey, key) < 0) { + key = tkey + i.index = x + } + } + if key == nil { + i.dir = dirEOI + return false + } + i.dir = dirForward + return true +} + +func (i *mergedIterator) Next() bool { + if i.dir == dirEOI || i.err != nil { + return false + } else if i.dir == dirReleased { + i.err = ErrIterReleased + return false + } + + switch i.dir { + case dirSOI: + return i.First() + case dirBackward: + key := append([]byte{}, i.keys[i.index]...) + if !i.Seek(key) { + return false + } + return i.Next() + } + + x := i.index + iter := i.iters[x] + switch { + case iter.Next(): + i.keys[x] = assertKey(iter.Key()) + case i.iterErr(iter): + return false + default: + i.keys[x] = nil + } + return i.next() +} + +func (i *mergedIterator) prev() bool { + var key []byte + if i.dir == dirBackward { + key = i.keys[i.index] + } + for x, tkey := range i.keys { + if tkey != nil && (key == nil || i.cmp.Compare(tkey, key) > 0) { + key = tkey + i.index = x + } + } + if key == nil { + i.dir = dirSOI + return false + } + i.dir = dirBackward + return true +} + +func (i *mergedIterator) Prev() bool { + if i.dir == dirSOI || i.err != nil { + return false + } else if i.dir == dirReleased { + i.err = ErrIterReleased + return false + } + + switch i.dir { + case dirEOI: + return i.Last() + case dirForward: + key := append([]byte{}, i.keys[i.index]...) + for x, iter := range i.iters { + if x == i.index { + continue + } + seek := iter.Seek(key) + switch { + case seek && iter.Prev(), !seek && iter.Last(): + i.keys[x] = assertKey(iter.Key()) + case i.iterErr(iter): + return false + default: + i.keys[x] = nil + } + } + } + + x := i.index + iter := i.iters[x] + switch { + case iter.Prev(): + i.keys[x] = assertKey(iter.Key()) + case i.iterErr(iter): + return false + default: + i.keys[x] = nil + } + return i.prev() +} + +func (i *mergedIterator) Key() []byte { + if i.err != nil || i.dir <= dirEOI { + return nil + } + return i.keys[i.index] +} + +func (i *mergedIterator) Value() []byte { + if i.err != nil || i.dir <= dirEOI { + return nil + } + return i.iters[i.index].Value() +} + +func (i *mergedIterator) Release() { + if i.dir != dirReleased { + i.dir = dirReleased + for _, iter := range i.iters { + iter.Release() + } + i.iters = nil + i.keys = nil + if i.releaser != nil { + i.releaser.Release() + i.releaser = nil + } + } +} + +func (i *mergedIterator) SetReleaser(releaser util.Releaser) { + if i.dir == dirReleased { + panic(util.ErrReleased) + } + if i.releaser != nil && releaser != nil { + panic(util.ErrHasReleaser) + } + i.releaser = releaser +} + +func (i *mergedIterator) Error() error { + return i.err +} + +func (i *mergedIterator) SetErrorCallback(f func(err error)) { + i.errf = f +} + +// NewMergedIterator returns an iterator that merges its input. Walking the +// resultant iterator will return all key/value pairs of all input iterators +// in strictly increasing key order, as defined by cmp. +// The input's key ranges may overlap, but there are assumed to be no duplicate +// keys: if iters[i] contains a key k then iters[j] will not contain that key k. +// None of the iters may be nil. +// +// If strict is true the any 'corruption errors' (i.e errors.IsCorrupted(err) == true) +// won't be ignored and will halt 'merged iterator', otherwise the iterator will +// continue to the next 'input iterator'. +func NewMergedIterator(iters []Iterator, cmp comparer.Comparer, strict bool) Iterator { + return &mergedIterator{ + iters: iters, + cmp: cmp, + strict: strict, + keys: make([][]byte, len(iters)), + } +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/journal/journal.go b/vendor/github.com/syndtr/goleveldb/leveldb/journal/journal.go new file mode 100644 index 000000000000..d094c3d0f8a5 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/journal/journal.go @@ -0,0 +1,524 @@ +// Copyright 2011 The LevelDB-Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Taken from: https://code.google.com/p/leveldb-go/source/browse/leveldb/record/record.go?r=1d5ccbe03246da926391ee12d1c6caae054ff4b0 +// License, authors and contributors informations can be found at bellow URLs respectively: +// https://code.google.com/p/leveldb-go/source/browse/LICENSE +// https://code.google.com/p/leveldb-go/source/browse/AUTHORS +// https://code.google.com/p/leveldb-go/source/browse/CONTRIBUTORS + +// Package journal reads and writes sequences of journals. Each journal is a stream +// of bytes that completes before the next journal starts. +// +// When reading, call Next to obtain an io.Reader for the next journal. Next will +// return io.EOF when there are no more journals. It is valid to call Next +// without reading the current journal to exhaustion. +// +// When writing, call Next to obtain an io.Writer for the next journal. Calling +// Next finishes the current journal. Call Close to finish the final journal. +// +// Optionally, call Flush to finish the current journal and flush the underlying +// writer without starting a new journal. To start a new journal after flushing, +// call Next. +// +// Neither Readers or Writers are safe to use concurrently. +// +// Example code: +// func read(r io.Reader) ([]string, error) { +// var ss []string +// journals := journal.NewReader(r, nil, true, true) +// for { +// j, err := journals.Next() +// if err == io.EOF { +// break +// } +// if err != nil { +// return nil, err +// } +// s, err := ioutil.ReadAll(j) +// if err != nil { +// return nil, err +// } +// ss = append(ss, string(s)) +// } +// return ss, nil +// } +// +// func write(w io.Writer, ss []string) error { +// journals := journal.NewWriter(w) +// for _, s := range ss { +// j, err := journals.Next() +// if err != nil { +// return err +// } +// if _, err := j.Write([]byte(s)), err != nil { +// return err +// } +// } +// return journals.Close() +// } +// +// The wire format is that the stream is divided into 32KiB blocks, and each +// block contains a number of tightly packed chunks. Chunks cannot cross block +// boundaries. The last block may be shorter than 32 KiB. Any unused bytes in a +// block must be zero. +// +// A journal maps to one or more chunks. Each chunk has a 7 byte header (a 4 +// byte checksum, a 2 byte little-endian uint16 length, and a 1 byte chunk type) +// followed by a payload. The checksum is over the chunk type and the payload. +// +// There are four chunk types: whether the chunk is the full journal, or the +// first, middle or last chunk of a multi-chunk journal. A multi-chunk journal +// has one first chunk, zero or more middle chunks, and one last chunk. +// +// The wire format allows for limited recovery in the face of data corruption: +// on a format error (such as a checksum mismatch), the reader moves to the +// next block and looks for the next full or first chunk. +package journal + +import ( + "encoding/binary" + "fmt" + "io" + + "github.com/syndtr/goleveldb/leveldb/errors" + "github.com/syndtr/goleveldb/leveldb/storage" + "github.com/syndtr/goleveldb/leveldb/util" +) + +// These constants are part of the wire format and should not be changed. +const ( + fullChunkType = 1 + firstChunkType = 2 + middleChunkType = 3 + lastChunkType = 4 +) + +const ( + blockSize = 32 * 1024 + headerSize = 7 +) + +type flusher interface { + Flush() error +} + +// ErrCorrupted is the error type that generated by corrupted block or chunk. +type ErrCorrupted struct { + Size int + Reason string +} + +func (e *ErrCorrupted) Error() string { + return fmt.Sprintf("leveldb/journal: block/chunk corrupted: %s (%d bytes)", e.Reason, e.Size) +} + +// Dropper is the interface that wrap simple Drop method. The Drop +// method will be called when the journal reader dropping a block or chunk. +type Dropper interface { + Drop(err error) +} + +// Reader reads journals from an underlying io.Reader. +type Reader struct { + // r is the underlying reader. + r io.Reader + // the dropper. + dropper Dropper + // strict flag. + strict bool + // checksum flag. + checksum bool + // seq is the sequence number of the current journal. + seq int + // buf[i:j] is the unread portion of the current chunk's payload. + // The low bound, i, excludes the chunk header. + i, j int + // n is the number of bytes of buf that are valid. Once reading has started, + // only the final block can have n < blockSize. + n int + // last is whether the current chunk is the last chunk of the journal. + last bool + // err is any accumulated error. + err error + // buf is the buffer. + buf [blockSize]byte +} + +// NewReader returns a new reader. The dropper may be nil, and if +// strict is true then corrupted or invalid chunk will halt the journal +// reader entirely. +func NewReader(r io.Reader, dropper Dropper, strict, checksum bool) *Reader { + return &Reader{ + r: r, + dropper: dropper, + strict: strict, + checksum: checksum, + last: true, + } +} + +var errSkip = errors.New("leveldb/journal: skipped") + +func (r *Reader) corrupt(n int, reason string, skip bool) error { + if r.dropper != nil { + r.dropper.Drop(&ErrCorrupted{n, reason}) + } + if r.strict && !skip { + r.err = errors.NewErrCorrupted(storage.FileDesc{}, &ErrCorrupted{n, reason}) + return r.err + } + return errSkip +} + +// nextChunk sets r.buf[r.i:r.j] to hold the next chunk's payload, reading the +// next block into the buffer if necessary. +func (r *Reader) nextChunk(first bool) error { + for { + if r.j+headerSize <= r.n { + checksum := binary.LittleEndian.Uint32(r.buf[r.j+0 : r.j+4]) + length := binary.LittleEndian.Uint16(r.buf[r.j+4 : r.j+6]) + chunkType := r.buf[r.j+6] + unprocBlock := r.n - r.j + if checksum == 0 && length == 0 && chunkType == 0 { + // Drop entire block. + r.i = r.n + r.j = r.n + return r.corrupt(unprocBlock, "zero header", false) + } + if chunkType < fullChunkType || chunkType > lastChunkType { + // Drop entire block. + r.i = r.n + r.j = r.n + return r.corrupt(unprocBlock, fmt.Sprintf("invalid chunk type %#x", chunkType), false) + } + r.i = r.j + headerSize + r.j = r.j + headerSize + int(length) + if r.j > r.n { + // Drop entire block. + r.i = r.n + r.j = r.n + return r.corrupt(unprocBlock, "chunk length overflows block", false) + } else if r.checksum && checksum != util.NewCRC(r.buf[r.i-1:r.j]).Value() { + // Drop entire block. + r.i = r.n + r.j = r.n + return r.corrupt(unprocBlock, "checksum mismatch", false) + } + if first && chunkType != fullChunkType && chunkType != firstChunkType { + chunkLength := (r.j - r.i) + headerSize + r.i = r.j + // Report the error, but skip it. + return r.corrupt(chunkLength, "orphan chunk", true) + } + r.last = chunkType == fullChunkType || chunkType == lastChunkType + return nil + } + + // The last block. + if r.n < blockSize && r.n > 0 { + if !first { + return r.corrupt(0, "missing chunk part", false) + } + r.err = io.EOF + return r.err + } + + // Read block. + n, err := io.ReadFull(r.r, r.buf[:]) + if err != nil && err != io.EOF && err != io.ErrUnexpectedEOF { + return err + } + if n == 0 { + if !first { + return r.corrupt(0, "missing chunk part", false) + } + r.err = io.EOF + return r.err + } + r.i, r.j, r.n = 0, 0, n + } +} + +// Next returns a reader for the next journal. It returns io.EOF if there are no +// more journals. The reader returned becomes stale after the next Next call, +// and should no longer be used. If strict is false, the reader will returns +// io.ErrUnexpectedEOF error when found corrupted journal. +func (r *Reader) Next() (io.Reader, error) { + r.seq++ + if r.err != nil { + return nil, r.err + } + r.i = r.j + for { + if err := r.nextChunk(true); err == nil { + break + } else if err != errSkip { + return nil, err + } + } + return &singleReader{r, r.seq, nil}, nil +} + +// Reset resets the journal reader, allows reuse of the journal reader. Reset returns +// last accumulated error. +func (r *Reader) Reset(reader io.Reader, dropper Dropper, strict, checksum bool) error { + r.seq++ + err := r.err + r.r = reader + r.dropper = dropper + r.strict = strict + r.checksum = checksum + r.i = 0 + r.j = 0 + r.n = 0 + r.last = true + r.err = nil + return err +} + +type singleReader struct { + r *Reader + seq int + err error +} + +func (x *singleReader) Read(p []byte) (int, error) { + r := x.r + if r.seq != x.seq { + return 0, errors.New("leveldb/journal: stale reader") + } + if x.err != nil { + return 0, x.err + } + if r.err != nil { + return 0, r.err + } + for r.i == r.j { + if r.last { + return 0, io.EOF + } + x.err = r.nextChunk(false) + if x.err != nil { + if x.err == errSkip { + x.err = io.ErrUnexpectedEOF + } + return 0, x.err + } + } + n := copy(p, r.buf[r.i:r.j]) + r.i += n + return n, nil +} + +func (x *singleReader) ReadByte() (byte, error) { + r := x.r + if r.seq != x.seq { + return 0, errors.New("leveldb/journal: stale reader") + } + if x.err != nil { + return 0, x.err + } + if r.err != nil { + return 0, r.err + } + for r.i == r.j { + if r.last { + return 0, io.EOF + } + x.err = r.nextChunk(false) + if x.err != nil { + if x.err == errSkip { + x.err = io.ErrUnexpectedEOF + } + return 0, x.err + } + } + c := r.buf[r.i] + r.i++ + return c, nil +} + +// Writer writes journals to an underlying io.Writer. +type Writer struct { + // w is the underlying writer. + w io.Writer + // seq is the sequence number of the current journal. + seq int + // f is w as a flusher. + f flusher + // buf[i:j] is the bytes that will become the current chunk. + // The low bound, i, includes the chunk header. + i, j int + // buf[:written] has already been written to w. + // written is zero unless Flush has been called. + written int + // first is whether the current chunk is the first chunk of the journal. + first bool + // pending is whether a chunk is buffered but not yet written. + pending bool + // err is any accumulated error. + err error + // buf is the buffer. + buf [blockSize]byte +} + +// NewWriter returns a new Writer. +func NewWriter(w io.Writer) *Writer { + f, _ := w.(flusher) + return &Writer{ + w: w, + f: f, + } +} + +// fillHeader fills in the header for the pending chunk. +func (w *Writer) fillHeader(last bool) { + if w.i+headerSize > w.j || w.j > blockSize { + panic("leveldb/journal: bad writer state") + } + if last { + if w.first { + w.buf[w.i+6] = fullChunkType + } else { + w.buf[w.i+6] = lastChunkType + } + } else { + if w.first { + w.buf[w.i+6] = firstChunkType + } else { + w.buf[w.i+6] = middleChunkType + } + } + binary.LittleEndian.PutUint32(w.buf[w.i+0:w.i+4], util.NewCRC(w.buf[w.i+6:w.j]).Value()) + binary.LittleEndian.PutUint16(w.buf[w.i+4:w.i+6], uint16(w.j-w.i-headerSize)) +} + +// writeBlock writes the buffered block to the underlying writer, and reserves +// space for the next chunk's header. +func (w *Writer) writeBlock() { + _, w.err = w.w.Write(w.buf[w.written:]) + w.i = 0 + w.j = headerSize + w.written = 0 +} + +// writePending finishes the current journal and writes the buffer to the +// underlying writer. +func (w *Writer) writePending() { + if w.err != nil { + return + } + if w.pending { + w.fillHeader(true) + w.pending = false + } + _, w.err = w.w.Write(w.buf[w.written:w.j]) + w.written = w.j +} + +// Close finishes the current journal and closes the writer. +func (w *Writer) Close() error { + w.seq++ + w.writePending() + if w.err != nil { + return w.err + } + w.err = errors.New("leveldb/journal: closed Writer") + return nil +} + +// Flush finishes the current journal, writes to the underlying writer, and +// flushes it if that writer implements interface{ Flush() error }. +func (w *Writer) Flush() error { + w.seq++ + w.writePending() + if w.err != nil { + return w.err + } + if w.f != nil { + w.err = w.f.Flush() + return w.err + } + return nil +} + +// Reset resets the journal writer, allows reuse of the journal writer. Reset +// will also closes the journal writer if not already. +func (w *Writer) Reset(writer io.Writer) (err error) { + w.seq++ + if w.err == nil { + w.writePending() + err = w.err + } + w.w = writer + w.f, _ = writer.(flusher) + w.i = 0 + w.j = 0 + w.written = 0 + w.first = false + w.pending = false + w.err = nil + return +} + +// Next returns a writer for the next journal. The writer returned becomes stale +// after the next Close, Flush or Next call, and should no longer be used. +func (w *Writer) Next() (io.Writer, error) { + w.seq++ + if w.err != nil { + return nil, w.err + } + if w.pending { + w.fillHeader(true) + } + w.i = w.j + w.j = w.j + headerSize + // Check if there is room in the block for the header. + if w.j > blockSize { + // Fill in the rest of the block with zeroes. + for k := w.i; k < blockSize; k++ { + w.buf[k] = 0 + } + w.writeBlock() + if w.err != nil { + return nil, w.err + } + } + w.first = true + w.pending = true + return singleWriter{w, w.seq}, nil +} + +type singleWriter struct { + w *Writer + seq int +} + +func (x singleWriter) Write(p []byte) (int, error) { + w := x.w + if w.seq != x.seq { + return 0, errors.New("leveldb/journal: stale writer") + } + if w.err != nil { + return 0, w.err + } + n0 := len(p) + for len(p) > 0 { + // Write a block, if it is full. + if w.j == blockSize { + w.fillHeader(false) + w.writeBlock() + if w.err != nil { + return 0, w.err + } + w.first = false + } + // Copy bytes into the buffer. + n := copy(w.buf[w.j:], p) + w.j += n + p = p[n:] + } + return n0, nil +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/key.go b/vendor/github.com/syndtr/goleveldb/leveldb/key.go new file mode 100644 index 000000000000..ad8f51ec85de --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/key.go @@ -0,0 +1,143 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package leveldb + +import ( + "encoding/binary" + "fmt" + + "github.com/syndtr/goleveldb/leveldb/errors" + "github.com/syndtr/goleveldb/leveldb/storage" +) + +// ErrInternalKeyCorrupted records internal key corruption. +type ErrInternalKeyCorrupted struct { + Ikey []byte + Reason string +} + +func (e *ErrInternalKeyCorrupted) Error() string { + return fmt.Sprintf("leveldb: internal key %q corrupted: %s", e.Ikey, e.Reason) +} + +func newErrInternalKeyCorrupted(ikey []byte, reason string) error { + return errors.NewErrCorrupted(storage.FileDesc{}, &ErrInternalKeyCorrupted{append([]byte{}, ikey...), reason}) +} + +type keyType uint + +func (kt keyType) String() string { + switch kt { + case keyTypeDel: + return "d" + case keyTypeVal: + return "v" + } + return fmt.Sprintf("", uint(kt)) +} + +// Value types encoded as the last component of internal keys. +// Don't modify; this value are saved to disk. +const ( + keyTypeDel = keyType(0) + keyTypeVal = keyType(1) +) + +// keyTypeSeek defines the keyType that should be passed when constructing an +// internal key for seeking to a particular sequence number (since we +// sort sequence numbers in decreasing order and the value type is +// embedded as the low 8 bits in the sequence number in internal keys, +// we need to use the highest-numbered ValueType, not the lowest). +const keyTypeSeek = keyTypeVal + +const ( + // Maximum value possible for sequence number; the 8-bits are + // used by value type, so its can packed together in single + // 64-bit integer. + keyMaxSeq = (uint64(1) << 56) - 1 + // Maximum value possible for packed sequence number and type. + keyMaxNum = (keyMaxSeq << 8) | uint64(keyTypeSeek) +) + +// Maximum number encoded in bytes. +var keyMaxNumBytes = make([]byte, 8) + +func init() { + binary.LittleEndian.PutUint64(keyMaxNumBytes, keyMaxNum) +} + +type internalKey []byte + +func makeInternalKey(dst, ukey []byte, seq uint64, kt keyType) internalKey { + if seq > keyMaxSeq { + panic("leveldb: invalid sequence number") + } else if kt > keyTypeVal { + panic("leveldb: invalid type") + } + + dst = ensureBuffer(dst, len(ukey)+8) + copy(dst, ukey) + binary.LittleEndian.PutUint64(dst[len(ukey):], (seq<<8)|uint64(kt)) + return internalKey(dst) +} + +func parseInternalKey(ik []byte) (ukey []byte, seq uint64, kt keyType, err error) { + if len(ik) < 8 { + return nil, 0, 0, newErrInternalKeyCorrupted(ik, "invalid length") + } + num := binary.LittleEndian.Uint64(ik[len(ik)-8:]) + seq, kt = uint64(num>>8), keyType(num&0xff) + if kt > keyTypeVal { + return nil, 0, 0, newErrInternalKeyCorrupted(ik, "invalid type") + } + ukey = ik[:len(ik)-8] + return +} + +func validInternalKey(ik []byte) bool { + _, _, _, err := parseInternalKey(ik) + return err == nil +} + +func (ik internalKey) assert() { + if ik == nil { + panic("leveldb: nil internalKey") + } + if len(ik) < 8 { + panic(fmt.Sprintf("leveldb: internal key %q, len=%d: invalid length", []byte(ik), len(ik))) + } +} + +func (ik internalKey) ukey() []byte { + ik.assert() + return ik[:len(ik)-8] +} + +func (ik internalKey) num() uint64 { + ik.assert() + return binary.LittleEndian.Uint64(ik[len(ik)-8:]) +} + +func (ik internalKey) parseNum() (seq uint64, kt keyType) { + num := ik.num() + seq, kt = uint64(num>>8), keyType(num&0xff) + if kt > keyTypeVal { + panic(fmt.Sprintf("leveldb: internal key %q, len=%d: invalid type %#x", []byte(ik), len(ik), kt)) + } + return +} + +func (ik internalKey) String() string { + if ik == nil { + return "" + } + + if ukey, seq, kt, err := parseInternalKey(ik); err == nil { + return fmt.Sprintf("%s,%s%d", shorten(string(ukey)), kt, seq) + } + return fmt.Sprintf("", []byte(ik)) +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/memdb/memdb.go b/vendor/github.com/syndtr/goleveldb/leveldb/memdb/memdb.go new file mode 100644 index 000000000000..b661c08a93e2 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/memdb/memdb.go @@ -0,0 +1,475 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Package memdb provides in-memory key/value database implementation. +package memdb + +import ( + "math/rand" + "sync" + + "github.com/syndtr/goleveldb/leveldb/comparer" + "github.com/syndtr/goleveldb/leveldb/errors" + "github.com/syndtr/goleveldb/leveldb/iterator" + "github.com/syndtr/goleveldb/leveldb/util" +) + +// Common errors. +var ( + ErrNotFound = errors.ErrNotFound + ErrIterReleased = errors.New("leveldb/memdb: iterator released") +) + +const tMaxHeight = 12 + +type dbIter struct { + util.BasicReleaser + p *DB + slice *util.Range + node int + forward bool + key, value []byte + err error +} + +func (i *dbIter) fill(checkStart, checkLimit bool) bool { + if i.node != 0 { + n := i.p.nodeData[i.node] + m := n + i.p.nodeData[i.node+nKey] + i.key = i.p.kvData[n:m] + if i.slice != nil { + switch { + case checkLimit && i.slice.Limit != nil && i.p.cmp.Compare(i.key, i.slice.Limit) >= 0: + fallthrough + case checkStart && i.slice.Start != nil && i.p.cmp.Compare(i.key, i.slice.Start) < 0: + i.node = 0 + goto bail + } + } + i.value = i.p.kvData[m : m+i.p.nodeData[i.node+nVal]] + return true + } +bail: + i.key = nil + i.value = nil + return false +} + +func (i *dbIter) Valid() bool { + return i.node != 0 +} + +func (i *dbIter) First() bool { + if i.Released() { + i.err = ErrIterReleased + return false + } + + i.forward = true + i.p.mu.RLock() + defer i.p.mu.RUnlock() + if i.slice != nil && i.slice.Start != nil { + i.node, _ = i.p.findGE(i.slice.Start, false) + } else { + i.node = i.p.nodeData[nNext] + } + return i.fill(false, true) +} + +func (i *dbIter) Last() bool { + if i.Released() { + i.err = ErrIterReleased + return false + } + + i.forward = false + i.p.mu.RLock() + defer i.p.mu.RUnlock() + if i.slice != nil && i.slice.Limit != nil { + i.node = i.p.findLT(i.slice.Limit) + } else { + i.node = i.p.findLast() + } + return i.fill(true, false) +} + +func (i *dbIter) Seek(key []byte) bool { + if i.Released() { + i.err = ErrIterReleased + return false + } + + i.forward = true + i.p.mu.RLock() + defer i.p.mu.RUnlock() + if i.slice != nil && i.slice.Start != nil && i.p.cmp.Compare(key, i.slice.Start) < 0 { + key = i.slice.Start + } + i.node, _ = i.p.findGE(key, false) + return i.fill(false, true) +} + +func (i *dbIter) Next() bool { + if i.Released() { + i.err = ErrIterReleased + return false + } + + if i.node == 0 { + if !i.forward { + return i.First() + } + return false + } + i.forward = true + i.p.mu.RLock() + defer i.p.mu.RUnlock() + i.node = i.p.nodeData[i.node+nNext] + return i.fill(false, true) +} + +func (i *dbIter) Prev() bool { + if i.Released() { + i.err = ErrIterReleased + return false + } + + if i.node == 0 { + if i.forward { + return i.Last() + } + return false + } + i.forward = false + i.p.mu.RLock() + defer i.p.mu.RUnlock() + i.node = i.p.findLT(i.key) + return i.fill(true, false) +} + +func (i *dbIter) Key() []byte { + return i.key +} + +func (i *dbIter) Value() []byte { + return i.value +} + +func (i *dbIter) Error() error { return i.err } + +func (i *dbIter) Release() { + if !i.Released() { + i.p = nil + i.node = 0 + i.key = nil + i.value = nil + i.BasicReleaser.Release() + } +} + +const ( + nKV = iota + nKey + nVal + nHeight + nNext +) + +// DB is an in-memory key/value database. +type DB struct { + cmp comparer.BasicComparer + rnd *rand.Rand + + mu sync.RWMutex + kvData []byte + // Node data: + // [0] : KV offset + // [1] : Key length + // [2] : Value length + // [3] : Height + // [3..height] : Next nodes + nodeData []int + prevNode [tMaxHeight]int + maxHeight int + n int + kvSize int +} + +func (p *DB) randHeight() (h int) { + const branching = 4 + h = 1 + for h < tMaxHeight && p.rnd.Int()%branching == 0 { + h++ + } + return +} + +// Must hold RW-lock if prev == true, as it use shared prevNode slice. +func (p *DB) findGE(key []byte, prev bool) (int, bool) { + node := 0 + h := p.maxHeight - 1 + for { + next := p.nodeData[node+nNext+h] + cmp := 1 + if next != 0 { + o := p.nodeData[next] + cmp = p.cmp.Compare(p.kvData[o:o+p.nodeData[next+nKey]], key) + } + if cmp < 0 { + // Keep searching in this list + node = next + } else { + if prev { + p.prevNode[h] = node + } else if cmp == 0 { + return next, true + } + if h == 0 { + return next, cmp == 0 + } + h-- + } + } +} + +func (p *DB) findLT(key []byte) int { + node := 0 + h := p.maxHeight - 1 + for { + next := p.nodeData[node+nNext+h] + o := p.nodeData[next] + if next == 0 || p.cmp.Compare(p.kvData[o:o+p.nodeData[next+nKey]], key) >= 0 { + if h == 0 { + break + } + h-- + } else { + node = next + } + } + return node +} + +func (p *DB) findLast() int { + node := 0 + h := p.maxHeight - 1 + for { + next := p.nodeData[node+nNext+h] + if next == 0 { + if h == 0 { + break + } + h-- + } else { + node = next + } + } + return node +} + +// Put sets the value for the given key. It overwrites any previous value +// for that key; a DB is not a multi-map. +// +// It is safe to modify the contents of the arguments after Put returns. +func (p *DB) Put(key []byte, value []byte) error { + p.mu.Lock() + defer p.mu.Unlock() + + if node, exact := p.findGE(key, true); exact { + kvOffset := len(p.kvData) + p.kvData = append(p.kvData, key...) + p.kvData = append(p.kvData, value...) + p.nodeData[node] = kvOffset + m := p.nodeData[node+nVal] + p.nodeData[node+nVal] = len(value) + p.kvSize += len(value) - m + return nil + } + + h := p.randHeight() + if h > p.maxHeight { + for i := p.maxHeight; i < h; i++ { + p.prevNode[i] = 0 + } + p.maxHeight = h + } + + kvOffset := len(p.kvData) + p.kvData = append(p.kvData, key...) + p.kvData = append(p.kvData, value...) + // Node + node := len(p.nodeData) + p.nodeData = append(p.nodeData, kvOffset, len(key), len(value), h) + for i, n := range p.prevNode[:h] { + m := n + nNext + i + p.nodeData = append(p.nodeData, p.nodeData[m]) + p.nodeData[m] = node + } + + p.kvSize += len(key) + len(value) + p.n++ + return nil +} + +// Delete deletes the value for the given key. It returns ErrNotFound if +// the DB does not contain the key. +// +// It is safe to modify the contents of the arguments after Delete returns. +func (p *DB) Delete(key []byte) error { + p.mu.Lock() + defer p.mu.Unlock() + + node, exact := p.findGE(key, true) + if !exact { + return ErrNotFound + } + + h := p.nodeData[node+nHeight] + for i, n := range p.prevNode[:h] { + m := n + nNext + i + p.nodeData[m] = p.nodeData[p.nodeData[m]+nNext+i] + } + + p.kvSize -= p.nodeData[node+nKey] + p.nodeData[node+nVal] + p.n-- + return nil +} + +// Contains returns true if the given key are in the DB. +// +// It is safe to modify the contents of the arguments after Contains returns. +func (p *DB) Contains(key []byte) bool { + p.mu.RLock() + _, exact := p.findGE(key, false) + p.mu.RUnlock() + return exact +} + +// Get gets the value for the given key. It returns error.ErrNotFound if the +// DB does not contain the key. +// +// The caller should not modify the contents of the returned slice, but +// it is safe to modify the contents of the argument after Get returns. +func (p *DB) Get(key []byte) (value []byte, err error) { + p.mu.RLock() + if node, exact := p.findGE(key, false); exact { + o := p.nodeData[node] + p.nodeData[node+nKey] + value = p.kvData[o : o+p.nodeData[node+nVal]] + } else { + err = ErrNotFound + } + p.mu.RUnlock() + return +} + +// Find finds key/value pair whose key is greater than or equal to the +// given key. It returns ErrNotFound if the table doesn't contain +// such pair. +// +// The caller should not modify the contents of the returned slice, but +// it is safe to modify the contents of the argument after Find returns. +func (p *DB) Find(key []byte) (rkey, value []byte, err error) { + p.mu.RLock() + if node, _ := p.findGE(key, false); node != 0 { + n := p.nodeData[node] + m := n + p.nodeData[node+nKey] + rkey = p.kvData[n:m] + value = p.kvData[m : m+p.nodeData[node+nVal]] + } else { + err = ErrNotFound + } + p.mu.RUnlock() + return +} + +// NewIterator returns an iterator of the DB. +// The returned iterator is not safe for concurrent use, but it is safe to use +// multiple iterators concurrently, with each in a dedicated goroutine. +// It is also safe to use an iterator concurrently with modifying its +// underlying DB. However, the resultant key/value pairs are not guaranteed +// to be a consistent snapshot of the DB at a particular point in time. +// +// Slice allows slicing the iterator to only contains keys in the given +// range. A nil Range.Start is treated as a key before all keys in the +// DB. And a nil Range.Limit is treated as a key after all keys in +// the DB. +// +// The iterator must be released after use, by calling Release method. +// +// Also read Iterator documentation of the leveldb/iterator package. +func (p *DB) NewIterator(slice *util.Range) iterator.Iterator { + return &dbIter{p: p, slice: slice} +} + +// Capacity returns keys/values buffer capacity. +func (p *DB) Capacity() int { + p.mu.RLock() + defer p.mu.RUnlock() + return cap(p.kvData) +} + +// Size returns sum of keys and values length. Note that deleted +// key/value will not be accounted for, but it will still consume +// the buffer, since the buffer is append only. +func (p *DB) Size() int { + p.mu.RLock() + defer p.mu.RUnlock() + return p.kvSize +} + +// Free returns keys/values free buffer before need to grow. +func (p *DB) Free() int { + p.mu.RLock() + defer p.mu.RUnlock() + return cap(p.kvData) - len(p.kvData) +} + +// Len returns the number of entries in the DB. +func (p *DB) Len() int { + p.mu.RLock() + defer p.mu.RUnlock() + return p.n +} + +// Reset resets the DB to initial empty state. Allows reuse the buffer. +func (p *DB) Reset() { + p.mu.Lock() + p.rnd = rand.New(rand.NewSource(0xdeadbeef)) + p.maxHeight = 1 + p.n = 0 + p.kvSize = 0 + p.kvData = p.kvData[:0] + p.nodeData = p.nodeData[:nNext+tMaxHeight] + p.nodeData[nKV] = 0 + p.nodeData[nKey] = 0 + p.nodeData[nVal] = 0 + p.nodeData[nHeight] = tMaxHeight + for n := 0; n < tMaxHeight; n++ { + p.nodeData[nNext+n] = 0 + p.prevNode[n] = 0 + } + p.mu.Unlock() +} + +// New creates a new initialized in-memory key/value DB. The capacity +// is the initial key/value buffer capacity. The capacity is advisory, +// not enforced. +// +// This DB is append-only, deleting an entry would remove entry node but not +// reclaim KV buffer. +// +// The returned DB instance is safe for concurrent use. +func New(cmp comparer.BasicComparer, capacity int) *DB { + p := &DB{ + cmp: cmp, + rnd: rand.New(rand.NewSource(0xdeadbeef)), + maxHeight: 1, + kvData: make([]byte, 0, capacity), + nodeData: make([]int, 4+tMaxHeight), + } + p.nodeData[nHeight] = tMaxHeight + return p +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/opt/options.go b/vendor/github.com/syndtr/goleveldb/leveldb/opt/options.go new file mode 100644 index 000000000000..528b16423321 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/opt/options.go @@ -0,0 +1,697 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Package opt provides sets of options used by LevelDB. +package opt + +import ( + "math" + + "github.com/syndtr/goleveldb/leveldb/cache" + "github.com/syndtr/goleveldb/leveldb/comparer" + "github.com/syndtr/goleveldb/leveldb/filter" +) + +const ( + KiB = 1024 + MiB = KiB * 1024 + GiB = MiB * 1024 +) + +var ( + DefaultBlockCacher = LRUCacher + DefaultBlockCacheCapacity = 8 * MiB + DefaultBlockRestartInterval = 16 + DefaultBlockSize = 4 * KiB + DefaultCompactionExpandLimitFactor = 25 + DefaultCompactionGPOverlapsFactor = 10 + DefaultCompactionL0Trigger = 4 + DefaultCompactionSourceLimitFactor = 1 + DefaultCompactionTableSize = 2 * MiB + DefaultCompactionTableSizeMultiplier = 1.0 + DefaultCompactionTotalSize = 10 * MiB + DefaultCompactionTotalSizeMultiplier = 10.0 + DefaultCompressionType = SnappyCompression + DefaultIteratorSamplingRate = 1 * MiB + DefaultOpenFilesCacher = LRUCacher + DefaultOpenFilesCacheCapacity = 500 + DefaultWriteBuffer = 4 * MiB + DefaultWriteL0PauseTrigger = 12 + DefaultWriteL0SlowdownTrigger = 8 +) + +// Cacher is a caching algorithm. +type Cacher interface { + New(capacity int) cache.Cacher +} + +type CacherFunc struct { + NewFunc func(capacity int) cache.Cacher +} + +func (f *CacherFunc) New(capacity int) cache.Cacher { + if f.NewFunc != nil { + return f.NewFunc(capacity) + } + return nil +} + +func noCacher(int) cache.Cacher { return nil } + +var ( + // LRUCacher is the LRU-cache algorithm. + LRUCacher = &CacherFunc{cache.NewLRU} + + // NoCacher is the value to disable caching algorithm. + NoCacher = &CacherFunc{} +) + +// Compression is the 'sorted table' block compression algorithm to use. +type Compression uint + +func (c Compression) String() string { + switch c { + case DefaultCompression: + return "default" + case NoCompression: + return "none" + case SnappyCompression: + return "snappy" + } + return "invalid" +} + +const ( + DefaultCompression Compression = iota + NoCompression + SnappyCompression + nCompression +) + +// Strict is the DB 'strict level'. +type Strict uint + +const ( + // If present then a corrupted or invalid chunk or block in manifest + // journal will cause an error instead of being dropped. + // This will prevent database with corrupted manifest to be opened. + StrictManifest Strict = 1 << iota + + // If present then journal chunk checksum will be verified. + StrictJournalChecksum + + // If present then a corrupted or invalid chunk or block in journal + // will cause an error instead of being dropped. + // This will prevent database with corrupted journal to be opened. + StrictJournal + + // If present then 'sorted table' block checksum will be verified. + // This has effect on both 'read operation' and compaction. + StrictBlockChecksum + + // If present then a corrupted 'sorted table' will fails compaction. + // The database will enter read-only mode. + StrictCompaction + + // If present then a corrupted 'sorted table' will halts 'read operation'. + StrictReader + + // If present then leveldb.Recover will drop corrupted 'sorted table'. + StrictRecovery + + // This only applicable for ReadOptions, if present then this ReadOptions + // 'strict level' will override global ones. + StrictOverride + + // StrictAll enables all strict flags. + StrictAll = StrictManifest | StrictJournalChecksum | StrictJournal | StrictBlockChecksum | StrictCompaction | StrictReader | StrictRecovery + + // DefaultStrict is the default strict flags. Specify any strict flags + // will override default strict flags as whole (i.e. not OR'ed). + DefaultStrict = StrictJournalChecksum | StrictBlockChecksum | StrictCompaction | StrictReader + + // NoStrict disables all strict flags. Override default strict flags. + NoStrict = ^StrictAll +) + +// Options holds the optional parameters for the DB at large. +type Options struct { + // AltFilters defines one or more 'alternative filters'. + // 'alternative filters' will be used during reads if a filter block + // does not match with the 'effective filter'. + // + // The default value is nil + AltFilters []filter.Filter + + // BlockCacher provides cache algorithm for LevelDB 'sorted table' block caching. + // Specify NoCacher to disable caching algorithm. + // + // The default value is LRUCacher. + BlockCacher Cacher + + // BlockCacheCapacity defines the capacity of the 'sorted table' block caching. + // Use -1 for zero, this has same effect as specifying NoCacher to BlockCacher. + // + // The default value is 8MiB. + BlockCacheCapacity int + + // BlockCacheEvictRemoved allows enable forced-eviction on cached block belonging + // to removed 'sorted table'. + // + // The default if false. + BlockCacheEvictRemoved bool + + // BlockRestartInterval is the number of keys between restart points for + // delta encoding of keys. + // + // The default value is 16. + BlockRestartInterval int + + // BlockSize is the minimum uncompressed size in bytes of each 'sorted table' + // block. + // + // The default value is 4KiB. + BlockSize int + + // CompactionExpandLimitFactor limits compaction size after expanded. + // This will be multiplied by table size limit at compaction target level. + // + // The default value is 25. + CompactionExpandLimitFactor int + + // CompactionGPOverlapsFactor limits overlaps in grandparent (Level + 2) that a + // single 'sorted table' generates. + // This will be multiplied by table size limit at grandparent level. + // + // The default value is 10. + CompactionGPOverlapsFactor int + + // CompactionL0Trigger defines number of 'sorted table' at level-0 that will + // trigger compaction. + // + // The default value is 4. + CompactionL0Trigger int + + // CompactionSourceLimitFactor limits compaction source size. This doesn't apply to + // level-0. + // This will be multiplied by table size limit at compaction target level. + // + // The default value is 1. + CompactionSourceLimitFactor int + + // CompactionTableSize limits size of 'sorted table' that compaction generates. + // The limits for each level will be calculated as: + // CompactionTableSize * (CompactionTableSizeMultiplier ^ Level) + // The multiplier for each level can also fine-tuned using CompactionTableSizeMultiplierPerLevel. + // + // The default value is 2MiB. + CompactionTableSize int + + // CompactionTableSizeMultiplier defines multiplier for CompactionTableSize. + // + // The default value is 1. + CompactionTableSizeMultiplier float64 + + // CompactionTableSizeMultiplierPerLevel defines per-level multiplier for + // CompactionTableSize. + // Use zero to skip a level. + // + // The default value is nil. + CompactionTableSizeMultiplierPerLevel []float64 + + // CompactionTotalSize limits total size of 'sorted table' for each level. + // The limits for each level will be calculated as: + // CompactionTotalSize * (CompactionTotalSizeMultiplier ^ Level) + // The multiplier for each level can also fine-tuned using + // CompactionTotalSizeMultiplierPerLevel. + // + // The default value is 10MiB. + CompactionTotalSize int + + // CompactionTotalSizeMultiplier defines multiplier for CompactionTotalSize. + // + // The default value is 10. + CompactionTotalSizeMultiplier float64 + + // CompactionTotalSizeMultiplierPerLevel defines per-level multiplier for + // CompactionTotalSize. + // Use zero to skip a level. + // + // The default value is nil. + CompactionTotalSizeMultiplierPerLevel []float64 + + // Comparer defines a total ordering over the space of []byte keys: a 'less + // than' relationship. The same comparison algorithm must be used for reads + // and writes over the lifetime of the DB. + // + // The default value uses the same ordering as bytes.Compare. + Comparer comparer.Comparer + + // Compression defines the 'sorted table' block compression to use. + // + // The default value (DefaultCompression) uses snappy compression. + Compression Compression + + // DisableBufferPool allows disable use of util.BufferPool functionality. + // + // The default value is false. + DisableBufferPool bool + + // DisableBlockCache allows disable use of cache.Cache functionality on + // 'sorted table' block. + // + // The default value is false. + DisableBlockCache bool + + // DisableCompactionBackoff allows disable compaction retry backoff. + // + // The default value is false. + DisableCompactionBackoff bool + + // DisableLargeBatchTransaction allows disabling switch-to-transaction mode + // on large batch write. If enable batch writes large than WriteBuffer will + // use transaction. + // + // The default is false. + DisableLargeBatchTransaction bool + + // ErrorIfExist defines whether an error should returned if the DB already + // exist. + // + // The default value is false. + ErrorIfExist bool + + // ErrorIfMissing defines whether an error should returned if the DB is + // missing. If false then the database will be created if missing, otherwise + // an error will be returned. + // + // The default value is false. + ErrorIfMissing bool + + // Filter defines an 'effective filter' to use. An 'effective filter' + // if defined will be used to generate per-table filter block. + // The filter name will be stored on disk. + // During reads LevelDB will try to find matching filter from + // 'effective filter' and 'alternative filters'. + // + // Filter can be changed after a DB has been created. It is recommended + // to put old filter to the 'alternative filters' to mitigate lack of + // filter during transition period. + // + // A filter is used to reduce disk reads when looking for a specific key. + // + // The default value is nil. + Filter filter.Filter + + // IteratorSamplingRate defines approximate gap (in bytes) between read + // sampling of an iterator. The samples will be used to determine when + // compaction should be triggered. + // + // The default is 1MiB. + IteratorSamplingRate int + + // NoSync allows completely disable fsync. + // + // The default is false. + NoSync bool + + // NoWriteMerge allows disabling write merge. + // + // The default is false. + NoWriteMerge bool + + // OpenFilesCacher provides cache algorithm for open files caching. + // Specify NoCacher to disable caching algorithm. + // + // The default value is LRUCacher. + OpenFilesCacher Cacher + + // OpenFilesCacheCapacity defines the capacity of the open files caching. + // Use -1 for zero, this has same effect as specifying NoCacher to OpenFilesCacher. + // + // The default value is 500. + OpenFilesCacheCapacity int + + // If true then opens DB in read-only mode. + // + // The default value is false. + ReadOnly bool + + // Strict defines the DB strict level. + Strict Strict + + // WriteBuffer defines maximum size of a 'memdb' before flushed to + // 'sorted table'. 'memdb' is an in-memory DB backed by an on-disk + // unsorted journal. + // + // LevelDB may held up to two 'memdb' at the same time. + // + // The default value is 4MiB. + WriteBuffer int + + // WriteL0StopTrigger defines number of 'sorted table' at level-0 that will + // pause write. + // + // The default value is 12. + WriteL0PauseTrigger int + + // WriteL0SlowdownTrigger defines number of 'sorted table' at level-0 that + // will trigger write slowdown. + // + // The default value is 8. + WriteL0SlowdownTrigger int +} + +func (o *Options) GetAltFilters() []filter.Filter { + if o == nil { + return nil + } + return o.AltFilters +} + +func (o *Options) GetBlockCacher() Cacher { + if o == nil || o.BlockCacher == nil { + return DefaultBlockCacher + } else if o.BlockCacher == NoCacher { + return nil + } + return o.BlockCacher +} + +func (o *Options) GetBlockCacheCapacity() int { + if o == nil || o.BlockCacheCapacity == 0 { + return DefaultBlockCacheCapacity + } else if o.BlockCacheCapacity < 0 { + return 0 + } + return o.BlockCacheCapacity +} + +func (o *Options) GetBlockCacheEvictRemoved() bool { + if o == nil { + return false + } + return o.BlockCacheEvictRemoved +} + +func (o *Options) GetBlockRestartInterval() int { + if o == nil || o.BlockRestartInterval <= 0 { + return DefaultBlockRestartInterval + } + return o.BlockRestartInterval +} + +func (o *Options) GetBlockSize() int { + if o == nil || o.BlockSize <= 0 { + return DefaultBlockSize + } + return o.BlockSize +} + +func (o *Options) GetCompactionExpandLimit(level int) int { + factor := DefaultCompactionExpandLimitFactor + if o != nil && o.CompactionExpandLimitFactor > 0 { + factor = o.CompactionExpandLimitFactor + } + return o.GetCompactionTableSize(level+1) * factor +} + +func (o *Options) GetCompactionGPOverlaps(level int) int { + factor := DefaultCompactionGPOverlapsFactor + if o != nil && o.CompactionGPOverlapsFactor > 0 { + factor = o.CompactionGPOverlapsFactor + } + return o.GetCompactionTableSize(level+2) * factor +} + +func (o *Options) GetCompactionL0Trigger() int { + if o == nil || o.CompactionL0Trigger == 0 { + return DefaultCompactionL0Trigger + } + return o.CompactionL0Trigger +} + +func (o *Options) GetCompactionSourceLimit(level int) int { + factor := DefaultCompactionSourceLimitFactor + if o != nil && o.CompactionSourceLimitFactor > 0 { + factor = o.CompactionSourceLimitFactor + } + return o.GetCompactionTableSize(level+1) * factor +} + +func (o *Options) GetCompactionTableSize(level int) int { + var ( + base = DefaultCompactionTableSize + mult float64 + ) + if o != nil { + if o.CompactionTableSize > 0 { + base = o.CompactionTableSize + } + if level < len(o.CompactionTableSizeMultiplierPerLevel) && o.CompactionTableSizeMultiplierPerLevel[level] > 0 { + mult = o.CompactionTableSizeMultiplierPerLevel[level] + } else if o.CompactionTableSizeMultiplier > 0 { + mult = math.Pow(o.CompactionTableSizeMultiplier, float64(level)) + } + } + if mult == 0 { + mult = math.Pow(DefaultCompactionTableSizeMultiplier, float64(level)) + } + return int(float64(base) * mult) +} + +func (o *Options) GetCompactionTotalSize(level int) int64 { + var ( + base = DefaultCompactionTotalSize + mult float64 + ) + if o != nil { + if o.CompactionTotalSize > 0 { + base = o.CompactionTotalSize + } + if level < len(o.CompactionTotalSizeMultiplierPerLevel) && o.CompactionTotalSizeMultiplierPerLevel[level] > 0 { + mult = o.CompactionTotalSizeMultiplierPerLevel[level] + } else if o.CompactionTotalSizeMultiplier > 0 { + mult = math.Pow(o.CompactionTotalSizeMultiplier, float64(level)) + } + } + if mult == 0 { + mult = math.Pow(DefaultCompactionTotalSizeMultiplier, float64(level)) + } + return int64(float64(base) * mult) +} + +func (o *Options) GetComparer() comparer.Comparer { + if o == nil || o.Comparer == nil { + return comparer.DefaultComparer + } + return o.Comparer +} + +func (o *Options) GetCompression() Compression { + if o == nil || o.Compression <= DefaultCompression || o.Compression >= nCompression { + return DefaultCompressionType + } + return o.Compression +} + +func (o *Options) GetDisableBufferPool() bool { + if o == nil { + return false + } + return o.DisableBufferPool +} + +func (o *Options) GetDisableBlockCache() bool { + if o == nil { + return false + } + return o.DisableBlockCache +} + +func (o *Options) GetDisableCompactionBackoff() bool { + if o == nil { + return false + } + return o.DisableCompactionBackoff +} + +func (o *Options) GetDisableLargeBatchTransaction() bool { + if o == nil { + return false + } + return o.DisableLargeBatchTransaction +} + +func (o *Options) GetErrorIfExist() bool { + if o == nil { + return false + } + return o.ErrorIfExist +} + +func (o *Options) GetErrorIfMissing() bool { + if o == nil { + return false + } + return o.ErrorIfMissing +} + +func (o *Options) GetFilter() filter.Filter { + if o == nil { + return nil + } + return o.Filter +} + +func (o *Options) GetIteratorSamplingRate() int { + if o == nil || o.IteratorSamplingRate <= 0 { + return DefaultIteratorSamplingRate + } + return o.IteratorSamplingRate +} + +func (o *Options) GetNoSync() bool { + if o == nil { + return false + } + return o.NoSync +} + +func (o *Options) GetNoWriteMerge() bool { + if o == nil { + return false + } + return o.NoWriteMerge +} + +func (o *Options) GetOpenFilesCacher() Cacher { + if o == nil || o.OpenFilesCacher == nil { + return DefaultOpenFilesCacher + } + if o.OpenFilesCacher == NoCacher { + return nil + } + return o.OpenFilesCacher +} + +func (o *Options) GetOpenFilesCacheCapacity() int { + if o == nil || o.OpenFilesCacheCapacity == 0 { + return DefaultOpenFilesCacheCapacity + } else if o.OpenFilesCacheCapacity < 0 { + return 0 + } + return o.OpenFilesCacheCapacity +} + +func (o *Options) GetReadOnly() bool { + if o == nil { + return false + } + return o.ReadOnly +} + +func (o *Options) GetStrict(strict Strict) bool { + if o == nil || o.Strict == 0 { + return DefaultStrict&strict != 0 + } + return o.Strict&strict != 0 +} + +func (o *Options) GetWriteBuffer() int { + if o == nil || o.WriteBuffer <= 0 { + return DefaultWriteBuffer + } + return o.WriteBuffer +} + +func (o *Options) GetWriteL0PauseTrigger() int { + if o == nil || o.WriteL0PauseTrigger == 0 { + return DefaultWriteL0PauseTrigger + } + return o.WriteL0PauseTrigger +} + +func (o *Options) GetWriteL0SlowdownTrigger() int { + if o == nil || o.WriteL0SlowdownTrigger == 0 { + return DefaultWriteL0SlowdownTrigger + } + return o.WriteL0SlowdownTrigger +} + +// ReadOptions holds the optional parameters for 'read operation'. The +// 'read operation' includes Get, Find and NewIterator. +type ReadOptions struct { + // DontFillCache defines whether block reads for this 'read operation' + // should be cached. If false then the block will be cached. This does + // not affects already cached block. + // + // The default value is false. + DontFillCache bool + + // Strict will be OR'ed with global DB 'strict level' unless StrictOverride + // is present. Currently only StrictReader that has effect here. + Strict Strict +} + +func (ro *ReadOptions) GetDontFillCache() bool { + if ro == nil { + return false + } + return ro.DontFillCache +} + +func (ro *ReadOptions) GetStrict(strict Strict) bool { + if ro == nil { + return false + } + return ro.Strict&strict != 0 +} + +// WriteOptions holds the optional parameters for 'write operation'. The +// 'write operation' includes Write, Put and Delete. +type WriteOptions struct { + // NoWriteMerge allows disabling write merge. + // + // The default is false. + NoWriteMerge bool + + // Sync is whether to sync underlying writes from the OS buffer cache + // through to actual disk, if applicable. Setting Sync can result in + // slower writes. + // + // If false, and the machine crashes, then some recent writes may be lost. + // Note that if it is just the process that crashes (and the machine does + // not) then no writes will be lost. + // + // In other words, Sync being false has the same semantics as a write + // system call. Sync being true means write followed by fsync. + // + // The default value is false. + Sync bool +} + +func (wo *WriteOptions) GetNoWriteMerge() bool { + if wo == nil { + return false + } + return wo.NoWriteMerge +} + +func (wo *WriteOptions) GetSync() bool { + if wo == nil { + return false + } + return wo.Sync +} + +func GetStrict(o *Options, ro *ReadOptions, strict Strict) bool { + if ro.GetStrict(StrictOverride) { + return ro.GetStrict(strict) + } else { + return o.GetStrict(strict) || ro.GetStrict(strict) + } +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/options.go b/vendor/github.com/syndtr/goleveldb/leveldb/options.go new file mode 100644 index 000000000000..b072b1ac4c78 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/options.go @@ -0,0 +1,107 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package leveldb + +import ( + "github.com/syndtr/goleveldb/leveldb/filter" + "github.com/syndtr/goleveldb/leveldb/opt" +) + +func dupOptions(o *opt.Options) *opt.Options { + newo := &opt.Options{} + if o != nil { + *newo = *o + } + if newo.Strict == 0 { + newo.Strict = opt.DefaultStrict + } + return newo +} + +func (s *session) setOptions(o *opt.Options) { + no := dupOptions(o) + // Alternative filters. + if filters := o.GetAltFilters(); len(filters) > 0 { + no.AltFilters = make([]filter.Filter, len(filters)) + for i, filter := range filters { + no.AltFilters[i] = &iFilter{filter} + } + } + // Comparer. + s.icmp = &iComparer{o.GetComparer()} + no.Comparer = s.icmp + // Filter. + if filter := o.GetFilter(); filter != nil { + no.Filter = &iFilter{filter} + } + + s.o = &cachedOptions{Options: no} + s.o.cache() +} + +const optCachedLevel = 7 + +type cachedOptions struct { + *opt.Options + + compactionExpandLimit []int + compactionGPOverlaps []int + compactionSourceLimit []int + compactionTableSize []int + compactionTotalSize []int64 +} + +func (co *cachedOptions) cache() { + co.compactionExpandLimit = make([]int, optCachedLevel) + co.compactionGPOverlaps = make([]int, optCachedLevel) + co.compactionSourceLimit = make([]int, optCachedLevel) + co.compactionTableSize = make([]int, optCachedLevel) + co.compactionTotalSize = make([]int64, optCachedLevel) + + for level := 0; level < optCachedLevel; level++ { + co.compactionExpandLimit[level] = co.Options.GetCompactionExpandLimit(level) + co.compactionGPOverlaps[level] = co.Options.GetCompactionGPOverlaps(level) + co.compactionSourceLimit[level] = co.Options.GetCompactionSourceLimit(level) + co.compactionTableSize[level] = co.Options.GetCompactionTableSize(level) + co.compactionTotalSize[level] = co.Options.GetCompactionTotalSize(level) + } +} + +func (co *cachedOptions) GetCompactionExpandLimit(level int) int { + if level < optCachedLevel { + return co.compactionExpandLimit[level] + } + return co.Options.GetCompactionExpandLimit(level) +} + +func (co *cachedOptions) GetCompactionGPOverlaps(level int) int { + if level < optCachedLevel { + return co.compactionGPOverlaps[level] + } + return co.Options.GetCompactionGPOverlaps(level) +} + +func (co *cachedOptions) GetCompactionSourceLimit(level int) int { + if level < optCachedLevel { + return co.compactionSourceLimit[level] + } + return co.Options.GetCompactionSourceLimit(level) +} + +func (co *cachedOptions) GetCompactionTableSize(level int) int { + if level < optCachedLevel { + return co.compactionTableSize[level] + } + return co.Options.GetCompactionTableSize(level) +} + +func (co *cachedOptions) GetCompactionTotalSize(level int) int64 { + if level < optCachedLevel { + return co.compactionTotalSize[level] + } + return co.Options.GetCompactionTotalSize(level) +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/session.go b/vendor/github.com/syndtr/goleveldb/leveldb/session.go new file mode 100644 index 000000000000..3f391f934622 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/session.go @@ -0,0 +1,210 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package leveldb + +import ( + "fmt" + "io" + "os" + "sync" + + "github.com/syndtr/goleveldb/leveldb/errors" + "github.com/syndtr/goleveldb/leveldb/journal" + "github.com/syndtr/goleveldb/leveldb/opt" + "github.com/syndtr/goleveldb/leveldb/storage" +) + +// ErrManifestCorrupted records manifest corruption. This error will be +// wrapped with errors.ErrCorrupted. +type ErrManifestCorrupted struct { + Field string + Reason string +} + +func (e *ErrManifestCorrupted) Error() string { + return fmt.Sprintf("leveldb: manifest corrupted (field '%s'): %s", e.Field, e.Reason) +} + +func newErrManifestCorrupted(fd storage.FileDesc, field, reason string) error { + return errors.NewErrCorrupted(fd, &ErrManifestCorrupted{field, reason}) +} + +// session represent a persistent database session. +type session struct { + // Need 64-bit alignment. + stNextFileNum int64 // current unused file number + stJournalNum int64 // current journal file number; need external synchronization + stPrevJournalNum int64 // prev journal file number; no longer used; for compatibility with older version of leveldb + stTempFileNum int64 + stSeqNum uint64 // last mem compacted seq; need external synchronization + + stor *iStorage + storLock storage.Locker + o *cachedOptions + icmp *iComparer + tops *tOps + fileRef map[int64]int + + manifest *journal.Writer + manifestWriter storage.Writer + manifestFd storage.FileDesc + + stCompPtrs []internalKey // compaction pointers; need external synchronization + stVersion *version // current version + vmu sync.Mutex +} + +// Creates new initialized session instance. +func newSession(stor storage.Storage, o *opt.Options) (s *session, err error) { + if stor == nil { + return nil, os.ErrInvalid + } + storLock, err := stor.Lock() + if err != nil { + return + } + s = &session{ + stor: newIStorage(stor), + storLock: storLock, + fileRef: make(map[int64]int), + } + s.setOptions(o) + s.tops = newTableOps(s) + s.setVersion(newVersion(s)) + s.log("log@legend F·NumFile S·FileSize N·Entry C·BadEntry B·BadBlock Ke·KeyError D·DroppedEntry L·Level Q·SeqNum T·TimeElapsed") + return +} + +// Close session. +func (s *session) close() { + s.tops.close() + if s.manifest != nil { + s.manifest.Close() + } + if s.manifestWriter != nil { + s.manifestWriter.Close() + } + s.manifest = nil + s.manifestWriter = nil + s.setVersion(&version{s: s, closing: true}) +} + +// Release session lock. +func (s *session) release() { + s.storLock.Unlock() +} + +// Create a new database session; need external synchronization. +func (s *session) create() error { + // create manifest + return s.newManifest(nil, nil) +} + +// Recover a database session; need external synchronization. +func (s *session) recover() (err error) { + defer func() { + if os.IsNotExist(err) { + // Don't return os.ErrNotExist if the underlying storage contains + // other files that belong to LevelDB. So the DB won't get trashed. + if fds, _ := s.stor.List(storage.TypeAll); len(fds) > 0 { + err = &errors.ErrCorrupted{Fd: storage.FileDesc{Type: storage.TypeManifest}, Err: &errors.ErrMissingFiles{}} + } + } + }() + + fd, err := s.stor.GetMeta() + if err != nil { + return + } + + reader, err := s.stor.Open(fd) + if err != nil { + return + } + defer reader.Close() + + var ( + // Options. + strict = s.o.GetStrict(opt.StrictManifest) + + jr = journal.NewReader(reader, dropper{s, fd}, strict, true) + rec = &sessionRecord{} + staging = s.stVersion.newStaging() + ) + for { + var r io.Reader + r, err = jr.Next() + if err != nil { + if err == io.EOF { + err = nil + break + } + return errors.SetFd(err, fd) + } + + err = rec.decode(r) + if err == nil { + // save compact pointers + for _, r := range rec.compPtrs { + s.setCompPtr(r.level, internalKey(r.ikey)) + } + // commit record to version staging + staging.commit(rec) + } else { + err = errors.SetFd(err, fd) + if strict || !errors.IsCorrupted(err) { + return + } + s.logf("manifest error: %v (skipped)", errors.SetFd(err, fd)) + } + rec.resetCompPtrs() + rec.resetAddedTables() + rec.resetDeletedTables() + } + + switch { + case !rec.has(recComparer): + return newErrManifestCorrupted(fd, "comparer", "missing") + case rec.comparer != s.icmp.uName(): + return newErrManifestCorrupted(fd, "comparer", fmt.Sprintf("mismatch: want '%s', got '%s'", s.icmp.uName(), rec.comparer)) + case !rec.has(recNextFileNum): + return newErrManifestCorrupted(fd, "next-file-num", "missing") + case !rec.has(recJournalNum): + return newErrManifestCorrupted(fd, "journal-file-num", "missing") + case !rec.has(recSeqNum): + return newErrManifestCorrupted(fd, "seq-num", "missing") + } + + s.manifestFd = fd + s.setVersion(staging.finish()) + s.setNextFileNum(rec.nextFileNum) + s.recordCommited(rec) + return nil +} + +// Commit session; need external synchronization. +func (s *session) commit(r *sessionRecord) (err error) { + v := s.version() + defer v.release() + + // spawn new version based on current version + nv := v.spawn(r) + + if s.manifest == nil { + // manifest journal writer not yet created, create one + err = s.newManifest(r, nv) + } else { + err = s.flushManifest(r) + } + + // finally, apply new version if no error rise + if err == nil { + s.setVersion(nv) + } + + return +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/session_compaction.go b/vendor/github.com/syndtr/goleveldb/leveldb/session_compaction.go new file mode 100644 index 000000000000..089cd00b26d5 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/session_compaction.go @@ -0,0 +1,302 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package leveldb + +import ( + "sync/atomic" + + "github.com/syndtr/goleveldb/leveldb/iterator" + "github.com/syndtr/goleveldb/leveldb/memdb" + "github.com/syndtr/goleveldb/leveldb/opt" +) + +func (s *session) pickMemdbLevel(umin, umax []byte, maxLevel int) int { + v := s.version() + defer v.release() + return v.pickMemdbLevel(umin, umax, maxLevel) +} + +func (s *session) flushMemdb(rec *sessionRecord, mdb *memdb.DB, maxLevel int) (int, error) { + // Create sorted table. + iter := mdb.NewIterator(nil) + defer iter.Release() + t, n, err := s.tops.createFrom(iter) + if err != nil { + return 0, err + } + + // Pick level other than zero can cause compaction issue with large + // bulk insert and delete on strictly incrementing key-space. The + // problem is that the small deletion markers trapped at lower level, + // while key/value entries keep growing at higher level. Since the + // key-space is strictly incrementing it will not overlaps with + // higher level, thus maximum possible level is always picked, while + // overlapping deletion marker pushed into lower level. + // See: https://github.com/syndtr/goleveldb/issues/127. + flushLevel := s.pickMemdbLevel(t.imin.ukey(), t.imax.ukey(), maxLevel) + rec.addTableFile(flushLevel, t) + + s.logf("memdb@flush created L%d@%d N·%d S·%s %q:%q", flushLevel, t.fd.Num, n, shortenb(int(t.size)), t.imin, t.imax) + return flushLevel, nil +} + +// Pick a compaction based on current state; need external synchronization. +func (s *session) pickCompaction() *compaction { + v := s.version() + + var sourceLevel int + var t0 tFiles + if v.cScore >= 1 { + sourceLevel = v.cLevel + cptr := s.getCompPtr(sourceLevel) + tables := v.levels[sourceLevel] + for _, t := range tables { + if cptr == nil || s.icmp.Compare(t.imax, cptr) > 0 { + t0 = append(t0, t) + break + } + } + if len(t0) == 0 { + t0 = append(t0, tables[0]) + } + } else { + if p := atomic.LoadPointer(&v.cSeek); p != nil { + ts := (*tSet)(p) + sourceLevel = ts.level + t0 = append(t0, ts.table) + } else { + v.release() + return nil + } + } + + return newCompaction(s, v, sourceLevel, t0) +} + +// Create compaction from given level and range; need external synchronization. +func (s *session) getCompactionRange(sourceLevel int, umin, umax []byte, noLimit bool) *compaction { + v := s.version() + + if sourceLevel >= len(v.levels) { + v.release() + return nil + } + + t0 := v.levels[sourceLevel].getOverlaps(nil, s.icmp, umin, umax, sourceLevel == 0) + if len(t0) == 0 { + v.release() + return nil + } + + // Avoid compacting too much in one shot in case the range is large. + // But we cannot do this for level-0 since level-0 files can overlap + // and we must not pick one file and drop another older file if the + // two files overlap. + if !noLimit && sourceLevel > 0 { + limit := int64(v.s.o.GetCompactionSourceLimit(sourceLevel)) + total := int64(0) + for i, t := range t0 { + total += t.size + if total >= limit { + s.logf("table@compaction limiting F·%d -> F·%d", len(t0), i+1) + t0 = t0[:i+1] + break + } + } + } + + return newCompaction(s, v, sourceLevel, t0) +} + +func newCompaction(s *session, v *version, sourceLevel int, t0 tFiles) *compaction { + c := &compaction{ + s: s, + v: v, + sourceLevel: sourceLevel, + levels: [2]tFiles{t0, nil}, + maxGPOverlaps: int64(s.o.GetCompactionGPOverlaps(sourceLevel)), + tPtrs: make([]int, len(v.levels)), + } + c.expand() + c.save() + return c +} + +// compaction represent a compaction state. +type compaction struct { + s *session + v *version + + sourceLevel int + levels [2]tFiles + maxGPOverlaps int64 + + gp tFiles + gpi int + seenKey bool + gpOverlappedBytes int64 + imin, imax internalKey + tPtrs []int + released bool + + snapGPI int + snapSeenKey bool + snapGPOverlappedBytes int64 + snapTPtrs []int +} + +func (c *compaction) save() { + c.snapGPI = c.gpi + c.snapSeenKey = c.seenKey + c.snapGPOverlappedBytes = c.gpOverlappedBytes + c.snapTPtrs = append(c.snapTPtrs[:0], c.tPtrs...) +} + +func (c *compaction) restore() { + c.gpi = c.snapGPI + c.seenKey = c.snapSeenKey + c.gpOverlappedBytes = c.snapGPOverlappedBytes + c.tPtrs = append(c.tPtrs[:0], c.snapTPtrs...) +} + +func (c *compaction) release() { + if !c.released { + c.released = true + c.v.release() + } +} + +// Expand compacted tables; need external synchronization. +func (c *compaction) expand() { + limit := int64(c.s.o.GetCompactionExpandLimit(c.sourceLevel)) + vt0 := c.v.levels[c.sourceLevel] + vt1 := tFiles{} + if level := c.sourceLevel + 1; level < len(c.v.levels) { + vt1 = c.v.levels[level] + } + + t0, t1 := c.levels[0], c.levels[1] + imin, imax := t0.getRange(c.s.icmp) + // We expand t0 here just incase ukey hop across tables. + t0 = vt0.getOverlaps(t0, c.s.icmp, imin.ukey(), imax.ukey(), c.sourceLevel == 0) + if len(t0) != len(c.levels[0]) { + imin, imax = t0.getRange(c.s.icmp) + } + t1 = vt1.getOverlaps(t1, c.s.icmp, imin.ukey(), imax.ukey(), false) + // Get entire range covered by compaction. + amin, amax := append(t0, t1...).getRange(c.s.icmp) + + // See if we can grow the number of inputs in "sourceLevel" without + // changing the number of "sourceLevel+1" files we pick up. + if len(t1) > 0 { + exp0 := vt0.getOverlaps(nil, c.s.icmp, amin.ukey(), amax.ukey(), c.sourceLevel == 0) + if len(exp0) > len(t0) && t1.size()+exp0.size() < limit { + xmin, xmax := exp0.getRange(c.s.icmp) + exp1 := vt1.getOverlaps(nil, c.s.icmp, xmin.ukey(), xmax.ukey(), false) + if len(exp1) == len(t1) { + c.s.logf("table@compaction expanding L%d+L%d (F·%d S·%s)+(F·%d S·%s) -> (F·%d S·%s)+(F·%d S·%s)", + c.sourceLevel, c.sourceLevel+1, len(t0), shortenb(int(t0.size())), len(t1), shortenb(int(t1.size())), + len(exp0), shortenb(int(exp0.size())), len(exp1), shortenb(int(exp1.size()))) + imin, imax = xmin, xmax + t0, t1 = exp0, exp1 + amin, amax = append(t0, t1...).getRange(c.s.icmp) + } + } + } + + // Compute the set of grandparent files that overlap this compaction + // (parent == sourceLevel+1; grandparent == sourceLevel+2) + if level := c.sourceLevel + 2; level < len(c.v.levels) { + c.gp = c.v.levels[level].getOverlaps(c.gp, c.s.icmp, amin.ukey(), amax.ukey(), false) + } + + c.levels[0], c.levels[1] = t0, t1 + c.imin, c.imax = imin, imax +} + +// Check whether compaction is trivial. +func (c *compaction) trivial() bool { + return len(c.levels[0]) == 1 && len(c.levels[1]) == 0 && c.gp.size() <= c.maxGPOverlaps +} + +func (c *compaction) baseLevelForKey(ukey []byte) bool { + for level := c.sourceLevel + 2; level < len(c.v.levels); level++ { + tables := c.v.levels[level] + for c.tPtrs[level] < len(tables) { + t := tables[c.tPtrs[level]] + if c.s.icmp.uCompare(ukey, t.imax.ukey()) <= 0 { + // We've advanced far enough. + if c.s.icmp.uCompare(ukey, t.imin.ukey()) >= 0 { + // Key falls in this file's range, so definitely not base level. + return false + } + break + } + c.tPtrs[level]++ + } + } + return true +} + +func (c *compaction) shouldStopBefore(ikey internalKey) bool { + for ; c.gpi < len(c.gp); c.gpi++ { + gp := c.gp[c.gpi] + if c.s.icmp.Compare(ikey, gp.imax) <= 0 { + break + } + if c.seenKey { + c.gpOverlappedBytes += gp.size + } + } + c.seenKey = true + + if c.gpOverlappedBytes > c.maxGPOverlaps { + // Too much overlap for current output; start new output. + c.gpOverlappedBytes = 0 + return true + } + return false +} + +// Creates an iterator. +func (c *compaction) newIterator() iterator.Iterator { + // Creates iterator slice. + icap := len(c.levels) + if c.sourceLevel == 0 { + // Special case for level-0. + icap = len(c.levels[0]) + 1 + } + its := make([]iterator.Iterator, 0, icap) + + // Options. + ro := &opt.ReadOptions{ + DontFillCache: true, + Strict: opt.StrictOverride, + } + strict := c.s.o.GetStrict(opt.StrictCompaction) + if strict { + ro.Strict |= opt.StrictReader + } + + for i, tables := range c.levels { + if len(tables) == 0 { + continue + } + + // Level-0 is not sorted and may overlaps each other. + if c.sourceLevel+i == 0 { + for _, t := range tables { + its = append(its, c.s.tops.newIterator(t, nil, ro)) + } + } else { + it := iterator.NewIndexedIterator(tables.newIndexIterator(c.s.tops, c.s.icmp, nil, ro), strict) + its = append(its, it) + } + } + + return iterator.NewMergedIterator(its, c.s.icmp, strict) +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/session_record.go b/vendor/github.com/syndtr/goleveldb/leveldb/session_record.go new file mode 100644 index 000000000000..854e1aa6f9b4 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/session_record.go @@ -0,0 +1,323 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package leveldb + +import ( + "bufio" + "encoding/binary" + "io" + "strings" + + "github.com/syndtr/goleveldb/leveldb/errors" + "github.com/syndtr/goleveldb/leveldb/storage" +) + +type byteReader interface { + io.Reader + io.ByteReader +} + +// These numbers are written to disk and should not be changed. +const ( + recComparer = 1 + recJournalNum = 2 + recNextFileNum = 3 + recSeqNum = 4 + recCompPtr = 5 + recDelTable = 6 + recAddTable = 7 + // 8 was used for large value refs + recPrevJournalNum = 9 +) + +type cpRecord struct { + level int + ikey internalKey +} + +type atRecord struct { + level int + num int64 + size int64 + imin internalKey + imax internalKey +} + +type dtRecord struct { + level int + num int64 +} + +type sessionRecord struct { + hasRec int + comparer string + journalNum int64 + prevJournalNum int64 + nextFileNum int64 + seqNum uint64 + compPtrs []cpRecord + addedTables []atRecord + deletedTables []dtRecord + + scratch [binary.MaxVarintLen64]byte + err error +} + +func (p *sessionRecord) has(rec int) bool { + return p.hasRec&(1< +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package leveldb + +import ( + "fmt" + "sync/atomic" + + "github.com/syndtr/goleveldb/leveldb/journal" + "github.com/syndtr/goleveldb/leveldb/storage" +) + +// Logging. + +type dropper struct { + s *session + fd storage.FileDesc +} + +func (d dropper) Drop(err error) { + if e, ok := err.(*journal.ErrCorrupted); ok { + d.s.logf("journal@drop %s-%d S·%s %q", d.fd.Type, d.fd.Num, shortenb(e.Size), e.Reason) + } else { + d.s.logf("journal@drop %s-%d %q", d.fd.Type, d.fd.Num, err) + } +} + +func (s *session) log(v ...interface{}) { s.stor.Log(fmt.Sprint(v...)) } +func (s *session) logf(format string, v ...interface{}) { s.stor.Log(fmt.Sprintf(format, v...)) } + +// File utils. + +func (s *session) newTemp() storage.FileDesc { + num := atomic.AddInt64(&s.stTempFileNum, 1) - 1 + return storage.FileDesc{Type: storage.TypeTemp, Num: num} +} + +func (s *session) addFileRef(fd storage.FileDesc, ref int) int { + ref += s.fileRef[fd.Num] + if ref > 0 { + s.fileRef[fd.Num] = ref + } else if ref == 0 { + delete(s.fileRef, fd.Num) + } else { + panic(fmt.Sprintf("negative ref: %v", fd)) + } + return ref +} + +// Session state. + +// Get current version. This will incr version ref, must call +// version.release (exactly once) after use. +func (s *session) version() *version { + s.vmu.Lock() + defer s.vmu.Unlock() + s.stVersion.incref() + return s.stVersion +} + +func (s *session) tLen(level int) int { + s.vmu.Lock() + defer s.vmu.Unlock() + return s.stVersion.tLen(level) +} + +// Set current version to v. +func (s *session) setVersion(v *version) { + s.vmu.Lock() + defer s.vmu.Unlock() + // Hold by session. It is important to call this first before releasing + // current version, otherwise the still used files might get released. + v.incref() + if s.stVersion != nil { + // Release current version. + s.stVersion.releaseNB() + } + s.stVersion = v +} + +// Get current unused file number. +func (s *session) nextFileNum() int64 { + return atomic.LoadInt64(&s.stNextFileNum) +} + +// Set current unused file number to num. +func (s *session) setNextFileNum(num int64) { + atomic.StoreInt64(&s.stNextFileNum, num) +} + +// Mark file number as used. +func (s *session) markFileNum(num int64) { + nextFileNum := num + 1 + for { + old, x := s.stNextFileNum, nextFileNum + if old > x { + x = old + } + if atomic.CompareAndSwapInt64(&s.stNextFileNum, old, x) { + break + } + } +} + +// Allocate a file number. +func (s *session) allocFileNum() int64 { + return atomic.AddInt64(&s.stNextFileNum, 1) - 1 +} + +// Reuse given file number. +func (s *session) reuseFileNum(num int64) { + for { + old, x := s.stNextFileNum, num + if old != x+1 { + x = old + } + if atomic.CompareAndSwapInt64(&s.stNextFileNum, old, x) { + break + } + } +} + +// Set compaction ptr at given level; need external synchronization. +func (s *session) setCompPtr(level int, ik internalKey) { + if level >= len(s.stCompPtrs) { + newCompPtrs := make([]internalKey, level+1) + copy(newCompPtrs, s.stCompPtrs) + s.stCompPtrs = newCompPtrs + } + s.stCompPtrs[level] = append(internalKey{}, ik...) +} + +// Get compaction ptr at given level; need external synchronization. +func (s *session) getCompPtr(level int) internalKey { + if level >= len(s.stCompPtrs) { + return nil + } + return s.stCompPtrs[level] +} + +// Manifest related utils. + +// Fill given session record obj with current states; need external +// synchronization. +func (s *session) fillRecord(r *sessionRecord, snapshot bool) { + r.setNextFileNum(s.nextFileNum()) + + if snapshot { + if !r.has(recJournalNum) { + r.setJournalNum(s.stJournalNum) + } + + if !r.has(recSeqNum) { + r.setSeqNum(s.stSeqNum) + } + + for level, ik := range s.stCompPtrs { + if ik != nil { + r.addCompPtr(level, ik) + } + } + + r.setComparer(s.icmp.uName()) + } +} + +// Mark if record has been committed, this will update session state; +// need external synchronization. +func (s *session) recordCommited(rec *sessionRecord) { + if rec.has(recJournalNum) { + s.stJournalNum = rec.journalNum + } + + if rec.has(recPrevJournalNum) { + s.stPrevJournalNum = rec.prevJournalNum + } + + if rec.has(recSeqNum) { + s.stSeqNum = rec.seqNum + } + + for _, r := range rec.compPtrs { + s.setCompPtr(r.level, internalKey(r.ikey)) + } +} + +// Create a new manifest file; need external synchronization. +func (s *session) newManifest(rec *sessionRecord, v *version) (err error) { + fd := storage.FileDesc{Type: storage.TypeManifest, Num: s.allocFileNum()} + writer, err := s.stor.Create(fd) + if err != nil { + return + } + jw := journal.NewWriter(writer) + + if v == nil { + v = s.version() + defer v.release() + } + if rec == nil { + rec = &sessionRecord{} + } + s.fillRecord(rec, true) + v.fillRecord(rec) + + defer func() { + if err == nil { + s.recordCommited(rec) + if s.manifest != nil { + s.manifest.Close() + } + if s.manifestWriter != nil { + s.manifestWriter.Close() + } + if !s.manifestFd.Zero() { + s.stor.Remove(s.manifestFd) + } + s.manifestFd = fd + s.manifestWriter = writer + s.manifest = jw + } else { + writer.Close() + s.stor.Remove(fd) + s.reuseFileNum(fd.Num) + } + }() + + w, err := jw.Next() + if err != nil { + return + } + err = rec.encode(w) + if err != nil { + return + } + err = jw.Flush() + if err != nil { + return + } + err = s.stor.SetMeta(fd) + return +} + +// Flush record to disk. +func (s *session) flushManifest(rec *sessionRecord) (err error) { + s.fillRecord(rec, false) + w, err := s.manifest.Next() + if err != nil { + return + } + err = rec.encode(w) + if err != nil { + return + } + err = s.manifest.Flush() + if err != nil { + return + } + if !s.o.GetNoSync() { + err = s.manifestWriter.Sync() + if err != nil { + return + } + } + s.recordCommited(rec) + return +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/storage.go b/vendor/github.com/syndtr/goleveldb/leveldb/storage.go new file mode 100644 index 000000000000..d45fb5dfeb14 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/storage.go @@ -0,0 +1,63 @@ +package leveldb + +import ( + "github.com/syndtr/goleveldb/leveldb/storage" + "sync/atomic" +) + +type iStorage struct { + storage.Storage + read uint64 + write uint64 +} + +func (c *iStorage) Open(fd storage.FileDesc) (storage.Reader, error) { + r, err := c.Storage.Open(fd) + return &iStorageReader{r, c}, err +} + +func (c *iStorage) Create(fd storage.FileDesc) (storage.Writer, error) { + w, err := c.Storage.Create(fd) + return &iStorageWriter{w, c}, err +} + +func (c *iStorage) reads() uint64 { + return atomic.LoadUint64(&c.read) +} + +func (c *iStorage) writes() uint64 { + return atomic.LoadUint64(&c.write) +} + +// newIStorage returns the given storage wrapped by iStorage. +func newIStorage(s storage.Storage) *iStorage { + return &iStorage{s, 0, 0} +} + +type iStorageReader struct { + storage.Reader + c *iStorage +} + +func (r *iStorageReader) Read(p []byte) (n int, err error) { + n, err = r.Reader.Read(p) + atomic.AddUint64(&r.c.read, uint64(n)) + return n, err +} + +func (r *iStorageReader) ReadAt(p []byte, off int64) (n int, err error) { + n, err = r.Reader.ReadAt(p, off) + atomic.AddUint64(&r.c.read, uint64(n)) + return n, err +} + +type iStorageWriter struct { + storage.Writer + c *iStorage +} + +func (w *iStorageWriter) Write(p []byte) (n int, err error) { + n, err = w.Writer.Write(p) + atomic.AddUint64(&w.c.write, uint64(n)) + return n, err +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage.go b/vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage.go new file mode 100644 index 000000000000..9ba71fd6d107 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage.go @@ -0,0 +1,671 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reservefs. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package storage + +import ( + "errors" + "fmt" + "io" + "io/ioutil" + "os" + "path/filepath" + "runtime" + "sort" + "strconv" + "strings" + "sync" + "time" +) + +var ( + errFileOpen = errors.New("leveldb/storage: file still open") + errReadOnly = errors.New("leveldb/storage: storage is read-only") +) + +type fileLock interface { + release() error +} + +type fileStorageLock struct { + fs *fileStorage +} + +func (lock *fileStorageLock) Unlock() { + if lock.fs != nil { + lock.fs.mu.Lock() + defer lock.fs.mu.Unlock() + if lock.fs.slock == lock { + lock.fs.slock = nil + } + } +} + +type int64Slice []int64 + +func (p int64Slice) Len() int { return len(p) } +func (p int64Slice) Less(i, j int) bool { return p[i] < p[j] } +func (p int64Slice) Swap(i, j int) { p[i], p[j] = p[j], p[i] } + +func writeFileSynced(filename string, data []byte, perm os.FileMode) error { + f, err := os.OpenFile(filename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, perm) + if err != nil { + return err + } + n, err := f.Write(data) + if err == nil && n < len(data) { + err = io.ErrShortWrite + } + if err1 := f.Sync(); err == nil { + err = err1 + } + if err1 := f.Close(); err == nil { + err = err1 + } + return err +} + +const logSizeThreshold = 1024 * 1024 // 1 MiB + +// fileStorage is a file-system backed storage. +type fileStorage struct { + path string + readOnly bool + + mu sync.Mutex + flock fileLock + slock *fileStorageLock + logw *os.File + logSize int64 + buf []byte + // Opened file counter; if open < 0 means closed. + open int + day int +} + +// OpenFile returns a new filesystem-backed storage implementation with the given +// path. This also acquire a file lock, so any subsequent attempt to open the +// same path will fail. +// +// The storage must be closed after use, by calling Close method. +func OpenFile(path string, readOnly bool) (Storage, error) { + if fi, err := os.Stat(path); err == nil { + if !fi.IsDir() { + return nil, fmt.Errorf("leveldb/storage: open %s: not a directory", path) + } + } else if os.IsNotExist(err) && !readOnly { + if err := os.MkdirAll(path, 0755); err != nil { + return nil, err + } + } else { + return nil, err + } + + flock, err := newFileLock(filepath.Join(path, "LOCK"), readOnly) + if err != nil { + return nil, err + } + + defer func() { + if err != nil { + flock.release() + } + }() + + var ( + logw *os.File + logSize int64 + ) + if !readOnly { + logw, err = os.OpenFile(filepath.Join(path, "LOG"), os.O_WRONLY|os.O_CREATE, 0644) + if err != nil { + return nil, err + } + logSize, err = logw.Seek(0, os.SEEK_END) + if err != nil { + logw.Close() + return nil, err + } + } + + fs := &fileStorage{ + path: path, + readOnly: readOnly, + flock: flock, + logw: logw, + logSize: logSize, + } + runtime.SetFinalizer(fs, (*fileStorage).Close) + return fs, nil +} + +func (fs *fileStorage) Lock() (Locker, error) { + fs.mu.Lock() + defer fs.mu.Unlock() + if fs.open < 0 { + return nil, ErrClosed + } + if fs.readOnly { + return &fileStorageLock{}, nil + } + if fs.slock != nil { + return nil, ErrLocked + } + fs.slock = &fileStorageLock{fs: fs} + return fs.slock, nil +} + +func itoa(buf []byte, i int, wid int) []byte { + u := uint(i) + if u == 0 && wid <= 1 { + return append(buf, '0') + } + + // Assemble decimal in reverse order. + var b [32]byte + bp := len(b) + for ; u > 0 || wid > 0; u /= 10 { + bp-- + wid-- + b[bp] = byte(u%10) + '0' + } + return append(buf, b[bp:]...) +} + +func (fs *fileStorage) printDay(t time.Time) { + if fs.day == t.Day() { + return + } + fs.day = t.Day() + fs.logw.Write([]byte("=============== " + t.Format("Jan 2, 2006 (MST)") + " ===============\n")) +} + +func (fs *fileStorage) doLog(t time.Time, str string) { + if fs.logSize > logSizeThreshold { + // Rotate log file. + fs.logw.Close() + fs.logw = nil + fs.logSize = 0 + rename(filepath.Join(fs.path, "LOG"), filepath.Join(fs.path, "LOG.old")) + } + if fs.logw == nil { + var err error + fs.logw, err = os.OpenFile(filepath.Join(fs.path, "LOG"), os.O_WRONLY|os.O_CREATE, 0644) + if err != nil { + return + } + // Force printDay on new log file. + fs.day = 0 + } + fs.printDay(t) + hour, min, sec := t.Clock() + msec := t.Nanosecond() / 1e3 + // time + fs.buf = itoa(fs.buf[:0], hour, 2) + fs.buf = append(fs.buf, ':') + fs.buf = itoa(fs.buf, min, 2) + fs.buf = append(fs.buf, ':') + fs.buf = itoa(fs.buf, sec, 2) + fs.buf = append(fs.buf, '.') + fs.buf = itoa(fs.buf, msec, 6) + fs.buf = append(fs.buf, ' ') + // write + fs.buf = append(fs.buf, []byte(str)...) + fs.buf = append(fs.buf, '\n') + n, _ := fs.logw.Write(fs.buf) + fs.logSize += int64(n) +} + +func (fs *fileStorage) Log(str string) { + if !fs.readOnly { + t := time.Now() + fs.mu.Lock() + defer fs.mu.Unlock() + if fs.open < 0 { + return + } + fs.doLog(t, str) + } +} + +func (fs *fileStorage) log(str string) { + if !fs.readOnly { + fs.doLog(time.Now(), str) + } +} + +func (fs *fileStorage) setMeta(fd FileDesc) error { + content := fsGenName(fd) + "\n" + // Check and backup old CURRENT file. + currentPath := filepath.Join(fs.path, "CURRENT") + if _, err := os.Stat(currentPath); err == nil { + b, err := ioutil.ReadFile(currentPath) + if err != nil { + fs.log(fmt.Sprintf("backup CURRENT: %v", err)) + return err + } + if string(b) == content { + // Content not changed, do nothing. + return nil + } + if err := writeFileSynced(currentPath+".bak", b, 0644); err != nil { + fs.log(fmt.Sprintf("backup CURRENT: %v", err)) + return err + } + } else if !os.IsNotExist(err) { + return err + } + path := fmt.Sprintf("%s.%d", filepath.Join(fs.path, "CURRENT"), fd.Num) + if err := writeFileSynced(path, []byte(content), 0644); err != nil { + fs.log(fmt.Sprintf("create CURRENT.%d: %v", fd.Num, err)) + return err + } + // Replace CURRENT file. + if err := rename(path, currentPath); err != nil { + fs.log(fmt.Sprintf("rename CURRENT.%d: %v", fd.Num, err)) + return err + } + // Sync root directory. + if err := syncDir(fs.path); err != nil { + fs.log(fmt.Sprintf("syncDir: %v", err)) + return err + } + return nil +} + +func (fs *fileStorage) SetMeta(fd FileDesc) error { + if !FileDescOk(fd) { + return ErrInvalidFile + } + if fs.readOnly { + return errReadOnly + } + + fs.mu.Lock() + defer fs.mu.Unlock() + if fs.open < 0 { + return ErrClosed + } + return fs.setMeta(fd) +} + +func (fs *fileStorage) GetMeta() (FileDesc, error) { + fs.mu.Lock() + defer fs.mu.Unlock() + if fs.open < 0 { + return FileDesc{}, ErrClosed + } + dir, err := os.Open(fs.path) + if err != nil { + return FileDesc{}, err + } + names, err := dir.Readdirnames(0) + // Close the dir first before checking for Readdirnames error. + if ce := dir.Close(); ce != nil { + fs.log(fmt.Sprintf("close dir: %v", ce)) + } + if err != nil { + return FileDesc{}, err + } + // Try this in order: + // - CURRENT.[0-9]+ ('pending rename' file, descending order) + // - CURRENT + // - CURRENT.bak + // + // Skip corrupted file or file that point to a missing target file. + type currentFile struct { + name string + fd FileDesc + } + tryCurrent := func(name string) (*currentFile, error) { + b, err := ioutil.ReadFile(filepath.Join(fs.path, name)) + if err != nil { + if os.IsNotExist(err) { + err = os.ErrNotExist + } + return nil, err + } + var fd FileDesc + if len(b) < 1 || b[len(b)-1] != '\n' || !fsParseNamePtr(string(b[:len(b)-1]), &fd) { + fs.log(fmt.Sprintf("%s: corrupted content: %q", name, b)) + err := &ErrCorrupted{ + Err: errors.New("leveldb/storage: corrupted or incomplete CURRENT file"), + } + return nil, err + } + if _, err := os.Stat(filepath.Join(fs.path, fsGenName(fd))); err != nil { + if os.IsNotExist(err) { + fs.log(fmt.Sprintf("%s: missing target file: %s", name, fd)) + err = os.ErrNotExist + } + return nil, err + } + return ¤tFile{name: name, fd: fd}, nil + } + tryCurrents := func(names []string) (*currentFile, error) { + var ( + cur *currentFile + // Last corruption error. + lastCerr error + ) + for _, name := range names { + var err error + cur, err = tryCurrent(name) + if err == nil { + break + } else if err == os.ErrNotExist { + // Fallback to the next file. + } else if isCorrupted(err) { + lastCerr = err + // Fallback to the next file. + } else { + // In case the error is due to permission, etc. + return nil, err + } + } + if cur == nil { + err := os.ErrNotExist + if lastCerr != nil { + err = lastCerr + } + return nil, err + } + return cur, nil + } + + // Try 'pending rename' files. + var nums []int64 + for _, name := range names { + if strings.HasPrefix(name, "CURRENT.") && name != "CURRENT.bak" { + i, err := strconv.ParseInt(name[8:], 10, 64) + if err == nil { + nums = append(nums, i) + } + } + } + var ( + pendCur *currentFile + pendErr = os.ErrNotExist + pendNames []string + ) + if len(nums) > 0 { + sort.Sort(sort.Reverse(int64Slice(nums))) + pendNames = make([]string, len(nums)) + for i, num := range nums { + pendNames[i] = fmt.Sprintf("CURRENT.%d", num) + } + pendCur, pendErr = tryCurrents(pendNames) + if pendErr != nil && pendErr != os.ErrNotExist && !isCorrupted(pendErr) { + return FileDesc{}, pendErr + } + } + + // Try CURRENT and CURRENT.bak. + curCur, curErr := tryCurrents([]string{"CURRENT", "CURRENT.bak"}) + if curErr != nil && curErr != os.ErrNotExist && !isCorrupted(curErr) { + return FileDesc{}, curErr + } + + // pendCur takes precedence, but guards against obsolete pendCur. + if pendCur != nil && (curCur == nil || pendCur.fd.Num > curCur.fd.Num) { + curCur = pendCur + } + + if curCur != nil { + // Restore CURRENT file to proper state. + if !fs.readOnly && (curCur.name != "CURRENT" || len(pendNames) != 0) { + // Ignore setMeta errors, however don't delete obsolete files if we + // catch error. + if err := fs.setMeta(curCur.fd); err == nil { + // Remove 'pending rename' files. + for _, name := range pendNames { + if err := os.Remove(filepath.Join(fs.path, name)); err != nil { + fs.log(fmt.Sprintf("remove %s: %v", name, err)) + } + } + } + } + return curCur.fd, nil + } + + // Nothing found. + if isCorrupted(pendErr) { + return FileDesc{}, pendErr + } + return FileDesc{}, curErr +} + +func (fs *fileStorage) List(ft FileType) (fds []FileDesc, err error) { + fs.mu.Lock() + defer fs.mu.Unlock() + if fs.open < 0 { + return nil, ErrClosed + } + dir, err := os.Open(fs.path) + if err != nil { + return + } + names, err := dir.Readdirnames(0) + // Close the dir first before checking for Readdirnames error. + if cerr := dir.Close(); cerr != nil { + fs.log(fmt.Sprintf("close dir: %v", cerr)) + } + if err == nil { + for _, name := range names { + if fd, ok := fsParseName(name); ok && fd.Type&ft != 0 { + fds = append(fds, fd) + } + } + } + return +} + +func (fs *fileStorage) Open(fd FileDesc) (Reader, error) { + if !FileDescOk(fd) { + return nil, ErrInvalidFile + } + + fs.mu.Lock() + defer fs.mu.Unlock() + if fs.open < 0 { + return nil, ErrClosed + } + of, err := os.OpenFile(filepath.Join(fs.path, fsGenName(fd)), os.O_RDONLY, 0) + if err != nil { + if fsHasOldName(fd) && os.IsNotExist(err) { + of, err = os.OpenFile(filepath.Join(fs.path, fsGenOldName(fd)), os.O_RDONLY, 0) + if err == nil { + goto ok + } + } + return nil, err + } +ok: + fs.open++ + return &fileWrap{File: of, fs: fs, fd: fd}, nil +} + +func (fs *fileStorage) Create(fd FileDesc) (Writer, error) { + if !FileDescOk(fd) { + return nil, ErrInvalidFile + } + if fs.readOnly { + return nil, errReadOnly + } + + fs.mu.Lock() + defer fs.mu.Unlock() + if fs.open < 0 { + return nil, ErrClosed + } + of, err := os.OpenFile(filepath.Join(fs.path, fsGenName(fd)), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644) + if err != nil { + return nil, err + } + fs.open++ + return &fileWrap{File: of, fs: fs, fd: fd}, nil +} + +func (fs *fileStorage) Remove(fd FileDesc) error { + if !FileDescOk(fd) { + return ErrInvalidFile + } + if fs.readOnly { + return errReadOnly + } + + fs.mu.Lock() + defer fs.mu.Unlock() + if fs.open < 0 { + return ErrClosed + } + err := os.Remove(filepath.Join(fs.path, fsGenName(fd))) + if err != nil { + if fsHasOldName(fd) && os.IsNotExist(err) { + if e1 := os.Remove(filepath.Join(fs.path, fsGenOldName(fd))); !os.IsNotExist(e1) { + fs.log(fmt.Sprintf("remove %s: %v (old name)", fd, err)) + err = e1 + } + } else { + fs.log(fmt.Sprintf("remove %s: %v", fd, err)) + } + } + return err +} + +func (fs *fileStorage) Rename(oldfd, newfd FileDesc) error { + if !FileDescOk(oldfd) || !FileDescOk(newfd) { + return ErrInvalidFile + } + if oldfd == newfd { + return nil + } + if fs.readOnly { + return errReadOnly + } + + fs.mu.Lock() + defer fs.mu.Unlock() + if fs.open < 0 { + return ErrClosed + } + return rename(filepath.Join(fs.path, fsGenName(oldfd)), filepath.Join(fs.path, fsGenName(newfd))) +} + +func (fs *fileStorage) Close() error { + fs.mu.Lock() + defer fs.mu.Unlock() + if fs.open < 0 { + return ErrClosed + } + // Clear the finalizer. + runtime.SetFinalizer(fs, nil) + + if fs.open > 0 { + fs.log(fmt.Sprintf("close: warning, %d files still open", fs.open)) + } + fs.open = -1 + if fs.logw != nil { + fs.logw.Close() + } + return fs.flock.release() +} + +type fileWrap struct { + *os.File + fs *fileStorage + fd FileDesc + closed bool +} + +func (fw *fileWrap) Sync() error { + if err := fw.File.Sync(); err != nil { + return err + } + if fw.fd.Type == TypeManifest { + // Also sync parent directory if file type is manifest. + // See: https://code.google.com/p/leveldb/issues/detail?id=190. + if err := syncDir(fw.fs.path); err != nil { + fw.fs.log(fmt.Sprintf("syncDir: %v", err)) + return err + } + } + return nil +} + +func (fw *fileWrap) Close() error { + fw.fs.mu.Lock() + defer fw.fs.mu.Unlock() + if fw.closed { + return ErrClosed + } + fw.closed = true + fw.fs.open-- + err := fw.File.Close() + if err != nil { + fw.fs.log(fmt.Sprintf("close %s: %v", fw.fd, err)) + } + return err +} + +func fsGenName(fd FileDesc) string { + switch fd.Type { + case TypeManifest: + return fmt.Sprintf("MANIFEST-%06d", fd.Num) + case TypeJournal: + return fmt.Sprintf("%06d.log", fd.Num) + case TypeTable: + return fmt.Sprintf("%06d.ldb", fd.Num) + case TypeTemp: + return fmt.Sprintf("%06d.tmp", fd.Num) + default: + panic("invalid file type") + } +} + +func fsHasOldName(fd FileDesc) bool { + return fd.Type == TypeTable +} + +func fsGenOldName(fd FileDesc) string { + switch fd.Type { + case TypeTable: + return fmt.Sprintf("%06d.sst", fd.Num) + } + return fsGenName(fd) +} + +func fsParseName(name string) (fd FileDesc, ok bool) { + var tail string + _, err := fmt.Sscanf(name, "%d.%s", &fd.Num, &tail) + if err == nil { + switch tail { + case "log": + fd.Type = TypeJournal + case "ldb", "sst": + fd.Type = TypeTable + case "tmp": + fd.Type = TypeTemp + default: + return + } + return fd, true + } + n, _ := fmt.Sscanf(name, "MANIFEST-%d%s", &fd.Num, &tail) + if n == 1 { + fd.Type = TypeManifest + return fd, true + } + return +} + +func fsParseNamePtr(name string, fd *FileDesc) bool { + _fd, ok := fsParseName(name) + if fd != nil { + *fd = _fd + } + return ok +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage_nacl.go b/vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage_nacl.go new file mode 100644 index 000000000000..5545aeef2a86 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage_nacl.go @@ -0,0 +1,34 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// +build nacl + +package storage + +import ( + "os" + "syscall" +) + +func newFileLock(path string, readOnly bool) (fl fileLock, err error) { + return nil, syscall.ENOTSUP +} + +func setFileLock(f *os.File, readOnly, lock bool) error { + return syscall.ENOTSUP +} + +func rename(oldpath, newpath string) error { + return syscall.ENOTSUP +} + +func isErrInvalid(err error) bool { + return false +} + +func syncDir(name string) error { + return syscall.ENOTSUP +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage_plan9.go b/vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage_plan9.go new file mode 100644 index 000000000000..b8297980126d --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage_plan9.go @@ -0,0 +1,63 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package storage + +import ( + "os" +) + +type plan9FileLock struct { + f *os.File +} + +func (fl *plan9FileLock) release() error { + return fl.f.Close() +} + +func newFileLock(path string, readOnly bool) (fl fileLock, err error) { + var ( + flag int + perm os.FileMode + ) + if readOnly { + flag = os.O_RDONLY + } else { + flag = os.O_RDWR + perm = os.ModeExclusive + } + f, err := os.OpenFile(path, flag, perm) + if os.IsNotExist(err) { + f, err = os.OpenFile(path, flag|os.O_CREATE, perm|0644) + } + if err != nil { + return + } + fl = &plan9FileLock{f: f} + return +} + +func rename(oldpath, newpath string) error { + if _, err := os.Stat(newpath); err == nil { + if err := os.Remove(newpath); err != nil { + return err + } + } + + return os.Rename(oldpath, newpath) +} + +func syncDir(name string) error { + f, err := os.Open(name) + if err != nil { + return err + } + defer f.Close() + if err := f.Sync(); err != nil { + return err + } + return nil +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage_solaris.go b/vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage_solaris.go new file mode 100644 index 000000000000..79901ee4a7a3 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage_solaris.go @@ -0,0 +1,81 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// +build solaris + +package storage + +import ( + "os" + "syscall" +) + +type unixFileLock struct { + f *os.File +} + +func (fl *unixFileLock) release() error { + if err := setFileLock(fl.f, false, false); err != nil { + return err + } + return fl.f.Close() +} + +func newFileLock(path string, readOnly bool) (fl fileLock, err error) { + var flag int + if readOnly { + flag = os.O_RDONLY + } else { + flag = os.O_RDWR + } + f, err := os.OpenFile(path, flag, 0) + if os.IsNotExist(err) { + f, err = os.OpenFile(path, flag|os.O_CREATE, 0644) + } + if err != nil { + return + } + err = setFileLock(f, readOnly, true) + if err != nil { + f.Close() + return + } + fl = &unixFileLock{f: f} + return +} + +func setFileLock(f *os.File, readOnly, lock bool) error { + flock := syscall.Flock_t{ + Type: syscall.F_UNLCK, + Start: 0, + Len: 0, + Whence: 1, + } + if lock { + if readOnly { + flock.Type = syscall.F_RDLCK + } else { + flock.Type = syscall.F_WRLCK + } + } + return syscall.FcntlFlock(f.Fd(), syscall.F_SETLK, &flock) +} + +func rename(oldpath, newpath string) error { + return os.Rename(oldpath, newpath) +} + +func syncDir(name string) error { + f, err := os.Open(name) + if err != nil { + return err + } + defer f.Close() + if err := f.Sync(); err != nil { + return err + } + return nil +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage_unix.go b/vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage_unix.go new file mode 100644 index 000000000000..d75f66a9efc6 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage_unix.go @@ -0,0 +1,98 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// +build darwin dragonfly freebsd linux netbsd openbsd + +package storage + +import ( + "os" + "syscall" +) + +type unixFileLock struct { + f *os.File +} + +func (fl *unixFileLock) release() error { + if err := setFileLock(fl.f, false, false); err != nil { + return err + } + return fl.f.Close() +} + +func newFileLock(path string, readOnly bool) (fl fileLock, err error) { + var flag int + if readOnly { + flag = os.O_RDONLY + } else { + flag = os.O_RDWR + } + f, err := os.OpenFile(path, flag, 0) + if os.IsNotExist(err) { + f, err = os.OpenFile(path, flag|os.O_CREATE, 0644) + } + if err != nil { + return + } + err = setFileLock(f, readOnly, true) + if err != nil { + f.Close() + return + } + fl = &unixFileLock{f: f} + return +} + +func setFileLock(f *os.File, readOnly, lock bool) error { + how := syscall.LOCK_UN + if lock { + if readOnly { + how = syscall.LOCK_SH + } else { + how = syscall.LOCK_EX + } + } + return syscall.Flock(int(f.Fd()), how|syscall.LOCK_NB) +} + +func rename(oldpath, newpath string) error { + return os.Rename(oldpath, newpath) +} + +func isErrInvalid(err error) bool { + if err == os.ErrInvalid { + return true + } + // Go < 1.8 + if syserr, ok := err.(*os.SyscallError); ok && syserr.Err == syscall.EINVAL { + return true + } + // Go >= 1.8 returns *os.PathError instead + if patherr, ok := err.(*os.PathError); ok && patherr.Err == syscall.EINVAL { + return true + } + return false +} + +func syncDir(name string) error { + // As per fsync manpage, Linux seems to expect fsync on directory, however + // some system don't support this, so we will ignore syscall.EINVAL. + // + // From fsync(2): + // Calling fsync() does not necessarily ensure that the entry in the + // directory containing the file has also reached disk. For that an + // explicit fsync() on a file descriptor for the directory is also needed. + f, err := os.Open(name) + if err != nil { + return err + } + defer f.Close() + if err := f.Sync(); err != nil && !isErrInvalid(err) { + return err + } + return nil +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage_windows.go b/vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage_windows.go new file mode 100644 index 000000000000..899335fd7e4e --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage_windows.go @@ -0,0 +1,78 @@ +// Copyright (c) 2013, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package storage + +import ( + "syscall" + "unsafe" +) + +var ( + modkernel32 = syscall.NewLazyDLL("kernel32.dll") + + procMoveFileExW = modkernel32.NewProc("MoveFileExW") +) + +const ( + _MOVEFILE_REPLACE_EXISTING = 1 +) + +type windowsFileLock struct { + fd syscall.Handle +} + +func (fl *windowsFileLock) release() error { + return syscall.Close(fl.fd) +} + +func newFileLock(path string, readOnly bool) (fl fileLock, err error) { + pathp, err := syscall.UTF16PtrFromString(path) + if err != nil { + return + } + var access, shareMode uint32 + if readOnly { + access = syscall.GENERIC_READ + shareMode = syscall.FILE_SHARE_READ + } else { + access = syscall.GENERIC_READ | syscall.GENERIC_WRITE + } + fd, err := syscall.CreateFile(pathp, access, shareMode, nil, syscall.OPEN_EXISTING, syscall.FILE_ATTRIBUTE_NORMAL, 0) + if err == syscall.ERROR_FILE_NOT_FOUND { + fd, err = syscall.CreateFile(pathp, access, shareMode, nil, syscall.OPEN_ALWAYS, syscall.FILE_ATTRIBUTE_NORMAL, 0) + } + if err != nil { + return + } + fl = &windowsFileLock{fd: fd} + return +} + +func moveFileEx(from *uint16, to *uint16, flags uint32) error { + r1, _, e1 := syscall.Syscall(procMoveFileExW.Addr(), 3, uintptr(unsafe.Pointer(from)), uintptr(unsafe.Pointer(to)), uintptr(flags)) + if r1 == 0 { + if e1 != 0 { + return error(e1) + } + return syscall.EINVAL + } + return nil +} + +func rename(oldpath, newpath string) error { + from, err := syscall.UTF16PtrFromString(oldpath) + if err != nil { + return err + } + to, err := syscall.UTF16PtrFromString(newpath) + if err != nil { + return err + } + return moveFileEx(from, to, _MOVEFILE_REPLACE_EXISTING) +} + +func syncDir(name string) error { return nil } diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/storage/mem_storage.go b/vendor/github.com/syndtr/goleveldb/leveldb/storage/mem_storage.go new file mode 100644 index 000000000000..838f1bee1ba7 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/storage/mem_storage.go @@ -0,0 +1,222 @@ +// Copyright (c) 2013, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package storage + +import ( + "bytes" + "os" + "sync" +) + +const typeShift = 4 + +// Verify at compile-time that typeShift is large enough to cover all FileType +// values by confirming that 0 == 0. +var _ [0]struct{} = [TypeAll >> typeShift]struct{}{} + +type memStorageLock struct { + ms *memStorage +} + +func (lock *memStorageLock) Unlock() { + ms := lock.ms + ms.mu.Lock() + defer ms.mu.Unlock() + if ms.slock == lock { + ms.slock = nil + } + return +} + +// memStorage is a memory-backed storage. +type memStorage struct { + mu sync.Mutex + slock *memStorageLock + files map[uint64]*memFile + meta FileDesc +} + +// NewMemStorage returns a new memory-backed storage implementation. +func NewMemStorage() Storage { + return &memStorage{ + files: make(map[uint64]*memFile), + } +} + +func (ms *memStorage) Lock() (Locker, error) { + ms.mu.Lock() + defer ms.mu.Unlock() + if ms.slock != nil { + return nil, ErrLocked + } + ms.slock = &memStorageLock{ms: ms} + return ms.slock, nil +} + +func (*memStorage) Log(str string) {} + +func (ms *memStorage) SetMeta(fd FileDesc) error { + if !FileDescOk(fd) { + return ErrInvalidFile + } + + ms.mu.Lock() + ms.meta = fd + ms.mu.Unlock() + return nil +} + +func (ms *memStorage) GetMeta() (FileDesc, error) { + ms.mu.Lock() + defer ms.mu.Unlock() + if ms.meta.Zero() { + return FileDesc{}, os.ErrNotExist + } + return ms.meta, nil +} + +func (ms *memStorage) List(ft FileType) ([]FileDesc, error) { + ms.mu.Lock() + var fds []FileDesc + for x := range ms.files { + fd := unpackFile(x) + if fd.Type&ft != 0 { + fds = append(fds, fd) + } + } + ms.mu.Unlock() + return fds, nil +} + +func (ms *memStorage) Open(fd FileDesc) (Reader, error) { + if !FileDescOk(fd) { + return nil, ErrInvalidFile + } + + ms.mu.Lock() + defer ms.mu.Unlock() + if m, exist := ms.files[packFile(fd)]; exist { + if m.open { + return nil, errFileOpen + } + m.open = true + return &memReader{Reader: bytes.NewReader(m.Bytes()), ms: ms, m: m}, nil + } + return nil, os.ErrNotExist +} + +func (ms *memStorage) Create(fd FileDesc) (Writer, error) { + if !FileDescOk(fd) { + return nil, ErrInvalidFile + } + + x := packFile(fd) + ms.mu.Lock() + defer ms.mu.Unlock() + m, exist := ms.files[x] + if exist { + if m.open { + return nil, errFileOpen + } + m.Reset() + } else { + m = &memFile{} + ms.files[x] = m + } + m.open = true + return &memWriter{memFile: m, ms: ms}, nil +} + +func (ms *memStorage) Remove(fd FileDesc) error { + if !FileDescOk(fd) { + return ErrInvalidFile + } + + x := packFile(fd) + ms.mu.Lock() + defer ms.mu.Unlock() + if _, exist := ms.files[x]; exist { + delete(ms.files, x) + return nil + } + return os.ErrNotExist +} + +func (ms *memStorage) Rename(oldfd, newfd FileDesc) error { + if !FileDescOk(oldfd) || !FileDescOk(newfd) { + return ErrInvalidFile + } + if oldfd == newfd { + return nil + } + + oldx := packFile(oldfd) + newx := packFile(newfd) + ms.mu.Lock() + defer ms.mu.Unlock() + oldm, exist := ms.files[oldx] + if !exist { + return os.ErrNotExist + } + newm, exist := ms.files[newx] + if (exist && newm.open) || oldm.open { + return errFileOpen + } + delete(ms.files, oldx) + ms.files[newx] = oldm + return nil +} + +func (*memStorage) Close() error { return nil } + +type memFile struct { + bytes.Buffer + open bool +} + +type memReader struct { + *bytes.Reader + ms *memStorage + m *memFile + closed bool +} + +func (mr *memReader) Close() error { + mr.ms.mu.Lock() + defer mr.ms.mu.Unlock() + if mr.closed { + return ErrClosed + } + mr.m.open = false + return nil +} + +type memWriter struct { + *memFile + ms *memStorage + closed bool +} + +func (*memWriter) Sync() error { return nil } + +func (mw *memWriter) Close() error { + mw.ms.mu.Lock() + defer mw.ms.mu.Unlock() + if mw.closed { + return ErrClosed + } + mw.memFile.open = false + return nil +} + +func packFile(fd FileDesc) uint64 { + return uint64(fd.Num)<> typeShift)} +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/storage/storage.go b/vendor/github.com/syndtr/goleveldb/leveldb/storage/storage.go new file mode 100644 index 000000000000..4e4a724258d6 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/storage/storage.go @@ -0,0 +1,187 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Package storage provides storage abstraction for LevelDB. +package storage + +import ( + "errors" + "fmt" + "io" +) + +// FileType represent a file type. +type FileType int + +// File types. +const ( + TypeManifest FileType = 1 << iota + TypeJournal + TypeTable + TypeTemp + + TypeAll = TypeManifest | TypeJournal | TypeTable | TypeTemp +) + +func (t FileType) String() string { + switch t { + case TypeManifest: + return "manifest" + case TypeJournal: + return "journal" + case TypeTable: + return "table" + case TypeTemp: + return "temp" + } + return fmt.Sprintf("", t) +} + +// Common error. +var ( + ErrInvalidFile = errors.New("leveldb/storage: invalid file for argument") + ErrLocked = errors.New("leveldb/storage: already locked") + ErrClosed = errors.New("leveldb/storage: closed") +) + +// ErrCorrupted is the type that wraps errors that indicate corruption of +// a file. Package storage has its own type instead of using +// errors.ErrCorrupted to prevent circular import. +type ErrCorrupted struct { + Fd FileDesc + Err error +} + +func isCorrupted(err error) bool { + switch err.(type) { + case *ErrCorrupted: + return true + } + return false +} + +func (e *ErrCorrupted) Error() string { + if !e.Fd.Zero() { + return fmt.Sprintf("%v [file=%v]", e.Err, e.Fd) + } + return e.Err.Error() +} + +// Syncer is the interface that wraps basic Sync method. +type Syncer interface { + // Sync commits the current contents of the file to stable storage. + Sync() error +} + +// Reader is the interface that groups the basic Read, Seek, ReadAt and Close +// methods. +type Reader interface { + io.ReadSeeker + io.ReaderAt + io.Closer +} + +// Writer is the interface that groups the basic Write, Sync and Close +// methods. +type Writer interface { + io.WriteCloser + Syncer +} + +// Locker is the interface that wraps Unlock method. +type Locker interface { + Unlock() +} + +// FileDesc is a 'file descriptor'. +type FileDesc struct { + Type FileType + Num int64 +} + +func (fd FileDesc) String() string { + switch fd.Type { + case TypeManifest: + return fmt.Sprintf("MANIFEST-%06d", fd.Num) + case TypeJournal: + return fmt.Sprintf("%06d.log", fd.Num) + case TypeTable: + return fmt.Sprintf("%06d.ldb", fd.Num) + case TypeTemp: + return fmt.Sprintf("%06d.tmp", fd.Num) + default: + return fmt.Sprintf("%#x-%d", fd.Type, fd.Num) + } +} + +// Zero returns true if fd == (FileDesc{}). +func (fd FileDesc) Zero() bool { + return fd == (FileDesc{}) +} + +// FileDescOk returns true if fd is a valid 'file descriptor'. +func FileDescOk(fd FileDesc) bool { + switch fd.Type { + case TypeManifest: + case TypeJournal: + case TypeTable: + case TypeTemp: + default: + return false + } + return fd.Num >= 0 +} + +// Storage is the storage. A storage instance must be safe for concurrent use. +type Storage interface { + // Lock locks the storage. Any subsequent attempt to call Lock will fail + // until the last lock released. + // Caller should call Unlock method after use. + Lock() (Locker, error) + + // Log logs a string. This is used for logging. + // An implementation may write to a file, stdout or simply do nothing. + Log(str string) + + // SetMeta store 'file descriptor' that can later be acquired using GetMeta + // method. The 'file descriptor' should point to a valid file. + // SetMeta should be implemented in such way that changes should happen + // atomically. + SetMeta(fd FileDesc) error + + // GetMeta returns 'file descriptor' stored in meta. The 'file descriptor' + // can be updated using SetMeta method. + // Returns os.ErrNotExist if meta doesn't store any 'file descriptor', or + // 'file descriptor' point to nonexistent file. + GetMeta() (FileDesc, error) + + // List returns file descriptors that match the given file types. + // The file types may be OR'ed together. + List(ft FileType) ([]FileDesc, error) + + // Open opens file with the given 'file descriptor' read-only. + // Returns os.ErrNotExist error if the file does not exist. + // Returns ErrClosed if the underlying storage is closed. + Open(fd FileDesc) (Reader, error) + + // Create creates file with the given 'file descriptor', truncate if already + // exist and opens write-only. + // Returns ErrClosed if the underlying storage is closed. + Create(fd FileDesc) (Writer, error) + + // Remove removes file with the given 'file descriptor'. + // Returns ErrClosed if the underlying storage is closed. + Remove(fd FileDesc) error + + // Rename renames file from oldfd to newfd. + // Returns ErrClosed if the underlying storage is closed. + Rename(oldfd, newfd FileDesc) error + + // Close closes the storage. + // It is valid to call Close multiple times. Other methods should not be + // called after the storage has been closed. + Close() error +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/table.go b/vendor/github.com/syndtr/goleveldb/leveldb/table.go new file mode 100644 index 000000000000..1fac60d050b3 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/table.go @@ -0,0 +1,531 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package leveldb + +import ( + "fmt" + "sort" + "sync/atomic" + + "github.com/syndtr/goleveldb/leveldb/cache" + "github.com/syndtr/goleveldb/leveldb/iterator" + "github.com/syndtr/goleveldb/leveldb/opt" + "github.com/syndtr/goleveldb/leveldb/storage" + "github.com/syndtr/goleveldb/leveldb/table" + "github.com/syndtr/goleveldb/leveldb/util" +) + +// tFile holds basic information about a table. +type tFile struct { + fd storage.FileDesc + seekLeft int32 + size int64 + imin, imax internalKey +} + +// Returns true if given key is after largest key of this table. +func (t *tFile) after(icmp *iComparer, ukey []byte) bool { + return ukey != nil && icmp.uCompare(ukey, t.imax.ukey()) > 0 +} + +// Returns true if given key is before smallest key of this table. +func (t *tFile) before(icmp *iComparer, ukey []byte) bool { + return ukey != nil && icmp.uCompare(ukey, t.imin.ukey()) < 0 +} + +// Returns true if given key range overlaps with this table key range. +func (t *tFile) overlaps(icmp *iComparer, umin, umax []byte) bool { + return !t.after(icmp, umin) && !t.before(icmp, umax) +} + +// Cosumes one seek and return current seeks left. +func (t *tFile) consumeSeek() int32 { + return atomic.AddInt32(&t.seekLeft, -1) +} + +// Creates new tFile. +func newTableFile(fd storage.FileDesc, size int64, imin, imax internalKey) *tFile { + f := &tFile{ + fd: fd, + size: size, + imin: imin, + imax: imax, + } + + // We arrange to automatically compact this file after + // a certain number of seeks. Let's assume: + // (1) One seek costs 10ms + // (2) Writing or reading 1MB costs 10ms (100MB/s) + // (3) A compaction of 1MB does 25MB of IO: + // 1MB read from this level + // 10-12MB read from next level (boundaries may be misaligned) + // 10-12MB written to next level + // This implies that 25 seeks cost the same as the compaction + // of 1MB of data. I.e., one seek costs approximately the + // same as the compaction of 40KB of data. We are a little + // conservative and allow approximately one seek for every 16KB + // of data before triggering a compaction. + f.seekLeft = int32(size / 16384) + if f.seekLeft < 100 { + f.seekLeft = 100 + } + + return f +} + +func tableFileFromRecord(r atRecord) *tFile { + return newTableFile(storage.FileDesc{Type: storage.TypeTable, Num: r.num}, r.size, r.imin, r.imax) +} + +// tFiles hold multiple tFile. +type tFiles []*tFile + +func (tf tFiles) Len() int { return len(tf) } +func (tf tFiles) Swap(i, j int) { tf[i], tf[j] = tf[j], tf[i] } + +func (tf tFiles) nums() string { + x := "[ " + for i, f := range tf { + if i != 0 { + x += ", " + } + x += fmt.Sprint(f.fd.Num) + } + x += " ]" + return x +} + +// Returns true if i smallest key is less than j. +// This used for sort by key in ascending order. +func (tf tFiles) lessByKey(icmp *iComparer, i, j int) bool { + a, b := tf[i], tf[j] + n := icmp.Compare(a.imin, b.imin) + if n == 0 { + return a.fd.Num < b.fd.Num + } + return n < 0 +} + +// Returns true if i file number is greater than j. +// This used for sort by file number in descending order. +func (tf tFiles) lessByNum(i, j int) bool { + return tf[i].fd.Num > tf[j].fd.Num +} + +// Sorts tables by key in ascending order. +func (tf tFiles) sortByKey(icmp *iComparer) { + sort.Sort(&tFilesSortByKey{tFiles: tf, icmp: icmp}) +} + +// Sorts tables by file number in descending order. +func (tf tFiles) sortByNum() { + sort.Sort(&tFilesSortByNum{tFiles: tf}) +} + +// Returns sum of all tables size. +func (tf tFiles) size() (sum int64) { + for _, t := range tf { + sum += t.size + } + return sum +} + +// Searches smallest index of tables whose its smallest +// key is after or equal with given key. +func (tf tFiles) searchMin(icmp *iComparer, ikey internalKey) int { + return sort.Search(len(tf), func(i int) bool { + return icmp.Compare(tf[i].imin, ikey) >= 0 + }) +} + +// Searches smallest index of tables whose its largest +// key is after or equal with given key. +func (tf tFiles) searchMax(icmp *iComparer, ikey internalKey) int { + return sort.Search(len(tf), func(i int) bool { + return icmp.Compare(tf[i].imax, ikey) >= 0 + }) +} + +// Returns true if given key range overlaps with one or more +// tables key range. If unsorted is true then binary search will not be used. +func (tf tFiles) overlaps(icmp *iComparer, umin, umax []byte, unsorted bool) bool { + if unsorted { + // Check against all files. + for _, t := range tf { + if t.overlaps(icmp, umin, umax) { + return true + } + } + return false + } + + i := 0 + if len(umin) > 0 { + // Find the earliest possible internal key for min. + i = tf.searchMax(icmp, makeInternalKey(nil, umin, keyMaxSeq, keyTypeSeek)) + } + if i >= len(tf) { + // Beginning of range is after all files, so no overlap. + return false + } + return !tf[i].before(icmp, umax) +} + +// Returns tables whose its key range overlaps with given key range. +// Range will be expanded if ukey found hop across tables. +// If overlapped is true then the search will be restarted if umax +// expanded. +// The dst content will be overwritten. +func (tf tFiles) getOverlaps(dst tFiles, icmp *iComparer, umin, umax []byte, overlapped bool) tFiles { + dst = dst[:0] + for i := 0; i < len(tf); { + t := tf[i] + if t.overlaps(icmp, umin, umax) { + if umin != nil && icmp.uCompare(t.imin.ukey(), umin) < 0 { + umin = t.imin.ukey() + dst = dst[:0] + i = 0 + continue + } else if umax != nil && icmp.uCompare(t.imax.ukey(), umax) > 0 { + umax = t.imax.ukey() + // Restart search if it is overlapped. + if overlapped { + dst = dst[:0] + i = 0 + continue + } + } + + dst = append(dst, t) + } + i++ + } + + return dst +} + +// Returns tables key range. +func (tf tFiles) getRange(icmp *iComparer) (imin, imax internalKey) { + for i, t := range tf { + if i == 0 { + imin, imax = t.imin, t.imax + continue + } + if icmp.Compare(t.imin, imin) < 0 { + imin = t.imin + } + if icmp.Compare(t.imax, imax) > 0 { + imax = t.imax + } + } + + return +} + +// Creates iterator index from tables. +func (tf tFiles) newIndexIterator(tops *tOps, icmp *iComparer, slice *util.Range, ro *opt.ReadOptions) iterator.IteratorIndexer { + if slice != nil { + var start, limit int + if slice.Start != nil { + start = tf.searchMax(icmp, internalKey(slice.Start)) + } + if slice.Limit != nil { + limit = tf.searchMin(icmp, internalKey(slice.Limit)) + } else { + limit = tf.Len() + } + tf = tf[start:limit] + } + return iterator.NewArrayIndexer(&tFilesArrayIndexer{ + tFiles: tf, + tops: tops, + icmp: icmp, + slice: slice, + ro: ro, + }) +} + +// Tables iterator index. +type tFilesArrayIndexer struct { + tFiles + tops *tOps + icmp *iComparer + slice *util.Range + ro *opt.ReadOptions +} + +func (a *tFilesArrayIndexer) Search(key []byte) int { + return a.searchMax(a.icmp, internalKey(key)) +} + +func (a *tFilesArrayIndexer) Get(i int) iterator.Iterator { + if i == 0 || i == a.Len()-1 { + return a.tops.newIterator(a.tFiles[i], a.slice, a.ro) + } + return a.tops.newIterator(a.tFiles[i], nil, a.ro) +} + +// Helper type for sortByKey. +type tFilesSortByKey struct { + tFiles + icmp *iComparer +} + +func (x *tFilesSortByKey) Less(i, j int) bool { + return x.lessByKey(x.icmp, i, j) +} + +// Helper type for sortByNum. +type tFilesSortByNum struct { + tFiles +} + +func (x *tFilesSortByNum) Less(i, j int) bool { + return x.lessByNum(i, j) +} + +// Table operations. +type tOps struct { + s *session + noSync bool + evictRemoved bool + cache *cache.Cache + bcache *cache.Cache + bpool *util.BufferPool +} + +// Creates an empty table and returns table writer. +func (t *tOps) create() (*tWriter, error) { + fd := storage.FileDesc{Type: storage.TypeTable, Num: t.s.allocFileNum()} + fw, err := t.s.stor.Create(fd) + if err != nil { + return nil, err + } + return &tWriter{ + t: t, + fd: fd, + w: fw, + tw: table.NewWriter(fw, t.s.o.Options), + }, nil +} + +// Builds table from src iterator. +func (t *tOps) createFrom(src iterator.Iterator) (f *tFile, n int, err error) { + w, err := t.create() + if err != nil { + return + } + + defer func() { + if err != nil { + w.drop() + } + }() + + for src.Next() { + err = w.append(src.Key(), src.Value()) + if err != nil { + return + } + } + err = src.Error() + if err != nil { + return + } + + n = w.tw.EntriesLen() + f, err = w.finish() + return +} + +// Opens table. It returns a cache handle, which should +// be released after use. +func (t *tOps) open(f *tFile) (ch *cache.Handle, err error) { + ch = t.cache.Get(0, uint64(f.fd.Num), func() (size int, value cache.Value) { + var r storage.Reader + r, err = t.s.stor.Open(f.fd) + if err != nil { + return 0, nil + } + + var bcache *cache.NamespaceGetter + if t.bcache != nil { + bcache = &cache.NamespaceGetter{Cache: t.bcache, NS: uint64(f.fd.Num)} + } + + var tr *table.Reader + tr, err = table.NewReader(r, f.size, f.fd, bcache, t.bpool, t.s.o.Options) + if err != nil { + r.Close() + return 0, nil + } + return 1, tr + + }) + if ch == nil && err == nil { + err = ErrClosed + } + return +} + +// Finds key/value pair whose key is greater than or equal to the +// given key. +func (t *tOps) find(f *tFile, key []byte, ro *opt.ReadOptions) (rkey, rvalue []byte, err error) { + ch, err := t.open(f) + if err != nil { + return nil, nil, err + } + defer ch.Release() + return ch.Value().(*table.Reader).Find(key, true, ro) +} + +// Finds key that is greater than or equal to the given key. +func (t *tOps) findKey(f *tFile, key []byte, ro *opt.ReadOptions) (rkey []byte, err error) { + ch, err := t.open(f) + if err != nil { + return nil, err + } + defer ch.Release() + return ch.Value().(*table.Reader).FindKey(key, true, ro) +} + +// Returns approximate offset of the given key. +func (t *tOps) offsetOf(f *tFile, key []byte) (offset int64, err error) { + ch, err := t.open(f) + if err != nil { + return + } + defer ch.Release() + return ch.Value().(*table.Reader).OffsetOf(key) +} + +// Creates an iterator from the given table. +func (t *tOps) newIterator(f *tFile, slice *util.Range, ro *opt.ReadOptions) iterator.Iterator { + ch, err := t.open(f) + if err != nil { + return iterator.NewEmptyIterator(err) + } + iter := ch.Value().(*table.Reader).NewIterator(slice, ro) + iter.SetReleaser(ch) + return iter +} + +// Removes table from persistent storage. It waits until +// no one use the the table. +func (t *tOps) remove(f *tFile) { + t.cache.Delete(0, uint64(f.fd.Num), func() { + if err := t.s.stor.Remove(f.fd); err != nil { + t.s.logf("table@remove removing @%d %q", f.fd.Num, err) + } else { + t.s.logf("table@remove removed @%d", f.fd.Num) + } + if t.evictRemoved && t.bcache != nil { + t.bcache.EvictNS(uint64(f.fd.Num)) + } + }) +} + +// Closes the table ops instance. It will close all tables, +// regadless still used or not. +func (t *tOps) close() { + t.bpool.Close() + t.cache.Close() + if t.bcache != nil { + t.bcache.CloseWeak() + } +} + +// Creates new initialized table ops instance. +func newTableOps(s *session) *tOps { + var ( + cacher cache.Cacher + bcache *cache.Cache + bpool *util.BufferPool + ) + if s.o.GetOpenFilesCacheCapacity() > 0 { + cacher = cache.NewLRU(s.o.GetOpenFilesCacheCapacity()) + } + if !s.o.GetDisableBlockCache() { + var bcacher cache.Cacher + if s.o.GetBlockCacheCapacity() > 0 { + bcacher = s.o.GetBlockCacher().New(s.o.GetBlockCacheCapacity()) + } + bcache = cache.NewCache(bcacher) + } + if !s.o.GetDisableBufferPool() { + bpool = util.NewBufferPool(s.o.GetBlockSize() + 5) + } + return &tOps{ + s: s, + noSync: s.o.GetNoSync(), + evictRemoved: s.o.GetBlockCacheEvictRemoved(), + cache: cache.NewCache(cacher), + bcache: bcache, + bpool: bpool, + } +} + +// tWriter wraps the table writer. It keep track of file descriptor +// and added key range. +type tWriter struct { + t *tOps + + fd storage.FileDesc + w storage.Writer + tw *table.Writer + + first, last []byte +} + +// Append key/value pair to the table. +func (w *tWriter) append(key, value []byte) error { + if w.first == nil { + w.first = append([]byte{}, key...) + } + w.last = append(w.last[:0], key...) + return w.tw.Append(key, value) +} + +// Returns true if the table is empty. +func (w *tWriter) empty() bool { + return w.first == nil +} + +// Closes the storage.Writer. +func (w *tWriter) close() { + if w.w != nil { + w.w.Close() + w.w = nil + } +} + +// Finalizes the table and returns table file. +func (w *tWriter) finish() (f *tFile, err error) { + defer w.close() + err = w.tw.Close() + if err != nil { + return + } + if !w.t.noSync { + err = w.w.Sync() + if err != nil { + return + } + } + f = newTableFile(w.fd, int64(w.tw.BytesLen()), internalKey(w.first), internalKey(w.last)) + return +} + +// Drops the table. +func (w *tWriter) drop() { + w.close() + w.t.s.stor.Remove(w.fd) + w.t.s.reuseFileNum(w.fd.Num) + w.tw = nil + w.first = nil + w.last = nil +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/table/reader.go b/vendor/github.com/syndtr/goleveldb/leveldb/table/reader.go new file mode 100644 index 000000000000..16cfbaa0068a --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/table/reader.go @@ -0,0 +1,1135 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package table + +import ( + "encoding/binary" + "fmt" + "io" + "sort" + "strings" + "sync" + + "github.com/golang/snappy" + + "github.com/syndtr/goleveldb/leveldb/cache" + "github.com/syndtr/goleveldb/leveldb/comparer" + "github.com/syndtr/goleveldb/leveldb/errors" + "github.com/syndtr/goleveldb/leveldb/filter" + "github.com/syndtr/goleveldb/leveldb/iterator" + "github.com/syndtr/goleveldb/leveldb/opt" + "github.com/syndtr/goleveldb/leveldb/storage" + "github.com/syndtr/goleveldb/leveldb/util" +) + +// Reader errors. +var ( + ErrNotFound = errors.ErrNotFound + ErrReaderReleased = errors.New("leveldb/table: reader released") + ErrIterReleased = errors.New("leveldb/table: iterator released") +) + +// ErrCorrupted describes error due to corruption. This error will be wrapped +// with errors.ErrCorrupted. +type ErrCorrupted struct { + Pos int64 + Size int64 + Kind string + Reason string +} + +func (e *ErrCorrupted) Error() string { + return fmt.Sprintf("leveldb/table: corruption on %s (pos=%d): %s", e.Kind, e.Pos, e.Reason) +} + +func max(x, y int) int { + if x > y { + return x + } + return y +} + +type block struct { + bpool *util.BufferPool + bh blockHandle + data []byte + restartsLen int + restartsOffset int +} + +func (b *block) seek(cmp comparer.Comparer, rstart, rlimit int, key []byte) (index, offset int, err error) { + index = sort.Search(b.restartsLen-rstart-(b.restartsLen-rlimit), func(i int) bool { + offset := int(binary.LittleEndian.Uint32(b.data[b.restartsOffset+4*(rstart+i):])) + offset++ // shared always zero, since this is a restart point + v1, n1 := binary.Uvarint(b.data[offset:]) // key length + _, n2 := binary.Uvarint(b.data[offset+n1:]) // value length + m := offset + n1 + n2 + return cmp.Compare(b.data[m:m+int(v1)], key) > 0 + }) + rstart - 1 + if index < rstart { + // The smallest key is greater-than key sought. + index = rstart + } + offset = int(binary.LittleEndian.Uint32(b.data[b.restartsOffset+4*index:])) + return +} + +func (b *block) restartIndex(rstart, rlimit, offset int) int { + return sort.Search(b.restartsLen-rstart-(b.restartsLen-rlimit), func(i int) bool { + return int(binary.LittleEndian.Uint32(b.data[b.restartsOffset+4*(rstart+i):])) > offset + }) + rstart - 1 +} + +func (b *block) restartOffset(index int) int { + return int(binary.LittleEndian.Uint32(b.data[b.restartsOffset+4*index:])) +} + +func (b *block) entry(offset int) (key, value []byte, nShared, n int, err error) { + if offset >= b.restartsOffset { + if offset != b.restartsOffset { + err = &ErrCorrupted{Reason: "entries offset not aligned"} + } + return + } + v0, n0 := binary.Uvarint(b.data[offset:]) // Shared prefix length + v1, n1 := binary.Uvarint(b.data[offset+n0:]) // Key length + v2, n2 := binary.Uvarint(b.data[offset+n0+n1:]) // Value length + m := n0 + n1 + n2 + n = m + int(v1) + int(v2) + if n0 <= 0 || n1 <= 0 || n2 <= 0 || offset+n > b.restartsOffset { + err = &ErrCorrupted{Reason: "entries corrupted"} + return + } + key = b.data[offset+m : offset+m+int(v1)] + value = b.data[offset+m+int(v1) : offset+n] + nShared = int(v0) + return +} + +func (b *block) Release() { + b.bpool.Put(b.data) + b.bpool = nil + b.data = nil +} + +type dir int + +const ( + dirReleased dir = iota - 1 + dirSOI + dirEOI + dirBackward + dirForward +) + +type blockIter struct { + tr *Reader + block *block + blockReleaser util.Releaser + releaser util.Releaser + key, value []byte + offset int + // Previous offset, only filled by Next. + prevOffset int + prevNode []int + prevKeys []byte + restartIndex int + // Iterator direction. + dir dir + // Restart index slice range. + riStart int + riLimit int + // Offset slice range. + offsetStart int + offsetRealStart int + offsetLimit int + // Error. + err error +} + +func (i *blockIter) sErr(err error) { + i.err = err + i.key = nil + i.value = nil + i.prevNode = nil + i.prevKeys = nil +} + +func (i *blockIter) reset() { + if i.dir == dirBackward { + i.prevNode = i.prevNode[:0] + i.prevKeys = i.prevKeys[:0] + } + i.restartIndex = i.riStart + i.offset = i.offsetStart + i.dir = dirSOI + i.key = i.key[:0] + i.value = nil +} + +func (i *blockIter) isFirst() bool { + switch i.dir { + case dirForward: + return i.prevOffset == i.offsetRealStart + case dirBackward: + return len(i.prevNode) == 1 && i.restartIndex == i.riStart + } + return false +} + +func (i *blockIter) isLast() bool { + switch i.dir { + case dirForward, dirBackward: + return i.offset == i.offsetLimit + } + return false +} + +func (i *blockIter) First() bool { + if i.err != nil { + return false + } else if i.dir == dirReleased { + i.err = ErrIterReleased + return false + } + + if i.dir == dirBackward { + i.prevNode = i.prevNode[:0] + i.prevKeys = i.prevKeys[:0] + } + i.dir = dirSOI + return i.Next() +} + +func (i *blockIter) Last() bool { + if i.err != nil { + return false + } else if i.dir == dirReleased { + i.err = ErrIterReleased + return false + } + + if i.dir == dirBackward { + i.prevNode = i.prevNode[:0] + i.prevKeys = i.prevKeys[:0] + } + i.dir = dirEOI + return i.Prev() +} + +func (i *blockIter) Seek(key []byte) bool { + if i.err != nil { + return false + } else if i.dir == dirReleased { + i.err = ErrIterReleased + return false + } + + ri, offset, err := i.block.seek(i.tr.cmp, i.riStart, i.riLimit, key) + if err != nil { + i.sErr(err) + return false + } + i.restartIndex = ri + i.offset = max(i.offsetStart, offset) + if i.dir == dirSOI || i.dir == dirEOI { + i.dir = dirForward + } + for i.Next() { + if i.tr.cmp.Compare(i.key, key) >= 0 { + return true + } + } + return false +} + +func (i *blockIter) Next() bool { + if i.dir == dirEOI || i.err != nil { + return false + } else if i.dir == dirReleased { + i.err = ErrIterReleased + return false + } + + if i.dir == dirSOI { + i.restartIndex = i.riStart + i.offset = i.offsetStart + } else if i.dir == dirBackward { + i.prevNode = i.prevNode[:0] + i.prevKeys = i.prevKeys[:0] + } + for i.offset < i.offsetRealStart { + key, value, nShared, n, err := i.block.entry(i.offset) + if err != nil { + i.sErr(i.tr.fixErrCorruptedBH(i.block.bh, err)) + return false + } + if n == 0 { + i.dir = dirEOI + return false + } + i.key = append(i.key[:nShared], key...) + i.value = value + i.offset += n + } + if i.offset >= i.offsetLimit { + i.dir = dirEOI + if i.offset != i.offsetLimit { + i.sErr(i.tr.newErrCorruptedBH(i.block.bh, "entries offset not aligned")) + } + return false + } + key, value, nShared, n, err := i.block.entry(i.offset) + if err != nil { + i.sErr(i.tr.fixErrCorruptedBH(i.block.bh, err)) + return false + } + if n == 0 { + i.dir = dirEOI + return false + } + i.key = append(i.key[:nShared], key...) + i.value = value + i.prevOffset = i.offset + i.offset += n + i.dir = dirForward + return true +} + +func (i *blockIter) Prev() bool { + if i.dir == dirSOI || i.err != nil { + return false + } else if i.dir == dirReleased { + i.err = ErrIterReleased + return false + } + + var ri int + if i.dir == dirForward { + // Change direction. + i.offset = i.prevOffset + if i.offset == i.offsetRealStart { + i.dir = dirSOI + return false + } + ri = i.block.restartIndex(i.restartIndex, i.riLimit, i.offset) + i.dir = dirBackward + } else if i.dir == dirEOI { + // At the end of iterator. + i.restartIndex = i.riLimit + i.offset = i.offsetLimit + if i.offset == i.offsetRealStart { + i.dir = dirSOI + return false + } + ri = i.riLimit - 1 + i.dir = dirBackward + } else if len(i.prevNode) == 1 { + // This is the end of a restart range. + i.offset = i.prevNode[0] + i.prevNode = i.prevNode[:0] + if i.restartIndex == i.riStart { + i.dir = dirSOI + return false + } + i.restartIndex-- + ri = i.restartIndex + } else { + // In the middle of restart range, get from cache. + n := len(i.prevNode) - 3 + node := i.prevNode[n:] + i.prevNode = i.prevNode[:n] + // Get the key. + ko := node[0] + i.key = append(i.key[:0], i.prevKeys[ko:]...) + i.prevKeys = i.prevKeys[:ko] + // Get the value. + vo := node[1] + vl := vo + node[2] + i.value = i.block.data[vo:vl] + i.offset = vl + return true + } + // Build entries cache. + i.key = i.key[:0] + i.value = nil + offset := i.block.restartOffset(ri) + if offset == i.offset { + ri-- + if ri < 0 { + i.dir = dirSOI + return false + } + offset = i.block.restartOffset(ri) + } + i.prevNode = append(i.prevNode, offset) + for { + key, value, nShared, n, err := i.block.entry(offset) + if err != nil { + i.sErr(i.tr.fixErrCorruptedBH(i.block.bh, err)) + return false + } + if offset >= i.offsetRealStart { + if i.value != nil { + // Appends 3 variables: + // 1. Previous keys offset + // 2. Value offset in the data block + // 3. Value length + i.prevNode = append(i.prevNode, len(i.prevKeys), offset-len(i.value), len(i.value)) + i.prevKeys = append(i.prevKeys, i.key...) + } + i.value = value + } + i.key = append(i.key[:nShared], key...) + offset += n + // Stop if target offset reached. + if offset >= i.offset { + if offset != i.offset { + i.sErr(i.tr.newErrCorruptedBH(i.block.bh, "entries offset not aligned")) + return false + } + + break + } + } + i.restartIndex = ri + i.offset = offset + return true +} + +func (i *blockIter) Key() []byte { + if i.err != nil || i.dir <= dirEOI { + return nil + } + return i.key +} + +func (i *blockIter) Value() []byte { + if i.err != nil || i.dir <= dirEOI { + return nil + } + return i.value +} + +func (i *blockIter) Release() { + if i.dir != dirReleased { + i.tr = nil + i.block = nil + i.prevNode = nil + i.prevKeys = nil + i.key = nil + i.value = nil + i.dir = dirReleased + if i.blockReleaser != nil { + i.blockReleaser.Release() + i.blockReleaser = nil + } + if i.releaser != nil { + i.releaser.Release() + i.releaser = nil + } + } +} + +func (i *blockIter) SetReleaser(releaser util.Releaser) { + if i.dir == dirReleased { + panic(util.ErrReleased) + } + if i.releaser != nil && releaser != nil { + panic(util.ErrHasReleaser) + } + i.releaser = releaser +} + +func (i *blockIter) Valid() bool { + return i.err == nil && (i.dir == dirBackward || i.dir == dirForward) +} + +func (i *blockIter) Error() error { + return i.err +} + +type filterBlock struct { + bpool *util.BufferPool + data []byte + oOffset int + baseLg uint + filtersNum int +} + +func (b *filterBlock) contains(filter filter.Filter, offset uint64, key []byte) bool { + i := int(offset >> b.baseLg) + if i < b.filtersNum { + o := b.data[b.oOffset+i*4:] + n := int(binary.LittleEndian.Uint32(o)) + m := int(binary.LittleEndian.Uint32(o[4:])) + if n < m && m <= b.oOffset { + return filter.Contains(b.data[n:m], key) + } else if n == m { + return false + } + } + return true +} + +func (b *filterBlock) Release() { + b.bpool.Put(b.data) + b.bpool = nil + b.data = nil +} + +type indexIter struct { + *blockIter + tr *Reader + slice *util.Range + // Options + fillCache bool +} + +func (i *indexIter) Get() iterator.Iterator { + value := i.Value() + if value == nil { + return nil + } + dataBH, n := decodeBlockHandle(value) + if n == 0 { + return iterator.NewEmptyIterator(i.tr.newErrCorruptedBH(i.tr.indexBH, "bad data block handle")) + } + + var slice *util.Range + if i.slice != nil && (i.blockIter.isFirst() || i.blockIter.isLast()) { + slice = i.slice + } + return i.tr.getDataIterErr(dataBH, slice, i.tr.verifyChecksum, i.fillCache) +} + +// Reader is a table reader. +type Reader struct { + mu sync.RWMutex + fd storage.FileDesc + reader io.ReaderAt + cache *cache.NamespaceGetter + err error + bpool *util.BufferPool + // Options + o *opt.Options + cmp comparer.Comparer + filter filter.Filter + verifyChecksum bool + + dataEnd int64 + metaBH, indexBH, filterBH blockHandle + indexBlock *block + filterBlock *filterBlock +} + +func (r *Reader) blockKind(bh blockHandle) string { + switch bh.offset { + case r.metaBH.offset: + return "meta-block" + case r.indexBH.offset: + return "index-block" + case r.filterBH.offset: + if r.filterBH.length > 0 { + return "filter-block" + } + } + return "data-block" +} + +func (r *Reader) newErrCorrupted(pos, size int64, kind, reason string) error { + return &errors.ErrCorrupted{Fd: r.fd, Err: &ErrCorrupted{Pos: pos, Size: size, Kind: kind, Reason: reason}} +} + +func (r *Reader) newErrCorruptedBH(bh blockHandle, reason string) error { + return r.newErrCorrupted(int64(bh.offset), int64(bh.length), r.blockKind(bh), reason) +} + +func (r *Reader) fixErrCorruptedBH(bh blockHandle, err error) error { + if cerr, ok := err.(*ErrCorrupted); ok { + cerr.Pos = int64(bh.offset) + cerr.Size = int64(bh.length) + cerr.Kind = r.blockKind(bh) + return &errors.ErrCorrupted{Fd: r.fd, Err: cerr} + } + return err +} + +func (r *Reader) readRawBlock(bh blockHandle, verifyChecksum bool) ([]byte, error) { + data := r.bpool.Get(int(bh.length + blockTrailerLen)) + if _, err := r.reader.ReadAt(data, int64(bh.offset)); err != nil && err != io.EOF { + return nil, err + } + + if verifyChecksum { + n := bh.length + 1 + checksum0 := binary.LittleEndian.Uint32(data[n:]) + checksum1 := util.NewCRC(data[:n]).Value() + if checksum0 != checksum1 { + r.bpool.Put(data) + return nil, r.newErrCorruptedBH(bh, fmt.Sprintf("checksum mismatch, want=%#x got=%#x", checksum0, checksum1)) + } + } + + switch data[bh.length] { + case blockTypeNoCompression: + data = data[:bh.length] + case blockTypeSnappyCompression: + decLen, err := snappy.DecodedLen(data[:bh.length]) + if err != nil { + r.bpool.Put(data) + return nil, r.newErrCorruptedBH(bh, err.Error()) + } + decData := r.bpool.Get(decLen) + decData, err = snappy.Decode(decData, data[:bh.length]) + r.bpool.Put(data) + if err != nil { + r.bpool.Put(decData) + return nil, r.newErrCorruptedBH(bh, err.Error()) + } + data = decData + default: + r.bpool.Put(data) + return nil, r.newErrCorruptedBH(bh, fmt.Sprintf("unknown compression type %#x", data[bh.length])) + } + return data, nil +} + +func (r *Reader) readBlock(bh blockHandle, verifyChecksum bool) (*block, error) { + data, err := r.readRawBlock(bh, verifyChecksum) + if err != nil { + return nil, err + } + restartsLen := int(binary.LittleEndian.Uint32(data[len(data)-4:])) + b := &block{ + bpool: r.bpool, + bh: bh, + data: data, + restartsLen: restartsLen, + restartsOffset: len(data) - (restartsLen+1)*4, + } + return b, nil +} + +func (r *Reader) readBlockCached(bh blockHandle, verifyChecksum, fillCache bool) (*block, util.Releaser, error) { + if r.cache != nil { + var ( + err error + ch *cache.Handle + ) + if fillCache { + ch = r.cache.Get(bh.offset, func() (size int, value cache.Value) { + var b *block + b, err = r.readBlock(bh, verifyChecksum) + if err != nil { + return 0, nil + } + return cap(b.data), b + }) + } else { + ch = r.cache.Get(bh.offset, nil) + } + if ch != nil { + b, ok := ch.Value().(*block) + if !ok { + ch.Release() + return nil, nil, errors.New("leveldb/table: inconsistent block type") + } + return b, ch, err + } else if err != nil { + return nil, nil, err + } + } + + b, err := r.readBlock(bh, verifyChecksum) + return b, b, err +} + +func (r *Reader) readFilterBlock(bh blockHandle) (*filterBlock, error) { + data, err := r.readRawBlock(bh, true) + if err != nil { + return nil, err + } + n := len(data) + if n < 5 { + return nil, r.newErrCorruptedBH(bh, "too short") + } + m := n - 5 + oOffset := int(binary.LittleEndian.Uint32(data[m:])) + if oOffset > m { + return nil, r.newErrCorruptedBH(bh, "invalid data-offsets offset") + } + b := &filterBlock{ + bpool: r.bpool, + data: data, + oOffset: oOffset, + baseLg: uint(data[n-1]), + filtersNum: (m - oOffset) / 4, + } + return b, nil +} + +func (r *Reader) readFilterBlockCached(bh blockHandle, fillCache bool) (*filterBlock, util.Releaser, error) { + if r.cache != nil { + var ( + err error + ch *cache.Handle + ) + if fillCache { + ch = r.cache.Get(bh.offset, func() (size int, value cache.Value) { + var b *filterBlock + b, err = r.readFilterBlock(bh) + if err != nil { + return 0, nil + } + return cap(b.data), b + }) + } else { + ch = r.cache.Get(bh.offset, nil) + } + if ch != nil { + b, ok := ch.Value().(*filterBlock) + if !ok { + ch.Release() + return nil, nil, errors.New("leveldb/table: inconsistent block type") + } + return b, ch, err + } else if err != nil { + return nil, nil, err + } + } + + b, err := r.readFilterBlock(bh) + return b, b, err +} + +func (r *Reader) getIndexBlock(fillCache bool) (b *block, rel util.Releaser, err error) { + if r.indexBlock == nil { + return r.readBlockCached(r.indexBH, true, fillCache) + } + return r.indexBlock, util.NoopReleaser{}, nil +} + +func (r *Reader) getFilterBlock(fillCache bool) (*filterBlock, util.Releaser, error) { + if r.filterBlock == nil { + return r.readFilterBlockCached(r.filterBH, fillCache) + } + return r.filterBlock, util.NoopReleaser{}, nil +} + +func (r *Reader) newBlockIter(b *block, bReleaser util.Releaser, slice *util.Range, inclLimit bool) *blockIter { + bi := &blockIter{ + tr: r, + block: b, + blockReleaser: bReleaser, + // Valid key should never be nil. + key: make([]byte, 0), + dir: dirSOI, + riStart: 0, + riLimit: b.restartsLen, + offsetStart: 0, + offsetRealStart: 0, + offsetLimit: b.restartsOffset, + } + if slice != nil { + if slice.Start != nil { + if bi.Seek(slice.Start) { + bi.riStart = b.restartIndex(bi.restartIndex, b.restartsLen, bi.prevOffset) + bi.offsetStart = b.restartOffset(bi.riStart) + bi.offsetRealStart = bi.prevOffset + } else { + bi.riStart = b.restartsLen + bi.offsetStart = b.restartsOffset + bi.offsetRealStart = b.restartsOffset + } + } + if slice.Limit != nil { + if bi.Seek(slice.Limit) && (!inclLimit || bi.Next()) { + bi.offsetLimit = bi.prevOffset + bi.riLimit = bi.restartIndex + 1 + } + } + bi.reset() + if bi.offsetStart > bi.offsetLimit { + bi.sErr(errors.New("leveldb/table: invalid slice range")) + } + } + return bi +} + +func (r *Reader) getDataIter(dataBH blockHandle, slice *util.Range, verifyChecksum, fillCache bool) iterator.Iterator { + b, rel, err := r.readBlockCached(dataBH, verifyChecksum, fillCache) + if err != nil { + return iterator.NewEmptyIterator(err) + } + return r.newBlockIter(b, rel, slice, false) +} + +func (r *Reader) getDataIterErr(dataBH blockHandle, slice *util.Range, verifyChecksum, fillCache bool) iterator.Iterator { + r.mu.RLock() + defer r.mu.RUnlock() + + if r.err != nil { + return iterator.NewEmptyIterator(r.err) + } + + return r.getDataIter(dataBH, slice, verifyChecksum, fillCache) +} + +// NewIterator creates an iterator from the table. +// +// Slice allows slicing the iterator to only contains keys in the given +// range. A nil Range.Start is treated as a key before all keys in the +// table. And a nil Range.Limit is treated as a key after all keys in +// the table. +// +// The returned iterator is not safe for concurrent use and should be released +// after use. +// +// Also read Iterator documentation of the leveldb/iterator package. +func (r *Reader) NewIterator(slice *util.Range, ro *opt.ReadOptions) iterator.Iterator { + r.mu.RLock() + defer r.mu.RUnlock() + + if r.err != nil { + return iterator.NewEmptyIterator(r.err) + } + + fillCache := !ro.GetDontFillCache() + indexBlock, rel, err := r.getIndexBlock(fillCache) + if err != nil { + return iterator.NewEmptyIterator(err) + } + index := &indexIter{ + blockIter: r.newBlockIter(indexBlock, rel, slice, true), + tr: r, + slice: slice, + fillCache: !ro.GetDontFillCache(), + } + return iterator.NewIndexedIterator(index, opt.GetStrict(r.o, ro, opt.StrictReader)) +} + +func (r *Reader) find(key []byte, filtered bool, ro *opt.ReadOptions, noValue bool) (rkey, value []byte, err error) { + r.mu.RLock() + defer r.mu.RUnlock() + + if r.err != nil { + err = r.err + return + } + + indexBlock, rel, err := r.getIndexBlock(true) + if err != nil { + return + } + defer rel.Release() + + index := r.newBlockIter(indexBlock, nil, nil, true) + defer index.Release() + + if !index.Seek(key) { + if err = index.Error(); err == nil { + err = ErrNotFound + } + return + } + + dataBH, n := decodeBlockHandle(index.Value()) + if n == 0 { + r.err = r.newErrCorruptedBH(r.indexBH, "bad data block handle") + return nil, nil, r.err + } + + // The filter should only used for exact match. + if filtered && r.filter != nil { + filterBlock, frel, ferr := r.getFilterBlock(true) + if ferr == nil { + if !filterBlock.contains(r.filter, dataBH.offset, key) { + frel.Release() + return nil, nil, ErrNotFound + } + frel.Release() + } else if !errors.IsCorrupted(ferr) { + return nil, nil, ferr + } + } + + data := r.getDataIter(dataBH, nil, r.verifyChecksum, !ro.GetDontFillCache()) + if !data.Seek(key) { + data.Release() + if err = data.Error(); err != nil { + return + } + + // The nearest greater-than key is the first key of the next block. + if !index.Next() { + if err = index.Error(); err == nil { + err = ErrNotFound + } + return + } + + dataBH, n = decodeBlockHandle(index.Value()) + if n == 0 { + r.err = r.newErrCorruptedBH(r.indexBH, "bad data block handle") + return nil, nil, r.err + } + + data = r.getDataIter(dataBH, nil, r.verifyChecksum, !ro.GetDontFillCache()) + if !data.Next() { + data.Release() + if err = data.Error(); err == nil { + err = ErrNotFound + } + return + } + } + + // Key doesn't use block buffer, no need to copy the buffer. + rkey = data.Key() + if !noValue { + if r.bpool == nil { + value = data.Value() + } else { + // Value does use block buffer, and since the buffer will be + // recycled, it need to be copied. + value = append([]byte{}, data.Value()...) + } + } + data.Release() + return +} + +// Find finds key/value pair whose key is greater than or equal to the +// given key. It returns ErrNotFound if the table doesn't contain +// such pair. +// If filtered is true then the nearest 'block' will be checked against +// 'filter data' (if present) and will immediately return ErrNotFound if +// 'filter data' indicates that such pair doesn't exist. +// +// The caller may modify the contents of the returned slice as it is its +// own copy. +// It is safe to modify the contents of the argument after Find returns. +func (r *Reader) Find(key []byte, filtered bool, ro *opt.ReadOptions) (rkey, value []byte, err error) { + return r.find(key, filtered, ro, false) +} + +// FindKey finds key that is greater than or equal to the given key. +// It returns ErrNotFound if the table doesn't contain such key. +// If filtered is true then the nearest 'block' will be checked against +// 'filter data' (if present) and will immediately return ErrNotFound if +// 'filter data' indicates that such key doesn't exist. +// +// The caller may modify the contents of the returned slice as it is its +// own copy. +// It is safe to modify the contents of the argument after Find returns. +func (r *Reader) FindKey(key []byte, filtered bool, ro *opt.ReadOptions) (rkey []byte, err error) { + rkey, _, err = r.find(key, filtered, ro, true) + return +} + +// Get gets the value for the given key. It returns errors.ErrNotFound +// if the table does not contain the key. +// +// The caller may modify the contents of the returned slice as it is its +// own copy. +// It is safe to modify the contents of the argument after Find returns. +func (r *Reader) Get(key []byte, ro *opt.ReadOptions) (value []byte, err error) { + r.mu.RLock() + defer r.mu.RUnlock() + + if r.err != nil { + err = r.err + return + } + + rkey, value, err := r.find(key, false, ro, false) + if err == nil && r.cmp.Compare(rkey, key) != 0 { + value = nil + err = ErrNotFound + } + return +} + +// OffsetOf returns approximate offset for the given key. +// +// It is safe to modify the contents of the argument after Get returns. +func (r *Reader) OffsetOf(key []byte) (offset int64, err error) { + r.mu.RLock() + defer r.mu.RUnlock() + + if r.err != nil { + err = r.err + return + } + + indexBlock, rel, err := r.readBlockCached(r.indexBH, true, true) + if err != nil { + return + } + defer rel.Release() + + index := r.newBlockIter(indexBlock, nil, nil, true) + defer index.Release() + if index.Seek(key) { + dataBH, n := decodeBlockHandle(index.Value()) + if n == 0 { + r.err = r.newErrCorruptedBH(r.indexBH, "bad data block handle") + return + } + offset = int64(dataBH.offset) + return + } + err = index.Error() + if err == nil { + offset = r.dataEnd + } + return +} + +// Release implements util.Releaser. +// It also close the file if it is an io.Closer. +func (r *Reader) Release() { + r.mu.Lock() + defer r.mu.Unlock() + + if closer, ok := r.reader.(io.Closer); ok { + closer.Close() + } + if r.indexBlock != nil { + r.indexBlock.Release() + r.indexBlock = nil + } + if r.filterBlock != nil { + r.filterBlock.Release() + r.filterBlock = nil + } + r.reader = nil + r.cache = nil + r.bpool = nil + r.err = ErrReaderReleased +} + +// NewReader creates a new initialized table reader for the file. +// The fi, cache and bpool is optional and can be nil. +// +// The returned table reader instance is safe for concurrent use. +func NewReader(f io.ReaderAt, size int64, fd storage.FileDesc, cache *cache.NamespaceGetter, bpool *util.BufferPool, o *opt.Options) (*Reader, error) { + if f == nil { + return nil, errors.New("leveldb/table: nil file") + } + + r := &Reader{ + fd: fd, + reader: f, + cache: cache, + bpool: bpool, + o: o, + cmp: o.GetComparer(), + verifyChecksum: o.GetStrict(opt.StrictBlockChecksum), + } + + if size < footerLen { + r.err = r.newErrCorrupted(0, size, "table", "too small") + return r, nil + } + + footerPos := size - footerLen + var footer [footerLen]byte + if _, err := r.reader.ReadAt(footer[:], footerPos); err != nil && err != io.EOF { + return nil, err + } + if string(footer[footerLen-len(magic):footerLen]) != magic { + r.err = r.newErrCorrupted(footerPos, footerLen, "table-footer", "bad magic number") + return r, nil + } + + var n int + // Decode the metaindex block handle. + r.metaBH, n = decodeBlockHandle(footer[:]) + if n == 0 { + r.err = r.newErrCorrupted(footerPos, footerLen, "table-footer", "bad metaindex block handle") + return r, nil + } + + // Decode the index block handle. + r.indexBH, n = decodeBlockHandle(footer[n:]) + if n == 0 { + r.err = r.newErrCorrupted(footerPos, footerLen, "table-footer", "bad index block handle") + return r, nil + } + + // Read metaindex block. + metaBlock, err := r.readBlock(r.metaBH, true) + if err != nil { + if errors.IsCorrupted(err) { + r.err = err + return r, nil + } + return nil, err + } + + // Set data end. + r.dataEnd = int64(r.metaBH.offset) + + // Read metaindex. + metaIter := r.newBlockIter(metaBlock, nil, nil, true) + for metaIter.Next() { + key := string(metaIter.Key()) + if !strings.HasPrefix(key, "filter.") { + continue + } + fn := key[7:] + if f0 := o.GetFilter(); f0 != nil && f0.Name() == fn { + r.filter = f0 + } else { + for _, f0 := range o.GetAltFilters() { + if f0.Name() == fn { + r.filter = f0 + break + } + } + } + if r.filter != nil { + filterBH, n := decodeBlockHandle(metaIter.Value()) + if n == 0 { + continue + } + r.filterBH = filterBH + // Update data end. + r.dataEnd = int64(filterBH.offset) + break + } + } + metaIter.Release() + metaBlock.Release() + + // Cache index and filter block locally, since we don't have global cache. + if cache == nil { + r.indexBlock, err = r.readBlock(r.indexBH, true) + if err != nil { + if errors.IsCorrupted(err) { + r.err = err + return r, nil + } + return nil, err + } + if r.filter != nil { + r.filterBlock, err = r.readFilterBlock(r.filterBH) + if err != nil { + if !errors.IsCorrupted(err) { + return nil, err + } + + // Don't use filter then. + r.filter = nil + } + } + } + + return r, nil +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/table/table.go b/vendor/github.com/syndtr/goleveldb/leveldb/table/table.go new file mode 100644 index 000000000000..beacdc1f024a --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/table/table.go @@ -0,0 +1,177 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Package table allows read and write sorted key/value. +package table + +import ( + "encoding/binary" +) + +/* +Table: + +Table is consist of one or more data blocks, an optional filter block +a metaindex block, an index block and a table footer. Metaindex block +is a special block used to keep parameters of the table, such as filter +block name and its block handle. Index block is a special block used to +keep record of data blocks offset and length, index block use one as +restart interval. The key used by index block are the last key of preceding +block, shorter separator of adjacent blocks or shorter successor of the +last key of the last block. Filter block is an optional block contains +sequence of filter data generated by a filter generator. + +Table data structure: + + optional + / + +--------------+--------------+--------------+------+-------+-----------------+-------------+--------+ + | data block 1 | ... | data block n | filter block | metaindex block | index block | footer | + +--------------+--------------+--------------+--------------+-----------------+-------------+--------+ + + Each block followed by a 5-bytes trailer contains compression type and checksum. + +Table block trailer: + + +---------------------------+-------------------+ + | compression type (1-byte) | checksum (4-byte) | + +---------------------------+-------------------+ + + The checksum is a CRC-32 computed using Castagnoli's polynomial. Compression + type also included in the checksum. + +Table footer: + + +------------------- 40-bytes -------------------+ + / \ + +------------------------+--------------------+------+-----------------+ + | metaindex block handle / index block handle / ---- | magic (8-bytes) | + +------------------------+--------------------+------+-----------------+ + + The magic are first 64-bit of SHA-1 sum of "http://code.google.com/p/leveldb/". + +NOTE: All fixed-length integer are little-endian. +*/ + +/* +Block: + +Block is consist of one or more key/value entries and a block trailer. +Block entry shares key prefix with its preceding key until a restart +point reached. A block should contains at least one restart point. +First restart point are always zero. + +Block data structure: + + + restart point + restart point (depends on restart interval) + / / + +---------------+---------------+---------------+---------------+---------+ + | block entry 1 | block entry 2 | ... | block entry n | trailer | + +---------------+---------------+---------------+---------------+---------+ + +Key/value entry: + + +---- key len ----+ + / \ + +-------+---------+-----------+---------+--------------------+--------------+----------------+ + | shared (varint) | not shared (varint) | value len (varint) | key (varlen) | value (varlen) | + +-----------------+---------------------+--------------------+--------------+----------------+ + + Block entry shares key prefix with its preceding key: + Conditions: + restart_interval=2 + entry one : key=deck,value=v1 + entry two : key=dock,value=v2 + entry three: key=duck,value=v3 + The entries will be encoded as follow: + + + restart point (offset=0) + restart point (offset=16) + / / + +-----+-----+-----+----------+--------+-----+-----+-----+---------+--------+-----+-----+-----+----------+--------+ + | 0 | 4 | 2 | "deck" | "v1" | 1 | 3 | 2 | "ock" | "v2" | 0 | 4 | 2 | "duck" | "v3" | + +-----+-----+-----+----------+--------+-----+-----+-----+---------+--------+-----+-----+-----+----------+--------+ + \ / \ / \ / + +----------- entry one -----------+ +----------- entry two ----------+ +---------- entry three ----------+ + + The block trailer will contains two restart points: + + +------------+-----------+--------+ + | 0 | 16 | 2 | + +------------+-----------+---+----+ + \ / \ + +-- restart points --+ + restart points length + +Block trailer: + + +-- 4-bytes --+ + / \ + +-----------------+-----------------+-----------------+------------------------------+ + | restart point 1 | .... | restart point n | restart points len (4-bytes) | + +-----------------+-----------------+-----------------+------------------------------+ + + +NOTE: All fixed-length integer are little-endian. +*/ + +/* +Filter block: + +Filter block consist of one or more filter data and a filter block trailer. +The trailer contains filter data offsets, a trailer offset and a 1-byte base Lg. + +Filter block data structure: + + + offset 1 + offset 2 + offset n + trailer offset + / / / / + +---------------+---------------+---------------+---------+ + | filter data 1 | ... | filter data n | trailer | + +---------------+---------------+---------------+---------+ + +Filter block trailer: + + +- 4-bytes -+ + / \ + +---------------+---------------+---------------+-------------------------------+------------------+ + | data 1 offset | .... | data n offset | data-offsets offset (4-bytes) | base Lg (1-byte) | + +-------------- +---------------+---------------+-------------------------------+------------------+ + + +NOTE: All fixed-length integer are little-endian. +*/ + +const ( + blockTrailerLen = 5 + footerLen = 48 + + magic = "\x57\xfb\x80\x8b\x24\x75\x47\xdb" + + // The block type gives the per-block compression format. + // These constants are part of the file format and should not be changed. + blockTypeNoCompression = 0 + blockTypeSnappyCompression = 1 + + // Generate new filter every 2KB of data + filterBaseLg = 11 + filterBase = 1 << filterBaseLg +) + +type blockHandle struct { + offset, length uint64 +} + +func decodeBlockHandle(src []byte) (blockHandle, int) { + offset, n := binary.Uvarint(src) + length, m := binary.Uvarint(src[n:]) + if n == 0 || m == 0 { + return blockHandle{}, 0 + } + return blockHandle{offset, length}, n + m +} + +func encodeBlockHandle(dst []byte, b blockHandle) int { + n := binary.PutUvarint(dst, b.offset) + m := binary.PutUvarint(dst[n:], b.length) + return n + m +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/table/writer.go b/vendor/github.com/syndtr/goleveldb/leveldb/table/writer.go new file mode 100644 index 000000000000..b96b271d8ddb --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/table/writer.go @@ -0,0 +1,375 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package table + +import ( + "encoding/binary" + "errors" + "fmt" + "io" + + "github.com/golang/snappy" + + "github.com/syndtr/goleveldb/leveldb/comparer" + "github.com/syndtr/goleveldb/leveldb/filter" + "github.com/syndtr/goleveldb/leveldb/opt" + "github.com/syndtr/goleveldb/leveldb/util" +) + +func sharedPrefixLen(a, b []byte) int { + i, n := 0, len(a) + if n > len(b) { + n = len(b) + } + for i < n && a[i] == b[i] { + i++ + } + return i +} + +type blockWriter struct { + restartInterval int + buf util.Buffer + nEntries int + prevKey []byte + restarts []uint32 + scratch []byte +} + +func (w *blockWriter) append(key, value []byte) { + nShared := 0 + if w.nEntries%w.restartInterval == 0 { + w.restarts = append(w.restarts, uint32(w.buf.Len())) + } else { + nShared = sharedPrefixLen(w.prevKey, key) + } + n := binary.PutUvarint(w.scratch[0:], uint64(nShared)) + n += binary.PutUvarint(w.scratch[n:], uint64(len(key)-nShared)) + n += binary.PutUvarint(w.scratch[n:], uint64(len(value))) + w.buf.Write(w.scratch[:n]) + w.buf.Write(key[nShared:]) + w.buf.Write(value) + w.prevKey = append(w.prevKey[:0], key...) + w.nEntries++ +} + +func (w *blockWriter) finish() { + // Write restarts entry. + if w.nEntries == 0 { + // Must have at least one restart entry. + w.restarts = append(w.restarts, 0) + } + w.restarts = append(w.restarts, uint32(len(w.restarts))) + for _, x := range w.restarts { + buf4 := w.buf.Alloc(4) + binary.LittleEndian.PutUint32(buf4, x) + } +} + +func (w *blockWriter) reset() { + w.buf.Reset() + w.nEntries = 0 + w.restarts = w.restarts[:0] +} + +func (w *blockWriter) bytesLen() int { + restartsLen := len(w.restarts) + if restartsLen == 0 { + restartsLen = 1 + } + return w.buf.Len() + 4*restartsLen + 4 +} + +type filterWriter struct { + generator filter.FilterGenerator + buf util.Buffer + nKeys int + offsets []uint32 +} + +func (w *filterWriter) add(key []byte) { + if w.generator == nil { + return + } + w.generator.Add(key) + w.nKeys++ +} + +func (w *filterWriter) flush(offset uint64) { + if w.generator == nil { + return + } + for x := int(offset / filterBase); x > len(w.offsets); { + w.generate() + } +} + +func (w *filterWriter) finish() { + if w.generator == nil { + return + } + // Generate last keys. + + if w.nKeys > 0 { + w.generate() + } + w.offsets = append(w.offsets, uint32(w.buf.Len())) + for _, x := range w.offsets { + buf4 := w.buf.Alloc(4) + binary.LittleEndian.PutUint32(buf4, x) + } + w.buf.WriteByte(filterBaseLg) +} + +func (w *filterWriter) generate() { + // Record offset. + w.offsets = append(w.offsets, uint32(w.buf.Len())) + // Generate filters. + if w.nKeys > 0 { + w.generator.Generate(&w.buf) + w.nKeys = 0 + } +} + +// Writer is a table writer. +type Writer struct { + writer io.Writer + err error + // Options + cmp comparer.Comparer + filter filter.Filter + compression opt.Compression + blockSize int + + dataBlock blockWriter + indexBlock blockWriter + filterBlock filterWriter + pendingBH blockHandle + offset uint64 + nEntries int + // Scratch allocated enough for 5 uvarint. Block writer should not use + // first 20-bytes since it will be used to encode block handle, which + // then passed to the block writer itself. + scratch [50]byte + comparerScratch []byte + compressionScratch []byte +} + +func (w *Writer) writeBlock(buf *util.Buffer, compression opt.Compression) (bh blockHandle, err error) { + // Compress the buffer if necessary. + var b []byte + if compression == opt.SnappyCompression { + // Allocate scratch enough for compression and block trailer. + if n := snappy.MaxEncodedLen(buf.Len()) + blockTrailerLen; len(w.compressionScratch) < n { + w.compressionScratch = make([]byte, n) + } + compressed := snappy.Encode(w.compressionScratch, buf.Bytes()) + n := len(compressed) + b = compressed[:n+blockTrailerLen] + b[n] = blockTypeSnappyCompression + } else { + tmp := buf.Alloc(blockTrailerLen) + tmp[0] = blockTypeNoCompression + b = buf.Bytes() + } + + // Calculate the checksum. + n := len(b) - 4 + checksum := util.NewCRC(b[:n]).Value() + binary.LittleEndian.PutUint32(b[n:], checksum) + + // Write the buffer to the file. + _, err = w.writer.Write(b) + if err != nil { + return + } + bh = blockHandle{w.offset, uint64(len(b) - blockTrailerLen)} + w.offset += uint64(len(b)) + return +} + +func (w *Writer) flushPendingBH(key []byte) { + if w.pendingBH.length == 0 { + return + } + var separator []byte + if len(key) == 0 { + separator = w.cmp.Successor(w.comparerScratch[:0], w.dataBlock.prevKey) + } else { + separator = w.cmp.Separator(w.comparerScratch[:0], w.dataBlock.prevKey, key) + } + if separator == nil { + separator = w.dataBlock.prevKey + } else { + w.comparerScratch = separator + } + n := encodeBlockHandle(w.scratch[:20], w.pendingBH) + // Append the block handle to the index block. + w.indexBlock.append(separator, w.scratch[:n]) + // Reset prev key of the data block. + w.dataBlock.prevKey = w.dataBlock.prevKey[:0] + // Clear pending block handle. + w.pendingBH = blockHandle{} +} + +func (w *Writer) finishBlock() error { + w.dataBlock.finish() + bh, err := w.writeBlock(&w.dataBlock.buf, w.compression) + if err != nil { + return err + } + w.pendingBH = bh + // Reset the data block. + w.dataBlock.reset() + // Flush the filter block. + w.filterBlock.flush(w.offset) + return nil +} + +// Append appends key/value pair to the table. The keys passed must +// be in increasing order. +// +// It is safe to modify the contents of the arguments after Append returns. +func (w *Writer) Append(key, value []byte) error { + if w.err != nil { + return w.err + } + if w.nEntries > 0 && w.cmp.Compare(w.dataBlock.prevKey, key) >= 0 { + w.err = fmt.Errorf("leveldb/table: Writer: keys are not in increasing order: %q, %q", w.dataBlock.prevKey, key) + return w.err + } + + w.flushPendingBH(key) + // Append key/value pair to the data block. + w.dataBlock.append(key, value) + // Add key to the filter block. + w.filterBlock.add(key) + + // Finish the data block if block size target reached. + if w.dataBlock.bytesLen() >= w.blockSize { + if err := w.finishBlock(); err != nil { + w.err = err + return w.err + } + } + w.nEntries++ + return nil +} + +// BlocksLen returns number of blocks written so far. +func (w *Writer) BlocksLen() int { + n := w.indexBlock.nEntries + if w.pendingBH.length > 0 { + // Includes the pending block. + n++ + } + return n +} + +// EntriesLen returns number of entries added so far. +func (w *Writer) EntriesLen() int { + return w.nEntries +} + +// BytesLen returns number of bytes written so far. +func (w *Writer) BytesLen() int { + return int(w.offset) +} + +// Close will finalize the table. Calling Append is not possible +// after Close, but calling BlocksLen, EntriesLen and BytesLen +// is still possible. +func (w *Writer) Close() error { + if w.err != nil { + return w.err + } + + // Write the last data block. Or empty data block if there + // aren't any data blocks at all. + if w.dataBlock.nEntries > 0 || w.nEntries == 0 { + if err := w.finishBlock(); err != nil { + w.err = err + return w.err + } + } + w.flushPendingBH(nil) + + // Write the filter block. + var filterBH blockHandle + w.filterBlock.finish() + if buf := &w.filterBlock.buf; buf.Len() > 0 { + filterBH, w.err = w.writeBlock(buf, opt.NoCompression) + if w.err != nil { + return w.err + } + } + + // Write the metaindex block. + if filterBH.length > 0 { + key := []byte("filter." + w.filter.Name()) + n := encodeBlockHandle(w.scratch[:20], filterBH) + w.dataBlock.append(key, w.scratch[:n]) + } + w.dataBlock.finish() + metaindexBH, err := w.writeBlock(&w.dataBlock.buf, w.compression) + if err != nil { + w.err = err + return w.err + } + + // Write the index block. + w.indexBlock.finish() + indexBH, err := w.writeBlock(&w.indexBlock.buf, w.compression) + if err != nil { + w.err = err + return w.err + } + + // Write the table footer. + footer := w.scratch[:footerLen] + for i := range footer { + footer[i] = 0 + } + n := encodeBlockHandle(footer, metaindexBH) + encodeBlockHandle(footer[n:], indexBH) + copy(footer[footerLen-len(magic):], magic) + if _, err := w.writer.Write(footer); err != nil { + w.err = err + return w.err + } + w.offset += footerLen + + w.err = errors.New("leveldb/table: writer is closed") + return nil +} + +// NewWriter creates a new initialized table writer for the file. +// +// Table writer is not safe for concurrent use. +func NewWriter(f io.Writer, o *opt.Options) *Writer { + w := &Writer{ + writer: f, + cmp: o.GetComparer(), + filter: o.GetFilter(), + compression: o.GetCompression(), + blockSize: o.GetBlockSize(), + comparerScratch: make([]byte, 0), + } + // data block + w.dataBlock.restartInterval = o.GetBlockRestartInterval() + // The first 20-bytes are used for encoding block handle. + w.dataBlock.scratch = w.scratch[20:] + // index block + w.indexBlock.restartInterval = 1 + w.indexBlock.scratch = w.scratch[20:] + // filter block + if w.filter != nil { + w.filterBlock.generator = w.filter.NewGenerator() + w.filterBlock.flush(0) + } + return w +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/util.go b/vendor/github.com/syndtr/goleveldb/leveldb/util.go new file mode 100644 index 000000000000..0e2b519e5c71 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/util.go @@ -0,0 +1,98 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package leveldb + +import ( + "fmt" + "sort" + + "github.com/syndtr/goleveldb/leveldb/storage" +) + +func shorten(str string) string { + if len(str) <= 8 { + return str + } + return str[:3] + ".." + str[len(str)-3:] +} + +var bunits = [...]string{"", "Ki", "Mi", "Gi", "Ti"} + +func shortenb(bytes int) string { + i := 0 + for ; bytes > 1024 && i < 4; i++ { + bytes /= 1024 + } + return fmt.Sprintf("%d%sB", bytes, bunits[i]) +} + +func sshortenb(bytes int) string { + if bytes == 0 { + return "~" + } + sign := "+" + if bytes < 0 { + sign = "-" + bytes *= -1 + } + i := 0 + for ; bytes > 1024 && i < 4; i++ { + bytes /= 1024 + } + return fmt.Sprintf("%s%d%sB", sign, bytes, bunits[i]) +} + +func sint(x int) string { + if x == 0 { + return "~" + } + sign := "+" + if x < 0 { + sign = "-" + x *= -1 + } + return fmt.Sprintf("%s%d", sign, x) +} + +func minInt(a, b int) int { + if a < b { + return a + } + return b +} + +func maxInt(a, b int) int { + if a > b { + return a + } + return b +} + +type fdSorter []storage.FileDesc + +func (p fdSorter) Len() int { + return len(p) +} + +func (p fdSorter) Less(i, j int) bool { + return p[i].Num < p[j].Num +} + +func (p fdSorter) Swap(i, j int) { + p[i], p[j] = p[j], p[i] +} + +func sortFds(fds []storage.FileDesc) { + sort.Sort(fdSorter(fds)) +} + +func ensureBuffer(b []byte, n int) []byte { + if cap(b) < n { + return make([]byte, n) + } + return b[:n] +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/util/buffer.go b/vendor/github.com/syndtr/goleveldb/leveldb/util/buffer.go new file mode 100644 index 000000000000..21de242552d2 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/util/buffer.go @@ -0,0 +1,293 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package util + +// This a copy of Go std bytes.Buffer with some modification +// and some features stripped. + +import ( + "bytes" + "io" +) + +// A Buffer is a variable-sized buffer of bytes with Read and Write methods. +// The zero value for Buffer is an empty buffer ready to use. +type Buffer struct { + buf []byte // contents are the bytes buf[off : len(buf)] + off int // read at &buf[off], write at &buf[len(buf)] + bootstrap [64]byte // memory to hold first slice; helps small buffers (Printf) avoid allocation. +} + +// Bytes returns a slice of the contents of the unread portion of the buffer; +// len(b.Bytes()) == b.Len(). If the caller changes the contents of the +// returned slice, the contents of the buffer will change provided there +// are no intervening method calls on the Buffer. +func (b *Buffer) Bytes() []byte { return b.buf[b.off:] } + +// String returns the contents of the unread portion of the buffer +// as a string. If the Buffer is a nil pointer, it returns "". +func (b *Buffer) String() string { + if b == nil { + // Special case, useful in debugging. + return "" + } + return string(b.buf[b.off:]) +} + +// Len returns the number of bytes of the unread portion of the buffer; +// b.Len() == len(b.Bytes()). +func (b *Buffer) Len() int { return len(b.buf) - b.off } + +// Truncate discards all but the first n unread bytes from the buffer. +// It panics if n is negative or greater than the length of the buffer. +func (b *Buffer) Truncate(n int) { + switch { + case n < 0 || n > b.Len(): + panic("leveldb/util.Buffer: truncation out of range") + case n == 0: + // Reuse buffer space. + b.off = 0 + } + b.buf = b.buf[0 : b.off+n] +} + +// Reset resets the buffer so it has no content. +// b.Reset() is the same as b.Truncate(0). +func (b *Buffer) Reset() { b.Truncate(0) } + +// grow grows the buffer to guarantee space for n more bytes. +// It returns the index where bytes should be written. +// If the buffer can't grow it will panic with bytes.ErrTooLarge. +func (b *Buffer) grow(n int) int { + m := b.Len() + // If buffer is empty, reset to recover space. + if m == 0 && b.off != 0 { + b.Truncate(0) + } + if len(b.buf)+n > cap(b.buf) { + var buf []byte + if b.buf == nil && n <= len(b.bootstrap) { + buf = b.bootstrap[0:] + } else if m+n <= cap(b.buf)/2 { + // We can slide things down instead of allocating a new + // slice. We only need m+n <= cap(b.buf) to slide, but + // we instead let capacity get twice as large so we + // don't spend all our time copying. + copy(b.buf[:], b.buf[b.off:]) + buf = b.buf[:m] + } else { + // not enough space anywhere + buf = makeSlice(2*cap(b.buf) + n) + copy(buf, b.buf[b.off:]) + } + b.buf = buf + b.off = 0 + } + b.buf = b.buf[0 : b.off+m+n] + return b.off + m +} + +// Alloc allocs n bytes of slice from the buffer, growing the buffer as +// needed. If n is negative, Alloc will panic. +// If the buffer can't grow it will panic with bytes.ErrTooLarge. +func (b *Buffer) Alloc(n int) []byte { + if n < 0 { + panic("leveldb/util.Buffer.Alloc: negative count") + } + m := b.grow(n) + return b.buf[m:] +} + +// Grow grows the buffer's capacity, if necessary, to guarantee space for +// another n bytes. After Grow(n), at least n bytes can be written to the +// buffer without another allocation. +// If n is negative, Grow will panic. +// If the buffer can't grow it will panic with bytes.ErrTooLarge. +func (b *Buffer) Grow(n int) { + if n < 0 { + panic("leveldb/util.Buffer.Grow: negative count") + } + m := b.grow(n) + b.buf = b.buf[0:m] +} + +// Write appends the contents of p to the buffer, growing the buffer as +// needed. The return value n is the length of p; err is always nil. If the +// buffer becomes too large, Write will panic with bytes.ErrTooLarge. +func (b *Buffer) Write(p []byte) (n int, err error) { + m := b.grow(len(p)) + return copy(b.buf[m:], p), nil +} + +// MinRead is the minimum slice size passed to a Read call by +// Buffer.ReadFrom. As long as the Buffer has at least MinRead bytes beyond +// what is required to hold the contents of r, ReadFrom will not grow the +// underlying buffer. +const MinRead = 512 + +// ReadFrom reads data from r until EOF and appends it to the buffer, growing +// the buffer as needed. The return value n is the number of bytes read. Any +// error except io.EOF encountered during the read is also returned. If the +// buffer becomes too large, ReadFrom will panic with bytes.ErrTooLarge. +func (b *Buffer) ReadFrom(r io.Reader) (n int64, err error) { + // If buffer is empty, reset to recover space. + if b.off >= len(b.buf) { + b.Truncate(0) + } + for { + if free := cap(b.buf) - len(b.buf); free < MinRead { + // not enough space at end + newBuf := b.buf + if b.off+free < MinRead { + // not enough space using beginning of buffer; + // double buffer capacity + newBuf = makeSlice(2*cap(b.buf) + MinRead) + } + copy(newBuf, b.buf[b.off:]) + b.buf = newBuf[:len(b.buf)-b.off] + b.off = 0 + } + m, e := r.Read(b.buf[len(b.buf):cap(b.buf)]) + b.buf = b.buf[0 : len(b.buf)+m] + n += int64(m) + if e == io.EOF { + break + } + if e != nil { + return n, e + } + } + return n, nil // err is EOF, so return nil explicitly +} + +// makeSlice allocates a slice of size n. If the allocation fails, it panics +// with bytes.ErrTooLarge. +func makeSlice(n int) []byte { + // If the make fails, give a known error. + defer func() { + if recover() != nil { + panic(bytes.ErrTooLarge) + } + }() + return make([]byte, n) +} + +// WriteTo writes data to w until the buffer is drained or an error occurs. +// The return value n is the number of bytes written; it always fits into an +// int, but it is int64 to match the io.WriterTo interface. Any error +// encountered during the write is also returned. +func (b *Buffer) WriteTo(w io.Writer) (n int64, err error) { + if b.off < len(b.buf) { + nBytes := b.Len() + m, e := w.Write(b.buf[b.off:]) + if m > nBytes { + panic("leveldb/util.Buffer.WriteTo: invalid Write count") + } + b.off += m + n = int64(m) + if e != nil { + return n, e + } + // all bytes should have been written, by definition of + // Write method in io.Writer + if m != nBytes { + return n, io.ErrShortWrite + } + } + // Buffer is now empty; reset. + b.Truncate(0) + return +} + +// WriteByte appends the byte c to the buffer, growing the buffer as needed. +// The returned error is always nil, but is included to match bufio.Writer's +// WriteByte. If the buffer becomes too large, WriteByte will panic with +// bytes.ErrTooLarge. +func (b *Buffer) WriteByte(c byte) error { + m := b.grow(1) + b.buf[m] = c + return nil +} + +// Read reads the next len(p) bytes from the buffer or until the buffer +// is drained. The return value n is the number of bytes read. If the +// buffer has no data to return, err is io.EOF (unless len(p) is zero); +// otherwise it is nil. +func (b *Buffer) Read(p []byte) (n int, err error) { + if b.off >= len(b.buf) { + // Buffer is empty, reset to recover space. + b.Truncate(0) + if len(p) == 0 { + return + } + return 0, io.EOF + } + n = copy(p, b.buf[b.off:]) + b.off += n + return +} + +// Next returns a slice containing the next n bytes from the buffer, +// advancing the buffer as if the bytes had been returned by Read. +// If there are fewer than n bytes in the buffer, Next returns the entire buffer. +// The slice is only valid until the next call to a read or write method. +func (b *Buffer) Next(n int) []byte { + m := b.Len() + if n > m { + n = m + } + data := b.buf[b.off : b.off+n] + b.off += n + return data +} + +// ReadByte reads and returns the next byte from the buffer. +// If no byte is available, it returns error io.EOF. +func (b *Buffer) ReadByte() (c byte, err error) { + if b.off >= len(b.buf) { + // Buffer is empty, reset to recover space. + b.Truncate(0) + return 0, io.EOF + } + c = b.buf[b.off] + b.off++ + return c, nil +} + +// ReadBytes reads until the first occurrence of delim in the input, +// returning a slice containing the data up to and including the delimiter. +// If ReadBytes encounters an error before finding a delimiter, +// it returns the data read before the error and the error itself (often io.EOF). +// ReadBytes returns err != nil if and only if the returned data does not end in +// delim. +func (b *Buffer) ReadBytes(delim byte) (line []byte, err error) { + slice, err := b.readSlice(delim) + // return a copy of slice. The buffer's backing array may + // be overwritten by later calls. + line = append(line, slice...) + return +} + +// readSlice is like ReadBytes but returns a reference to internal buffer data. +func (b *Buffer) readSlice(delim byte) (line []byte, err error) { + i := bytes.IndexByte(b.buf[b.off:], delim) + end := b.off + i + 1 + if i < 0 { + end = len(b.buf) + err = io.EOF + } + line = b.buf[b.off:end] + b.off = end + return line, err +} + +// NewBuffer creates and initializes a new Buffer using buf as its initial +// contents. It is intended to prepare a Buffer to read existing data. It +// can also be used to size the internal buffer for writing. To do that, +// buf should have the desired capacity but a length of zero. +// +// In most cases, new(Buffer) (or just declaring a Buffer variable) is +// sufficient to initialize a Buffer. +func NewBuffer(buf []byte) *Buffer { return &Buffer{buf: buf} } diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/util/buffer_pool.go b/vendor/github.com/syndtr/goleveldb/leveldb/util/buffer_pool.go new file mode 100644 index 000000000000..2f3db974a796 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/util/buffer_pool.go @@ -0,0 +1,239 @@ +// Copyright (c) 2014, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package util + +import ( + "fmt" + "sync" + "sync/atomic" + "time" +) + +type buffer struct { + b []byte + miss int +} + +// BufferPool is a 'buffer pool'. +type BufferPool struct { + pool [6]chan []byte + size [5]uint32 + sizeMiss [5]uint32 + sizeHalf [5]uint32 + baseline [4]int + baseline0 int + + mu sync.RWMutex + closed bool + closeC chan struct{} + + get uint32 + put uint32 + half uint32 + less uint32 + equal uint32 + greater uint32 + miss uint32 +} + +func (p *BufferPool) poolNum(n int) int { + if n <= p.baseline0 && n > p.baseline0/2 { + return 0 + } + for i, x := range p.baseline { + if n <= x { + return i + 1 + } + } + return len(p.baseline) + 1 +} + +// Get returns buffer with length of n. +func (p *BufferPool) Get(n int) []byte { + if p == nil { + return make([]byte, n) + } + + p.mu.RLock() + defer p.mu.RUnlock() + + if p.closed { + return make([]byte, n) + } + + atomic.AddUint32(&p.get, 1) + + poolNum := p.poolNum(n) + pool := p.pool[poolNum] + if poolNum == 0 { + // Fast path. + select { + case b := <-pool: + switch { + case cap(b) > n: + if cap(b)-n >= n { + atomic.AddUint32(&p.half, 1) + select { + case pool <- b: + default: + } + return make([]byte, n) + } else { + atomic.AddUint32(&p.less, 1) + return b[:n] + } + case cap(b) == n: + atomic.AddUint32(&p.equal, 1) + return b[:n] + default: + atomic.AddUint32(&p.greater, 1) + } + default: + atomic.AddUint32(&p.miss, 1) + } + + return make([]byte, n, p.baseline0) + } else { + sizePtr := &p.size[poolNum-1] + + select { + case b := <-pool: + switch { + case cap(b) > n: + if cap(b)-n >= n { + atomic.AddUint32(&p.half, 1) + sizeHalfPtr := &p.sizeHalf[poolNum-1] + if atomic.AddUint32(sizeHalfPtr, 1) == 20 { + atomic.StoreUint32(sizePtr, uint32(cap(b)/2)) + atomic.StoreUint32(sizeHalfPtr, 0) + } else { + select { + case pool <- b: + default: + } + } + return make([]byte, n) + } else { + atomic.AddUint32(&p.less, 1) + return b[:n] + } + case cap(b) == n: + atomic.AddUint32(&p.equal, 1) + return b[:n] + default: + atomic.AddUint32(&p.greater, 1) + if uint32(cap(b)) >= atomic.LoadUint32(sizePtr) { + select { + case pool <- b: + default: + } + } + } + default: + atomic.AddUint32(&p.miss, 1) + } + + if size := atomic.LoadUint32(sizePtr); uint32(n) > size { + if size == 0 { + atomic.CompareAndSwapUint32(sizePtr, 0, uint32(n)) + } else { + sizeMissPtr := &p.sizeMiss[poolNum-1] + if atomic.AddUint32(sizeMissPtr, 1) == 20 { + atomic.StoreUint32(sizePtr, uint32(n)) + atomic.StoreUint32(sizeMissPtr, 0) + } + } + return make([]byte, n) + } else { + return make([]byte, n, size) + } + } +} + +// Put adds given buffer to the pool. +func (p *BufferPool) Put(b []byte) { + if p == nil { + return + } + + p.mu.RLock() + defer p.mu.RUnlock() + + if p.closed { + return + } + + atomic.AddUint32(&p.put, 1) + + pool := p.pool[p.poolNum(cap(b))] + select { + case pool <- b: + default: + } + +} + +func (p *BufferPool) Close() { + if p == nil { + return + } + + p.mu.Lock() + if !p.closed { + p.closed = true + p.closeC <- struct{}{} + } + p.mu.Unlock() +} + +func (p *BufferPool) String() string { + if p == nil { + return "" + } + + return fmt.Sprintf("BufferPool{B·%d Z·%v Zm·%v Zh·%v G·%d P·%d H·%d <·%d =·%d >·%d M·%d}", + p.baseline0, p.size, p.sizeMiss, p.sizeHalf, p.get, p.put, p.half, p.less, p.equal, p.greater, p.miss) +} + +func (p *BufferPool) drain() { + ticker := time.NewTicker(2 * time.Second) + defer ticker.Stop() + for { + select { + case <-ticker.C: + for _, ch := range p.pool { + select { + case <-ch: + default: + } + } + case <-p.closeC: + close(p.closeC) + for _, ch := range p.pool { + close(ch) + } + return + } + } +} + +// NewBufferPool creates a new initialized 'buffer pool'. +func NewBufferPool(baseline int) *BufferPool { + if baseline <= 0 { + panic("baseline can't be <= 0") + } + p := &BufferPool{ + baseline0: baseline, + baseline: [...]int{baseline / 4, baseline / 2, baseline * 2, baseline * 4}, + closeC: make(chan struct{}, 1), + } + for i, cap := range []int{2, 2, 4, 4, 2, 1} { + p.pool[i] = make(chan []byte, cap) + } + go p.drain() + return p +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/util/crc32.go b/vendor/github.com/syndtr/goleveldb/leveldb/util/crc32.go new file mode 100644 index 000000000000..631c9d6109c9 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/util/crc32.go @@ -0,0 +1,30 @@ +// Copyright 2011 The LevelDB-Go Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package util + +import ( + "hash/crc32" +) + +var table = crc32.MakeTable(crc32.Castagnoli) + +// CRC is a CRC-32 checksum computed using Castagnoli's polynomial. +type CRC uint32 + +// NewCRC creates a new crc based on the given bytes. +func NewCRC(b []byte) CRC { + return CRC(0).Update(b) +} + +// Update updates the crc with the given bytes. +func (c CRC) Update(b []byte) CRC { + return CRC(crc32.Update(uint32(c), table, b)) +} + +// Value returns a masked crc. +func (c CRC) Value() uint32 { + return uint32(c>>15|c<<17) + 0xa282ead8 +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/util/hash.go b/vendor/github.com/syndtr/goleveldb/leveldb/util/hash.go new file mode 100644 index 000000000000..7f3fa4e2c79e --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/util/hash.go @@ -0,0 +1,48 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package util + +import ( + "encoding/binary" +) + +// Hash return hash of the given data. +func Hash(data []byte, seed uint32) uint32 { + // Similar to murmur hash + const ( + m = uint32(0xc6a4a793) + r = uint32(24) + ) + var ( + h = seed ^ (uint32(len(data)) * m) + i int + ) + + for n := len(data) - len(data)%4; i < n; i += 4 { + h += binary.LittleEndian.Uint32(data[i:]) + h *= m + h ^= (h >> 16) + } + + switch len(data) - i { + default: + panic("not reached") + case 3: + h += uint32(data[i+2]) << 16 + fallthrough + case 2: + h += uint32(data[i+1]) << 8 + fallthrough + case 1: + h += uint32(data[i]) + h *= m + h ^= (h >> r) + case 0: + } + + return h +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/util/range.go b/vendor/github.com/syndtr/goleveldb/leveldb/util/range.go new file mode 100644 index 000000000000..85159583d2c3 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/util/range.go @@ -0,0 +1,32 @@ +// Copyright (c) 2014, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package util + +// Range is a key range. +type Range struct { + // Start of the key range, include in the range. + Start []byte + + // Limit of the key range, not include in the range. + Limit []byte +} + +// BytesPrefix returns key range that satisfy the given prefix. +// This only applicable for the standard 'bytes comparer'. +func BytesPrefix(prefix []byte) *Range { + var limit []byte + for i := len(prefix) - 1; i >= 0; i-- { + c := prefix[i] + if c < 0xff { + limit = make([]byte, i+1) + copy(limit, prefix) + limit[i] = c + 1 + break + } + } + return &Range{prefix, limit} +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/util/util.go b/vendor/github.com/syndtr/goleveldb/leveldb/util/util.go new file mode 100644 index 000000000000..80614afc58fc --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/util/util.go @@ -0,0 +1,73 @@ +// Copyright (c) 2013, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Package util provides utilities used throughout leveldb. +package util + +import ( + "errors" +) + +var ( + ErrReleased = errors.New("leveldb: resource already relesed") + ErrHasReleaser = errors.New("leveldb: releaser already defined") +) + +// Releaser is the interface that wraps the basic Release method. +type Releaser interface { + // Release releases associated resources. Release should always success + // and can be called multiple times without causing error. + Release() +} + +// ReleaseSetter is the interface that wraps the basic SetReleaser method. +type ReleaseSetter interface { + // SetReleaser associates the given releaser to the resources. The + // releaser will be called once coresponding resources released. + // Calling SetReleaser with nil will clear the releaser. + // + // This will panic if a releaser already present or coresponding + // resource is already released. Releaser should be cleared first + // before assigned a new one. + SetReleaser(releaser Releaser) +} + +// BasicReleaser provides basic implementation of Releaser and ReleaseSetter. +type BasicReleaser struct { + releaser Releaser + released bool +} + +// Released returns whether Release method already called. +func (r *BasicReleaser) Released() bool { + return r.released +} + +// Release implements Releaser.Release. +func (r *BasicReleaser) Release() { + if !r.released { + if r.releaser != nil { + r.releaser.Release() + r.releaser = nil + } + r.released = true + } +} + +// SetReleaser implements ReleaseSetter.SetReleaser. +func (r *BasicReleaser) SetReleaser(releaser Releaser) { + if r.released { + panic(ErrReleased) + } + if r.releaser != nil && releaser != nil { + panic(ErrHasReleaser) + } + r.releaser = releaser +} + +type NoopReleaser struct{} + +func (NoopReleaser) Release() {} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/version.go b/vendor/github.com/syndtr/goleveldb/leveldb/version.go new file mode 100644 index 000000000000..73f272af5ff4 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/version.go @@ -0,0 +1,528 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package leveldb + +import ( + "fmt" + "sync/atomic" + "unsafe" + + "github.com/syndtr/goleveldb/leveldb/iterator" + "github.com/syndtr/goleveldb/leveldb/opt" + "github.com/syndtr/goleveldb/leveldb/util" +) + +type tSet struct { + level int + table *tFile +} + +type version struct { + s *session + + levels []tFiles + + // Level that should be compacted next and its compaction score. + // Score < 1 means compaction is not strictly needed. These fields + // are initialized by computeCompaction() + cLevel int + cScore float64 + + cSeek unsafe.Pointer + + closing bool + ref int + released bool +} + +func newVersion(s *session) *version { + return &version{s: s} +} + +func (v *version) incref() { + if v.released { + panic("already released") + } + + v.ref++ + if v.ref == 1 { + // Incr file ref. + for _, tt := range v.levels { + for _, t := range tt { + v.s.addFileRef(t.fd, 1) + } + } + } +} + +func (v *version) releaseNB() { + v.ref-- + if v.ref > 0 { + return + } else if v.ref < 0 { + panic("negative version ref") + } + + for _, tt := range v.levels { + for _, t := range tt { + if v.s.addFileRef(t.fd, -1) == 0 { + v.s.tops.remove(t) + } + } + } + + v.released = true +} + +func (v *version) release() { + v.s.vmu.Lock() + v.releaseNB() + v.s.vmu.Unlock() +} + +func (v *version) walkOverlapping(aux tFiles, ikey internalKey, f func(level int, t *tFile) bool, lf func(level int) bool) { + ukey := ikey.ukey() + + // Aux level. + if aux != nil { + for _, t := range aux { + if t.overlaps(v.s.icmp, ukey, ukey) { + if !f(-1, t) { + return + } + } + } + + if lf != nil && !lf(-1) { + return + } + } + + // Walk tables level-by-level. + for level, tables := range v.levels { + if len(tables) == 0 { + continue + } + + if level == 0 { + // Level-0 files may overlap each other. Find all files that + // overlap ukey. + for _, t := range tables { + if t.overlaps(v.s.icmp, ukey, ukey) { + if !f(level, t) { + return + } + } + } + } else { + if i := tables.searchMax(v.s.icmp, ikey); i < len(tables) { + t := tables[i] + if v.s.icmp.uCompare(ukey, t.imin.ukey()) >= 0 { + if !f(level, t) { + return + } + } + } + } + + if lf != nil && !lf(level) { + return + } + } +} + +func (v *version) get(aux tFiles, ikey internalKey, ro *opt.ReadOptions, noValue bool) (value []byte, tcomp bool, err error) { + if v.closing { + return nil, false, ErrClosed + } + + ukey := ikey.ukey() + + var ( + tset *tSet + tseek bool + + // Level-0. + zfound bool + zseq uint64 + zkt keyType + zval []byte + ) + + err = ErrNotFound + + // Since entries never hop across level, finding key/value + // in smaller level make later levels irrelevant. + v.walkOverlapping(aux, ikey, func(level int, t *tFile) bool { + if level >= 0 && !tseek { + if tset == nil { + tset = &tSet{level, t} + } else { + tseek = true + } + } + + var ( + fikey, fval []byte + ferr error + ) + if noValue { + fikey, ferr = v.s.tops.findKey(t, ikey, ro) + } else { + fikey, fval, ferr = v.s.tops.find(t, ikey, ro) + } + + switch ferr { + case nil: + case ErrNotFound: + return true + default: + err = ferr + return false + } + + if fukey, fseq, fkt, fkerr := parseInternalKey(fikey); fkerr == nil { + if v.s.icmp.uCompare(ukey, fukey) == 0 { + // Level <= 0 may overlaps each-other. + if level <= 0 { + if fseq >= zseq { + zfound = true + zseq = fseq + zkt = fkt + zval = fval + } + } else { + switch fkt { + case keyTypeVal: + value = fval + err = nil + case keyTypeDel: + default: + panic("leveldb: invalid internalKey type") + } + return false + } + } + } else { + err = fkerr + return false + } + + return true + }, func(level int) bool { + if zfound { + switch zkt { + case keyTypeVal: + value = zval + err = nil + case keyTypeDel: + default: + panic("leveldb: invalid internalKey type") + } + return false + } + + return true + }) + + if tseek && tset.table.consumeSeek() <= 0 { + tcomp = atomic.CompareAndSwapPointer(&v.cSeek, nil, unsafe.Pointer(tset)) + } + + return +} + +func (v *version) sampleSeek(ikey internalKey) (tcomp bool) { + var tset *tSet + + v.walkOverlapping(nil, ikey, func(level int, t *tFile) bool { + if tset == nil { + tset = &tSet{level, t} + return true + } + if tset.table.consumeSeek() <= 0 { + tcomp = atomic.CompareAndSwapPointer(&v.cSeek, nil, unsafe.Pointer(tset)) + } + return false + }, nil) + + return +} + +func (v *version) getIterators(slice *util.Range, ro *opt.ReadOptions) (its []iterator.Iterator) { + strict := opt.GetStrict(v.s.o.Options, ro, opt.StrictReader) + for level, tables := range v.levels { + if level == 0 { + // Merge all level zero files together since they may overlap. + for _, t := range tables { + its = append(its, v.s.tops.newIterator(t, slice, ro)) + } + } else if len(tables) != 0 { + its = append(its, iterator.NewIndexedIterator(tables.newIndexIterator(v.s.tops, v.s.icmp, slice, ro), strict)) + } + } + return +} + +func (v *version) newStaging() *versionStaging { + return &versionStaging{base: v} +} + +// Spawn a new version based on this version. +func (v *version) spawn(r *sessionRecord) *version { + staging := v.newStaging() + staging.commit(r) + return staging.finish() +} + +func (v *version) fillRecord(r *sessionRecord) { + for level, tables := range v.levels { + for _, t := range tables { + r.addTableFile(level, t) + } + } +} + +func (v *version) tLen(level int) int { + if level < len(v.levels) { + return len(v.levels[level]) + } + return 0 +} + +func (v *version) offsetOf(ikey internalKey) (n int64, err error) { + for level, tables := range v.levels { + for _, t := range tables { + if v.s.icmp.Compare(t.imax, ikey) <= 0 { + // Entire file is before "ikey", so just add the file size + n += t.size + } else if v.s.icmp.Compare(t.imin, ikey) > 0 { + // Entire file is after "ikey", so ignore + if level > 0 { + // Files other than level 0 are sorted by meta->min, so + // no further files in this level will contain data for + // "ikey". + break + } + } else { + // "ikey" falls in the range for this table. Add the + // approximate offset of "ikey" within the table. + if m, err := v.s.tops.offsetOf(t, ikey); err == nil { + n += m + } else { + return 0, err + } + } + } + } + + return +} + +func (v *version) pickMemdbLevel(umin, umax []byte, maxLevel int) (level int) { + if maxLevel > 0 { + if len(v.levels) == 0 { + return maxLevel + } + if !v.levels[0].overlaps(v.s.icmp, umin, umax, true) { + var overlaps tFiles + for ; level < maxLevel; level++ { + if pLevel := level + 1; pLevel >= len(v.levels) { + return maxLevel + } else if v.levels[pLevel].overlaps(v.s.icmp, umin, umax, false) { + break + } + if gpLevel := level + 2; gpLevel < len(v.levels) { + overlaps = v.levels[gpLevel].getOverlaps(overlaps, v.s.icmp, umin, umax, false) + if overlaps.size() > int64(v.s.o.GetCompactionGPOverlaps(level)) { + break + } + } + } + } + } + return +} + +func (v *version) computeCompaction() { + // Precomputed best level for next compaction + bestLevel := int(-1) + bestScore := float64(-1) + + statFiles := make([]int, len(v.levels)) + statSizes := make([]string, len(v.levels)) + statScore := make([]string, len(v.levels)) + statTotSize := int64(0) + + for level, tables := range v.levels { + var score float64 + size := tables.size() + if level == 0 { + // We treat level-0 specially by bounding the number of files + // instead of number of bytes for two reasons: + // + // (1) With larger write-buffer sizes, it is nice not to do too + // many level-0 compaction. + // + // (2) The files in level-0 are merged on every read and + // therefore we wish to avoid too many files when the individual + // file size is small (perhaps because of a small write-buffer + // setting, or very high compression ratios, or lots of + // overwrites/deletions). + score = float64(len(tables)) / float64(v.s.o.GetCompactionL0Trigger()) + } else { + score = float64(size) / float64(v.s.o.GetCompactionTotalSize(level)) + } + + if score > bestScore { + bestLevel = level + bestScore = score + } + + statFiles[level] = len(tables) + statSizes[level] = shortenb(int(size)) + statScore[level] = fmt.Sprintf("%.2f", score) + statTotSize += size + } + + v.cLevel = bestLevel + v.cScore = bestScore + + v.s.logf("version@stat F·%v S·%s%v Sc·%v", statFiles, shortenb(int(statTotSize)), statSizes, statScore) +} + +func (v *version) needCompaction() bool { + return v.cScore >= 1 || atomic.LoadPointer(&v.cSeek) != nil +} + +type tablesScratch struct { + added map[int64]atRecord + deleted map[int64]struct{} +} + +type versionStaging struct { + base *version + levels []tablesScratch +} + +func (p *versionStaging) getScratch(level int) *tablesScratch { + if level >= len(p.levels) { + newLevels := make([]tablesScratch, level+1) + copy(newLevels, p.levels) + p.levels = newLevels + } + return &(p.levels[level]) +} + +func (p *versionStaging) commit(r *sessionRecord) { + // Deleted tables. + for _, r := range r.deletedTables { + scratch := p.getScratch(r.level) + if r.level < len(p.base.levels) && len(p.base.levels[r.level]) > 0 { + if scratch.deleted == nil { + scratch.deleted = make(map[int64]struct{}) + } + scratch.deleted[r.num] = struct{}{} + } + if scratch.added != nil { + delete(scratch.added, r.num) + } + } + + // New tables. + for _, r := range r.addedTables { + scratch := p.getScratch(r.level) + if scratch.added == nil { + scratch.added = make(map[int64]atRecord) + } + scratch.added[r.num] = r + if scratch.deleted != nil { + delete(scratch.deleted, r.num) + } + } +} + +func (p *versionStaging) finish() *version { + // Build new version. + nv := newVersion(p.base.s) + numLevel := len(p.levels) + if len(p.base.levels) > numLevel { + numLevel = len(p.base.levels) + } + nv.levels = make([]tFiles, numLevel) + for level := 0; level < numLevel; level++ { + var baseTabels tFiles + if level < len(p.base.levels) { + baseTabels = p.base.levels[level] + } + + if level < len(p.levels) { + scratch := p.levels[level] + + var nt tFiles + // Prealloc list if possible. + if n := len(baseTabels) + len(scratch.added) - len(scratch.deleted); n > 0 { + nt = make(tFiles, 0, n) + } + + // Base tables. + for _, t := range baseTabels { + if _, ok := scratch.deleted[t.fd.Num]; ok { + continue + } + if _, ok := scratch.added[t.fd.Num]; ok { + continue + } + nt = append(nt, t) + } + + // New tables. + for _, r := range scratch.added { + nt = append(nt, tableFileFromRecord(r)) + } + + if len(nt) != 0 { + // Sort tables. + if level == 0 { + nt.sortByNum() + } else { + nt.sortByKey(p.base.s.icmp) + } + + nv.levels[level] = nt + } + } else { + nv.levels[level] = baseTabels + } + } + + // Trim levels. + n := len(nv.levels) + for ; n > 0 && nv.levels[n-1] == nil; n-- { + } + nv.levels = nv.levels[:n] + + // Compute compaction score for new version. + nv.computeCompaction() + + return nv +} + +type versionReleaser struct { + v *version + once bool +} + +func (vr *versionReleaser) Release() { + v := vr.v + v.s.vmu.Lock() + if !vr.once { + v.releaseNB() + vr.once = true + } + v.s.vmu.Unlock() +} diff --git a/vendor/github.com/szferi/gomdb/LICENSE b/vendor/github.com/szferi/gomdb/LICENSE new file mode 100644 index 000000000000..96fd7a269744 --- /dev/null +++ b/vendor/github.com/szferi/gomdb/LICENSE @@ -0,0 +1,10 @@ +Copyright (c) 2013, Ferenc Szalai +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + + Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + Neither the name of the author nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/github.com/szferi/gomdb/cursor.go b/vendor/github.com/szferi/gomdb/cursor.go new file mode 100644 index 000000000000..75e0d1487147 --- /dev/null +++ b/vendor/github.com/szferi/gomdb/cursor.go @@ -0,0 +1,103 @@ +package mdb + +/* +#cgo CFLAGS: -pthread -W -Wall -Wno-unused-parameter -Wbad-function-cast -O2 -g +#cgo freebsd CFLAGS: -DMDB_DSYNC=O_SYNC +#cgo openbsd CFLAGS: -DMDB_DSYNC=O_SYNC +#cgo netbsd CFLAGS: -DMDB_DSYNC=O_SYNC +#include +#include +#include "lmdb.h" +*/ +import "C" + +import ( + "errors" +) + +// MDB_cursor_op +const ( + FIRST = iota + FIRST_DUP + GET_BOTH + GET_RANGE + GET_CURRENT + GET_MULTIPLE + LAST + LAST_DUP + NEXT + NEXT_DUP + NEXT_MULTIPLE + NEXT_NODUP + PREV + PREV_DUP + PREV_NODUP + SET + SET_KEY + SET_RANGE +) + +func (cursor *Cursor) Close() error { + if cursor._cursor == nil { + return errors.New("Cursor already closed") + } + C.mdb_cursor_close(cursor._cursor) + cursor._cursor = nil + return nil +} + +func (cursor *Cursor) Txn() *Txn { + var _txn *C.MDB_txn + _txn = C.mdb_cursor_txn(cursor._cursor) + if _txn != nil { + return &Txn{_txn} + } + return nil +} + +func (cursor *Cursor) DBI() DBI { + var _dbi C.MDB_dbi + _dbi = C.mdb_cursor_dbi(cursor._cursor) + return DBI(_dbi) +} + +// Retrieves the low-level MDB cursor. +func (cursor *Cursor) MdbCursor() *C.MDB_cursor { + return cursor._cursor +} + +func (cursor *Cursor) Get(set_key, sval []byte, op uint) (key, val []byte, err error) { + k, v, err := cursor.GetVal(set_key, sval, op) + if err != nil { + return nil, nil, err + } + return k.Bytes(), v.Bytes(), nil +} + +func (cursor *Cursor) GetVal(key, val []byte, op uint) (Val, Val, error) { + ckey := Wrap(key) + cval := Wrap(val) + ret := C.mdb_cursor_get(cursor._cursor, (*C.MDB_val)(&ckey), (*C.MDB_val)(&cval), C.MDB_cursor_op(op)) + return ckey, cval, errno(ret) +} + +func (cursor *Cursor) Put(key, val []byte, flags uint) error { + ckey := Wrap(key) + cval := Wrap(val) + ret := C.mdb_cursor_put(cursor._cursor, (*C.MDB_val)(&ckey), (*C.MDB_val)(&cval), C.uint(flags)) + return errno(ret) +} + +func (cursor *Cursor) Del(flags uint) error { + ret := C.mdb_cursor_del(cursor._cursor, C.uint(flags)) + return errno(ret) +} + +func (cursor *Cursor) Count() (uint64, error) { + var _size C.size_t + ret := C.mdb_cursor_count(cursor._cursor, &_size) + if ret != SUCCESS { + return 0, errno(ret) + } + return uint64(_size), nil +} diff --git a/vendor/github.com/szferi/gomdb/env.go b/vendor/github.com/szferi/gomdb/env.go new file mode 100644 index 000000000000..08b4d0a32900 --- /dev/null +++ b/vendor/github.com/szferi/gomdb/env.go @@ -0,0 +1,227 @@ +package mdb + +/* +#cgo CFLAGS: -pthread -W -Wall -Wno-unused-parameter -Wbad-function-cast -O2 -g +#cgo freebsd CFLAGS: -DMDB_DSYNC=O_SYNC +#cgo openbsd CFLAGS: -DMDB_DSYNC=O_SYNC +#cgo netbsd CFLAGS: -DMDB_DSYNC=O_SYNC +#include +#include +#include "lmdb.h" +*/ +import "C" + +import ( + "errors" + "fmt" + "syscall" + "unsafe" +) + +const SUCCESS = C.MDB_SUCCESS + +// mdb_env Environment Flags +const ( + FIXEDMAP = C.MDB_FIXEDMAP // mmap at a fixed address (experimental) + NOSUBDIR = C.MDB_NOSUBDIR // no environment directory + NOSYNC = C.MDB_NOSYNC // don't fsync after commit + RDONLY = C.MDB_RDONLY // read only + NOMETASYNC = C.MDB_NOMETASYNC // don't fsync metapage after commit + WRITEMAP = C.MDB_WRITEMAP // use writable mmap + MAPASYNC = C.MDB_MAPASYNC // use asynchronous msync when MDB_WRITEMAP is use + NOTLS = C.MDB_NOTLS // tie reader locktable slots to Txn objects instead of threads +) + +type DBI uint + +type Errno C.int + +// minimum and maximum values produced for the Errno type. syscall.Errnos of +// other values may still be produced. +const minErrno, maxErrno C.int = C.MDB_KEYEXIST, C.MDB_LAST_ERRCODE + +func (e Errno) Error() string { + s := C.GoString(C.mdb_strerror(C.int(e))) + if s == "" { + return fmt.Sprint("mdb errno:", int(e)) + } + return s +} + +// for tests that can't import C +func _errno(ret int) error { + return errno(C.int(ret)) +} + +func errno(ret C.int) error { + if ret == C.MDB_SUCCESS { + return nil + } + if minErrno <= ret && ret <= maxErrno { + return Errno(ret) + } + return syscall.Errno(ret) +} + +// error codes +const ( + KeyExist = Errno(C.MDB_KEYEXIST) + NotFound = Errno(C.MDB_NOTFOUND) + PageNotFound = Errno(C.MDB_PAGE_NOTFOUND) + Corrupted = Errno(C.MDB_CORRUPTED) + Panic = Errno(C.MDB_PANIC) + VersionMismatch = Errno(C.MDB_VERSION_MISMATCH) + Invalid = Errno(C.MDB_INVALID) + MapFull = Errno(C.MDB_MAP_FULL) + DbsFull = Errno(C.MDB_DBS_FULL) + ReadersFull = Errno(C.MDB_READERS_FULL) + TlsFull = Errno(C.MDB_TLS_FULL) + TxnFull = Errno(C.MDB_TXN_FULL) + CursorFull = Errno(C.MDB_CURSOR_FULL) + PageFull = Errno(C.MDB_PAGE_FULL) + MapResized = Errno(C.MDB_MAP_RESIZED) + Incompatibile = Errno(C.MDB_INCOMPATIBLE) +) + +func Version() string { + var major, minor, patch *C.int + ver_str := C.mdb_version(major, minor, patch) + return C.GoString(ver_str) +} + +// Env is opaque structure for a database environment. +// A DB environment supports multiple databases, all residing in the +// same shared-memory map. +type Env struct { + _env *C.MDB_env +} + +// Create an MDB environment handle. +func NewEnv() (*Env, error) { + var _env *C.MDB_env + ret := C.mdb_env_create(&_env) + if ret != SUCCESS { + return nil, errno(ret) + } + return &Env{_env}, nil +} + +// Open an environment handle. If this function fails Close() must be called to discard the Env handle. +func (env *Env) Open(path string, flags uint, mode uint) error { + cpath := C.CString(path) + defer C.free(unsafe.Pointer(cpath)) + ret := C.mdb_env_open(env._env, cpath, C.uint(NOTLS|flags), C.mdb_mode_t(mode)) + return errno(ret) +} + +func (env *Env) Close() error { + if env._env == nil { + return errors.New("Environment already closed") + } + C.mdb_env_close(env._env) + env._env = nil + return nil +} + +func (env *Env) Copy(path string) error { + cpath := C.CString(path) + defer C.free(unsafe.Pointer(cpath)) + ret := C.mdb_env_copy(env._env, cpath) + return errno(ret) +} + +// Statistics for a database in the environment +type Stat struct { + PSize uint // Size of a database page. This is currently the same for all databases. + Depth uint // Depth (height) of the B-tree + BranchPages uint64 // Number of internal (non-leaf) pages + LeafPages uint64 // Number of leaf pages + OverflowPages uint64 // Number of overflow pages + Entries uint64 // Number of data items +} + +func (env *Env) Stat() (*Stat, error) { + var _stat C.MDB_stat + ret := C.mdb_env_stat(env._env, &_stat) + if ret != SUCCESS { + return nil, errno(ret) + } + stat := Stat{PSize: uint(_stat.ms_psize), + Depth: uint(_stat.ms_depth), + BranchPages: uint64(_stat.ms_branch_pages), + LeafPages: uint64(_stat.ms_leaf_pages), + OverflowPages: uint64(_stat.ms_overflow_pages), + Entries: uint64(_stat.ms_entries)} + return &stat, nil +} + +type Info struct { + MapSize uint64 // Size of the data memory map + LastPNO uint64 // ID of the last used page + LastTxnID uint64 // ID of the last committed transaction + MaxReaders uint // maximum number of threads for the environment + NumReaders uint // maximum number of threads used in the environment +} + +func (env *Env) Info() (*Info, error) { + var _info C.MDB_envinfo + ret := C.mdb_env_info(env._env, &_info) + if ret != SUCCESS { + return nil, errno(ret) + } + info := Info{MapSize: uint64(_info.me_mapsize), + LastPNO: uint64(_info.me_last_pgno), + LastTxnID: uint64(_info.me_last_txnid), + MaxReaders: uint(_info.me_maxreaders), + NumReaders: uint(_info.me_numreaders)} + return &info, nil +} + +func (env *Env) Sync(force int) error { + ret := C.mdb_env_sync(env._env, C.int(force)) + return errno(ret) +} + +func (env *Env) SetFlags(flags uint, onoff int) error { + ret := C.mdb_env_set_flags(env._env, C.uint(flags), C.int(onoff)) + return errno(ret) +} + +func (env *Env) Flags() (uint, error) { + var _flags C.uint + ret := C.mdb_env_get_flags(env._env, &_flags) + if ret != SUCCESS { + return 0, errno(ret) + } + return uint(_flags), nil +} + +func (env *Env) Path() (string, error) { + var path string + cpath := C.CString(path) + defer C.free(unsafe.Pointer(cpath)) + ret := C.mdb_env_get_path(env._env, &cpath) + if ret != SUCCESS { + return "", errno(ret) + } + return C.GoString(cpath), nil +} + +func (env *Env) SetMapSize(size uint64) error { + ret := C.mdb_env_set_mapsize(env._env, C.size_t(size)) + return errno(ret) +} + +func (env *Env) SetMaxReaders(size uint) error { + ret := C.mdb_env_set_maxreaders(env._env, C.uint(size)) + return errno(ret) +} + +func (env *Env) SetMaxDBs(size DBI) error { + ret := C.mdb_env_set_maxdbs(env._env, C.MDB_dbi(size)) + return errno(ret) +} + +func (env *Env) DBIClose(dbi DBI) { + C.mdb_dbi_close(env._env, C.MDB_dbi(dbi)) +} diff --git a/vendor/github.com/szferi/gomdb/lmdb.h b/vendor/github.com/szferi/gomdb/lmdb.h new file mode 100644 index 000000000000..11b46bfa4866 --- /dev/null +++ b/vendor/github.com/szferi/gomdb/lmdb.h @@ -0,0 +1,1553 @@ +/** @file lmdb.h + * @brief Lightning memory-mapped database library + * + * @mainpage Lightning Memory-Mapped Database Manager (LMDB) + * + * @section intro_sec Introduction + * LMDB is a Btree-based database management library modeled loosely on the + * BerkeleyDB API, but much simplified. The entire database is exposed + * in a memory map, and all data fetches return data directly + * from the mapped memory, so no malloc's or memcpy's occur during + * data fetches. As such, the library is extremely simple because it + * requires no page caching layer of its own, and it is extremely high + * performance and memory-efficient. It is also fully transactional with + * full ACID semantics, and when the memory map is read-only, the + * database integrity cannot be corrupted by stray pointer writes from + * application code. + * + * The library is fully thread-aware and supports concurrent read/write + * access from multiple processes and threads. Data pages use a copy-on- + * write strategy so no active data pages are ever overwritten, which + * also provides resistance to corruption and eliminates the need of any + * special recovery procedures after a system crash. Writes are fully + * serialized; only one write transaction may be active at a time, which + * guarantees that writers can never deadlock. The database structure is + * multi-versioned so readers run with no locks; writers cannot block + * readers, and readers don't block writers. + * + * Unlike other well-known database mechanisms which use either write-ahead + * transaction logs or append-only data writes, LMDB requires no maintenance + * during operation. Both write-ahead loggers and append-only databases + * require periodic checkpointing and/or compaction of their log or database + * files otherwise they grow without bound. LMDB tracks free pages within + * the database and re-uses them for new write operations, so the database + * size does not grow without bound in normal use. + * + * The memory map can be used as a read-only or read-write map. It is + * read-only by default as this provides total immunity to corruption. + * Using read-write mode offers much higher write performance, but adds + * the possibility for stray application writes thru pointers to silently + * corrupt the database. Of course if your application code is known to + * be bug-free (...) then this is not an issue. + * + * @section caveats_sec Caveats + * Troubleshooting the lock file, plus semaphores on BSD systems: + * + * - A broken lockfile can cause sync issues. + * Stale reader transactions left behind by an aborted program + * cause further writes to grow the database quickly, and + * stale locks can block further operation. + * + * Fix: Check for stale readers periodically, using the + * #mdb_reader_check function or the \ref mdb_stat_1 "mdb_stat" tool. Or just + * make all programs using the database close it; the lockfile + * is always reset on first open of the environment. + * + * - On BSD systems or others configured with MDB_USE_POSIX_SEM, + * startup can fail due to semaphores owned by another userid. + * + * Fix: Open and close the database as the user which owns the + * semaphores (likely last user) or as root, while no other + * process is using the database. + * + * Restrictions/caveats (in addition to those listed for some functions): + * + * - Only the database owner should normally use the database on + * BSD systems or when otherwise configured with MDB_USE_POSIX_SEM. + * Multiple users can cause startup to fail later, as noted above. + * + * - There is normally no pure read-only mode, since readers need write + * access to locks and lock file. Exceptions: On read-only filesystems + * or with the #MDB_NOLOCK flag described under #mdb_env_open(). + * + * - By default, in versions before 0.9.10, unused portions of the data + * file might receive garbage data from memory freed by other code. + * (This does not happen when using the #MDB_WRITEMAP flag.) As of + * 0.9.10 the default behavior is to initialize such memory before + * writing to the data file. Since there may be a slight performance + * cost due to this initialization, applications may disable it using + * the #MDB_NOMEMINIT flag. Applications handling sensitive data + * which must not be written should not use this flag. This flag is + * irrelevant when using #MDB_WRITEMAP. + * + * - A thread can only use one transaction at a time, plus any child + * transactions. Each transaction belongs to one thread. See below. + * The #MDB_NOTLS flag changes this for read-only transactions. + * + * - Use an MDB_env* in the process which opened it, without fork()ing. + * + * - Do not have open an LMDB database twice in the same process at + * the same time. Not even from a plain open() call - close()ing it + * breaks flock() advisory locking. + * + * - Avoid long-lived transactions. Read transactions prevent + * reuse of pages freed by newer write transactions, thus the + * database can grow quickly. Write transactions prevent + * other write transactions, since writes are serialized. + * + * - Avoid suspending a process with active transactions. These + * would then be "long-lived" as above. Also read transactions + * suspended when writers commit could sometimes see wrong data. + * + * ...when several processes can use a database concurrently: + * + * - Avoid aborting a process with an active transaction. + * The transaction becomes "long-lived" as above until a check + * for stale readers is performed or the lockfile is reset, + * since the process may not remove it from the lockfile. + * + * - If you do that anyway, do a periodic check for stale readers. Or + * close the environment once in a while, so the lockfile can get reset. + * + * - Do not use LMDB databases on remote filesystems, even between + * processes on the same host. This breaks flock() on some OSes, + * possibly memory map sync, and certainly sync between programs + * on different hosts. + * + * - Opening a database can fail if another process is opening or + * closing it at exactly the same time. + * + * @author Howard Chu, Symas Corporation. + * + * @copyright Copyright 2011-2014 Howard Chu, Symas Corp. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + * + * @par Derived From: + * This code is derived from btree.c written by Martin Hedenfalk. + * + * Copyright (c) 2009, 2010 Martin Hedenfalk + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#ifndef _LMDB_H_ +#define _LMDB_H_ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** Unix permissions for creating files, or dummy definition for Windows */ +#ifdef _MSC_VER +typedef int mdb_mode_t; +#else +typedef mode_t mdb_mode_t; +#endif + +/** An abstraction for a file handle. + * On POSIX systems file handles are small integers. On Windows + * they're opaque pointers. + */ +#ifdef _WIN32 +typedef void *mdb_filehandle_t; +#else +typedef int mdb_filehandle_t; +#endif + +/** @defgroup mdb LMDB API + * @{ + * @brief OpenLDAP Lightning Memory-Mapped Database Manager + */ +/** @defgroup Version Version Macros + * @{ + */ +/** Library major version */ +#define MDB_VERSION_MAJOR 0 +/** Library minor version */ +#define MDB_VERSION_MINOR 9 +/** Library patch version */ +#define MDB_VERSION_PATCH 14 + +/** Combine args a,b,c into a single integer for easy version comparisons */ +#define MDB_VERINT(a,b,c) (((a) << 24) | ((b) << 16) | (c)) + +/** The full library version as a single integer */ +#define MDB_VERSION_FULL \ + MDB_VERINT(MDB_VERSION_MAJOR,MDB_VERSION_MINOR,MDB_VERSION_PATCH) + +/** The release date of this library version */ +#define MDB_VERSION_DATE "July 24, 2014" + +/** A stringifier for the version info */ +#define MDB_VERSTR(a,b,c,d) "LMDB " #a "." #b "." #c ": (" d ")" + +/** A helper for the stringifier macro */ +#define MDB_VERFOO(a,b,c,d) MDB_VERSTR(a,b,c,d) + +/** The full library version as a C string */ +#define MDB_VERSION_STRING \ + MDB_VERFOO(MDB_VERSION_MAJOR,MDB_VERSION_MINOR,MDB_VERSION_PATCH,MDB_VERSION_DATE) +/** @} */ + +/** @brief Opaque structure for a database environment. + * + * A DB environment supports multiple databases, all residing in the same + * shared-memory map. + */ +typedef struct MDB_env MDB_env; + +/** @brief Opaque structure for a transaction handle. + * + * All database operations require a transaction handle. Transactions may be + * read-only or read-write. + */ +typedef struct MDB_txn MDB_txn; + +/** @brief A handle for an individual database in the DB environment. */ +typedef unsigned int MDB_dbi; + +/** @brief Opaque structure for navigating through a database */ +typedef struct MDB_cursor MDB_cursor; + +/** @brief Generic structure used for passing keys and data in and out + * of the database. + * + * Values returned from the database are valid only until a subsequent + * update operation, or the end of the transaction. Do not modify or + * free them, they commonly point into the database itself. + * + * Key sizes must be between 1 and #mdb_env_get_maxkeysize() inclusive. + * The same applies to data sizes in databases with the #MDB_DUPSORT flag. + * Other data items can in theory be from 0 to 0xffffffff bytes long. + */ +typedef struct MDB_val { + size_t mv_size; /**< size of the data item */ + void *mv_data; /**< address of the data item */ +} MDB_val; + +/** @brief A callback function used to compare two keys in a database */ +typedef int (MDB_cmp_func)(const MDB_val *a, const MDB_val *b); + +/** @brief A callback function used to relocate a position-dependent data item + * in a fixed-address database. + * + * The \b newptr gives the item's desired address in + * the memory map, and \b oldptr gives its previous address. The item's actual + * data resides at the address in \b item. This callback is expected to walk + * through the fields of the record in \b item and modify any + * values based at the \b oldptr address to be relative to the \b newptr address. + * @param[in,out] item The item that is to be relocated. + * @param[in] oldptr The previous address. + * @param[in] newptr The new address to relocate to. + * @param[in] relctx An application-provided context, set by #mdb_set_relctx(). + * @todo This feature is currently unimplemented. + */ +typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *relctx); + +/** @defgroup mdb_env Environment Flags + * @{ + */ + /** mmap at a fixed address (experimental) */ +#define MDB_FIXEDMAP 0x01 + /** no environment directory */ +#define MDB_NOSUBDIR 0x4000 + /** don't fsync after commit */ +#define MDB_NOSYNC 0x10000 + /** read only */ +#define MDB_RDONLY 0x20000 + /** don't fsync metapage after commit */ +#define MDB_NOMETASYNC 0x40000 + /** use writable mmap */ +#define MDB_WRITEMAP 0x80000 + /** use asynchronous msync when #MDB_WRITEMAP is used */ +#define MDB_MAPASYNC 0x100000 + /** tie reader locktable slots to #MDB_txn objects instead of to threads */ +#define MDB_NOTLS 0x200000 + /** don't do any locking, caller must manage their own locks */ +#define MDB_NOLOCK 0x400000 + /** don't do readahead (no effect on Windows) */ +#define MDB_NORDAHEAD 0x800000 + /** don't initialize malloc'd memory before writing to datafile */ +#define MDB_NOMEMINIT 0x1000000 +/** @} */ + +/** @defgroup mdb_dbi_open Database Flags + * @{ + */ + /** use reverse string keys */ +#define MDB_REVERSEKEY 0x02 + /** use sorted duplicates */ +#define MDB_DUPSORT 0x04 + /** numeric keys in native byte order. + * The keys must all be of the same size. */ +#define MDB_INTEGERKEY 0x08 + /** with #MDB_DUPSORT, sorted dup items have fixed size */ +#define MDB_DUPFIXED 0x10 + /** with #MDB_DUPSORT, dups are numeric in native byte order */ +#define MDB_INTEGERDUP 0x20 + /** with #MDB_DUPSORT, use reverse string dups */ +#define MDB_REVERSEDUP 0x40 + /** create DB if not already existing */ +#define MDB_CREATE 0x40000 +/** @} */ + +/** @defgroup mdb_put Write Flags + * @{ + */ +/** For put: Don't write if the key already exists. */ +#define MDB_NOOVERWRITE 0x10 +/** Only for #MDB_DUPSORT
+ * For put: don't write if the key and data pair already exist.
+ * For mdb_cursor_del: remove all duplicate data items. + */ +#define MDB_NODUPDATA 0x20 +/** For mdb_cursor_put: overwrite the current key/data pair */ +#define MDB_CURRENT 0x40 +/** For put: Just reserve space for data, don't copy it. Return a + * pointer to the reserved space. + */ +#define MDB_RESERVE 0x10000 +/** Data is being appended, don't split full pages. */ +#define MDB_APPEND 0x20000 +/** Duplicate data is being appended, don't split full pages. */ +#define MDB_APPENDDUP 0x40000 +/** Store multiple data items in one call. Only for #MDB_DUPFIXED. */ +#define MDB_MULTIPLE 0x80000 +/* @} */ + +/** @defgroup mdb_copy Copy Flags + * @{ + */ +/** Compacting copy: Omit free space from copy, and renumber all + * pages sequentially. + */ +#define MDB_CP_COMPACT 0x01 +/* @} */ + +/** @brief Cursor Get operations. + * + * This is the set of all operations for retrieving data + * using a cursor. + */ +typedef enum MDB_cursor_op { + MDB_FIRST, /**< Position at first key/data item */ + MDB_FIRST_DUP, /**< Position at first data item of current key. + Only for #MDB_DUPSORT */ + MDB_GET_BOTH, /**< Position at key/data pair. Only for #MDB_DUPSORT */ + MDB_GET_BOTH_RANGE, /**< position at key, nearest data. Only for #MDB_DUPSORT */ + MDB_GET_CURRENT, /**< Return key/data at current cursor position */ + MDB_GET_MULTIPLE, /**< Return key and up to a page of duplicate data items + from current cursor position. Move cursor to prepare + for #MDB_NEXT_MULTIPLE. Only for #MDB_DUPFIXED */ + MDB_LAST, /**< Position at last key/data item */ + MDB_LAST_DUP, /**< Position at last data item of current key. + Only for #MDB_DUPSORT */ + MDB_NEXT, /**< Position at next data item */ + MDB_NEXT_DUP, /**< Position at next data item of current key. + Only for #MDB_DUPSORT */ + MDB_NEXT_MULTIPLE, /**< Return key and up to a page of duplicate data items + from next cursor position. Move cursor to prepare + for #MDB_NEXT_MULTIPLE. Only for #MDB_DUPFIXED */ + MDB_NEXT_NODUP, /**< Position at first data item of next key */ + MDB_PREV, /**< Position at previous data item */ + MDB_PREV_DUP, /**< Position at previous data item of current key. + Only for #MDB_DUPSORT */ + MDB_PREV_NODUP, /**< Position at last data item of previous key */ + MDB_SET, /**< Position at specified key */ + MDB_SET_KEY, /**< Position at specified key, return key + data */ + MDB_SET_RANGE /**< Position at first key greater than or equal to specified key. */ +} MDB_cursor_op; + +/** @defgroup errors Return Codes + * + * BerkeleyDB uses -30800 to -30999, we'll go under them + * @{ + */ + /** Successful result */ +#define MDB_SUCCESS 0 + /** key/data pair already exists */ +#define MDB_KEYEXIST (-30799) + /** key/data pair not found (EOF) */ +#define MDB_NOTFOUND (-30798) + /** Requested page not found - this usually indicates corruption */ +#define MDB_PAGE_NOTFOUND (-30797) + /** Located page was wrong type */ +#define MDB_CORRUPTED (-30796) + /** Update of meta page failed, probably I/O error */ +#define MDB_PANIC (-30795) + /** Environment version mismatch */ +#define MDB_VERSION_MISMATCH (-30794) + /** File is not a valid LMDB file */ +#define MDB_INVALID (-30793) + /** Environment mapsize reached */ +#define MDB_MAP_FULL (-30792) + /** Environment maxdbs reached */ +#define MDB_DBS_FULL (-30791) + /** Environment maxreaders reached */ +#define MDB_READERS_FULL (-30790) + /** Too many TLS keys in use - Windows only */ +#define MDB_TLS_FULL (-30789) + /** Txn has too many dirty pages */ +#define MDB_TXN_FULL (-30788) + /** Cursor stack too deep - internal error */ +#define MDB_CURSOR_FULL (-30787) + /** Page has not enough space - internal error */ +#define MDB_PAGE_FULL (-30786) + /** Database contents grew beyond environment mapsize */ +#define MDB_MAP_RESIZED (-30785) + /** MDB_INCOMPATIBLE: Operation and DB incompatible, or DB flags changed */ +#define MDB_INCOMPATIBLE (-30784) + /** Invalid reuse of reader locktable slot */ +#define MDB_BAD_RSLOT (-30783) + /** Transaction cannot recover - it must be aborted */ +#define MDB_BAD_TXN (-30782) + /** Unsupported size of key/DB name/data, or wrong DUPFIXED size */ +#define MDB_BAD_VALSIZE (-30781) + /** The specified DBI was changed unexpectedly */ +#define MDB_BAD_DBI (-30780) + /** The last defined error code */ +#define MDB_LAST_ERRCODE MDB_BAD_DBI +/** @} */ + +/** @brief Statistics for a database in the environment */ +typedef struct MDB_stat { + unsigned int ms_psize; /**< Size of a database page. + This is currently the same for all databases. */ + unsigned int ms_depth; /**< Depth (height) of the B-tree */ + size_t ms_branch_pages; /**< Number of internal (non-leaf) pages */ + size_t ms_leaf_pages; /**< Number of leaf pages */ + size_t ms_overflow_pages; /**< Number of overflow pages */ + size_t ms_entries; /**< Number of data items */ +} MDB_stat; + +/** @brief Information about the environment */ +typedef struct MDB_envinfo { + void *me_mapaddr; /**< Address of map, if fixed */ + size_t me_mapsize; /**< Size of the data memory map */ + size_t me_last_pgno; /**< ID of the last used page */ + size_t me_last_txnid; /**< ID of the last committed transaction */ + unsigned int me_maxreaders; /**< max reader slots in the environment */ + unsigned int me_numreaders; /**< max reader slots used in the environment */ +} MDB_envinfo; + + /** @brief Return the LMDB library version information. + * + * @param[out] major if non-NULL, the library major version number is copied here + * @param[out] minor if non-NULL, the library minor version number is copied here + * @param[out] patch if non-NULL, the library patch version number is copied here + * @retval "version string" The library version as a string + */ +char *mdb_version(int *major, int *minor, int *patch); + + /** @brief Return a string describing a given error code. + * + * This function is a superset of the ANSI C X3.159-1989 (ANSI C) strerror(3) + * function. If the error code is greater than or equal to 0, then the string + * returned by the system function strerror(3) is returned. If the error code + * is less than 0, an error string corresponding to the LMDB library error is + * returned. See @ref errors for a list of LMDB-specific error codes. + * @param[in] err The error code + * @retval "error message" The description of the error + */ +char *mdb_strerror(int err); + + /** @brief Create an LMDB environment handle. + * + * This function allocates memory for a #MDB_env structure. To release + * the allocated memory and discard the handle, call #mdb_env_close(). + * Before the handle may be used, it must be opened using #mdb_env_open(). + * Various other options may also need to be set before opening the handle, + * e.g. #mdb_env_set_mapsize(), #mdb_env_set_maxreaders(), #mdb_env_set_maxdbs(), + * depending on usage requirements. + * @param[out] env The address where the new handle will be stored + * @return A non-zero error value on failure and 0 on success. + */ +int mdb_env_create(MDB_env **env); + + /** @brief Open an environment handle. + * + * If this function fails, #mdb_env_close() must be called to discard the #MDB_env handle. + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[in] path The directory in which the database files reside. This + * directory must already exist and be writable. + * @param[in] flags Special options for this environment. This parameter + * must be set to 0 or by bitwise OR'ing together one or more of the + * values described here. + * Flags set by mdb_env_set_flags() are also used. + *
    + *
  • #MDB_FIXEDMAP + * use a fixed address for the mmap region. This flag must be specified + * when creating the environment, and is stored persistently in the environment. + * If successful, the memory map will always reside at the same virtual address + * and pointers used to reference data items in the database will be constant + * across multiple invocations. This option may not always work, depending on + * how the operating system has allocated memory to shared libraries and other uses. + * The feature is highly experimental. + *
  • #MDB_NOSUBDIR + * By default, LMDB creates its environment in a directory whose + * pathname is given in \b path, and creates its data and lock files + * under that directory. With this option, \b path is used as-is for + * the database main data file. The database lock file is the \b path + * with "-lock" appended. + *
  • #MDB_RDONLY + * Open the environment in read-only mode. No write operations will be + * allowed. LMDB will still modify the lock file - except on read-only + * filesystems, where LMDB does not use locks. + *
  • #MDB_WRITEMAP + * Use a writeable memory map unless MDB_RDONLY is set. This is faster + * and uses fewer mallocs, but loses protection from application bugs + * like wild pointer writes and other bad updates into the database. + * Incompatible with nested transactions. + * Processes with and without MDB_WRITEMAP on the same environment do + * not cooperate well. + *
  • #MDB_NOMETASYNC + * Flush system buffers to disk only once per transaction, omit the + * metadata flush. Defer that until the system flushes files to disk, + * or next non-MDB_RDONLY commit or #mdb_env_sync(). This optimization + * maintains database integrity, but a system crash may undo the last + * committed transaction. I.e. it preserves the ACI (atomicity, + * consistency, isolation) but not D (durability) database property. + * This flag may be changed at any time using #mdb_env_set_flags(). + *
  • #MDB_NOSYNC + * Don't flush system buffers to disk when committing a transaction. + * This optimization means a system crash can corrupt the database or + * lose the last transactions if buffers are not yet flushed to disk. + * The risk is governed by how often the system flushes dirty buffers + * to disk and how often #mdb_env_sync() is called. However, if the + * filesystem preserves write order and the #MDB_WRITEMAP flag is not + * used, transactions exhibit ACI (atomicity, consistency, isolation) + * properties and only lose D (durability). I.e. database integrity + * is maintained, but a system crash may undo the final transactions. + * Note that (#MDB_NOSYNC | #MDB_WRITEMAP) leaves the system with no + * hint for when to write transactions to disk, unless #mdb_env_sync() + * is called. (#MDB_MAPASYNC | #MDB_WRITEMAP) may be preferable. + * This flag may be changed at any time using #mdb_env_set_flags(). + *
  • #MDB_MAPASYNC + * When using #MDB_WRITEMAP, use asynchronous flushes to disk. + * As with #MDB_NOSYNC, a system crash can then corrupt the + * database or lose the last transactions. Calling #mdb_env_sync() + * ensures on-disk database integrity until next commit. + * This flag may be changed at any time using #mdb_env_set_flags(). + *
  • #MDB_NOTLS + * Don't use Thread-Local Storage. Tie reader locktable slots to + * #MDB_txn objects instead of to threads. I.e. #mdb_txn_reset() keeps + * the slot reseved for the #MDB_txn object. A thread may use parallel + * read-only transactions. A read-only transaction may span threads if + * the user synchronizes its use. Applications that multiplex many + * user threads over individual OS threads need this option. Such an + * application must also serialize the write transactions in an OS + * thread, since LMDB's write locking is unaware of the user threads. + *
  • #MDB_NOLOCK + * Don't do any locking. If concurrent access is anticipated, the + * caller must manage all concurrency itself. For proper operation + * the caller must enforce single-writer semantics, and must ensure + * that no readers are using old transactions while a writer is + * active. The simplest approach is to use an exclusive lock so that + * no readers may be active at all when a writer begins. + *
  • #MDB_NORDAHEAD + * Turn off readahead. Most operating systems perform readahead on + * read requests by default. This option turns it off if the OS + * supports it. Turning it off may help random read performance + * when the DB is larger than RAM and system RAM is full. + * The option is not implemented on Windows. + *
  • #MDB_NOMEMINIT + * Don't initialize malloc'd memory before writing to unused spaces + * in the data file. By default, memory for pages written to the data + * file is obtained using malloc. While these pages may be reused in + * subsequent transactions, freshly malloc'd pages will be initialized + * to zeroes before use. This avoids persisting leftover data from other + * code (that used the heap and subsequently freed the memory) into the + * data file. Note that many other system libraries may allocate + * and free memory from the heap for arbitrary uses. E.g., stdio may + * use the heap for file I/O buffers. This initialization step has a + * modest performance cost so some applications may want to disable + * it using this flag. This option can be a problem for applications + * which handle sensitive data like passwords, and it makes memory + * checkers like Valgrind noisy. This flag is not needed with #MDB_WRITEMAP, + * which writes directly to the mmap instead of using malloc for pages. The + * initialization is also skipped if #MDB_RESERVE is used; the + * caller is expected to overwrite all of the memory that was + * reserved in that case. + * This flag may be changed at any time using #mdb_env_set_flags(). + *
+ * @param[in] mode The UNIX permissions to set on created files. This parameter + * is ignored on Windows. + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • #MDB_VERSION_MISMATCH - the version of the LMDB library doesn't match the + * version that created the database environment. + *
  • #MDB_INVALID - the environment file headers are corrupted. + *
  • ENOENT - the directory specified by the path parameter doesn't exist. + *
  • EACCES - the user didn't have permission to access the environment files. + *
  • EAGAIN - the environment was locked by another process. + *
+ */ +int mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode); + + /** @brief Copy an LMDB environment to the specified path. + * + * This function may be used to make a backup of an existing environment. + * No lockfile is created, since it gets recreated at need. + * @note This call can trigger significant file size growth if run in + * parallel with write transactions, because it employs a read-only + * transaction. See long-lived transactions under @ref caveats_sec. + * @param[in] env An environment handle returned by #mdb_env_create(). It + * must have already been opened successfully. + * @param[in] path The directory in which the copy will reside. This + * directory must already exist and be writable but must otherwise be + * empty. + * @return A non-zero error value on failure and 0 on success. + */ +int mdb_env_copy(MDB_env *env, const char *path); + + /** @brief Copy an LMDB environment to the specified file descriptor. + * + * This function may be used to make a backup of an existing environment. + * No lockfile is created, since it gets recreated at need. + * @note This call can trigger significant file size growth if run in + * parallel with write transactions, because it employs a read-only + * transaction. See long-lived transactions under @ref caveats_sec. + * @param[in] env An environment handle returned by #mdb_env_create(). It + * must have already been opened successfully. + * @param[in] fd The filedescriptor to write the copy to. It must + * have already been opened for Write access. + * @return A non-zero error value on failure and 0 on success. + */ +int mdb_env_copyfd(MDB_env *env, mdb_filehandle_t fd); + + /** @brief Copy an LMDB environment to the specified path, with options. + * + * This function may be used to make a backup of an existing environment. + * No lockfile is created, since it gets recreated at need. + * @note This call can trigger significant file size growth if run in + * parallel with write transactions, because it employs a read-only + * transaction. See long-lived transactions under @ref caveats_sec. + * @param[in] env An environment handle returned by #mdb_env_create(). It + * must have already been opened successfully. + * @param[in] path The directory in which the copy will reside. This + * directory must already exist and be writable but must otherwise be + * empty. + * @param[in] flags Special options for this operation. This parameter + * must be set to 0 or by bitwise OR'ing together one or more of the + * values described here. + *
    + *
  • #MDB_CP_COMPACT - Perform compaction while copying: omit free + * pages and sequentially renumber all pages in output. This option + * consumes more CPU and runs more slowly than the default. + *
+ * @return A non-zero error value on failure and 0 on success. + */ +int mdb_env_copy2(MDB_env *env, const char *path, unsigned int flags); + + /** @brief Copy an LMDB environment to the specified file descriptor, + * with options. + * + * This function may be used to make a backup of an existing environment. + * No lockfile is created, since it gets recreated at need. See + * #mdb_env_copy2() for further details. + * @note This call can trigger significant file size growth if run in + * parallel with write transactions, because it employs a read-only + * transaction. See long-lived transactions under @ref caveats_sec. + * @param[in] env An environment handle returned by #mdb_env_create(). It + * must have already been opened successfully. + * @param[in] fd The filedescriptor to write the copy to. It must + * have already been opened for Write access. + * @param[in] flags Special options for this operation. + * See #mdb_env_copy2() for options. + * @return A non-zero error value on failure and 0 on success. + */ +int mdb_env_copyfd2(MDB_env *env, mdb_filehandle_t fd, unsigned int flags); + + /** @brief Return statistics about the LMDB environment. + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[out] stat The address of an #MDB_stat structure + * where the statistics will be copied + */ +int mdb_env_stat(MDB_env *env, MDB_stat *stat); + + /** @brief Return information about the LMDB environment. + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[out] stat The address of an #MDB_envinfo structure + * where the information will be copied + */ +int mdb_env_info(MDB_env *env, MDB_envinfo *stat); + + /** @brief Flush the data buffers to disk. + * + * Data is always written to disk when #mdb_txn_commit() is called, + * but the operating system may keep it buffered. LMDB always flushes + * the OS buffers upon commit as well, unless the environment was + * opened with #MDB_NOSYNC or in part #MDB_NOMETASYNC. + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[in] force If non-zero, force a synchronous flush. Otherwise + * if the environment has the #MDB_NOSYNC flag set the flushes + * will be omitted, and with #MDB_MAPASYNC they will be asynchronous. + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
  • EIO - an error occurred during synchronization. + *
+ */ +int mdb_env_sync(MDB_env *env, int force); + + /** @brief Close the environment and release the memory map. + * + * Only a single thread may call this function. All transactions, databases, + * and cursors must already be closed before calling this function. Attempts to + * use any such handles after calling this function will cause a SIGSEGV. + * The environment handle will be freed and must not be used again after this call. + * @param[in] env An environment handle returned by #mdb_env_create() + */ +void mdb_env_close(MDB_env *env); + + /** @brief Set environment flags. + * + * This may be used to set some flags in addition to those from + * #mdb_env_open(), or to unset these flags. If several threads + * change the flags at the same time, the result is undefined. + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[in] flags The flags to change, bitwise OR'ed together + * @param[in] onoff A non-zero value sets the flags, zero clears them. + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_env_set_flags(MDB_env *env, unsigned int flags, int onoff); + + /** @brief Get environment flags. + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[out] flags The address of an integer to store the flags + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_env_get_flags(MDB_env *env, unsigned int *flags); + + /** @brief Return the path that was used in #mdb_env_open(). + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[out] path Address of a string pointer to contain the path. This + * is the actual string in the environment, not a copy. It should not be + * altered in any way. + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_env_get_path(MDB_env *env, const char **path); + + /** @brief Return the filedescriptor for the given environment. + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[out] fd Address of a mdb_filehandle_t to contain the descriptor. + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *fd); + + /** @brief Set the size of the memory map to use for this environment. + * + * The size should be a multiple of the OS page size. The default is + * 10485760 bytes. The size of the memory map is also the maximum size + * of the database. The value should be chosen as large as possible, + * to accommodate future growth of the database. + * This function should be called after #mdb_env_create() and before #mdb_env_open(). + * It may be called at later times if no transactions are active in + * this process. Note that the library does not check for this condition, + * the caller must ensure it explicitly. + * + * The new size takes effect immediately for the current process but + * will not be persisted to any others until a write transaction has been + * committed by the current process. Also, only mapsize increases are + * persisted into the environment. + * + * If the mapsize is increased by another process, and data has grown + * beyond the range of the current mapsize, #mdb_txn_begin() will + * return #MDB_MAP_RESIZED. This function may be called with a size + * of zero to adopt the new size. + * + * Any attempt to set a size smaller than the space already consumed + * by the environment will be silently changed to the current size of the used space. + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[in] size The size in bytes + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified, or the environment has + * an active write transaction. + *
+ */ +int mdb_env_set_mapsize(MDB_env *env, size_t size); + + /** @brief Set the maximum number of threads/reader slots for the environment. + * + * This defines the number of slots in the lock table that is used to track readers in the + * the environment. The default is 126. + * Starting a read-only transaction normally ties a lock table slot to the + * current thread until the environment closes or the thread exits. If + * MDB_NOTLS is in use, #mdb_txn_begin() instead ties the slot to the + * MDB_txn object until it or the #MDB_env object is destroyed. + * This function may only be called after #mdb_env_create() and before #mdb_env_open(). + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[in] readers The maximum number of reader lock table slots + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified, or the environment is already open. + *
+ */ +int mdb_env_set_maxreaders(MDB_env *env, unsigned int readers); + + /** @brief Get the maximum number of threads/reader slots for the environment. + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[out] readers Address of an integer to store the number of readers + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_env_get_maxreaders(MDB_env *env, unsigned int *readers); + + /** @brief Set the maximum number of named databases for the environment. + * + * This function is only needed if multiple databases will be used in the + * environment. Simpler applications that use the environment as a single + * unnamed database can ignore this option. + * This function may only be called after #mdb_env_create() and before #mdb_env_open(). + * + * Currently a moderate number of slots are cheap but a huge number gets + * expensive: 7-120 words per transaction, and every #mdb_dbi_open() + * does a linear search of the opened slots. + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[in] dbs The maximum number of databases + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified, or the environment is already open. + *
+ */ +int mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs); + + /** @brief Get the maximum size of keys and #MDB_DUPSORT data we can write. + * + * Depends on the compile-time constant #MDB_MAXKEYSIZE. Default 511. + * See @ref MDB_val. + * @param[in] env An environment handle returned by #mdb_env_create() + * @return The maximum size of a key we can write + */ +int mdb_env_get_maxkeysize(MDB_env *env); + + /** @brief Set application information associated with the #MDB_env. + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[in] ctx An arbitrary pointer for whatever the application needs. + * @return A non-zero error value on failure and 0 on success. + */ +int mdb_env_set_userctx(MDB_env *env, void *ctx); + + /** @brief Get the application information associated with the #MDB_env. + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @return The pointer set by #mdb_env_set_userctx(). + */ +void *mdb_env_get_userctx(MDB_env *env); + + /** @brief A callback function for most LMDB assert() failures, + * called before printing the message and aborting. + * + * @param[in] env An environment handle returned by #mdb_env_create(). + * @param[in] msg The assertion message, not including newline. + */ +typedef void MDB_assert_func(MDB_env *env, const char *msg); + + /** Set or reset the assert() callback of the environment. + * Disabled if liblmdb is buillt with NDEBUG. + * @note This hack should become obsolete as lmdb's error handling matures. + * @param[in] env An environment handle returned by #mdb_env_create(). + * @param[in] func An #MDB_assert_func function, or 0. + * @return A non-zero error value on failure and 0 on success. + */ +int mdb_env_set_assert(MDB_env *env, MDB_assert_func *func); + + /** @brief Create a transaction for use with the environment. + * + * The transaction handle may be discarded using #mdb_txn_abort() or #mdb_txn_commit(). + * @note A transaction and its cursors must only be used by a single + * thread, and a thread may only have a single transaction at a time. + * If #MDB_NOTLS is in use, this does not apply to read-only transactions. + * @note Cursors may not span transactions. + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[in] parent If this parameter is non-NULL, the new transaction + * will be a nested transaction, with the transaction indicated by \b parent + * as its parent. Transactions may be nested to any level. A parent + * transaction and its cursors may not issue any other operations than + * mdb_txn_commit and mdb_txn_abort while it has active child transactions. + * @param[in] flags Special options for this transaction. This parameter + * must be set to 0 or by bitwise OR'ing together one or more of the + * values described here. + *
    + *
  • #MDB_RDONLY + * This transaction will not perform any write operations. + *
+ * @param[out] txn Address where the new #MDB_txn handle will be stored + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • #MDB_PANIC - a fatal error occurred earlier and the environment + * must be shut down. + *
  • #MDB_MAP_RESIZED - another process wrote data beyond this MDB_env's + * mapsize and this environment's map must be resized as well. + * See #mdb_env_set_mapsize(). + *
  • #MDB_READERS_FULL - a read-only transaction was requested and + * the reader lock table is full. See #mdb_env_set_maxreaders(). + *
  • ENOMEM - out of memory. + *
+ */ +int mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **txn); + + /** @brief Returns the transaction's #MDB_env + * + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + */ +MDB_env *mdb_txn_env(MDB_txn *txn); + + /** @brief Commit all the operations of a transaction into the database. + * + * The transaction handle is freed. It and its cursors must not be used + * again after this call, except with #mdb_cursor_renew(). + * @note Earlier documentation incorrectly said all cursors would be freed. + * Only write-transactions free cursors. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
  • ENOSPC - no more disk space. + *
  • EIO - a low-level I/O error occurred while writing. + *
  • ENOMEM - out of memory. + *
+ */ +int mdb_txn_commit(MDB_txn *txn); + + /** @brief Abandon all the operations of the transaction instead of saving them. + * + * The transaction handle is freed. It and its cursors must not be used + * again after this call, except with #mdb_cursor_renew(). + * @note Earlier documentation incorrectly said all cursors would be freed. + * Only write-transactions free cursors. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + */ +void mdb_txn_abort(MDB_txn *txn); + + /** @brief Reset a read-only transaction. + * + * Abort the transaction like #mdb_txn_abort(), but keep the transaction + * handle. #mdb_txn_renew() may reuse the handle. This saves allocation + * overhead if the process will start a new read-only transaction soon, + * and also locking overhead if #MDB_NOTLS is in use. The reader table + * lock is released, but the table slot stays tied to its thread or + * #MDB_txn. Use mdb_txn_abort() to discard a reset handle, and to free + * its lock table slot if MDB_NOTLS is in use. + * Cursors opened within the transaction must not be used + * again after this call, except with #mdb_cursor_renew(). + * Reader locks generally don't interfere with writers, but they keep old + * versions of database pages allocated. Thus they prevent the old pages + * from being reused when writers commit new data, and so under heavy load + * the database size may grow much more rapidly than otherwise. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + */ +void mdb_txn_reset(MDB_txn *txn); + + /** @brief Renew a read-only transaction. + * + * This acquires a new reader lock for a transaction handle that had been + * released by #mdb_txn_reset(). It must be called before a reset transaction + * may be used again. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • #MDB_PANIC - a fatal error occurred earlier and the environment + * must be shut down. + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_txn_renew(MDB_txn *txn); + +/** Compat with version <= 0.9.4, avoid clash with libmdb from MDB Tools project */ +#define mdb_open(txn,name,flags,dbi) mdb_dbi_open(txn,name,flags,dbi) +/** Compat with version <= 0.9.4, avoid clash with libmdb from MDB Tools project */ +#define mdb_close(env,dbi) mdb_dbi_close(env,dbi) + + /** @brief Open a database in the environment. + * + * A database handle denotes the name and parameters of a database, + * independently of whether such a database exists. + * The database handle may be discarded by calling #mdb_dbi_close(). + * The old database handle is returned if the database was already open. + * The handle may only be closed once. + * The database handle will be private to the current transaction until + * the transaction is successfully committed. If the transaction is + * aborted the handle will be closed automatically. + * After a successful commit the + * handle will reside in the shared environment, and may be used + * by other transactions. This function must not be called from + * multiple concurrent transactions. A transaction that uses this function + * must finish (either commit or abort) before any other transaction may + * use this function. + * + * To use named databases (with name != NULL), #mdb_env_set_maxdbs() + * must be called before opening the environment. Database names + * are kept as keys in the unnamed database. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] name The name of the database to open. If only a single + * database is needed in the environment, this value may be NULL. + * @param[in] flags Special options for this database. This parameter + * must be set to 0 or by bitwise OR'ing together one or more of the + * values described here. + *
    + *
  • #MDB_REVERSEKEY + * Keys are strings to be compared in reverse order, from the end + * of the strings to the beginning. By default, Keys are treated as strings and + * compared from beginning to end. + *
  • #MDB_DUPSORT + * Duplicate keys may be used in the database. (Or, from another perspective, + * keys may have multiple data items, stored in sorted order.) By default + * keys must be unique and may have only a single data item. + *
  • #MDB_INTEGERKEY + * Keys are binary integers in native byte order. Setting this option + * requires all keys to be the same size, typically sizeof(int) + * or sizeof(size_t). + *
  • #MDB_DUPFIXED + * This flag may only be used in combination with #MDB_DUPSORT. This option + * tells the library that the data items for this database are all the same + * size, which allows further optimizations in storage and retrieval. When + * all data items are the same size, the #MDB_GET_MULTIPLE and #MDB_NEXT_MULTIPLE + * cursor operations may be used to retrieve multiple items at once. + *
  • #MDB_INTEGERDUP + * This option specifies that duplicate data items are also integers, and + * should be sorted as such. + *
  • #MDB_REVERSEDUP + * This option specifies that duplicate data items should be compared as + * strings in reverse order. + *
  • #MDB_CREATE + * Create the named database if it doesn't exist. This option is not + * allowed in a read-only transaction or a read-only environment. + *
+ * @param[out] dbi Address where the new #MDB_dbi handle will be stored + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • #MDB_NOTFOUND - the specified database doesn't exist in the environment + * and #MDB_CREATE was not specified. + *
  • #MDB_DBS_FULL - too many databases have been opened. See #mdb_env_set_maxdbs(). + *
+ */ +int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *dbi); + + /** @brief Retrieve statistics for a database. + * + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[out] stat The address of an #MDB_stat structure + * where the statistics will be copied + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *stat); + + /** @brief Retrieve the DB flags for a database handle. + * + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[out] flags Address where the flags will be returned. + * @return A non-zero error value on failure and 0 on success. + */ +int mdb_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned int *flags); + + /** @brief Close a database handle. Normally unnecessary. Use with care: + * + * This call is not mutex protected. Handles should only be closed by + * a single thread, and only if no other threads are going to reference + * the database handle or one of its cursors any further. Do not close + * a handle if an existing transaction has modified its database. + * Doing so can cause misbehavior from database corruption to errors + * like MDB_BAD_VALSIZE (since the DB name is gone). + * + * Closing a database handle is not necessary, but lets #mdb_dbi_open() + * reuse the handle value. Usually it's better to set a bigger + * #mdb_env_set_maxdbs(), unless that value would be large. + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + */ +void mdb_dbi_close(MDB_env *env, MDB_dbi dbi); + + /** @brief Empty or delete+close a database. + * + * See #mdb_dbi_close() for restrictions about closing the DB handle. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[in] del 0 to empty the DB, 1 to delete it from the + * environment and close the DB handle. + * @return A non-zero error value on failure and 0 on success. + */ +int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del); + + /** @brief Set a custom key comparison function for a database. + * + * The comparison function is called whenever it is necessary to compare a + * key specified by the application with a key currently stored in the database. + * If no comparison function is specified, and no special key flags were specified + * with #mdb_dbi_open(), the keys are compared lexically, with shorter keys collating + * before longer keys. + * @warning This function must be called before any data access functions are used, + * otherwise data corruption may occur. The same comparison function must be used by every + * program accessing the database, every time the database is used. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[in] cmp A #MDB_cmp_func function + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp); + + /** @brief Set a custom data comparison function for a #MDB_DUPSORT database. + * + * This comparison function is called whenever it is necessary to compare a data + * item specified by the application with a data item currently stored in the database. + * This function only takes effect if the database was opened with the #MDB_DUPSORT + * flag. + * If no comparison function is specified, and no special key flags were specified + * with #mdb_dbi_open(), the data items are compared lexically, with shorter items collating + * before longer items. + * @warning This function must be called before any data access functions are used, + * otherwise data corruption may occur. The same comparison function must be used by every + * program accessing the database, every time the database is used. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[in] cmp A #MDB_cmp_func function + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp); + + /** @brief Set a relocation function for a #MDB_FIXEDMAP database. + * + * @todo The relocation function is called whenever it is necessary to move the data + * of an item to a different position in the database (e.g. through tree + * balancing operations, shifts as a result of adds or deletes, etc.). It is + * intended to allow address/position-dependent data items to be stored in + * a database in an environment opened with the #MDB_FIXEDMAP option. + * Currently the relocation feature is unimplemented and setting + * this function has no effect. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[in] rel A #MDB_rel_func function + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_set_relfunc(MDB_txn *txn, MDB_dbi dbi, MDB_rel_func *rel); + + /** @brief Set a context pointer for a #MDB_FIXEDMAP database's relocation function. + * + * See #mdb_set_relfunc and #MDB_rel_func for more details. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[in] ctx An arbitrary pointer for whatever the application needs. + * It will be passed to the callback function set by #mdb_set_relfunc + * as its \b relctx parameter whenever the callback is invoked. + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_set_relctx(MDB_txn *txn, MDB_dbi dbi, void *ctx); + + /** @brief Get items from a database. + * + * This function retrieves key/data pairs from the database. The address + * and length of the data associated with the specified \b key are returned + * in the structure to which \b data refers. + * If the database supports duplicate keys (#MDB_DUPSORT) then the + * first data item for the key will be returned. Retrieval of other + * items requires the use of #mdb_cursor_get(). + * + * @note The memory pointed to by the returned values is owned by the + * database. The caller need not dispose of the memory, and may not + * modify it in any way. For values returned in a read-only transaction + * any modification attempts will cause a SIGSEGV. + * @note Values returned from the database are valid only until a + * subsequent update operation, or the end of the transaction. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[in] key The key to search for in the database + * @param[out] data The data corresponding to the key + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • #MDB_NOTFOUND - the key was not in the database. + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_get(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data); + + /** @brief Store items into a database. + * + * This function stores key/data pairs in the database. The default behavior + * is to enter the new key/data pair, replacing any previously existing key + * if duplicates are disallowed, or adding a duplicate data item if + * duplicates are allowed (#MDB_DUPSORT). + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[in] key The key to store in the database + * @param[in,out] data The data to store + * @param[in] flags Special options for this operation. This parameter + * must be set to 0 or by bitwise OR'ing together one or more of the + * values described here. + *
    + *
  • #MDB_NODUPDATA - enter the new key/data pair only if it does not + * already appear in the database. This flag may only be specified + * if the database was opened with #MDB_DUPSORT. The function will + * return #MDB_KEYEXIST if the key/data pair already appears in the + * database. + *
  • #MDB_NOOVERWRITE - enter the new key/data pair only if the key + * does not already appear in the database. The function will return + * #MDB_KEYEXIST if the key already appears in the database, even if + * the database supports duplicates (#MDB_DUPSORT). The \b data + * parameter will be set to point to the existing item. + *
  • #MDB_RESERVE - reserve space for data of the given size, but + * don't copy the given data. Instead, return a pointer to the + * reserved space, which the caller can fill in later - before + * the next update operation or the transaction ends. This saves + * an extra memcpy if the data is being generated later. + * LMDB does nothing else with this memory, the caller is expected + * to modify all of the space requested. + *
  • #MDB_APPEND - append the given key/data pair to the end of the + * database. No key comparisons are performed. This option allows + * fast bulk loading when keys are already known to be in the + * correct order. Loading unsorted keys with this flag will cause + * data corruption. + *
  • #MDB_APPENDDUP - as above, but for sorted dup data. + *
+ * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • #MDB_MAP_FULL - the database is full, see #mdb_env_set_mapsize(). + *
  • #MDB_TXN_FULL - the transaction has too many dirty pages. + *
  • EACCES - an attempt was made to write in a read-only transaction. + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_put(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, + unsigned int flags); + + /** @brief Delete items from a database. + * + * This function removes key/data pairs from the database. + * If the database does not support sorted duplicate data items + * (#MDB_DUPSORT) the data parameter is ignored. + * If the database supports sorted duplicates and the data parameter + * is NULL, all of the duplicate data items for the key will be + * deleted. Otherwise, if the data parameter is non-NULL + * only the matching data item will be deleted. + * This function will return #MDB_NOTFOUND if the specified key/data + * pair is not in the database. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[in] key The key to delete from the database + * @param[in] data The data to delete + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EACCES - an attempt was made to write in a read-only transaction. + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_del(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data); + + /** @brief Create a cursor handle. + * + * A cursor is associated with a specific transaction and database. + * A cursor cannot be used when its database handle is closed. Nor + * when its transaction has ended, except with #mdb_cursor_renew(). + * It can be discarded with #mdb_cursor_close(). + * A cursor in a write-transaction can be closed before its transaction + * ends, and will otherwise be closed when its transaction ends. + * A cursor in a read-only transaction must be closed explicitly, before + * or after its transaction ends. It can be reused with + * #mdb_cursor_renew() before finally closing it. + * @note Earlier documentation said that cursors in every transaction + * were closed when the transaction committed or aborted. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[out] cursor Address where the new #MDB_cursor handle will be stored + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **cursor); + + /** @brief Close a cursor handle. + * + * The cursor handle will be freed and must not be used again after this call. + * Its transaction must still be live if it is a write-transaction. + * @param[in] cursor A cursor handle returned by #mdb_cursor_open() + */ +void mdb_cursor_close(MDB_cursor *cursor); + + /** @brief Renew a cursor handle. + * + * A cursor is associated with a specific transaction and database. + * Cursors that are only used in read-only + * transactions may be re-used, to avoid unnecessary malloc/free overhead. + * The cursor may be associated with a new read-only transaction, and + * referencing the same database handle as it was created with. + * This may be done whether the previous transaction is live or dead. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] cursor A cursor handle returned by #mdb_cursor_open() + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_cursor_renew(MDB_txn *txn, MDB_cursor *cursor); + + /** @brief Return the cursor's transaction handle. + * + * @param[in] cursor A cursor handle returned by #mdb_cursor_open() + */ +MDB_txn *mdb_cursor_txn(MDB_cursor *cursor); + + /** @brief Return the cursor's database handle. + * + * @param[in] cursor A cursor handle returned by #mdb_cursor_open() + */ +MDB_dbi mdb_cursor_dbi(MDB_cursor *cursor); + + /** @brief Retrieve by cursor. + * + * This function retrieves key/data pairs from the database. The address and length + * of the key are returned in the object to which \b key refers (except for the + * case of the #MDB_SET option, in which the \b key object is unchanged), and + * the address and length of the data are returned in the object to which \b data + * refers. + * See #mdb_get() for restrictions on using the output values. + * @param[in] cursor A cursor handle returned by #mdb_cursor_open() + * @param[in,out] key The key for a retrieved item + * @param[in,out] data The data of a retrieved item + * @param[in] op A cursor operation #MDB_cursor_op + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • #MDB_NOTFOUND - no matching key found. + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_cursor_get(MDB_cursor *cursor, MDB_val *key, MDB_val *data, + MDB_cursor_op op); + + /** @brief Store by cursor. + * + * This function stores key/data pairs into the database. + * The cursor is positioned at the new item, or on failure usually near it. + * @note Earlier documentation incorrectly said errors would leave the + * state of the cursor unchanged. + * @param[in] cursor A cursor handle returned by #mdb_cursor_open() + * @param[in] key The key operated on. + * @param[in] data The data operated on. + * @param[in] flags Options for this operation. This parameter + * must be set to 0 or one of the values described here. + *
    + *
  • #MDB_CURRENT - replace the item at the current cursor position. + * The \b key parameter must still be provided, and must match it. + * If using sorted duplicates (#MDB_DUPSORT) the data item must still + * sort into the same place. This is intended to be used when the + * new data is the same size as the old. Otherwise it will simply + * perform a delete of the old record followed by an insert. + *
  • #MDB_NODUPDATA - enter the new key/data pair only if it does not + * already appear in the database. This flag may only be specified + * if the database was opened with #MDB_DUPSORT. The function will + * return #MDB_KEYEXIST if the key/data pair already appears in the + * database. + *
  • #MDB_NOOVERWRITE - enter the new key/data pair only if the key + * does not already appear in the database. The function will return + * #MDB_KEYEXIST if the key already appears in the database, even if + * the database supports duplicates (#MDB_DUPSORT). + *
  • #MDB_RESERVE - reserve space for data of the given size, but + * don't copy the given data. Instead, return a pointer to the + * reserved space, which the caller can fill in later. This saves + * an extra memcpy if the data is being generated later. + *
  • #MDB_APPEND - append the given key/data pair to the end of the + * database. No key comparisons are performed. This option allows + * fast bulk loading when keys are already known to be in the + * correct order. Loading unsorted keys with this flag will cause + * data corruption. + *
  • #MDB_APPENDDUP - as above, but for sorted dup data. + *
  • #MDB_MULTIPLE - store multiple contiguous data elements in a + * single request. This flag may only be specified if the database + * was opened with #MDB_DUPFIXED. The \b data argument must be an + * array of two MDB_vals. The mv_size of the first MDB_val must be + * the size of a single data element. The mv_data of the first MDB_val + * must point to the beginning of the array of contiguous data elements. + * The mv_size of the second MDB_val must be the count of the number + * of data elements to store. On return this field will be set to + * the count of the number of elements actually written. The mv_data + * of the second MDB_val is unused. + *
+ * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • #MDB_MAP_FULL - the database is full, see #mdb_env_set_mapsize(). + *
  • #MDB_TXN_FULL - the transaction has too many dirty pages. + *
  • EACCES - an attempt was made to modify a read-only database. + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_cursor_put(MDB_cursor *cursor, MDB_val *key, MDB_val *data, + unsigned int flags); + + /** @brief Delete current key/data pair + * + * This function deletes the key/data pair to which the cursor refers. + * @param[in] cursor A cursor handle returned by #mdb_cursor_open() + * @param[in] flags Options for this operation. This parameter + * must be set to 0 or one of the values described here. + *
    + *
  • #MDB_NODUPDATA - delete all of the data items for the current key. + * This flag may only be specified if the database was opened with #MDB_DUPSORT. + *
+ * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EACCES - an attempt was made to modify a read-only database. + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_cursor_del(MDB_cursor *cursor, unsigned int flags); + + /** @brief Return count of duplicates for current key. + * + * This call is only valid on databases that support sorted duplicate + * data items #MDB_DUPSORT. + * @param[in] cursor A cursor handle returned by #mdb_cursor_open() + * @param[out] countp Address where the count will be stored + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - cursor is not initialized, or an invalid parameter was specified. + *
+ */ +int mdb_cursor_count(MDB_cursor *cursor, size_t *countp); + + /** @brief Compare two data items according to a particular database. + * + * This returns a comparison as if the two data items were keys in the + * specified database. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[in] a The first item to compare + * @param[in] b The second item to compare + * @return < 0 if a < b, 0 if a == b, > 0 if a > b + */ +int mdb_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b); + + /** @brief Compare two data items according to a particular database. + * + * This returns a comparison as if the two items were data items of + * the specified database. The database must have the #MDB_DUPSORT flag. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[in] a The first item to compare + * @param[in] b The second item to compare + * @return < 0 if a < b, 0 if a == b, > 0 if a > b + */ +int mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b); + + /** @brief A callback function used to print a message from the library. + * + * @param[in] msg The string to be printed. + * @param[in] ctx An arbitrary context pointer for the callback. + * @return < 0 on failure, >= 0 on success. + */ +typedef int (MDB_msg_func)(const char *msg, void *ctx); + + /** @brief Dump the entries in the reader lock table. + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[in] func A #MDB_msg_func function + * @param[in] ctx Anything the message function needs + * @return < 0 on failure, >= 0 on success. + */ +int mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx); + + /** @brief Check for stale entries in the reader lock table. + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[out] dead Number of stale slots that were cleared + * @return 0 on success, non-zero on failure. + */ +int mdb_reader_check(MDB_env *env, int *dead); +/** @} */ + +#ifdef __cplusplus +} +#endif +/** @page tools LMDB Command Line Tools + The following describes the command line tools that are available for LMDB. + \li \ref mdb_copy_1 + \li \ref mdb_dump_1 + \li \ref mdb_load_1 + \li \ref mdb_stat_1 +*/ + +#endif /* _LMDB_H_ */ diff --git a/vendor/github.com/szferi/gomdb/mdb.c b/vendor/github.com/szferi/gomdb/mdb.c new file mode 100644 index 000000000000..c87886da074d --- /dev/null +++ b/vendor/github.com/szferi/gomdb/mdb.c @@ -0,0 +1,9364 @@ +/** @file mdb.c + * @brief Lightning memory-mapped database library + * + * A Btree-based database management library modeled loosely on the + * BerkeleyDB API, but much simplified. + */ +/* + * Copyright 2011-2014 Howard Chu, Symas Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + * + * This code is derived from btree.c written by Martin Hedenfalk. + * + * Copyright (c) 2009, 2010 Martin Hedenfalk + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE 1 +#endif +#ifdef _WIN32 +#include +#include +/** getpid() returns int; MinGW defines pid_t but MinGW64 typedefs it + * as int64 which is wrong. MSVC doesn't define it at all, so just + * don't use it. + */ +#define MDB_PID_T int +#define MDB_THR_T DWORD +#include +#include +#ifdef __GNUC__ +# include +#else +# define LITTLE_ENDIAN 1234 +# define BIG_ENDIAN 4321 +# define BYTE_ORDER LITTLE_ENDIAN +# ifndef SSIZE_MAX +# define SSIZE_MAX INT_MAX +# endif +#endif +#else +#include +#include +#define MDB_PID_T pid_t +#define MDB_THR_T pthread_t +#include +#include +#include +#ifdef HAVE_SYS_FILE_H +#include +#endif +#include +#endif + +#if defined(__mips) && defined(__linux) +/* MIPS has cache coherency issues, requires explicit cache control */ +#include +extern int cacheflush(char *addr, int nbytes, int cache); +#define CACHEFLUSH(addr, bytes, cache) cacheflush(addr, bytes, cache) +#else +#define CACHEFLUSH(addr, bytes, cache) +#endif + + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(__sun) +/* Most platforms have posix_memalign, older may only have memalign */ +#define HAVE_MEMALIGN 1 +#include +#endif + +#if !(defined(BYTE_ORDER) || defined(__BYTE_ORDER)) +#include +#include /* defines BYTE_ORDER on HPUX and Solaris */ +#endif + +#if defined(__APPLE__) || defined (BSD) +# define MDB_USE_POSIX_SEM 1 +# define MDB_FDATASYNC fsync +#elif defined(ANDROID) +# define MDB_FDATASYNC fsync +#endif + +#ifndef _WIN32 +#include +#ifdef MDB_USE_POSIX_SEM +# define MDB_USE_HASH 1 +#include +#endif +#endif + +#ifdef USE_VALGRIND +#include +#define VGMEMP_CREATE(h,r,z) VALGRIND_CREATE_MEMPOOL(h,r,z) +#define VGMEMP_ALLOC(h,a,s) VALGRIND_MEMPOOL_ALLOC(h,a,s) +#define VGMEMP_FREE(h,a) VALGRIND_MEMPOOL_FREE(h,a) +#define VGMEMP_DESTROY(h) VALGRIND_DESTROY_MEMPOOL(h) +#define VGMEMP_DEFINED(a,s) VALGRIND_MAKE_MEM_DEFINED(a,s) +#else +#define VGMEMP_CREATE(h,r,z) +#define VGMEMP_ALLOC(h,a,s) +#define VGMEMP_FREE(h,a) +#define VGMEMP_DESTROY(h) +#define VGMEMP_DEFINED(a,s) +#endif + +#ifndef BYTE_ORDER +# if (defined(_LITTLE_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN)) +/* Solaris just defines one or the other */ +# define LITTLE_ENDIAN 1234 +# define BIG_ENDIAN 4321 +# ifdef _LITTLE_ENDIAN +# define BYTE_ORDER LITTLE_ENDIAN +# else +# define BYTE_ORDER BIG_ENDIAN +# endif +# else +# define BYTE_ORDER __BYTE_ORDER +# endif +#endif + +#ifndef LITTLE_ENDIAN +#define LITTLE_ENDIAN __LITTLE_ENDIAN +#endif +#ifndef BIG_ENDIAN +#define BIG_ENDIAN __BIG_ENDIAN +#endif + +#if defined(__i386) || defined(__x86_64) || defined(_M_IX86) +#define MISALIGNED_OK 1 +#endif + +#include "lmdb.h" +#include "midl.h" + +#if (BYTE_ORDER == LITTLE_ENDIAN) == (BYTE_ORDER == BIG_ENDIAN) +# error "Unknown or unsupported endianness (BYTE_ORDER)" +#elif (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF +# error "Two's complement, reasonably sized integer types, please" +#endif + +#ifdef __GNUC__ +/** Put infrequently used env functions in separate section */ +# ifdef __APPLE__ +# define ESECT __attribute__ ((section("__TEXT,text_env"))) +# else +# define ESECT __attribute__ ((section("text_env"))) +# endif +#else +#define ESECT +#endif + +/** @defgroup internal LMDB Internals + * @{ + */ +/** @defgroup compat Compatibility Macros + * A bunch of macros to minimize the amount of platform-specific ifdefs + * needed throughout the rest of the code. When the features this library + * needs are similar enough to POSIX to be hidden in a one-or-two line + * replacement, this macro approach is used. + * @{ + */ + + /** Features under development */ +#ifndef MDB_DEVEL +#define MDB_DEVEL 0 +#endif + + /** Wrapper around __func__, which is a C99 feature */ +#if __STDC_VERSION__ >= 199901L +# define mdb_func_ __func__ +#elif __GNUC__ >= 2 || _MSC_VER >= 1300 +# define mdb_func_ __FUNCTION__ +#else +/* If a debug message says (), update the #if statements above */ +# define mdb_func_ "" +#endif + +#ifdef _WIN32 +#define MDB_USE_HASH 1 +#define MDB_PIDLOCK 0 +#define THREAD_RET DWORD +#define pthread_t HANDLE +#define pthread_mutex_t HANDLE +#define pthread_cond_t HANDLE +#define pthread_key_t DWORD +#define pthread_self() GetCurrentThreadId() +#define pthread_key_create(x,y) \ + ((*(x) = TlsAlloc()) == TLS_OUT_OF_INDEXES ? ErrCode() : 0) +#define pthread_key_delete(x) TlsFree(x) +#define pthread_getspecific(x) TlsGetValue(x) +#define pthread_setspecific(x,y) (TlsSetValue(x,y) ? 0 : ErrCode()) +#define pthread_mutex_unlock(x) ReleaseMutex(*x) +#define pthread_mutex_lock(x) WaitForSingleObject(*x, INFINITE) +#define pthread_cond_signal(x) SetEvent(*x) +#define pthread_cond_wait(cond,mutex) do{SignalObjectAndWait(*mutex, *cond, INFINITE, FALSE); WaitForSingleObject(*mutex, INFINITE);}while(0) +#define THREAD_CREATE(thr,start,arg) thr=CreateThread(NULL,0,start,arg,0,NULL) +#define THREAD_FINISH(thr) WaitForSingleObject(thr, INFINITE) +#define LOCK_MUTEX_R(env) pthread_mutex_lock(&(env)->me_rmutex) +#define UNLOCK_MUTEX_R(env) pthread_mutex_unlock(&(env)->me_rmutex) +#define LOCK_MUTEX_W(env) pthread_mutex_lock(&(env)->me_wmutex) +#define UNLOCK_MUTEX_W(env) pthread_mutex_unlock(&(env)->me_wmutex) +#define getpid() GetCurrentProcessId() +#define MDB_FDATASYNC(fd) (!FlushFileBuffers(fd)) +#define MDB_MSYNC(addr,len,flags) (!FlushViewOfFile(addr,len)) +#define ErrCode() GetLastError() +#define GET_PAGESIZE(x) {SYSTEM_INFO si; GetSystemInfo(&si); (x) = si.dwPageSize;} +#define close(fd) (CloseHandle(fd) ? 0 : -1) +#define munmap(ptr,len) UnmapViewOfFile(ptr) +#ifdef PROCESS_QUERY_LIMITED_INFORMATION +#define MDB_PROCESS_QUERY_LIMITED_INFORMATION PROCESS_QUERY_LIMITED_INFORMATION +#else +#define MDB_PROCESS_QUERY_LIMITED_INFORMATION 0x1000 +#endif +#define Z "I" +#else +#define THREAD_RET void * +#define THREAD_CREATE(thr,start,arg) pthread_create(&thr,NULL,start,arg) +#define THREAD_FINISH(thr) pthread_join(thr,NULL) +#define Z "z" /**< printf format modifier for size_t */ + + /** For MDB_LOCK_FORMAT: True if readers take a pid lock in the lockfile */ +#define MDB_PIDLOCK 1 + +#ifdef MDB_USE_POSIX_SEM + +#define LOCK_MUTEX_R(env) mdb_sem_wait((env)->me_rmutex) +#define UNLOCK_MUTEX_R(env) sem_post((env)->me_rmutex) +#define LOCK_MUTEX_W(env) mdb_sem_wait((env)->me_wmutex) +#define UNLOCK_MUTEX_W(env) sem_post((env)->me_wmutex) + +static int +mdb_sem_wait(sem_t *sem) +{ + int rc; + while ((rc = sem_wait(sem)) && (rc = errno) == EINTR) ; + return rc; +} + +#else + /** Lock the reader mutex. + */ +#define LOCK_MUTEX_R(env) pthread_mutex_lock(&(env)->me_txns->mti_mutex) + /** Unlock the reader mutex. + */ +#define UNLOCK_MUTEX_R(env) pthread_mutex_unlock(&(env)->me_txns->mti_mutex) + + /** Lock the writer mutex. + * Only a single write transaction is allowed at a time. Other writers + * will block waiting for this mutex. + */ +#define LOCK_MUTEX_W(env) pthread_mutex_lock(&(env)->me_txns->mti_wmutex) + /** Unlock the writer mutex. + */ +#define UNLOCK_MUTEX_W(env) pthread_mutex_unlock(&(env)->me_txns->mti_wmutex) +#endif /* MDB_USE_POSIX_SEM */ + + /** Get the error code for the last failed system function. + */ +#define ErrCode() errno + + /** An abstraction for a file handle. + * On POSIX systems file handles are small integers. On Windows + * they're opaque pointers. + */ +#define HANDLE int + + /** A value for an invalid file handle. + * Mainly used to initialize file variables and signify that they are + * unused. + */ +#define INVALID_HANDLE_VALUE (-1) + + /** Get the size of a memory page for the system. + * This is the basic size that the platform's memory manager uses, and is + * fundamental to the use of memory-mapped files. + */ +#define GET_PAGESIZE(x) ((x) = sysconf(_SC_PAGE_SIZE)) +#endif + +#if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) +#define MNAME_LEN 32 +#else +#define MNAME_LEN (sizeof(pthread_mutex_t)) +#endif + +/** @} */ + +#ifndef _WIN32 +/** A flag for opening a file and requesting synchronous data writes. + * This is only used when writing a meta page. It's not strictly needed; + * we could just do a normal write and then immediately perform a flush. + * But if this flag is available it saves us an extra system call. + * + * @note If O_DSYNC is undefined but exists in /usr/include, + * preferably set some compiler flag to get the definition. + * Otherwise compile with the less efficient -DMDB_DSYNC=O_SYNC. + */ +#ifndef MDB_DSYNC +# define MDB_DSYNC O_DSYNC +#endif +#endif + +/** Function for flushing the data of a file. Define this to fsync + * if fdatasync() is not supported. + */ +#ifndef MDB_FDATASYNC +# define MDB_FDATASYNC fdatasync +#endif + +#ifndef MDB_MSYNC +# define MDB_MSYNC(addr,len,flags) msync(addr,len,flags) +#endif + +#ifndef MS_SYNC +#define MS_SYNC 1 +#endif + +#ifndef MS_ASYNC +#define MS_ASYNC 0 +#endif + + /** A page number in the database. + * Note that 64 bit page numbers are overkill, since pages themselves + * already represent 12-13 bits of addressable memory, and the OS will + * always limit applications to a maximum of 63 bits of address space. + * + * @note In the #MDB_node structure, we only store 48 bits of this value, + * which thus limits us to only 60 bits of addressable data. + */ +typedef MDB_ID pgno_t; + + /** A transaction ID. + * See struct MDB_txn.mt_txnid for details. + */ +typedef MDB_ID txnid_t; + +/** @defgroup debug Debug Macros + * @{ + */ +#ifndef MDB_DEBUG + /** Enable debug output. Needs variable argument macros (a C99 feature). + * Set this to 1 for copious tracing. Set to 2 to add dumps of all IDLs + * read from and written to the database (used for free space management). + */ +#define MDB_DEBUG 0 +#endif + +#if MDB_DEBUG +static int mdb_debug; +static txnid_t mdb_debug_start; + + /** Print a debug message with printf formatting. + * Requires double parenthesis around 2 or more args. + */ +# define DPRINTF(args) ((void) ((mdb_debug) && DPRINTF0 args)) +# define DPRINTF0(fmt, ...) \ + fprintf(stderr, "%s:%d " fmt "\n", mdb_func_, __LINE__, __VA_ARGS__) +#else +# define DPRINTF(args) ((void) 0) +#endif + /** Print a debug string. + * The string is printed literally, with no format processing. + */ +#define DPUTS(arg) DPRINTF(("%s", arg)) + /** Debuging output value of a cursor DBI: Negative in a sub-cursor. */ +#define DDBI(mc) \ + (((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi) +/** @} */ + + /** @brief The maximum size of a database page. + * + * It is 32k or 64k, since value-PAGEBASE must fit in + * #MDB_page.%mp_upper. + * + * LMDB will use database pages < OS pages if needed. + * That causes more I/O in write transactions: The OS must + * know (read) the whole page before writing a partial page. + * + * Note that we don't currently support Huge pages. On Linux, + * regular data files cannot use Huge pages, and in general + * Huge pages aren't actually pageable. We rely on the OS + * demand-pager to read our data and page it out when memory + * pressure from other processes is high. So until OSs have + * actual paging support for Huge pages, they're not viable. + */ +#define MAX_PAGESIZE (PAGEBASE ? 0x10000 : 0x8000) + + /** The minimum number of keys required in a database page. + * Setting this to a larger value will place a smaller bound on the + * maximum size of a data item. Data items larger than this size will + * be pushed into overflow pages instead of being stored directly in + * the B-tree node. This value used to default to 4. With a page size + * of 4096 bytes that meant that any item larger than 1024 bytes would + * go into an overflow page. That also meant that on average 2-3KB of + * each overflow page was wasted space. The value cannot be lower than + * 2 because then there would no longer be a tree structure. With this + * value, items larger than 2KB will go into overflow pages, and on + * average only 1KB will be wasted. + */ +#define MDB_MINKEYS 2 + + /** A stamp that identifies a file as an LMDB file. + * There's nothing special about this value other than that it is easily + * recognizable, and it will reflect any byte order mismatches. + */ +#define MDB_MAGIC 0xBEEFC0DE + + /** The version number for a database's datafile format. */ +#define MDB_DATA_VERSION ((MDB_DEVEL) ? 999 : 1) + /** The version number for a database's lockfile format. */ +#define MDB_LOCK_VERSION 1 + + /** @brief The max size of a key we can write, or 0 for dynamic max. + * + * Define this as 0 to compute the max from the page size. 511 + * is default for backwards compat: liblmdb <= 0.9.10 can break + * when modifying a DB with keys/dupsort data bigger than its max. + * #MDB_DEVEL sets the default to 0. + * + * Data items in an #MDB_DUPSORT database are also limited to + * this size, since they're actually keys of a sub-DB. Keys and + * #MDB_DUPSORT data items must fit on a node in a regular page. + */ +#ifndef MDB_MAXKEYSIZE +#define MDB_MAXKEYSIZE ((MDB_DEVEL) ? 0 : 511) +#endif + + /** The maximum size of a key we can write to the environment. */ +#if MDB_MAXKEYSIZE +#define ENV_MAXKEY(env) (MDB_MAXKEYSIZE) +#else +#define ENV_MAXKEY(env) ((env)->me_maxkey) +#endif + + /** @brief The maximum size of a data item. + * + * We only store a 32 bit value for node sizes. + */ +#define MAXDATASIZE 0xffffffffUL + +#if MDB_DEBUG + /** Key size which fits in a #DKBUF. + * @ingroup debug + */ +#define DKBUF_MAXKEYSIZE ((MDB_MAXKEYSIZE) > 0 ? (MDB_MAXKEYSIZE) : 511) + /** A key buffer. + * @ingroup debug + * This is used for printing a hex dump of a key's contents. + */ +#define DKBUF char kbuf[DKBUF_MAXKEYSIZE*2+1] + /** Display a key in hex. + * @ingroup debug + * Invoke a function to display a key in hex. + */ +#define DKEY(x) mdb_dkey(x, kbuf) +#else +#define DKBUF +#define DKEY(x) 0 +#endif + + /** An invalid page number. + * Mainly used to denote an empty tree. + */ +#define P_INVALID (~(pgno_t)0) + + /** Test if the flags \b f are set in a flag word \b w. */ +#define F_ISSET(w, f) (((w) & (f)) == (f)) + + /** Round \b n up to an even number. */ +#define EVEN(n) (((n) + 1U) & -2) /* sign-extending -2 to match n+1U */ + + /** Used for offsets within a single page. + * Since memory pages are typically 4 or 8KB in size, 12-13 bits, + * this is plenty. + */ +typedef uint16_t indx_t; + + /** Default size of memory map. + * This is certainly too small for any actual applications. Apps should always set + * the size explicitly using #mdb_env_set_mapsize(). + */ +#define DEFAULT_MAPSIZE 1048576 + +/** @defgroup readers Reader Lock Table + * Readers don't acquire any locks for their data access. Instead, they + * simply record their transaction ID in the reader table. The reader + * mutex is needed just to find an empty slot in the reader table. The + * slot's address is saved in thread-specific data so that subsequent read + * transactions started by the same thread need no further locking to proceed. + * + * If #MDB_NOTLS is set, the slot address is not saved in thread-specific data. + * + * No reader table is used if the database is on a read-only filesystem, or + * if #MDB_NOLOCK is set. + * + * Since the database uses multi-version concurrency control, readers don't + * actually need any locking. This table is used to keep track of which + * readers are using data from which old transactions, so that we'll know + * when a particular old transaction is no longer in use. Old transactions + * that have discarded any data pages can then have those pages reclaimed + * for use by a later write transaction. + * + * The lock table is constructed such that reader slots are aligned with the + * processor's cache line size. Any slot is only ever used by one thread. + * This alignment guarantees that there will be no contention or cache + * thrashing as threads update their own slot info, and also eliminates + * any need for locking when accessing a slot. + * + * A writer thread will scan every slot in the table to determine the oldest + * outstanding reader transaction. Any freed pages older than this will be + * reclaimed by the writer. The writer doesn't use any locks when scanning + * this table. This means that there's no guarantee that the writer will + * see the most up-to-date reader info, but that's not required for correct + * operation - all we need is to know the upper bound on the oldest reader, + * we don't care at all about the newest reader. So the only consequence of + * reading stale information here is that old pages might hang around a + * while longer before being reclaimed. That's actually good anyway, because + * the longer we delay reclaiming old pages, the more likely it is that a + * string of contiguous pages can be found after coalescing old pages from + * many old transactions together. + * @{ + */ + /** Number of slots in the reader table. + * This value was chosen somewhat arbitrarily. 126 readers plus a + * couple mutexes fit exactly into 8KB on my development machine. + * Applications should set the table size using #mdb_env_set_maxreaders(). + */ +#define DEFAULT_READERS 126 + + /** The size of a CPU cache line in bytes. We want our lock structures + * aligned to this size to avoid false cache line sharing in the + * lock table. + * This value works for most CPUs. For Itanium this should be 128. + */ +#ifndef CACHELINE +#define CACHELINE 64 +#endif + + /** The information we store in a single slot of the reader table. + * In addition to a transaction ID, we also record the process and + * thread ID that owns a slot, so that we can detect stale information, + * e.g. threads or processes that went away without cleaning up. + * @note We currently don't check for stale records. We simply re-init + * the table when we know that we're the only process opening the + * lock file. + */ +typedef struct MDB_rxbody { + /** Current Transaction ID when this transaction began, or (txnid_t)-1. + * Multiple readers that start at the same time will probably have the + * same ID here. Again, it's not important to exclude them from + * anything; all we need to know is which version of the DB they + * started from so we can avoid overwriting any data used in that + * particular version. + */ + txnid_t mrb_txnid; + /** The process ID of the process owning this reader txn. */ + MDB_PID_T mrb_pid; + /** The thread ID of the thread owning this txn. */ + MDB_THR_T mrb_tid; +} MDB_rxbody; + + /** The actual reader record, with cacheline padding. */ +typedef struct MDB_reader { + union { + MDB_rxbody mrx; + /** shorthand for mrb_txnid */ +#define mr_txnid mru.mrx.mrb_txnid +#define mr_pid mru.mrx.mrb_pid +#define mr_tid mru.mrx.mrb_tid + /** cache line alignment */ + char pad[(sizeof(MDB_rxbody)+CACHELINE-1) & ~(CACHELINE-1)]; + } mru; +} MDB_reader; + + /** The header for the reader table. + * The table resides in a memory-mapped file. (This is a different file + * than is used for the main database.) + * + * For POSIX the actual mutexes reside in the shared memory of this + * mapped file. On Windows, mutexes are named objects allocated by the + * kernel; we store the mutex names in this mapped file so that other + * processes can grab them. This same approach is also used on + * MacOSX/Darwin (using named semaphores) since MacOSX doesn't support + * process-shared POSIX mutexes. For these cases where a named object + * is used, the object name is derived from a 64 bit FNV hash of the + * environment pathname. As such, naming collisions are extremely + * unlikely. If a collision occurs, the results are unpredictable. + */ +typedef struct MDB_txbody { + /** Stamp identifying this as an LMDB file. It must be set + * to #MDB_MAGIC. */ + uint32_t mtb_magic; + /** Format of this lock file. Must be set to #MDB_LOCK_FORMAT. */ + uint32_t mtb_format; +#if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) + char mtb_rmname[MNAME_LEN]; +#else + /** Mutex protecting access to this table. + * This is the reader lock that #LOCK_MUTEX_R acquires. + */ + pthread_mutex_t mtb_mutex; +#endif + /** The ID of the last transaction committed to the database. + * This is recorded here only for convenience; the value can always + * be determined by reading the main database meta pages. + */ + txnid_t mtb_txnid; + /** The number of slots that have been used in the reader table. + * This always records the maximum count, it is not decremented + * when readers release their slots. + */ + unsigned mtb_numreaders; +} MDB_txbody; + + /** The actual reader table definition. */ +typedef struct MDB_txninfo { + union { + MDB_txbody mtb; +#define mti_magic mt1.mtb.mtb_magic +#define mti_format mt1.mtb.mtb_format +#define mti_mutex mt1.mtb.mtb_mutex +#define mti_rmname mt1.mtb.mtb_rmname +#define mti_txnid mt1.mtb.mtb_txnid +#define mti_numreaders mt1.mtb.mtb_numreaders + char pad[(sizeof(MDB_txbody)+CACHELINE-1) & ~(CACHELINE-1)]; + } mt1; + union { +#if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) + char mt2_wmname[MNAME_LEN]; +#define mti_wmname mt2.mt2_wmname +#else + pthread_mutex_t mt2_wmutex; +#define mti_wmutex mt2.mt2_wmutex +#endif + char pad[(MNAME_LEN+CACHELINE-1) & ~(CACHELINE-1)]; + } mt2; + MDB_reader mti_readers[1]; +} MDB_txninfo; + + /** Lockfile format signature: version, features and field layout */ +#define MDB_LOCK_FORMAT \ + ((uint32_t) \ + ((MDB_LOCK_VERSION) \ + /* Flags which describe functionality */ \ + + (((MDB_PIDLOCK) != 0) << 16))) +/** @} */ + +/** Common header for all page types. + * Overflow records occupy a number of contiguous pages with no + * headers on any page after the first. + */ +typedef struct MDB_page { +#define mp_pgno mp_p.p_pgno +#define mp_next mp_p.p_next + union { + pgno_t p_pgno; /**< page number */ + struct MDB_page *p_next; /**< for in-memory list of freed pages */ + } mp_p; + uint16_t mp_pad; +/** @defgroup mdb_page Page Flags + * @ingroup internal + * Flags for the page headers. + * @{ + */ +#define P_BRANCH 0x01 /**< branch page */ +#define P_LEAF 0x02 /**< leaf page */ +#define P_OVERFLOW 0x04 /**< overflow page */ +#define P_META 0x08 /**< meta page */ +#define P_DIRTY 0x10 /**< dirty page, also set for #P_SUBP pages */ +#define P_LEAF2 0x20 /**< for #MDB_DUPFIXED records */ +#define P_SUBP 0x40 /**< for #MDB_DUPSORT sub-pages */ +#define P_LOOSE 0x4000 /**< page was dirtied then freed, can be reused */ +#define P_KEEP 0x8000 /**< leave this page alone during spill */ +/** @} */ + uint16_t mp_flags; /**< @ref mdb_page */ +#define mp_lower mp_pb.pb.pb_lower +#define mp_upper mp_pb.pb.pb_upper +#define mp_pages mp_pb.pb_pages + union { + struct { + indx_t pb_lower; /**< lower bound of free space */ + indx_t pb_upper; /**< upper bound of free space */ + } pb; + uint32_t pb_pages; /**< number of overflow pages */ + } mp_pb; + indx_t mp_ptrs[1]; /**< dynamic size */ +} MDB_page; + + /** Size of the page header, excluding dynamic data at the end */ +#define PAGEHDRSZ ((unsigned) offsetof(MDB_page, mp_ptrs)) + + /** Address of first usable data byte in a page, after the header */ +#define METADATA(p) ((void *)((char *)(p) + PAGEHDRSZ)) + + /** ITS#7713, change PAGEBASE to handle 65536 byte pages */ +#define PAGEBASE ((MDB_DEVEL) ? PAGEHDRSZ : 0) + + /** Number of nodes on a page */ +#define NUMKEYS(p) (((p)->mp_lower - (PAGEHDRSZ-PAGEBASE)) >> 1) + + /** The amount of space remaining in the page */ +#define SIZELEFT(p) (indx_t)((p)->mp_upper - (p)->mp_lower) + + /** The percentage of space used in the page, in tenths of a percent. */ +#define PAGEFILL(env, p) (1000L * ((env)->me_psize - PAGEHDRSZ - SIZELEFT(p)) / \ + ((env)->me_psize - PAGEHDRSZ)) + /** The minimum page fill factor, in tenths of a percent. + * Pages emptier than this are candidates for merging. + */ +#define FILL_THRESHOLD 250 + + /** Test if a page is a leaf page */ +#define IS_LEAF(p) F_ISSET((p)->mp_flags, P_LEAF) + /** Test if a page is a LEAF2 page */ +#define IS_LEAF2(p) F_ISSET((p)->mp_flags, P_LEAF2) + /** Test if a page is a branch page */ +#define IS_BRANCH(p) F_ISSET((p)->mp_flags, P_BRANCH) + /** Test if a page is an overflow page */ +#define IS_OVERFLOW(p) F_ISSET((p)->mp_flags, P_OVERFLOW) + /** Test if a page is a sub page */ +#define IS_SUBP(p) F_ISSET((p)->mp_flags, P_SUBP) + + /** The number of overflow pages needed to store the given size. */ +#define OVPAGES(size, psize) ((PAGEHDRSZ-1 + (size)) / (psize) + 1) + + /** Link in #MDB_txn.%mt_loose_pgs list */ +#define NEXT_LOOSE_PAGE(p) (*(MDB_page **)((p) + 2)) + + /** Header for a single key/data pair within a page. + * Used in pages of type #P_BRANCH and #P_LEAF without #P_LEAF2. + * We guarantee 2-byte alignment for 'MDB_node's. + */ +typedef struct MDB_node { + /** lo and hi are used for data size on leaf nodes and for + * child pgno on branch nodes. On 64 bit platforms, flags + * is also used for pgno. (Branch nodes have no flags). + * They are in host byte order in case that lets some + * accesses be optimized into a 32-bit word access. + */ +#if BYTE_ORDER == LITTLE_ENDIAN + unsigned short mn_lo, mn_hi; /**< part of data size or pgno */ +#else + unsigned short mn_hi, mn_lo; +#endif +/** @defgroup mdb_node Node Flags + * @ingroup internal + * Flags for node headers. + * @{ + */ +#define F_BIGDATA 0x01 /**< data put on overflow page */ +#define F_SUBDATA 0x02 /**< data is a sub-database */ +#define F_DUPDATA 0x04 /**< data has duplicates */ + +/** valid flags for #mdb_node_add() */ +#define NODE_ADD_FLAGS (F_DUPDATA|F_SUBDATA|MDB_RESERVE|MDB_APPEND) + +/** @} */ + unsigned short mn_flags; /**< @ref mdb_node */ + unsigned short mn_ksize; /**< key size */ + char mn_data[1]; /**< key and data are appended here */ +} MDB_node; + + /** Size of the node header, excluding dynamic data at the end */ +#define NODESIZE offsetof(MDB_node, mn_data) + + /** Bit position of top word in page number, for shifting mn_flags */ +#define PGNO_TOPWORD ((pgno_t)-1 > 0xffffffffu ? 32 : 0) + + /** Size of a node in a branch page with a given key. + * This is just the node header plus the key, there is no data. + */ +#define INDXSIZE(k) (NODESIZE + ((k) == NULL ? 0 : (k)->mv_size)) + + /** Size of a node in a leaf page with a given key and data. + * This is node header plus key plus data size. + */ +#define LEAFSIZE(k, d) (NODESIZE + (k)->mv_size + (d)->mv_size) + + /** Address of node \b i in page \b p */ +#define NODEPTR(p, i) ((MDB_node *)((char *)(p) + (p)->mp_ptrs[i] + PAGEBASE)) + + /** Address of the key for the node */ +#define NODEKEY(node) (void *)((node)->mn_data) + + /** Address of the data for a node */ +#define NODEDATA(node) (void *)((char *)(node)->mn_data + (node)->mn_ksize) + + /** Get the page number pointed to by a branch node */ +#define NODEPGNO(node) \ + ((node)->mn_lo | ((pgno_t) (node)->mn_hi << 16) | \ + (PGNO_TOPWORD ? ((pgno_t) (node)->mn_flags << PGNO_TOPWORD) : 0)) + /** Set the page number in a branch node */ +#define SETPGNO(node,pgno) do { \ + (node)->mn_lo = (pgno) & 0xffff; (node)->mn_hi = (pgno) >> 16; \ + if (PGNO_TOPWORD) (node)->mn_flags = (pgno) >> PGNO_TOPWORD; } while(0) + + /** Get the size of the data in a leaf node */ +#define NODEDSZ(node) ((node)->mn_lo | ((unsigned)(node)->mn_hi << 16)) + /** Set the size of the data for a leaf node */ +#define SETDSZ(node,size) do { \ + (node)->mn_lo = (size) & 0xffff; (node)->mn_hi = (size) >> 16;} while(0) + /** The size of a key in a node */ +#define NODEKSZ(node) ((node)->mn_ksize) + + /** Copy a page number from src to dst */ +#ifdef MISALIGNED_OK +#define COPY_PGNO(dst,src) dst = src +#else +#if SIZE_MAX > 4294967295UL +#define COPY_PGNO(dst,src) do { \ + unsigned short *s, *d; \ + s = (unsigned short *)&(src); \ + d = (unsigned short *)&(dst); \ + *d++ = *s++; \ + *d++ = *s++; \ + *d++ = *s++; \ + *d = *s; \ +} while (0) +#else +#define COPY_PGNO(dst,src) do { \ + unsigned short *s, *d; \ + s = (unsigned short *)&(src); \ + d = (unsigned short *)&(dst); \ + *d++ = *s++; \ + *d = *s; \ +} while (0) +#endif +#endif + /** The address of a key in a LEAF2 page. + * LEAF2 pages are used for #MDB_DUPFIXED sorted-duplicate sub-DBs. + * There are no node headers, keys are stored contiguously. + */ +#define LEAF2KEY(p, i, ks) ((char *)(p) + PAGEHDRSZ + ((i)*(ks))) + + /** Set the \b node's key into \b keyptr, if requested. */ +#define MDB_GET_KEY(node, keyptr) { if ((keyptr) != NULL) { \ + (keyptr)->mv_size = NODEKSZ(node); (keyptr)->mv_data = NODEKEY(node); } } + + /** Set the \b node's key into \b key. */ +#define MDB_GET_KEY2(node, key) { key.mv_size = NODEKSZ(node); key.mv_data = NODEKEY(node); } + + /** Information about a single database in the environment. */ +typedef struct MDB_db { + uint32_t md_pad; /**< also ksize for LEAF2 pages */ + uint16_t md_flags; /**< @ref mdb_dbi_open */ + uint16_t md_depth; /**< depth of this tree */ + pgno_t md_branch_pages; /**< number of internal pages */ + pgno_t md_leaf_pages; /**< number of leaf pages */ + pgno_t md_overflow_pages; /**< number of overflow pages */ + size_t md_entries; /**< number of data items */ + pgno_t md_root; /**< the root page of this tree */ +} MDB_db; + + /** mdb_dbi_open flags */ +#define MDB_VALID 0x8000 /**< DB handle is valid, for me_dbflags */ +#define PERSISTENT_FLAGS (0xffff & ~(MDB_VALID)) +#define VALID_FLAGS (MDB_REVERSEKEY|MDB_DUPSORT|MDB_INTEGERKEY|MDB_DUPFIXED|\ + MDB_INTEGERDUP|MDB_REVERSEDUP|MDB_CREATE) + + /** Handle for the DB used to track free pages. */ +#define FREE_DBI 0 + /** Handle for the default DB. */ +#define MAIN_DBI 1 + + /** Meta page content. + * A meta page is the start point for accessing a database snapshot. + * Pages 0-1 are meta pages. Transaction N writes meta page #(N % 2). + */ +typedef struct MDB_meta { + /** Stamp identifying this as an LMDB file. It must be set + * to #MDB_MAGIC. */ + uint32_t mm_magic; + /** Version number of this lock file. Must be set to #MDB_DATA_VERSION. */ + uint32_t mm_version; + void *mm_address; /**< address for fixed mapping */ + size_t mm_mapsize; /**< size of mmap region */ + MDB_db mm_dbs[2]; /**< first is free space, 2nd is main db */ + /** The size of pages used in this DB */ +#define mm_psize mm_dbs[0].md_pad + /** Any persistent environment flags. @ref mdb_env */ +#define mm_flags mm_dbs[0].md_flags + pgno_t mm_last_pg; /**< last used page in file */ + txnid_t mm_txnid; /**< txnid that committed this page */ +} MDB_meta; + + /** Buffer for a stack-allocated meta page. + * The members define size and alignment, and silence type + * aliasing warnings. They are not used directly; that could + * mean incorrectly using several union members in parallel. + */ +typedef union MDB_metabuf { + MDB_page mb_page; + struct { + char mm_pad[PAGEHDRSZ]; + MDB_meta mm_meta; + } mb_metabuf; +} MDB_metabuf; + + /** Auxiliary DB info. + * The information here is mostly static/read-only. There is + * only a single copy of this record in the environment. + */ +typedef struct MDB_dbx { + MDB_val md_name; /**< name of the database */ + MDB_cmp_func *md_cmp; /**< function for comparing keys */ + MDB_cmp_func *md_dcmp; /**< function for comparing data items */ + MDB_rel_func *md_rel; /**< user relocate function */ + void *md_relctx; /**< user-provided context for md_rel */ +} MDB_dbx; + + /** A database transaction. + * Every operation requires a transaction handle. + */ +struct MDB_txn { + MDB_txn *mt_parent; /**< parent of a nested txn */ + MDB_txn *mt_child; /**< nested txn under this txn */ + pgno_t mt_next_pgno; /**< next unallocated page */ + /** The ID of this transaction. IDs are integers incrementing from 1. + * Only committed write transactions increment the ID. If a transaction + * aborts, the ID may be re-used by the next writer. + */ + txnid_t mt_txnid; + MDB_env *mt_env; /**< the DB environment */ + /** The list of pages that became unused during this transaction. + */ + MDB_IDL mt_free_pgs; + /** The list of loose pages that became unused and may be reused + * in this transaction, linked through #NEXT_LOOSE_PAGE(page). + */ + MDB_page *mt_loose_pgs; + /* #Number of loose pages (#mt_loose_pgs) */ + int mt_loose_count; + /** The sorted list of dirty pages we temporarily wrote to disk + * because the dirty list was full. page numbers in here are + * shifted left by 1, deleted slots have the LSB set. + */ + MDB_IDL mt_spill_pgs; + union { + /** For write txns: Modified pages. Sorted when not MDB_WRITEMAP. */ + MDB_ID2L dirty_list; + /** For read txns: This thread/txn's reader table slot, or NULL. */ + MDB_reader *reader; + } mt_u; + /** Array of records for each DB known in the environment. */ + MDB_dbx *mt_dbxs; + /** Array of MDB_db records for each known DB */ + MDB_db *mt_dbs; + /** Array of sequence numbers for each DB handle */ + unsigned int *mt_dbiseqs; +/** @defgroup mt_dbflag Transaction DB Flags + * @ingroup internal + * @{ + */ +#define DB_DIRTY 0x01 /**< DB was modified or is DUPSORT data */ +#define DB_STALE 0x02 /**< Named-DB record is older than txnID */ +#define DB_NEW 0x04 /**< Named-DB handle opened in this txn */ +#define DB_VALID 0x08 /**< DB handle is valid, see also #MDB_VALID */ +/** @} */ + /** In write txns, array of cursors for each DB */ + MDB_cursor **mt_cursors; + /** Array of flags for each DB */ + unsigned char *mt_dbflags; + /** Number of DB records in use. This number only ever increments; + * we don't decrement it when individual DB handles are closed. + */ + MDB_dbi mt_numdbs; + +/** @defgroup mdb_txn Transaction Flags + * @ingroup internal + * @{ + */ +#define MDB_TXN_RDONLY 0x01 /**< read-only transaction */ +#define MDB_TXN_ERROR 0x02 /**< txn is unusable after an error */ +#define MDB_TXN_DIRTY 0x04 /**< must write, even if dirty list is empty */ +#define MDB_TXN_SPILLS 0x08 /**< txn or a parent has spilled pages */ +/** @} */ + unsigned int mt_flags; /**< @ref mdb_txn */ + /** #dirty_list room: Array size - \#dirty pages visible to this txn. + * Includes ancestor txns' dirty pages not hidden by other txns' + * dirty/spilled pages. Thus commit(nested txn) has room to merge + * dirty_list into mt_parent after freeing hidden mt_parent pages. + */ + unsigned int mt_dirty_room; +}; + +/** Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty. + * At 4 keys per node, enough for 2^64 nodes, so there's probably no need to + * raise this on a 64 bit machine. + */ +#define CURSOR_STACK 32 + +struct MDB_xcursor; + + /** Cursors are used for all DB operations. + * A cursor holds a path of (page pointer, key index) from the DB + * root to a position in the DB, plus other state. #MDB_DUPSORT + * cursors include an xcursor to the current data item. Write txns + * track their cursors and keep them up to date when data moves. + * Exception: An xcursor's pointer to a #P_SUBP page can be stale. + * (A node with #F_DUPDATA but no #F_SUBDATA contains a subpage). + */ +struct MDB_cursor { + /** Next cursor on this DB in this txn */ + MDB_cursor *mc_next; + /** Backup of the original cursor if this cursor is a shadow */ + MDB_cursor *mc_backup; + /** Context used for databases with #MDB_DUPSORT, otherwise NULL */ + struct MDB_xcursor *mc_xcursor; + /** The transaction that owns this cursor */ + MDB_txn *mc_txn; + /** The database handle this cursor operates on */ + MDB_dbi mc_dbi; + /** The database record for this cursor */ + MDB_db *mc_db; + /** The database auxiliary record for this cursor */ + MDB_dbx *mc_dbx; + /** The @ref mt_dbflag for this database */ + unsigned char *mc_dbflag; + unsigned short mc_snum; /**< number of pushed pages */ + unsigned short mc_top; /**< index of top page, normally mc_snum-1 */ +/** @defgroup mdb_cursor Cursor Flags + * @ingroup internal + * Cursor state flags. + * @{ + */ +#define C_INITIALIZED 0x01 /**< cursor has been initialized and is valid */ +#define C_EOF 0x02 /**< No more data */ +#define C_SUB 0x04 /**< Cursor is a sub-cursor */ +#define C_DEL 0x08 /**< last op was a cursor_del */ +#define C_SPLITTING 0x20 /**< Cursor is in page_split */ +#define C_UNTRACK 0x40 /**< Un-track cursor when closing */ +/** @} */ + unsigned int mc_flags; /**< @ref mdb_cursor */ + MDB_page *mc_pg[CURSOR_STACK]; /**< stack of pushed pages */ + indx_t mc_ki[CURSOR_STACK]; /**< stack of page indices */ +}; + + /** Context for sorted-dup records. + * We could have gone to a fully recursive design, with arbitrarily + * deep nesting of sub-databases. But for now we only handle these + * levels - main DB, optional sub-DB, sorted-duplicate DB. + */ +typedef struct MDB_xcursor { + /** A sub-cursor for traversing the Dup DB */ + MDB_cursor mx_cursor; + /** The database record for this Dup DB */ + MDB_db mx_db; + /** The auxiliary DB record for this Dup DB */ + MDB_dbx mx_dbx; + /** The @ref mt_dbflag for this Dup DB */ + unsigned char mx_dbflag; +} MDB_xcursor; + + /** State of FreeDB old pages, stored in the MDB_env */ +typedef struct MDB_pgstate { + pgno_t *mf_pghead; /**< Reclaimed freeDB pages, or NULL before use */ + txnid_t mf_pglast; /**< ID of last used record, or 0 if !mf_pghead */ +} MDB_pgstate; + + /** The database environment. */ +struct MDB_env { + HANDLE me_fd; /**< The main data file */ + HANDLE me_lfd; /**< The lock file */ + HANDLE me_mfd; /**< just for writing the meta pages */ + /** Failed to update the meta page. Probably an I/O error. */ +#define MDB_FATAL_ERROR 0x80000000U + /** Some fields are initialized. */ +#define MDB_ENV_ACTIVE 0x20000000U + /** me_txkey is set */ +#define MDB_ENV_TXKEY 0x10000000U + uint32_t me_flags; /**< @ref mdb_env */ + unsigned int me_psize; /**< DB page size, inited from me_os_psize */ + unsigned int me_os_psize; /**< OS page size, from #GET_PAGESIZE */ + unsigned int me_maxreaders; /**< size of the reader table */ + unsigned int me_numreaders; /**< max numreaders set by this env */ + MDB_dbi me_numdbs; /**< number of DBs opened */ + MDB_dbi me_maxdbs; /**< size of the DB table */ + MDB_PID_T me_pid; /**< process ID of this env */ + char *me_path; /**< path to the DB files */ + char *me_map; /**< the memory map of the data file */ + MDB_txninfo *me_txns; /**< the memory map of the lock file or NULL */ + MDB_meta *me_metas[2]; /**< pointers to the two meta pages */ + void *me_pbuf; /**< scratch area for DUPSORT put() */ + MDB_txn *me_txn; /**< current write transaction */ + size_t me_mapsize; /**< size of the data memory map */ + off_t me_size; /**< current file size */ + pgno_t me_maxpg; /**< me_mapsize / me_psize */ + MDB_dbx *me_dbxs; /**< array of static DB info */ + uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */ + unsigned int *me_dbiseqs; /**< array of dbi sequence numbers */ + pthread_key_t me_txkey; /**< thread-key for readers */ + MDB_pgstate me_pgstate; /**< state of old pages from freeDB */ +# define me_pglast me_pgstate.mf_pglast +# define me_pghead me_pgstate.mf_pghead + MDB_page *me_dpages; /**< list of malloc'd blocks for re-use */ + /** IDL of pages that became unused in a write txn */ + MDB_IDL me_free_pgs; + /** ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE. */ + MDB_ID2L me_dirty_list; + /** Max number of freelist items that can fit in a single overflow page */ + int me_maxfree_1pg; + /** Max size of a node on a page */ + unsigned int me_nodemax; +#if !(MDB_MAXKEYSIZE) + unsigned int me_maxkey; /**< max size of a key */ +#endif + int me_live_reader; /**< have liveness lock in reader table */ +#ifdef _WIN32 + int me_pidquery; /**< Used in OpenProcess */ + HANDLE me_rmutex; /* Windows mutexes don't reside in shared mem */ + HANDLE me_wmutex; +#elif defined(MDB_USE_POSIX_SEM) + sem_t *me_rmutex; /* Shared mutexes are not supported */ + sem_t *me_wmutex; +#endif + void *me_userctx; /**< User-settable context */ + MDB_assert_func *me_assert_func; /**< Callback for assertion failures */ +}; + + /** Nested transaction */ +typedef struct MDB_ntxn { + MDB_txn mnt_txn; /**< the transaction */ + MDB_pgstate mnt_pgstate; /**< parent transaction's saved freestate */ +} MDB_ntxn; + + /** max number of pages to commit in one writev() call */ +#define MDB_COMMIT_PAGES 64 +#if defined(IOV_MAX) && IOV_MAX < MDB_COMMIT_PAGES +#undef MDB_COMMIT_PAGES +#define MDB_COMMIT_PAGES IOV_MAX +#endif + + /** max bytes to write in one call */ +#define MAX_WRITE (0x80000000U >> (sizeof(ssize_t) == 4)) + + /** Check \b txn and \b dbi arguments to a function */ +#define TXN_DBI_EXIST(txn, dbi) \ + ((txn) && (dbi) < (txn)->mt_numdbs && ((txn)->mt_dbflags[dbi] & DB_VALID)) + + /** Check for misused \b dbi handles */ +#define TXN_DBI_CHANGED(txn, dbi) \ + ((txn)->mt_dbiseqs[dbi] != (txn)->mt_env->me_dbiseqs[dbi]) + +static int mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp); +static int mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp); +static int mdb_page_touch(MDB_cursor *mc); + +static int mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **mp, int *lvl); +static int mdb_page_search_root(MDB_cursor *mc, + MDB_val *key, int modify); +#define MDB_PS_MODIFY 1 +#define MDB_PS_ROOTONLY 2 +#define MDB_PS_FIRST 4 +#define MDB_PS_LAST 8 +static int mdb_page_search(MDB_cursor *mc, + MDB_val *key, int flags); +static int mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst); + +#define MDB_SPLIT_REPLACE MDB_APPENDDUP /**< newkey is not new */ +static int mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, + pgno_t newpgno, unsigned int nflags); + +static int mdb_env_read_header(MDB_env *env, MDB_meta *meta); +static int mdb_env_pick_meta(const MDB_env *env); +static int mdb_env_write_meta(MDB_txn *txn); +#if !(defined(_WIN32) || defined(MDB_USE_POSIX_SEM)) /* Drop unused excl arg */ +# define mdb_env_close0(env, excl) mdb_env_close1(env) +#endif +static void mdb_env_close0(MDB_env *env, int excl); + +static MDB_node *mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp); +static int mdb_node_add(MDB_cursor *mc, indx_t indx, + MDB_val *key, MDB_val *data, pgno_t pgno, unsigned int flags); +static void mdb_node_del(MDB_cursor *mc, int ksize); +static void mdb_node_shrink(MDB_page *mp, indx_t indx); +static int mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst); +static int mdb_node_read(MDB_txn *txn, MDB_node *leaf, MDB_val *data); +static size_t mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data); +static size_t mdb_branch_size(MDB_env *env, MDB_val *key); + +static int mdb_rebalance(MDB_cursor *mc); +static int mdb_update_key(MDB_cursor *mc, MDB_val *key); + +static void mdb_cursor_pop(MDB_cursor *mc); +static int mdb_cursor_push(MDB_cursor *mc, MDB_page *mp); + +static int mdb_cursor_del0(MDB_cursor *mc); +static int mdb_del0(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, unsigned flags); +static int mdb_cursor_sibling(MDB_cursor *mc, int move_right); +static int mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op); +static int mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op); +static int mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op, + int *exactp); +static int mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data); +static int mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data); + +static void mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx); +static void mdb_xcursor_init0(MDB_cursor *mc); +static void mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node); + +static int mdb_drop0(MDB_cursor *mc, int subs); +static void mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi); + +/** @cond */ +static MDB_cmp_func mdb_cmp_memn, mdb_cmp_memnr, mdb_cmp_int, mdb_cmp_cint, mdb_cmp_long; +/** @endcond */ + +#ifdef _WIN32 +static SECURITY_DESCRIPTOR mdb_null_sd; +static SECURITY_ATTRIBUTES mdb_all_sa; +static int mdb_sec_inited; +#endif + +/** Return the library version info. */ +char * +mdb_version(int *major, int *minor, int *patch) +{ + if (major) *major = MDB_VERSION_MAJOR; + if (minor) *minor = MDB_VERSION_MINOR; + if (patch) *patch = MDB_VERSION_PATCH; + return MDB_VERSION_STRING; +} + +/** Table of descriptions for LMDB @ref errors */ +static char *const mdb_errstr[] = { + "MDB_KEYEXIST: Key/data pair already exists", + "MDB_NOTFOUND: No matching key/data pair found", + "MDB_PAGE_NOTFOUND: Requested page not found", + "MDB_CORRUPTED: Located page was wrong type", + "MDB_PANIC: Update of meta page failed", + "MDB_VERSION_MISMATCH: Database environment version mismatch", + "MDB_INVALID: File is not an LMDB file", + "MDB_MAP_FULL: Environment mapsize limit reached", + "MDB_DBS_FULL: Environment maxdbs limit reached", + "MDB_READERS_FULL: Environment maxreaders limit reached", + "MDB_TLS_FULL: Thread-local storage keys full - too many environments open", + "MDB_TXN_FULL: Transaction has too many dirty pages - transaction too big", + "MDB_CURSOR_FULL: Internal error - cursor stack limit reached", + "MDB_PAGE_FULL: Internal error - page has no more space", + "MDB_MAP_RESIZED: Database contents grew beyond environment mapsize", + "MDB_INCOMPATIBLE: Operation and DB incompatible, or DB flags changed", + "MDB_BAD_RSLOT: Invalid reuse of reader locktable slot", + "MDB_BAD_TXN: Transaction cannot recover - it must be aborted", + "MDB_BAD_VALSIZE: Unsupported size of key/DB name/data, or wrong DUPFIXED size", + "MDB_BAD_DBI: The specified DBI handle was closed/changed unexpectedly", +}; + +char * +mdb_strerror(int err) +{ +#ifdef _WIN32 + /** HACK: pad 4KB on stack over the buf. Return system msgs in buf. + * This works as long as no function between the call to mdb_strerror + * and the actual use of the message uses more than 4K of stack. + */ + char pad[4096]; + char buf[1024], *ptr = buf; +#endif + int i; + if (!err) + return ("Successful return: 0"); + + if (err >= MDB_KEYEXIST && err <= MDB_LAST_ERRCODE) { + i = err - MDB_KEYEXIST; + return mdb_errstr[i]; + } + +#ifdef _WIN32 + /* These are the C-runtime error codes we use. The comment indicates + * their numeric value, and the Win32 error they would correspond to + * if the error actually came from a Win32 API. A major mess, we should + * have used LMDB-specific error codes for everything. + */ + switch(err) { + case ENOENT: /* 2, FILE_NOT_FOUND */ + case EIO: /* 5, ACCESS_DENIED */ + case ENOMEM: /* 12, INVALID_ACCESS */ + case EACCES: /* 13, INVALID_DATA */ + case EBUSY: /* 16, CURRENT_DIRECTORY */ + case EINVAL: /* 22, BAD_COMMAND */ + case ENOSPC: /* 28, OUT_OF_PAPER */ + return strerror(err); + default: + ; + } + buf[0] = 0; + FormatMessage(FORMAT_MESSAGE_FROM_SYSTEM | + FORMAT_MESSAGE_IGNORE_INSERTS, + NULL, err, 0, ptr, sizeof(buf), pad); + return ptr; +#else + return strerror(err); +#endif +} + +/** assert(3) variant in cursor context */ +#define mdb_cassert(mc, expr) mdb_assert0((mc)->mc_txn->mt_env, expr, #expr) +/** assert(3) variant in transaction context */ +#define mdb_tassert(mc, expr) mdb_assert0((txn)->mt_env, expr, #expr) +/** assert(3) variant in environment context */ +#define mdb_eassert(env, expr) mdb_assert0(env, expr, #expr) + +#ifndef NDEBUG +# define mdb_assert0(env, expr, expr_txt) ((expr) ? (void)0 : \ + mdb_assert_fail(env, expr_txt, mdb_func_, __FILE__, __LINE__)) + +static void +mdb_assert_fail(MDB_env *env, const char *expr_txt, + const char *func, const char *file, int line) +{ + char buf[400]; + sprintf(buf, "%.100s:%d: Assertion '%.200s' failed in %.40s()", + file, line, expr_txt, func); + if (env->me_assert_func) + env->me_assert_func(env, buf); + fprintf(stderr, "%s\n", buf); + abort(); +} +#else +# define mdb_assert0(env, expr, expr_txt) ((void) 0) +#endif /* NDEBUG */ + +#if MDB_DEBUG +/** Return the page number of \b mp which may be sub-page, for debug output */ +static pgno_t +mdb_dbg_pgno(MDB_page *mp) +{ + pgno_t ret; + COPY_PGNO(ret, mp->mp_pgno); + return ret; +} + +/** Display a key in hexadecimal and return the address of the result. + * @param[in] key the key to display + * @param[in] buf the buffer to write into. Should always be #DKBUF. + * @return The key in hexadecimal form. + */ +char * +mdb_dkey(MDB_val *key, char *buf) +{ + char *ptr = buf; + unsigned char *c = key->mv_data; + unsigned int i; + + if (!key) + return ""; + + if (key->mv_size > DKBUF_MAXKEYSIZE) + return "MDB_MAXKEYSIZE"; + /* may want to make this a dynamic check: if the key is mostly + * printable characters, print it as-is instead of converting to hex. + */ +#if 1 + buf[0] = '\0'; + for (i=0; imv_size; i++) + ptr += sprintf(ptr, "%02x", *c++); +#else + sprintf(buf, "%.*s", key->mv_size, key->mv_data); +#endif + return buf; +} + +static const char * +mdb_leafnode_type(MDB_node *n) +{ + static char *const tp[2][2] = {{"", ": DB"}, {": sub-page", ": sub-DB"}}; + return F_ISSET(n->mn_flags, F_BIGDATA) ? ": overflow page" : + tp[F_ISSET(n->mn_flags, F_DUPDATA)][F_ISSET(n->mn_flags, F_SUBDATA)]; +} + +/** Display all the keys in the page. */ +void +mdb_page_list(MDB_page *mp) +{ + pgno_t pgno = mdb_dbg_pgno(mp); + const char *type, *state = (mp->mp_flags & P_DIRTY) ? ", dirty" : ""; + MDB_node *node; + unsigned int i, nkeys, nsize, total = 0; + MDB_val key; + DKBUF; + + switch (mp->mp_flags & (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP)) { + case P_BRANCH: type = "Branch page"; break; + case P_LEAF: type = "Leaf page"; break; + case P_LEAF|P_SUBP: type = "Sub-page"; break; + case P_LEAF|P_LEAF2: type = "LEAF2 page"; break; + case P_LEAF|P_LEAF2|P_SUBP: type = "LEAF2 sub-page"; break; + case P_OVERFLOW: + fprintf(stderr, "Overflow page %"Z"u pages %u%s\n", + pgno, mp->mp_pages, state); + return; + case P_META: + fprintf(stderr, "Meta-page %"Z"u txnid %"Z"u\n", + pgno, ((MDB_meta *)METADATA(mp))->mm_txnid); + return; + default: + fprintf(stderr, "Bad page %"Z"u flags 0x%u\n", pgno, mp->mp_flags); + return; + } + + nkeys = NUMKEYS(mp); + fprintf(stderr, "%s %"Z"u numkeys %d%s\n", type, pgno, nkeys, state); + + for (i=0; imp_pad; + key.mv_data = LEAF2KEY(mp, i, nsize); + total += nsize; + fprintf(stderr, "key %d: nsize %d, %s\n", i, nsize, DKEY(&key)); + continue; + } + node = NODEPTR(mp, i); + key.mv_size = node->mn_ksize; + key.mv_data = node->mn_data; + nsize = NODESIZE + key.mv_size; + if (IS_BRANCH(mp)) { + fprintf(stderr, "key %d: page %"Z"u, %s\n", i, NODEPGNO(node), + DKEY(&key)); + total += nsize; + } else { + if (F_ISSET(node->mn_flags, F_BIGDATA)) + nsize += sizeof(pgno_t); + else + nsize += NODEDSZ(node); + total += nsize; + nsize += sizeof(indx_t); + fprintf(stderr, "key %d: nsize %d, %s%s\n", + i, nsize, DKEY(&key), mdb_leafnode_type(node)); + } + total = EVEN(total); + } + fprintf(stderr, "Total: header %d + contents %d + unused %d\n", + IS_LEAF2(mp) ? PAGEHDRSZ : PAGEBASE + mp->mp_lower, total, SIZELEFT(mp)); +} + +void +mdb_cursor_chk(MDB_cursor *mc) +{ + unsigned int i; + MDB_node *node; + MDB_page *mp; + + if (!mc->mc_snum && !(mc->mc_flags & C_INITIALIZED)) return; + for (i=0; imc_top; i++) { + mp = mc->mc_pg[i]; + node = NODEPTR(mp, mc->mc_ki[i]); + if (NODEPGNO(node) != mc->mc_pg[i+1]->mp_pgno) + printf("oops!\n"); + } + if (mc->mc_ki[i] >= NUMKEYS(mc->mc_pg[i])) + printf("ack!\n"); +} +#endif + +#if (MDB_DEBUG) > 2 +/** Count all the pages in each DB and in the freelist + * and make sure it matches the actual number of pages + * being used. + * All named DBs must be open for a correct count. + */ +static void mdb_audit(MDB_txn *txn) +{ + MDB_cursor mc; + MDB_val key, data; + MDB_ID freecount, count; + MDB_dbi i; + int rc; + + freecount = 0; + mdb_cursor_init(&mc, txn, FREE_DBI, NULL); + while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) + freecount += *(MDB_ID *)data.mv_data; + mdb_tassert(txn, rc == MDB_NOTFOUND); + + count = 0; + for (i = 0; imt_numdbs; i++) { + MDB_xcursor mx; + if (!(txn->mt_dbflags[i] & DB_VALID)) + continue; + mdb_cursor_init(&mc, txn, i, &mx); + if (txn->mt_dbs[i].md_root == P_INVALID) + continue; + count += txn->mt_dbs[i].md_branch_pages + + txn->mt_dbs[i].md_leaf_pages + + txn->mt_dbs[i].md_overflow_pages; + if (txn->mt_dbs[i].md_flags & MDB_DUPSORT) { + rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST); + for (; rc == MDB_SUCCESS; rc = mdb_cursor_sibling(&mc, 1)) { + unsigned j; + MDB_page *mp; + mp = mc.mc_pg[mc.mc_top]; + for (j=0; jmn_flags & F_SUBDATA) { + MDB_db db; + memcpy(&db, NODEDATA(leaf), sizeof(db)); + count += db.md_branch_pages + db.md_leaf_pages + + db.md_overflow_pages; + } + } + } + mdb_tassert(txn, rc == MDB_NOTFOUND); + } + } + if (freecount + count + 2 /* metapages */ != txn->mt_next_pgno) { + fprintf(stderr, "audit: %lu freecount: %lu count: %lu total: %lu next_pgno: %lu\n", + txn->mt_txnid, freecount, count+2, freecount+count+2, txn->mt_next_pgno); + } +} +#endif + +int +mdb_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) +{ + return txn->mt_dbxs[dbi].md_cmp(a, b); +} + +int +mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) +{ + return txn->mt_dbxs[dbi].md_dcmp(a, b); +} + +/** Allocate memory for a page. + * Re-use old malloc'd pages first for singletons, otherwise just malloc. + */ +static MDB_page * +mdb_page_malloc(MDB_txn *txn, unsigned num) +{ + MDB_env *env = txn->mt_env; + MDB_page *ret = env->me_dpages; + size_t psize = env->me_psize, sz = psize, off; + /* For ! #MDB_NOMEMINIT, psize counts how much to init. + * For a single page alloc, we init everything after the page header. + * For multi-page, we init the final page; if the caller needed that + * many pages they will be filling in at least up to the last page. + */ + if (num == 1) { + if (ret) { + VGMEMP_ALLOC(env, ret, sz); + VGMEMP_DEFINED(ret, sizeof(ret->mp_next)); + env->me_dpages = ret->mp_next; + return ret; + } + psize -= off = PAGEHDRSZ; + } else { + sz *= num; + off = sz - psize; + } + if ((ret = malloc(sz)) != NULL) { + VGMEMP_ALLOC(env, ret, sz); + if (!(env->me_flags & MDB_NOMEMINIT)) { + memset((char *)ret + off, 0, psize); + ret->mp_pad = 0; + } + } else { + txn->mt_flags |= MDB_TXN_ERROR; + } + return ret; +} +/** Free a single page. + * Saves single pages to a list, for future reuse. + * (This is not used for multi-page overflow pages.) + */ +static void +mdb_page_free(MDB_env *env, MDB_page *mp) +{ + mp->mp_next = env->me_dpages; + VGMEMP_FREE(env, mp); + env->me_dpages = mp; +} + +/** Free a dirty page */ +static void +mdb_dpage_free(MDB_env *env, MDB_page *dp) +{ + if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) { + mdb_page_free(env, dp); + } else { + /* large pages just get freed directly */ + VGMEMP_FREE(env, dp); + free(dp); + } +} + +/** Return all dirty pages to dpage list */ +static void +mdb_dlist_free(MDB_txn *txn) +{ + MDB_env *env = txn->mt_env; + MDB_ID2L dl = txn->mt_u.dirty_list; + unsigned i, n = dl[0].mid; + + for (i = 1; i <= n; i++) { + mdb_dpage_free(env, dl[i].mptr); + } + dl[0].mid = 0; +} + +/** Loosen or free a single page. + * Saves single pages to a list for future reuse + * in this same txn. It has been pulled from the freeDB + * and already resides on the dirty list, but has been + * deleted. Use these pages first before pulling again + * from the freeDB. + * + * If the page wasn't dirtied in this txn, just add it + * to this txn's free list. + */ +static int +mdb_page_loose(MDB_cursor *mc, MDB_page *mp) +{ + int loose = 0; + pgno_t pgno = mp->mp_pgno; + MDB_txn *txn = mc->mc_txn; + + if ((mp->mp_flags & P_DIRTY) && mc->mc_dbi != FREE_DBI) { + if (txn->mt_parent) { + MDB_ID2 *dl = txn->mt_u.dirty_list; + /* If txn has a parent, make sure the page is in our + * dirty list. + */ + if (dl[0].mid) { + unsigned x = mdb_mid2l_search(dl, pgno); + if (x <= dl[0].mid && dl[x].mid == pgno) { + if (mp != dl[x].mptr) { /* bad cursor? */ + mc->mc_flags &= ~(C_INITIALIZED|C_EOF); + txn->mt_flags |= MDB_TXN_ERROR; + return MDB_CORRUPTED; + } + /* ok, it's ours */ + loose = 1; + } + } + } else { + /* no parent txn, so it's just ours */ + loose = 1; + } + } + if (loose) { + DPRINTF(("loosen db %d page %"Z"u", DDBI(mc), + mp->mp_pgno)); + NEXT_LOOSE_PAGE(mp) = txn->mt_loose_pgs; + txn->mt_loose_pgs = mp; + txn->mt_loose_count++; + mp->mp_flags |= P_LOOSE; + } else { + int rc = mdb_midl_append(&txn->mt_free_pgs, pgno); + if (rc) + return rc; + } + + return MDB_SUCCESS; +} + +/** Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn. + * @param[in] mc A cursor handle for the current operation. + * @param[in] pflags Flags of the pages to update: + * P_DIRTY to set P_KEEP, P_DIRTY|P_KEEP to clear it. + * @param[in] all No shortcuts. Needed except after a full #mdb_page_flush(). + * @return 0 on success, non-zero on failure. + */ +static int +mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all) +{ + enum { Mask = P_SUBP|P_DIRTY|P_LOOSE|P_KEEP }; + MDB_txn *txn = mc->mc_txn; + MDB_cursor *m3; + MDB_xcursor *mx; + MDB_page *dp, *mp; + MDB_node *leaf; + unsigned i, j; + int rc = MDB_SUCCESS, level; + + /* Mark pages seen by cursors */ + if (mc->mc_flags & C_UNTRACK) + mc = NULL; /* will find mc in mt_cursors */ + for (i = txn->mt_numdbs;; mc = txn->mt_cursors[--i]) { + for (; mc; mc=mc->mc_next) { + if (!(mc->mc_flags & C_INITIALIZED)) + continue; + for (m3 = mc;; m3 = &mx->mx_cursor) { + mp = NULL; + for (j=0; jmc_snum; j++) { + mp = m3->mc_pg[j]; + if ((mp->mp_flags & Mask) == pflags) + mp->mp_flags ^= P_KEEP; + } + mx = m3->mc_xcursor; + /* Proceed to mx if it is at a sub-database */ + if (! (mx && (mx->mx_cursor.mc_flags & C_INITIALIZED))) + break; + if (! (mp && (mp->mp_flags & P_LEAF))) + break; + leaf = NODEPTR(mp, m3->mc_ki[j-1]); + if (!(leaf->mn_flags & F_SUBDATA)) + break; + } + } + if (i == 0) + break; + } + + if (all) { + /* Mark dirty root pages */ + for (i=0; imt_numdbs; i++) { + if (txn->mt_dbflags[i] & DB_DIRTY) { + pgno_t pgno = txn->mt_dbs[i].md_root; + if (pgno == P_INVALID) + continue; + if ((rc = mdb_page_get(txn, pgno, &dp, &level)) != MDB_SUCCESS) + break; + if ((dp->mp_flags & Mask) == pflags && level <= 1) + dp->mp_flags ^= P_KEEP; + } + } + } + + return rc; +} + +static int mdb_page_flush(MDB_txn *txn, int keep); + +/** Spill pages from the dirty list back to disk. + * This is intended to prevent running into #MDB_TXN_FULL situations, + * but note that they may still occur in a few cases: + * 1) our estimate of the txn size could be too small. Currently this + * seems unlikely, except with a large number of #MDB_MULTIPLE items. + * 2) child txns may run out of space if their parents dirtied a + * lot of pages and never spilled them. TODO: we probably should do + * a preemptive spill during #mdb_txn_begin() of a child txn, if + * the parent's dirty_room is below a given threshold. + * + * Otherwise, if not using nested txns, it is expected that apps will + * not run into #MDB_TXN_FULL any more. The pages are flushed to disk + * the same way as for a txn commit, e.g. their P_DIRTY flag is cleared. + * If the txn never references them again, they can be left alone. + * If the txn only reads them, they can be used without any fuss. + * If the txn writes them again, they can be dirtied immediately without + * going thru all of the work of #mdb_page_touch(). Such references are + * handled by #mdb_page_unspill(). + * + * Also note, we never spill DB root pages, nor pages of active cursors, + * because we'll need these back again soon anyway. And in nested txns, + * we can't spill a page in a child txn if it was already spilled in a + * parent txn. That would alter the parent txns' data even though + * the child hasn't committed yet, and we'd have no way to undo it if + * the child aborted. + * + * @param[in] m0 cursor A cursor handle identifying the transaction and + * database for which we are checking space. + * @param[in] key For a put operation, the key being stored. + * @param[in] data For a put operation, the data being stored. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) +{ + MDB_txn *txn = m0->mc_txn; + MDB_page *dp; + MDB_ID2L dl = txn->mt_u.dirty_list; + unsigned int i, j, need; + int rc; + + if (m0->mc_flags & C_SUB) + return MDB_SUCCESS; + + /* Estimate how much space this op will take */ + i = m0->mc_db->md_depth; + /* Named DBs also dirty the main DB */ + if (m0->mc_dbi > MAIN_DBI) + i += txn->mt_dbs[MAIN_DBI].md_depth; + /* For puts, roughly factor in the key+data size */ + if (key) + i += (LEAFSIZE(key, data) + txn->mt_env->me_psize) / txn->mt_env->me_psize; + i += i; /* double it for good measure */ + need = i; + + if (txn->mt_dirty_room > i) + return MDB_SUCCESS; + + if (!txn->mt_spill_pgs) { + txn->mt_spill_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX); + if (!txn->mt_spill_pgs) + return ENOMEM; + } else { + /* purge deleted slots */ + MDB_IDL sl = txn->mt_spill_pgs; + unsigned int num = sl[0]; + j=0; + for (i=1; i<=num; i++) { + if (!(sl[i] & 1)) + sl[++j] = sl[i]; + } + sl[0] = j; + } + + /* Preserve pages which may soon be dirtied again */ + if ((rc = mdb_pages_xkeep(m0, P_DIRTY, 1)) != MDB_SUCCESS) + goto done; + + /* Less aggressive spill - we originally spilled the entire dirty list, + * with a few exceptions for cursor pages and DB root pages. But this + * turns out to be a lot of wasted effort because in a large txn many + * of those pages will need to be used again. So now we spill only 1/8th + * of the dirty pages. Testing revealed this to be a good tradeoff, + * better than 1/2, 1/4, or 1/10. + */ + if (need < MDB_IDL_UM_MAX / 8) + need = MDB_IDL_UM_MAX / 8; + + /* Save the page IDs of all the pages we're flushing */ + /* flush from the tail forward, this saves a lot of shifting later on. */ + for (i=dl[0].mid; i && need; i--) { + MDB_ID pn = dl[i].mid << 1; + dp = dl[i].mptr; + if (dp->mp_flags & (P_LOOSE|P_KEEP)) + continue; + /* Can't spill twice, make sure it's not already in a parent's + * spill list. + */ + if (txn->mt_parent) { + MDB_txn *tx2; + for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) { + if (tx2->mt_spill_pgs) { + j = mdb_midl_search(tx2->mt_spill_pgs, pn); + if (j <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[j] == pn) { + dp->mp_flags |= P_KEEP; + break; + } + } + } + if (tx2) + continue; + } + if ((rc = mdb_midl_append(&txn->mt_spill_pgs, pn))) + goto done; + need--; + } + mdb_midl_sort(txn->mt_spill_pgs); + + /* Flush the spilled part of dirty list */ + if ((rc = mdb_page_flush(txn, i)) != MDB_SUCCESS) + goto done; + + /* Reset any dirty pages we kept that page_flush didn't see */ + rc = mdb_pages_xkeep(m0, P_DIRTY|P_KEEP, i); + +done: + txn->mt_flags |= rc ? MDB_TXN_ERROR : MDB_TXN_SPILLS; + return rc; +} + +/** Find oldest txnid still referenced. Expects txn->mt_txnid > 0. */ +static txnid_t +mdb_find_oldest(MDB_txn *txn) +{ + int i; + txnid_t mr, oldest = txn->mt_txnid - 1; + if (txn->mt_env->me_txns) { + MDB_reader *r = txn->mt_env->me_txns->mti_readers; + for (i = txn->mt_env->me_txns->mti_numreaders; --i >= 0; ) { + if (r[i].mr_pid) { + mr = r[i].mr_txnid; + if (oldest > mr) + oldest = mr; + } + } + } + return oldest; +} + +/** Add a page to the txn's dirty list */ +static void +mdb_page_dirty(MDB_txn *txn, MDB_page *mp) +{ + MDB_ID2 mid; + int rc, (*insert)(MDB_ID2L, MDB_ID2 *); + + if (txn->mt_env->me_flags & MDB_WRITEMAP) { + insert = mdb_mid2l_append; + } else { + insert = mdb_mid2l_insert; + } + mid.mid = mp->mp_pgno; + mid.mptr = mp; + rc = insert(txn->mt_u.dirty_list, &mid); + mdb_tassert(txn, rc == 0); + txn->mt_dirty_room--; +} + +/** Allocate page numbers and memory for writing. Maintain me_pglast, + * me_pghead and mt_next_pgno. + * + * If there are free pages available from older transactions, they + * are re-used first. Otherwise allocate a new page at mt_next_pgno. + * Do not modify the freedB, just merge freeDB records into me_pghead[] + * and move me_pglast to say which records were consumed. Only this + * function can create me_pghead and move me_pglast/mt_next_pgno. + * @param[in] mc cursor A cursor handle identifying the transaction and + * database for which we are allocating. + * @param[in] num the number of pages to allocate. + * @param[out] mp Address of the allocated page(s). Requests for multiple pages + * will always be satisfied by a single contiguous chunk of memory. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) +{ +#ifdef MDB_PARANOID /* Seems like we can ignore this now */ + /* Get at most more freeDB records once me_pghead + * has enough pages. If not enough, use new pages from the map. + * If and mc is updating the freeDB, only get new + * records if me_pghead is empty. Then the freelist cannot play + * catch-up with itself by growing while trying to save it. + */ + enum { Paranoid = 1, Max_retries = 500 }; +#else + enum { Paranoid = 0, Max_retries = INT_MAX /*infinite*/ }; +#endif + int rc, retry = num * 20; + MDB_txn *txn = mc->mc_txn; + MDB_env *env = txn->mt_env; + pgno_t pgno, *mop = env->me_pghead; + unsigned i, j, mop_len = mop ? mop[0] : 0, n2 = num-1; + MDB_page *np; + txnid_t oldest = 0, last; + MDB_cursor_op op; + MDB_cursor m2; + + /* If there are any loose pages, just use them */ + if (num == 1 && txn->mt_loose_pgs) { + np = txn->mt_loose_pgs; + txn->mt_loose_pgs = NEXT_LOOSE_PAGE(np); + txn->mt_loose_count--; + DPRINTF(("db %d use loose page %"Z"u", DDBI(mc), + np->mp_pgno)); + *mp = np; + return MDB_SUCCESS; + } + + *mp = NULL; + + /* If our dirty list is already full, we can't do anything */ + if (txn->mt_dirty_room == 0) { + rc = MDB_TXN_FULL; + goto fail; + } + + for (op = MDB_FIRST;; op = MDB_NEXT) { + MDB_val key, data; + MDB_node *leaf; + pgno_t *idl; + + /* Seek a big enough contiguous page range. Prefer + * pages at the tail, just truncating the list. + */ + if (mop_len > n2) { + i = mop_len; + do { + pgno = mop[i]; + if (mop[i-n2] == pgno+n2) + goto search_done; + } while (--i > n2); + if (--retry < 0) + break; + } + + if (op == MDB_FIRST) { /* 1st iteration */ + /* Prepare to fetch more and coalesce */ + oldest = mdb_find_oldest(txn); + last = env->me_pglast; + mdb_cursor_init(&m2, txn, FREE_DBI, NULL); + if (last) { + op = MDB_SET_RANGE; + key.mv_data = &last; /* will look up last+1 */ + key.mv_size = sizeof(last); + } + if (Paranoid && mc->mc_dbi == FREE_DBI) + retry = -1; + } + if (Paranoid && retry < 0 && mop_len) + break; + + last++; + /* Do not fetch more if the record will be too recent */ + if (oldest <= last) + break; + rc = mdb_cursor_get(&m2, &key, NULL, op); + if (rc) { + if (rc == MDB_NOTFOUND) + break; + goto fail; + } + last = *(txnid_t*)key.mv_data; + if (oldest <= last) + break; + np = m2.mc_pg[m2.mc_top]; + leaf = NODEPTR(np, m2.mc_ki[m2.mc_top]); + if ((rc = mdb_node_read(txn, leaf, &data)) != MDB_SUCCESS) + return rc; + + idl = (MDB_ID *) data.mv_data; + i = idl[0]; + if (!mop) { + if (!(env->me_pghead = mop = mdb_midl_alloc(i))) { + rc = ENOMEM; + goto fail; + } + } else { + if ((rc = mdb_midl_need(&env->me_pghead, i)) != 0) + goto fail; + mop = env->me_pghead; + } + env->me_pglast = last; +#if (MDB_DEBUG) > 1 + DPRINTF(("IDL read txn %"Z"u root %"Z"u num %u", + last, txn->mt_dbs[FREE_DBI].md_root, i)); + for (j = i; j; j--) + DPRINTF(("IDL %"Z"u", idl[j])); +#endif + /* Merge in descending sorted order */ + mdb_midl_xmerge(mop, idl); + mop_len = mop[0]; + } + + /* Use new pages from the map when nothing suitable in the freeDB */ + i = 0; + pgno = txn->mt_next_pgno; + if (pgno + num >= env->me_maxpg) { + DPUTS("DB size maxed out"); + rc = MDB_MAP_FULL; + goto fail; + } + +search_done: + if (env->me_flags & MDB_WRITEMAP) { + np = (MDB_page *)(env->me_map + env->me_psize * pgno); + } else { + if (!(np = mdb_page_malloc(txn, num))) { + rc = ENOMEM; + goto fail; + } + } + if (i) { + mop[0] = mop_len -= num; + /* Move any stragglers down */ + for (j = i-num; j < mop_len; ) + mop[++j] = mop[++i]; + } else { + txn->mt_next_pgno = pgno + num; + } + np->mp_pgno = pgno; + mdb_page_dirty(txn, np); + *mp = np; + + return MDB_SUCCESS; + +fail: + txn->mt_flags |= MDB_TXN_ERROR; + return rc; +} + +/** Copy the used portions of a non-overflow page. + * @param[in] dst page to copy into + * @param[in] src page to copy from + * @param[in] psize size of a page + */ +static void +mdb_page_copy(MDB_page *dst, MDB_page *src, unsigned int psize) +{ + enum { Align = sizeof(pgno_t) }; + indx_t upper = src->mp_upper, lower = src->mp_lower, unused = upper-lower; + + /* If page isn't full, just copy the used portion. Adjust + * alignment so memcpy may copy words instead of bytes. + */ + if ((unused &= -Align) && !IS_LEAF2(src)) { + upper = (upper + PAGEBASE) & -Align; + memcpy(dst, src, (lower + PAGEBASE + (Align-1)) & -Align); + memcpy((pgno_t *)((char *)dst+upper), (pgno_t *)((char *)src+upper), + psize - upper); + } else { + memcpy(dst, src, psize - unused); + } +} + +/** Pull a page off the txn's spill list, if present. + * If a page being referenced was spilled to disk in this txn, bring + * it back and make it dirty/writable again. + * @param[in] txn the transaction handle. + * @param[in] mp the page being referenced. It must not be dirty. + * @param[out] ret the writable page, if any. ret is unchanged if + * mp wasn't spilled. + */ +static int +mdb_page_unspill(MDB_txn *txn, MDB_page *mp, MDB_page **ret) +{ + MDB_env *env = txn->mt_env; + const MDB_txn *tx2; + unsigned x; + pgno_t pgno = mp->mp_pgno, pn = pgno << 1; + + for (tx2 = txn; tx2; tx2=tx2->mt_parent) { + if (!tx2->mt_spill_pgs) + continue; + x = mdb_midl_search(tx2->mt_spill_pgs, pn); + if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) { + MDB_page *np; + int num; + if (txn->mt_dirty_room == 0) + return MDB_TXN_FULL; + if (IS_OVERFLOW(mp)) + num = mp->mp_pages; + else + num = 1; + if (env->me_flags & MDB_WRITEMAP) { + np = mp; + } else { + np = mdb_page_malloc(txn, num); + if (!np) + return ENOMEM; + if (num > 1) + memcpy(np, mp, num * env->me_psize); + else + mdb_page_copy(np, mp, env->me_psize); + } + if (tx2 == txn) { + /* If in current txn, this page is no longer spilled. + * If it happens to be the last page, truncate the spill list. + * Otherwise mark it as deleted by setting the LSB. + */ + if (x == txn->mt_spill_pgs[0]) + txn->mt_spill_pgs[0]--; + else + txn->mt_spill_pgs[x] |= 1; + } /* otherwise, if belonging to a parent txn, the + * page remains spilled until child commits + */ + + mdb_page_dirty(txn, np); + np->mp_flags |= P_DIRTY; + *ret = np; + break; + } + } + return MDB_SUCCESS; +} + +/** Touch a page: make it dirty and re-insert into tree with updated pgno. + * @param[in] mc cursor pointing to the page to be touched + * @return 0 on success, non-zero on failure. + */ +static int +mdb_page_touch(MDB_cursor *mc) +{ + MDB_page *mp = mc->mc_pg[mc->mc_top], *np; + MDB_txn *txn = mc->mc_txn; + MDB_cursor *m2, *m3; + pgno_t pgno; + int rc; + + if (!F_ISSET(mp->mp_flags, P_DIRTY)) { + if (txn->mt_flags & MDB_TXN_SPILLS) { + np = NULL; + rc = mdb_page_unspill(txn, mp, &np); + if (rc) + goto fail; + if (np) + goto done; + } + if ((rc = mdb_midl_need(&txn->mt_free_pgs, 1)) || + (rc = mdb_page_alloc(mc, 1, &np))) + goto fail; + pgno = np->mp_pgno; + DPRINTF(("touched db %d page %"Z"u -> %"Z"u", DDBI(mc), + mp->mp_pgno, pgno)); + mdb_cassert(mc, mp->mp_pgno != pgno); + mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); + /* Update the parent page, if any, to point to the new page */ + if (mc->mc_top) { + MDB_page *parent = mc->mc_pg[mc->mc_top-1]; + MDB_node *node = NODEPTR(parent, mc->mc_ki[mc->mc_top-1]); + SETPGNO(node, pgno); + } else { + mc->mc_db->md_root = pgno; + } + } else if (txn->mt_parent && !IS_SUBP(mp)) { + MDB_ID2 mid, *dl = txn->mt_u.dirty_list; + pgno = mp->mp_pgno; + /* If txn has a parent, make sure the page is in our + * dirty list. + */ + if (dl[0].mid) { + unsigned x = mdb_mid2l_search(dl, pgno); + if (x <= dl[0].mid && dl[x].mid == pgno) { + if (mp != dl[x].mptr) { /* bad cursor? */ + mc->mc_flags &= ~(C_INITIALIZED|C_EOF); + txn->mt_flags |= MDB_TXN_ERROR; + return MDB_CORRUPTED; + } + return 0; + } + } + mdb_cassert(mc, dl[0].mid < MDB_IDL_UM_MAX); + /* No - copy it */ + np = mdb_page_malloc(txn, 1); + if (!np) + return ENOMEM; + mid.mid = pgno; + mid.mptr = np; + rc = mdb_mid2l_insert(dl, &mid); + mdb_cassert(mc, rc == 0); + } else { + return 0; + } + + mdb_page_copy(np, mp, txn->mt_env->me_psize); + np->mp_pgno = pgno; + np->mp_flags |= P_DIRTY; + +done: + /* Adjust cursors pointing to mp */ + mc->mc_pg[mc->mc_top] = np; + m2 = txn->mt_cursors[mc->mc_dbi]; + if (mc->mc_flags & C_SUB) { + for (; m2; m2=m2->mc_next) { + m3 = &m2->mc_xcursor->mx_cursor; + if (m3->mc_snum < mc->mc_snum) continue; + if (m3->mc_pg[mc->mc_top] == mp) + m3->mc_pg[mc->mc_top] = np; + } + } else { + for (; m2; m2=m2->mc_next) { + if (m2->mc_snum < mc->mc_snum) continue; + if (m2->mc_pg[mc->mc_top] == mp) { + m2->mc_pg[mc->mc_top] = np; + if ((mc->mc_db->md_flags & MDB_DUPSORT) && + IS_LEAF(np) && + m2->mc_ki[mc->mc_top] == mc->mc_ki[mc->mc_top]) + { + MDB_node *leaf = NODEPTR(np, mc->mc_ki[mc->mc_top]); + if (!(leaf->mn_flags & F_SUBDATA)) + m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); + } + } + } + } + return 0; + +fail: + txn->mt_flags |= MDB_TXN_ERROR; + return rc; +} + +int +mdb_env_sync(MDB_env *env, int force) +{ + int rc = 0; + if (force || !F_ISSET(env->me_flags, MDB_NOSYNC)) { + if (env->me_flags & MDB_WRITEMAP) { + int flags = ((env->me_flags & MDB_MAPASYNC) && !force) + ? MS_ASYNC : MS_SYNC; + if (MDB_MSYNC(env->me_map, env->me_mapsize, flags)) + rc = ErrCode(); +#ifdef _WIN32 + else if (flags == MS_SYNC && MDB_FDATASYNC(env->me_fd)) + rc = ErrCode(); +#endif + } else { + if (MDB_FDATASYNC(env->me_fd)) + rc = ErrCode(); + } + } + return rc; +} + +/** Back up parent txn's cursors, then grab the originals for tracking */ +static int +mdb_cursor_shadow(MDB_txn *src, MDB_txn *dst) +{ + MDB_cursor *mc, *bk; + MDB_xcursor *mx; + size_t size; + int i; + + for (i = src->mt_numdbs; --i >= 0; ) { + if ((mc = src->mt_cursors[i]) != NULL) { + size = sizeof(MDB_cursor); + if (mc->mc_xcursor) + size += sizeof(MDB_xcursor); + for (; mc; mc = bk->mc_next) { + bk = malloc(size); + if (!bk) + return ENOMEM; + *bk = *mc; + mc->mc_backup = bk; + mc->mc_db = &dst->mt_dbs[i]; + /* Kill pointers into src - and dst to reduce abuse: The + * user may not use mc until dst ends. Otherwise we'd... + */ + mc->mc_txn = NULL; /* ...set this to dst */ + mc->mc_dbflag = NULL; /* ...and &dst->mt_dbflags[i] */ + if ((mx = mc->mc_xcursor) != NULL) { + *(MDB_xcursor *)(bk+1) = *mx; + mx->mx_cursor.mc_txn = NULL; /* ...and dst. */ + } + mc->mc_next = dst->mt_cursors[i]; + dst->mt_cursors[i] = mc; + } + } + } + return MDB_SUCCESS; +} + +/** Close this write txn's cursors, give parent txn's cursors back to parent. + * @param[in] txn the transaction handle. + * @param[in] merge true to keep changes to parent cursors, false to revert. + * @return 0 on success, non-zero on failure. + */ +static void +mdb_cursors_close(MDB_txn *txn, unsigned merge) +{ + MDB_cursor **cursors = txn->mt_cursors, *mc, *next, *bk; + MDB_xcursor *mx; + int i; + + for (i = txn->mt_numdbs; --i >= 0; ) { + for (mc = cursors[i]; mc; mc = next) { + next = mc->mc_next; + if ((bk = mc->mc_backup) != NULL) { + if (merge) { + /* Commit changes to parent txn */ + mc->mc_next = bk->mc_next; + mc->mc_backup = bk->mc_backup; + mc->mc_txn = bk->mc_txn; + mc->mc_db = bk->mc_db; + mc->mc_dbflag = bk->mc_dbflag; + if ((mx = mc->mc_xcursor) != NULL) + mx->mx_cursor.mc_txn = bk->mc_txn; + } else { + /* Abort nested txn */ + *mc = *bk; + if ((mx = mc->mc_xcursor) != NULL) + *mx = *(MDB_xcursor *)(bk+1); + } + mc = bk; + } + /* Only malloced cursors are permanently tracked. */ + free(mc); + } + cursors[i] = NULL; + } +} + +#if !(MDB_DEBUG) +#define mdb_txn_reset0(txn, act) mdb_txn_reset0(txn) +#endif +static void +mdb_txn_reset0(MDB_txn *txn, const char *act); + +#if !(MDB_PIDLOCK) /* Currently the same as defined(_WIN32) */ +enum Pidlock_op { + Pidset, Pidcheck +}; +#else +enum Pidlock_op { + Pidset = F_SETLK, Pidcheck = F_GETLK +}; +#endif + +/** Set or check a pid lock. Set returns 0 on success. + * Check returns 0 if the process is certainly dead, nonzero if it may + * be alive (the lock exists or an error happened so we do not know). + * + * On Windows Pidset is a no-op, we merely check for the existence + * of the process with the given pid. On POSIX we use a single byte + * lock on the lockfile, set at an offset equal to the pid. + */ +static int +mdb_reader_pid(MDB_env *env, enum Pidlock_op op, MDB_PID_T pid) +{ +#if !(MDB_PIDLOCK) /* Currently the same as defined(_WIN32) */ + int ret = 0; + HANDLE h; + if (op == Pidcheck) { + h = OpenProcess(env->me_pidquery, FALSE, pid); + /* No documented "no such process" code, but other program use this: */ + if (!h) + return ErrCode() != ERROR_INVALID_PARAMETER; + /* A process exists until all handles to it close. Has it exited? */ + ret = WaitForSingleObject(h, 0) != 0; + CloseHandle(h); + } + return ret; +#else + for (;;) { + int rc; + struct flock lock_info; + memset(&lock_info, 0, sizeof(lock_info)); + lock_info.l_type = F_WRLCK; + lock_info.l_whence = SEEK_SET; + lock_info.l_start = pid; + lock_info.l_len = 1; + if ((rc = fcntl(env->me_lfd, op, &lock_info)) == 0) { + if (op == F_GETLK && lock_info.l_type != F_UNLCK) + rc = -1; + } else if ((rc = ErrCode()) == EINTR) { + continue; + } + return rc; + } +#endif +} + +/** Common code for #mdb_txn_begin() and #mdb_txn_renew(). + * @param[in] txn the transaction handle to initialize + * @return 0 on success, non-zero on failure. + */ +static int +mdb_txn_renew0(MDB_txn *txn) +{ + MDB_env *env = txn->mt_env; + MDB_txninfo *ti = env->me_txns; + MDB_meta *meta; + unsigned int i, nr; + uint16_t x; + int rc, new_notls = 0; + + /* Setup db info */ + txn->mt_numdbs = env->me_numdbs; + txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */ + + if (txn->mt_flags & MDB_TXN_RDONLY) { + if (!ti) { + meta = env->me_metas[ mdb_env_pick_meta(env) ]; + txn->mt_txnid = meta->mm_txnid; + txn->mt_u.reader = NULL; + } else { + MDB_reader *r = (env->me_flags & MDB_NOTLS) ? txn->mt_u.reader : + pthread_getspecific(env->me_txkey); + if (r) { + if (r->mr_pid != env->me_pid || r->mr_txnid != (txnid_t)-1) + return MDB_BAD_RSLOT; + } else { + MDB_PID_T pid = env->me_pid; + MDB_THR_T tid = pthread_self(); + + if (!env->me_live_reader) { + rc = mdb_reader_pid(env, Pidset, pid); + if (rc) + return rc; + env->me_live_reader = 1; + } + + LOCK_MUTEX_R(env); + nr = ti->mti_numreaders; + for (i=0; imti_readers[i].mr_pid == 0) + break; + if (i == env->me_maxreaders) { + UNLOCK_MUTEX_R(env); + return MDB_READERS_FULL; + } + ti->mti_readers[i].mr_pid = pid; + ti->mti_readers[i].mr_tid = tid; + if (i == nr) + ti->mti_numreaders = ++nr; + /* Save numreaders for un-mutexed mdb_env_close() */ + env->me_numreaders = nr; + UNLOCK_MUTEX_R(env); + + r = &ti->mti_readers[i]; + new_notls = (env->me_flags & MDB_NOTLS); + if (!new_notls && (rc=pthread_setspecific(env->me_txkey, r))) { + r->mr_pid = 0; + return rc; + } + } + txn->mt_txnid = r->mr_txnid = ti->mti_txnid; + txn->mt_u.reader = r; + meta = env->me_metas[txn->mt_txnid & 1]; + } + } else { + if (ti) { + LOCK_MUTEX_W(env); + + txn->mt_txnid = ti->mti_txnid; + meta = env->me_metas[txn->mt_txnid & 1]; + } else { + meta = env->me_metas[ mdb_env_pick_meta(env) ]; + txn->mt_txnid = meta->mm_txnid; + } + txn->mt_txnid++; +#if MDB_DEBUG + if (txn->mt_txnid == mdb_debug_start) + mdb_debug = 1; +#endif + txn->mt_dirty_room = MDB_IDL_UM_MAX; + txn->mt_u.dirty_list = env->me_dirty_list; + txn->mt_u.dirty_list[0].mid = 0; + txn->mt_free_pgs = env->me_free_pgs; + txn->mt_free_pgs[0] = 0; + txn->mt_spill_pgs = NULL; + env->me_txn = txn; + memcpy(txn->mt_dbiseqs, env->me_dbiseqs, env->me_maxdbs * sizeof(unsigned int)); + } + + /* Copy the DB info and flags */ + memcpy(txn->mt_dbs, meta->mm_dbs, 2 * sizeof(MDB_db)); + + /* Moved to here to avoid a data race in read TXNs */ + txn->mt_next_pgno = meta->mm_last_pg+1; + + for (i=2; imt_numdbs; i++) { + x = env->me_dbflags[i]; + txn->mt_dbs[i].md_flags = x & PERSISTENT_FLAGS; + txn->mt_dbflags[i] = (x & MDB_VALID) ? DB_VALID|DB_STALE : 0; + } + txn->mt_dbflags[0] = txn->mt_dbflags[1] = DB_VALID; + + if (env->me_maxpg < txn->mt_next_pgno) { + mdb_txn_reset0(txn, "renew0-mapfail"); + if (new_notls) { + txn->mt_u.reader->mr_pid = 0; + txn->mt_u.reader = NULL; + } + return MDB_MAP_RESIZED; + } + + return MDB_SUCCESS; +} + +int +mdb_txn_renew(MDB_txn *txn) +{ + int rc; + + if (!txn || txn->mt_dbxs) /* A reset txn has mt_dbxs==NULL */ + return EINVAL; + + if (txn->mt_env->me_flags & MDB_FATAL_ERROR) { + DPUTS("environment had fatal error, must shutdown!"); + return MDB_PANIC; + } + + rc = mdb_txn_renew0(txn); + if (rc == MDB_SUCCESS) { + DPRINTF(("renew txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", + txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', + (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root)); + } + return rc; +} + +int +mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) +{ + MDB_txn *txn; + MDB_ntxn *ntxn; + int rc, size, tsize = sizeof(MDB_txn); + + if (env->me_flags & MDB_FATAL_ERROR) { + DPUTS("environment had fatal error, must shutdown!"); + return MDB_PANIC; + } + if ((env->me_flags & MDB_RDONLY) && !(flags & MDB_RDONLY)) + return EACCES; + if (parent) { + /* Nested transactions: Max 1 child, write txns only, no writemap */ + if (parent->mt_child || + (flags & MDB_RDONLY) || + (parent->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR)) || + (env->me_flags & MDB_WRITEMAP)) + { + return (parent->mt_flags & MDB_TXN_RDONLY) ? EINVAL : MDB_BAD_TXN; + } + tsize = sizeof(MDB_ntxn); + } + size = tsize + env->me_maxdbs * (sizeof(MDB_db)+1); + if (!(flags & MDB_RDONLY)) { + size += env->me_maxdbs * sizeof(MDB_cursor *); + /* child txns use parent's dbiseqs */ + if (!parent) + size += env->me_maxdbs * sizeof(unsigned int); + } + + if ((txn = calloc(1, size)) == NULL) { + DPRINTF(("calloc: %s", strerror(errno))); + return ENOMEM; + } + txn->mt_dbs = (MDB_db *) ((char *)txn + tsize); + if (flags & MDB_RDONLY) { + txn->mt_flags |= MDB_TXN_RDONLY; + txn->mt_dbflags = (unsigned char *)(txn->mt_dbs + env->me_maxdbs); + txn->mt_dbiseqs = env->me_dbiseqs; + } else { + txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs); + if (parent) { + txn->mt_dbiseqs = parent->mt_dbiseqs; + txn->mt_dbflags = (unsigned char *)(txn->mt_cursors + env->me_maxdbs); + } else { + txn->mt_dbiseqs = (unsigned int *)(txn->mt_cursors + env->me_maxdbs); + txn->mt_dbflags = (unsigned char *)(txn->mt_dbiseqs + env->me_maxdbs); + } + } + txn->mt_env = env; + + if (parent) { + unsigned int i; + txn->mt_u.dirty_list = malloc(sizeof(MDB_ID2)*MDB_IDL_UM_SIZE); + if (!txn->mt_u.dirty_list || + !(txn->mt_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX))) + { + free(txn->mt_u.dirty_list); + free(txn); + return ENOMEM; + } + txn->mt_txnid = parent->mt_txnid; + txn->mt_dirty_room = parent->mt_dirty_room; + txn->mt_u.dirty_list[0].mid = 0; + txn->mt_spill_pgs = NULL; + txn->mt_next_pgno = parent->mt_next_pgno; + parent->mt_child = txn; + txn->mt_parent = parent; + txn->mt_numdbs = parent->mt_numdbs; + txn->mt_flags = parent->mt_flags; + txn->mt_dbxs = parent->mt_dbxs; + memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); + /* Copy parent's mt_dbflags, but clear DB_NEW */ + for (i=0; imt_numdbs; i++) + txn->mt_dbflags[i] = parent->mt_dbflags[i] & ~DB_NEW; + rc = 0; + ntxn = (MDB_ntxn *)txn; + ntxn->mnt_pgstate = env->me_pgstate; /* save parent me_pghead & co */ + if (env->me_pghead) { + size = MDB_IDL_SIZEOF(env->me_pghead); + env->me_pghead = mdb_midl_alloc(env->me_pghead[0]); + if (env->me_pghead) + memcpy(env->me_pghead, ntxn->mnt_pgstate.mf_pghead, size); + else + rc = ENOMEM; + } + if (!rc) + rc = mdb_cursor_shadow(parent, txn); + if (rc) + mdb_txn_reset0(txn, "beginchild-fail"); + } else { + rc = mdb_txn_renew0(txn); + } + if (rc) + free(txn); + else { + *ret = txn; + DPRINTF(("begin txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", + txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', + (void *) txn, (void *) env, txn->mt_dbs[MAIN_DBI].md_root)); + } + + return rc; +} + +MDB_env * +mdb_txn_env(MDB_txn *txn) +{ + if(!txn) return NULL; + return txn->mt_env; +} + +/** Export or close DBI handles opened in this txn. */ +static void +mdb_dbis_update(MDB_txn *txn, int keep) +{ + int i; + MDB_dbi n = txn->mt_numdbs; + MDB_env *env = txn->mt_env; + unsigned char *tdbflags = txn->mt_dbflags; + + for (i = n; --i >= 2;) { + if (tdbflags[i] & DB_NEW) { + if (keep) { + env->me_dbflags[i] = txn->mt_dbs[i].md_flags | MDB_VALID; + } else { + char *ptr = env->me_dbxs[i].md_name.mv_data; + if (ptr) { + env->me_dbxs[i].md_name.mv_data = NULL; + env->me_dbxs[i].md_name.mv_size = 0; + env->me_dbflags[i] = 0; + env->me_dbiseqs[i]++; + free(ptr); + } + } + } + } + if (keep && env->me_numdbs < n) + env->me_numdbs = n; +} + +/** Common code for #mdb_txn_reset() and #mdb_txn_abort(). + * May be called twice for readonly txns: First reset it, then abort. + * @param[in] txn the transaction handle to reset + * @param[in] act why the transaction is being reset + */ +static void +mdb_txn_reset0(MDB_txn *txn, const char *act) +{ + MDB_env *env = txn->mt_env; + + /* Close any DBI handles opened in this txn */ + mdb_dbis_update(txn, 0); + + DPRINTF(("%s txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", + act, txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', + (void *) txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root)); + + if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { + if (txn->mt_u.reader) { + txn->mt_u.reader->mr_txnid = (txnid_t)-1; + if (!(env->me_flags & MDB_NOTLS)) + txn->mt_u.reader = NULL; /* txn does not own reader */ + } + txn->mt_numdbs = 0; /* close nothing if called again */ + txn->mt_dbxs = NULL; /* mark txn as reset */ + } else { + mdb_cursors_close(txn, 0); + + if (!(env->me_flags & MDB_WRITEMAP)) { + mdb_dlist_free(txn); + } + mdb_midl_free(env->me_pghead); + + if (txn->mt_parent) { + txn->mt_parent->mt_child = NULL; + env->me_pgstate = ((MDB_ntxn *)txn)->mnt_pgstate; + mdb_midl_free(txn->mt_free_pgs); + mdb_midl_free(txn->mt_spill_pgs); + free(txn->mt_u.dirty_list); + return; + } + + if (mdb_midl_shrink(&txn->mt_free_pgs)) + env->me_free_pgs = txn->mt_free_pgs; + env->me_pghead = NULL; + env->me_pglast = 0; + + env->me_txn = NULL; + /* The writer mutex was locked in mdb_txn_begin. */ + if (env->me_txns) + UNLOCK_MUTEX_W(env); + } +} + +void +mdb_txn_reset(MDB_txn *txn) +{ + if (txn == NULL) + return; + + /* This call is only valid for read-only txns */ + if (!(txn->mt_flags & MDB_TXN_RDONLY)) + return; + + mdb_txn_reset0(txn, "reset"); +} + +void +mdb_txn_abort(MDB_txn *txn) +{ + if (txn == NULL) + return; + + if (txn->mt_child) + mdb_txn_abort(txn->mt_child); + + mdb_txn_reset0(txn, "abort"); + /* Free reader slot tied to this txn (if MDB_NOTLS && writable FS) */ + if ((txn->mt_flags & MDB_TXN_RDONLY) && txn->mt_u.reader) + txn->mt_u.reader->mr_pid = 0; + + free(txn); +} + +/** Save the freelist as of this transaction to the freeDB. + * This changes the freelist. Keep trying until it stabilizes. + */ +static int +mdb_freelist_save(MDB_txn *txn) +{ + /* env->me_pghead[] can grow and shrink during this call. + * env->me_pglast and txn->mt_free_pgs[] can only grow. + * Page numbers cannot disappear from txn->mt_free_pgs[]. + */ + MDB_cursor mc; + MDB_env *env = txn->mt_env; + int rc, maxfree_1pg = env->me_maxfree_1pg, more = 1; + txnid_t pglast = 0, head_id = 0; + pgno_t freecnt = 0, *free_pgs, *mop; + ssize_t head_room = 0, total_room = 0, mop_len, clean_limit; + + mdb_cursor_init(&mc, txn, FREE_DBI, NULL); + + if (env->me_pghead) { + /* Make sure first page of freeDB is touched and on freelist */ + rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST|MDB_PS_MODIFY); + if (rc && rc != MDB_NOTFOUND) + return rc; + } + + if (!env->me_pghead && txn->mt_loose_pgs) { + /* Put loose page numbers in mt_free_pgs, since + * we may be unable to return them to me_pghead. + */ + MDB_page *mp = txn->mt_loose_pgs; + if ((rc = mdb_midl_need(&txn->mt_free_pgs, txn->mt_loose_count)) != 0) + return rc; + for (; mp; mp = NEXT_LOOSE_PAGE(mp)) + mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); + txn->mt_loose_pgs = NULL; + txn->mt_loose_count = 0; + } + + /* MDB_RESERVE cancels meminit in ovpage malloc (when no WRITEMAP) */ + clean_limit = (env->me_flags & (MDB_NOMEMINIT|MDB_WRITEMAP)) + ? SSIZE_MAX : maxfree_1pg; + + for (;;) { + /* Come back here after each Put() in case freelist changed */ + MDB_val key, data; + pgno_t *pgs; + ssize_t j; + + /* If using records from freeDB which we have not yet + * deleted, delete them and any we reserved for me_pghead. + */ + while (pglast < env->me_pglast) { + rc = mdb_cursor_first(&mc, &key, NULL); + if (rc) + return rc; + pglast = head_id = *(txnid_t *)key.mv_data; + total_room = head_room = 0; + mdb_tassert(txn, pglast <= env->me_pglast); + rc = mdb_cursor_del(&mc, 0); + if (rc) + return rc; + } + + /* Save the IDL of pages freed by this txn, to a single record */ + if (freecnt < txn->mt_free_pgs[0]) { + if (!freecnt) { + /* Make sure last page of freeDB is touched and on freelist */ + rc = mdb_page_search(&mc, NULL, MDB_PS_LAST|MDB_PS_MODIFY); + if (rc && rc != MDB_NOTFOUND) + return rc; + } + free_pgs = txn->mt_free_pgs; + /* Write to last page of freeDB */ + key.mv_size = sizeof(txn->mt_txnid); + key.mv_data = &txn->mt_txnid; + do { + freecnt = free_pgs[0]; + data.mv_size = MDB_IDL_SIZEOF(free_pgs); + rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); + if (rc) + return rc; + /* Retry if mt_free_pgs[] grew during the Put() */ + free_pgs = txn->mt_free_pgs; + } while (freecnt < free_pgs[0]); + mdb_midl_sort(free_pgs); + memcpy(data.mv_data, free_pgs, data.mv_size); +#if (MDB_DEBUG) > 1 + { + unsigned int i = free_pgs[0]; + DPRINTF(("IDL write txn %"Z"u root %"Z"u num %u", + txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i)); + for (; i; i--) + DPRINTF(("IDL %"Z"u", free_pgs[i])); + } +#endif + continue; + } + + mop = env->me_pghead; + mop_len = (mop ? mop[0] : 0) + txn->mt_loose_count; + + /* Reserve records for me_pghead[]. Split it if multi-page, + * to avoid searching freeDB for a page range. Use keys in + * range [1,me_pglast]: Smaller than txnid of oldest reader. + */ + if (total_room >= mop_len) { + if (total_room == mop_len || --more < 0) + break; + } else if (head_room >= maxfree_1pg && head_id > 1) { + /* Keep current record (overflow page), add a new one */ + head_id--; + head_room = 0; + } + /* (Re)write {key = head_id, IDL length = head_room} */ + total_room -= head_room; + head_room = mop_len - total_room; + if (head_room > maxfree_1pg && head_id > 1) { + /* Overflow multi-page for part of me_pghead */ + head_room /= head_id; /* amortize page sizes */ + head_room += maxfree_1pg - head_room % (maxfree_1pg + 1); + } else if (head_room < 0) { + /* Rare case, not bothering to delete this record */ + head_room = 0; + } + key.mv_size = sizeof(head_id); + key.mv_data = &head_id; + data.mv_size = (head_room + 1) * sizeof(pgno_t); + rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); + if (rc) + return rc; + /* IDL is initially empty, zero out at least the length */ + pgs = (pgno_t *)data.mv_data; + j = head_room > clean_limit ? head_room : 0; + do { + pgs[j] = 0; + } while (--j >= 0); + total_room += head_room; + } + + /* Return loose page numbers to me_pghead, though usually none are + * left at this point. The pages themselves remain in dirty_list. + */ + if (txn->mt_loose_pgs) { + MDB_page *mp = txn->mt_loose_pgs; + unsigned count = txn->mt_loose_count; + MDB_IDL loose; + /* Room for loose pages + temp IDL with same */ + if ((rc = mdb_midl_need(&env->me_pghead, 2*count+1)) != 0) + return rc; + mop = env->me_pghead; + loose = mop + MDB_IDL_ALLOCLEN(mop) - count; + for (count = 0; mp; mp = NEXT_LOOSE_PAGE(mp)) + loose[ ++count ] = mp->mp_pgno; + loose[0] = count; + mdb_midl_sort(loose); + mdb_midl_xmerge(mop, loose); + txn->mt_loose_pgs = NULL; + txn->mt_loose_count = 0; + mop_len = mop[0]; + } + + /* Fill in the reserved me_pghead records */ + rc = MDB_SUCCESS; + if (mop_len) { + MDB_val key, data; + + mop += mop_len; + rc = mdb_cursor_first(&mc, &key, &data); + for (; !rc; rc = mdb_cursor_next(&mc, &key, &data, MDB_NEXT)) { + txnid_t id = *(txnid_t *)key.mv_data; + ssize_t len = (ssize_t)(data.mv_size / sizeof(MDB_ID)) - 1; + MDB_ID save; + + mdb_tassert(txn, len >= 0 && id <= env->me_pglast); + key.mv_data = &id; + if (len > mop_len) { + len = mop_len; + data.mv_size = (len + 1) * sizeof(MDB_ID); + } + data.mv_data = mop -= len; + save = mop[0]; + mop[0] = len; + rc = mdb_cursor_put(&mc, &key, &data, MDB_CURRENT); + mop[0] = save; + if (rc || !(mop_len -= len)) + break; + } + } + return rc; +} + +/** Flush (some) dirty pages to the map, after clearing their dirty flag. + * @param[in] txn the transaction that's being committed + * @param[in] keep number of initial pages in dirty_list to keep dirty. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_page_flush(MDB_txn *txn, int keep) +{ + MDB_env *env = txn->mt_env; + MDB_ID2L dl = txn->mt_u.dirty_list; + unsigned psize = env->me_psize, j; + int i, pagecount = dl[0].mid, rc; + size_t size = 0, pos = 0; + pgno_t pgno = 0; + MDB_page *dp = NULL; +#ifdef _WIN32 + OVERLAPPED ov; +#else + struct iovec iov[MDB_COMMIT_PAGES]; + ssize_t wpos = 0, wsize = 0, wres; + size_t next_pos = 1; /* impossible pos, so pos != next_pos */ + int n = 0; +#endif + + j = i = keep; + + if (env->me_flags & MDB_WRITEMAP) { + /* Clear dirty flags */ + while (++i <= pagecount) { + dp = dl[i].mptr; + /* Don't flush this page yet */ + if (dp->mp_flags & (P_LOOSE|P_KEEP)) { + dp->mp_flags &= ~P_KEEP; + dl[++j] = dl[i]; + continue; + } + dp->mp_flags &= ~P_DIRTY; + } + goto done; + } + + /* Write the pages */ + for (;;) { + if (++i <= pagecount) { + dp = dl[i].mptr; + /* Don't flush this page yet */ + if (dp->mp_flags & (P_LOOSE|P_KEEP)) { + dp->mp_flags &= ~P_KEEP; + dl[i].mid = 0; + continue; + } + pgno = dl[i].mid; + /* clear dirty flag */ + dp->mp_flags &= ~P_DIRTY; + pos = pgno * psize; + size = psize; + if (IS_OVERFLOW(dp)) size *= dp->mp_pages; + } +#ifdef _WIN32 + else break; + + /* Windows actually supports scatter/gather I/O, but only on + * unbuffered file handles. Since we're relying on the OS page + * cache for all our data, that's self-defeating. So we just + * write pages one at a time. We use the ov structure to set + * the write offset, to at least save the overhead of a Seek + * system call. + */ + DPRINTF(("committing page %"Z"u", pgno)); + memset(&ov, 0, sizeof(ov)); + ov.Offset = pos & 0xffffffff; + ov.OffsetHigh = pos >> 16 >> 16; + if (!WriteFile(env->me_fd, dp, size, NULL, &ov)) { + rc = ErrCode(); + DPRINTF(("WriteFile: %d", rc)); + return rc; + } +#else + /* Write up to MDB_COMMIT_PAGES dirty pages at a time. */ + if (pos!=next_pos || n==MDB_COMMIT_PAGES || wsize+size>MAX_WRITE) { + if (n) { + /* Write previous page(s) */ +#ifdef MDB_USE_PWRITEV + wres = pwritev(env->me_fd, iov, n, wpos); +#else + if (n == 1) { + wres = pwrite(env->me_fd, iov[0].iov_base, wsize, wpos); + } else { + if (lseek(env->me_fd, wpos, SEEK_SET) == -1) { + rc = ErrCode(); + DPRINTF(("lseek: %s", strerror(rc))); + return rc; + } + wres = writev(env->me_fd, iov, n); + } +#endif + if (wres != wsize) { + if (wres < 0) { + rc = ErrCode(); + DPRINTF(("Write error: %s", strerror(rc))); + } else { + rc = EIO; /* TODO: Use which error code? */ + DPUTS("short write, filesystem full?"); + } + return rc; + } + n = 0; + } + if (i > pagecount) + break; + wpos = pos; + wsize = 0; + } + DPRINTF(("committing page %"Z"u", pgno)); + next_pos = pos + size; + iov[n].iov_len = size; + iov[n].iov_base = (char *)dp; + wsize += size; + n++; +#endif /* _WIN32 */ + } + + /* MIPS has cache coherency issues, this is a no-op everywhere else + * Note: for any size >= on-chip cache size, entire on-chip cache is + * flushed. + */ + CACHEFLUSH(env->me_map, txn->mt_next_pgno * env->me_psize, DCACHE); + + for (i = keep; ++i <= pagecount; ) { + dp = dl[i].mptr; + /* This is a page we skipped above */ + if (!dl[i].mid) { + dl[++j] = dl[i]; + dl[j].mid = dp->mp_pgno; + continue; + } + mdb_dpage_free(env, dp); + } + +done: + i--; + txn->mt_dirty_room += i - j; + dl[0].mid = j; + return MDB_SUCCESS; +} + +int +mdb_txn_commit(MDB_txn *txn) +{ + int rc; + unsigned int i; + MDB_env *env; + + if (txn == NULL || txn->mt_env == NULL) + return EINVAL; + + if (txn->mt_child) { + rc = mdb_txn_commit(txn->mt_child); + txn->mt_child = NULL; + if (rc) + goto fail; + } + + env = txn->mt_env; + + if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { + mdb_dbis_update(txn, 1); + txn->mt_numdbs = 2; /* so txn_abort() doesn't close any new handles */ + mdb_txn_abort(txn); + return MDB_SUCCESS; + } + + if (F_ISSET(txn->mt_flags, MDB_TXN_ERROR)) { + DPUTS("error flag is set, can't commit"); + if (txn->mt_parent) + txn->mt_parent->mt_flags |= MDB_TXN_ERROR; + rc = MDB_BAD_TXN; + goto fail; + } + + if (txn->mt_parent) { + MDB_txn *parent = txn->mt_parent; + MDB_page **lp; + MDB_ID2L dst, src; + MDB_IDL pspill; + unsigned x, y, len, ps_len; + + /* Append our free list to parent's */ + rc = mdb_midl_append_list(&parent->mt_free_pgs, txn->mt_free_pgs); + if (rc) + goto fail; + mdb_midl_free(txn->mt_free_pgs); + /* Failures after this must either undo the changes + * to the parent or set MDB_TXN_ERROR in the parent. + */ + + parent->mt_next_pgno = txn->mt_next_pgno; + parent->mt_flags = txn->mt_flags; + + /* Merge our cursors into parent's and close them */ + mdb_cursors_close(txn, 1); + + /* Update parent's DB table. */ + memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); + parent->mt_numdbs = txn->mt_numdbs; + parent->mt_dbflags[0] = txn->mt_dbflags[0]; + parent->mt_dbflags[1] = txn->mt_dbflags[1]; + for (i=2; imt_numdbs; i++) { + /* preserve parent's DB_NEW status */ + x = parent->mt_dbflags[i] & DB_NEW; + parent->mt_dbflags[i] = txn->mt_dbflags[i] | x; + } + + dst = parent->mt_u.dirty_list; + src = txn->mt_u.dirty_list; + /* Remove anything in our dirty list from parent's spill list */ + if ((pspill = parent->mt_spill_pgs) && (ps_len = pspill[0])) { + x = y = ps_len; + pspill[0] = (pgno_t)-1; + /* Mark our dirty pages as deleted in parent spill list */ + for (i=0, len=src[0].mid; ++i <= len; ) { + MDB_ID pn = src[i].mid << 1; + while (pn > pspill[x]) + x--; + if (pn == pspill[x]) { + pspill[x] = 1; + y = --x; + } + } + /* Squash deleted pagenums if we deleted any */ + for (x=y; ++x <= ps_len; ) + if (!(pspill[x] & 1)) + pspill[++y] = pspill[x]; + pspill[0] = y; + } + + /* Find len = length of merging our dirty list with parent's */ + x = dst[0].mid; + dst[0].mid = 0; /* simplify loops */ + if (parent->mt_parent) { + len = x + src[0].mid; + y = mdb_mid2l_search(src, dst[x].mid + 1) - 1; + for (i = x; y && i; y--) { + pgno_t yp = src[y].mid; + while (yp < dst[i].mid) + i--; + if (yp == dst[i].mid) { + i--; + len--; + } + } + } else { /* Simplify the above for single-ancestor case */ + len = MDB_IDL_UM_MAX - txn->mt_dirty_room; + } + /* Merge our dirty list with parent's */ + y = src[0].mid; + for (i = len; y; dst[i--] = src[y--]) { + pgno_t yp = src[y].mid; + while (yp < dst[x].mid) + dst[i--] = dst[x--]; + if (yp == dst[x].mid) + free(dst[x--].mptr); + } + mdb_tassert(txn, i == x); + dst[0].mid = len; + free(txn->mt_u.dirty_list); + parent->mt_dirty_room = txn->mt_dirty_room; + if (txn->mt_spill_pgs) { + if (parent->mt_spill_pgs) { + /* TODO: Prevent failure here, so parent does not fail */ + rc = mdb_midl_append_list(&parent->mt_spill_pgs, txn->mt_spill_pgs); + if (rc) + parent->mt_flags |= MDB_TXN_ERROR; + mdb_midl_free(txn->mt_spill_pgs); + mdb_midl_sort(parent->mt_spill_pgs); + } else { + parent->mt_spill_pgs = txn->mt_spill_pgs; + } + } + + /* Append our loose page list to parent's */ + for (lp = &parent->mt_loose_pgs; *lp; lp = &NEXT_LOOSE_PAGE(lp)) + ; + *lp = txn->mt_loose_pgs; + parent->mt_loose_count += txn->mt_loose_count; + + parent->mt_child = NULL; + mdb_midl_free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pghead); + free(txn); + return rc; + } + + if (txn != env->me_txn) { + DPUTS("attempt to commit unknown transaction"); + rc = EINVAL; + goto fail; + } + + mdb_cursors_close(txn, 0); + + if (!txn->mt_u.dirty_list[0].mid && + !(txn->mt_flags & (MDB_TXN_DIRTY|MDB_TXN_SPILLS))) + goto done; + + DPRINTF(("committing txn %"Z"u %p on mdbenv %p, root page %"Z"u", + txn->mt_txnid, (void*)txn, (void*)env, txn->mt_dbs[MAIN_DBI].md_root)); + + /* Update DB root pointers */ + if (txn->mt_numdbs > 2) { + MDB_cursor mc; + MDB_dbi i; + MDB_val data; + data.mv_size = sizeof(MDB_db); + + mdb_cursor_init(&mc, txn, MAIN_DBI, NULL); + for (i = 2; i < txn->mt_numdbs; i++) { + if (txn->mt_dbflags[i] & DB_DIRTY) { + if (TXN_DBI_CHANGED(txn, i)) { + rc = MDB_BAD_DBI; + goto fail; + } + data.mv_data = &txn->mt_dbs[i]; + rc = mdb_cursor_put(&mc, &txn->mt_dbxs[i].md_name, &data, 0); + if (rc) + goto fail; + } + } + } + + rc = mdb_freelist_save(txn); + if (rc) + goto fail; + + mdb_midl_free(env->me_pghead); + env->me_pghead = NULL; + if (mdb_midl_shrink(&txn->mt_free_pgs)) + env->me_free_pgs = txn->mt_free_pgs; + +#if (MDB_DEBUG) > 2 + mdb_audit(txn); +#endif + + if ((rc = mdb_page_flush(txn, 0)) || + (rc = mdb_env_sync(env, 0)) || + (rc = mdb_env_write_meta(txn))) + goto fail; + + /* Free P_LOOSE pages left behind in dirty_list */ + if (!(env->me_flags & MDB_WRITEMAP)) + mdb_dlist_free(txn); + +done: + env->me_pglast = 0; + env->me_txn = NULL; + mdb_dbis_update(txn, 1); + + if (env->me_txns) + UNLOCK_MUTEX_W(env); + free(txn); + + return MDB_SUCCESS; + +fail: + mdb_txn_abort(txn); + return rc; +} + +/** Read the environment parameters of a DB environment before + * mapping it into memory. + * @param[in] env the environment handle + * @param[out] meta address of where to store the meta information + * @return 0 on success, non-zero on failure. + */ +static int ESECT +mdb_env_read_header(MDB_env *env, MDB_meta *meta) +{ + MDB_metabuf pbuf; + MDB_page *p; + MDB_meta *m; + int i, rc, off; + enum { Size = sizeof(pbuf) }; + + /* We don't know the page size yet, so use a minimum value. + * Read both meta pages so we can use the latest one. + */ + + for (i=off=0; i<2; i++, off = meta->mm_psize) { +#ifdef _WIN32 + DWORD len; + OVERLAPPED ov; + memset(&ov, 0, sizeof(ov)); + ov.Offset = off; + rc = ReadFile(env->me_fd, &pbuf, Size, &len, &ov) ? (int)len : -1; + if (rc == -1 && ErrCode() == ERROR_HANDLE_EOF) + rc = 0; +#else + rc = pread(env->me_fd, &pbuf, Size, off); +#endif + if (rc != Size) { + if (rc == 0 && off == 0) + return ENOENT; + rc = rc < 0 ? (int) ErrCode() : MDB_INVALID; + DPRINTF(("read: %s", mdb_strerror(rc))); + return rc; + } + + p = (MDB_page *)&pbuf; + + if (!F_ISSET(p->mp_flags, P_META)) { + DPRINTF(("page %"Z"u not a meta page", p->mp_pgno)); + return MDB_INVALID; + } + + m = METADATA(p); + if (m->mm_magic != MDB_MAGIC) { + DPUTS("meta has invalid magic"); + return MDB_INVALID; + } + + if (m->mm_version != MDB_DATA_VERSION) { + DPRINTF(("database is version %u, expected version %u", + m->mm_version, MDB_DATA_VERSION)); + return MDB_VERSION_MISMATCH; + } + + if (off == 0 || m->mm_txnid > meta->mm_txnid) + *meta = *m; + } + return 0; +} + +static void ESECT +mdb_env_init_meta0(MDB_env *env, MDB_meta *meta) +{ + meta->mm_magic = MDB_MAGIC; + meta->mm_version = MDB_DATA_VERSION; + meta->mm_mapsize = env->me_mapsize; + meta->mm_psize = env->me_psize; + meta->mm_last_pg = 1; + meta->mm_flags = env->me_flags & 0xffff; + meta->mm_flags |= MDB_INTEGERKEY; + meta->mm_dbs[0].md_root = P_INVALID; + meta->mm_dbs[1].md_root = P_INVALID; +} + +/** Write the environment parameters of a freshly created DB environment. + * @param[in] env the environment handle + * @param[out] meta address of where to store the meta information + * @return 0 on success, non-zero on failure. + */ +static int ESECT +mdb_env_init_meta(MDB_env *env, MDB_meta *meta) +{ + MDB_page *p, *q; + int rc; + unsigned int psize; +#ifdef _WIN32 + DWORD len; + OVERLAPPED ov; + memset(&ov, 0, sizeof(ov)); +#define DO_PWRITE(rc, fd, ptr, size, len, pos) do { \ + ov.Offset = pos; \ + rc = WriteFile(fd, ptr, size, &len, &ov); } while(0) +#else + int len; +#define DO_PWRITE(rc, fd, ptr, size, len, pos) do { \ + len = pwrite(fd, ptr, size, pos); \ + rc = (len >= 0); } while(0) +#endif + + DPUTS("writing new meta page"); + + psize = env->me_psize; + + mdb_env_init_meta0(env, meta); + + p = calloc(2, psize); + p->mp_pgno = 0; + p->mp_flags = P_META; + *(MDB_meta *)METADATA(p) = *meta; + + q = (MDB_page *)((char *)p + psize); + q->mp_pgno = 1; + q->mp_flags = P_META; + *(MDB_meta *)METADATA(q) = *meta; + + DO_PWRITE(rc, env->me_fd, p, psize * 2, len, 0); + if (!rc) + rc = ErrCode(); + else if ((unsigned) len == psize * 2) + rc = MDB_SUCCESS; + else + rc = ENOSPC; + free(p); + return rc; +} + +/** Update the environment info to commit a transaction. + * @param[in] txn the transaction that's being committed + * @return 0 on success, non-zero on failure. + */ +static int +mdb_env_write_meta(MDB_txn *txn) +{ + MDB_env *env; + MDB_meta meta, metab, *mp; + size_t mapsize; + off_t off; + int rc, len, toggle; + char *ptr; + HANDLE mfd; +#ifdef _WIN32 + OVERLAPPED ov; +#else + int r2; +#endif + + toggle = txn->mt_txnid & 1; + DPRINTF(("writing meta page %d for root page %"Z"u", + toggle, txn->mt_dbs[MAIN_DBI].md_root)); + + env = txn->mt_env; + mp = env->me_metas[toggle]; + mapsize = env->me_metas[toggle ^ 1]->mm_mapsize; + /* Persist any increases of mapsize config */ + if (mapsize < env->me_mapsize) + mapsize = env->me_mapsize; + + if (env->me_flags & MDB_WRITEMAP) { + mp->mm_mapsize = mapsize; + mp->mm_dbs[0] = txn->mt_dbs[0]; + mp->mm_dbs[1] = txn->mt_dbs[1]; + mp->mm_last_pg = txn->mt_next_pgno - 1; + mp->mm_txnid = txn->mt_txnid; + if (!(env->me_flags & (MDB_NOMETASYNC|MDB_NOSYNC))) { + unsigned meta_size = env->me_psize; + rc = (env->me_flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC; + ptr = env->me_map; + if (toggle) { +#ifndef _WIN32 /* POSIX msync() requires ptr = start of OS page */ + if (meta_size < env->me_os_psize) + meta_size += meta_size; + else +#endif + ptr += meta_size; + } + if (MDB_MSYNC(ptr, meta_size, rc)) { + rc = ErrCode(); + goto fail; + } + } + goto done; + } + metab.mm_txnid = env->me_metas[toggle]->mm_txnid; + metab.mm_last_pg = env->me_metas[toggle]->mm_last_pg; + + meta.mm_mapsize = mapsize; + meta.mm_dbs[0] = txn->mt_dbs[0]; + meta.mm_dbs[1] = txn->mt_dbs[1]; + meta.mm_last_pg = txn->mt_next_pgno - 1; + meta.mm_txnid = txn->mt_txnid; + + off = offsetof(MDB_meta, mm_mapsize); + ptr = (char *)&meta + off; + len = sizeof(MDB_meta) - off; + if (toggle) + off += env->me_psize; + off += PAGEHDRSZ; + + /* Write to the SYNC fd */ + mfd = env->me_flags & (MDB_NOSYNC|MDB_NOMETASYNC) ? + env->me_fd : env->me_mfd; +#ifdef _WIN32 + { + memset(&ov, 0, sizeof(ov)); + ov.Offset = off; + if (!WriteFile(mfd, ptr, len, (DWORD *)&rc, &ov)) + rc = -1; + } +#else + rc = pwrite(mfd, ptr, len, off); +#endif + if (rc != len) { + rc = rc < 0 ? ErrCode() : EIO; + DPUTS("write failed, disk error?"); + /* On a failure, the pagecache still contains the new data. + * Write some old data back, to prevent it from being used. + * Use the non-SYNC fd; we know it will fail anyway. + */ + meta.mm_last_pg = metab.mm_last_pg; + meta.mm_txnid = metab.mm_txnid; +#ifdef _WIN32 + memset(&ov, 0, sizeof(ov)); + ov.Offset = off; + WriteFile(env->me_fd, ptr, len, NULL, &ov); +#else + r2 = pwrite(env->me_fd, ptr, len, off); + (void)r2; /* Silence warnings. We don't care about pwrite's return value */ +#endif +fail: + env->me_flags |= MDB_FATAL_ERROR; + return rc; + } + /* MIPS has cache coherency issues, this is a no-op everywhere else */ + CACHEFLUSH(env->me_map + off, len, DCACHE); +done: + /* Memory ordering issues are irrelevant; since the entire writer + * is wrapped by wmutex, all of these changes will become visible + * after the wmutex is unlocked. Since the DB is multi-version, + * readers will get consistent data regardless of how fresh or + * how stale their view of these values is. + */ + if (env->me_txns) + env->me_txns->mti_txnid = txn->mt_txnid; + + return MDB_SUCCESS; +} + +/** Check both meta pages to see which one is newer. + * @param[in] env the environment handle + * @return meta toggle (0 or 1). + */ +static int +mdb_env_pick_meta(const MDB_env *env) +{ + return (env->me_metas[0]->mm_txnid < env->me_metas[1]->mm_txnid); +} + +int ESECT +mdb_env_create(MDB_env **env) +{ + MDB_env *e; + + e = calloc(1, sizeof(MDB_env)); + if (!e) + return ENOMEM; + + e->me_maxreaders = DEFAULT_READERS; + e->me_maxdbs = e->me_numdbs = 2; + e->me_fd = INVALID_HANDLE_VALUE; + e->me_lfd = INVALID_HANDLE_VALUE; + e->me_mfd = INVALID_HANDLE_VALUE; +#ifdef MDB_USE_POSIX_SEM + e->me_rmutex = SEM_FAILED; + e->me_wmutex = SEM_FAILED; +#endif + e->me_pid = getpid(); + GET_PAGESIZE(e->me_os_psize); + VGMEMP_CREATE(e,0,0); + *env = e; + return MDB_SUCCESS; +} + +static int ESECT +mdb_env_map(MDB_env *env, void *addr) +{ + MDB_page *p; + unsigned int flags = env->me_flags; +#ifdef _WIN32 + int rc; + HANDLE mh; + LONG sizelo, sizehi; + size_t msize; + + if (flags & MDB_RDONLY) { + /* Don't set explicit map size, use whatever exists */ + msize = 0; + sizelo = 0; + sizehi = 0; + } else { + msize = env->me_mapsize; + sizelo = msize & 0xffffffff; + sizehi = msize >> 16 >> 16; /* only needed on Win64 */ + + /* Windows won't create mappings for zero length files. + * and won't map more than the file size. + * Just set the maxsize right now. + */ + if (SetFilePointer(env->me_fd, sizelo, &sizehi, 0) != (DWORD)sizelo + || !SetEndOfFile(env->me_fd) + || SetFilePointer(env->me_fd, 0, NULL, 0) != 0) + return ErrCode(); + } + + mh = CreateFileMapping(env->me_fd, NULL, flags & MDB_WRITEMAP ? + PAGE_READWRITE : PAGE_READONLY, + sizehi, sizelo, NULL); + if (!mh) + return ErrCode(); + env->me_map = MapViewOfFileEx(mh, flags & MDB_WRITEMAP ? + FILE_MAP_WRITE : FILE_MAP_READ, + 0, 0, msize, addr); + rc = env->me_map ? 0 : ErrCode(); + CloseHandle(mh); + if (rc) + return rc; +#else + int prot = PROT_READ; + if (flags & MDB_WRITEMAP) { + prot |= PROT_WRITE; + if (ftruncate(env->me_fd, env->me_mapsize) < 0) + return ErrCode(); + } + env->me_map = mmap(addr, env->me_mapsize, prot, MAP_SHARED, + env->me_fd, 0); + if (env->me_map == MAP_FAILED) { + env->me_map = NULL; + return ErrCode(); + } + + if (flags & MDB_NORDAHEAD) { + /* Turn off readahead. It's harmful when the DB is larger than RAM. */ +#ifdef MADV_RANDOM + madvise(env->me_map, env->me_mapsize, MADV_RANDOM); +#else +#ifdef POSIX_MADV_RANDOM + posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_RANDOM); +#endif /* POSIX_MADV_RANDOM */ +#endif /* MADV_RANDOM */ + } +#endif /* _WIN32 */ + + /* Can happen because the address argument to mmap() is just a + * hint. mmap() can pick another, e.g. if the range is in use. + * The MAP_FIXED flag would prevent that, but then mmap could + * instead unmap existing pages to make room for the new map. + */ + if (addr && env->me_map != addr) + return EBUSY; /* TODO: Make a new MDB_* error code? */ + + p = (MDB_page *)env->me_map; + env->me_metas[0] = METADATA(p); + env->me_metas[1] = (MDB_meta *)((char *)env->me_metas[0] + env->me_psize); + + return MDB_SUCCESS; +} + +int ESECT +mdb_env_set_mapsize(MDB_env *env, size_t size) +{ + /* If env is already open, caller is responsible for making + * sure there are no active txns. + */ + if (env->me_map) { + int rc; + void *old; + if (env->me_txn) + return EINVAL; + if (!size) + size = env->me_metas[mdb_env_pick_meta(env)]->mm_mapsize; + else if (size < env->me_mapsize) { + /* If the configured size is smaller, make sure it's + * still big enough. Silently round up to minimum if not. + */ + size_t minsize = (env->me_metas[mdb_env_pick_meta(env)]->mm_last_pg + 1) * env->me_psize; + if (size < minsize) + size = minsize; + } + munmap(env->me_map, env->me_mapsize); + env->me_mapsize = size; + old = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : NULL; + rc = mdb_env_map(env, old); + if (rc) + return rc; + } + env->me_mapsize = size; + if (env->me_psize) + env->me_maxpg = env->me_mapsize / env->me_psize; + return MDB_SUCCESS; +} + +int ESECT +mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs) +{ + if (env->me_map) + return EINVAL; + env->me_maxdbs = dbs + 2; /* Named databases + main and free DB */ + return MDB_SUCCESS; +} + +int ESECT +mdb_env_set_maxreaders(MDB_env *env, unsigned int readers) +{ + if (env->me_map || readers < 1) + return EINVAL; + env->me_maxreaders = readers; + return MDB_SUCCESS; +} + +int ESECT +mdb_env_get_maxreaders(MDB_env *env, unsigned int *readers) +{ + if (!env || !readers) + return EINVAL; + *readers = env->me_maxreaders; + return MDB_SUCCESS; +} + +/** Further setup required for opening an LMDB environment + */ +static int ESECT +mdb_env_open2(MDB_env *env) +{ + unsigned int flags = env->me_flags; + int i, newenv = 0, rc; + MDB_meta meta; + +#ifdef _WIN32 + /* See if we should use QueryLimited */ + rc = GetVersion(); + if ((rc & 0xff) > 5) + env->me_pidquery = MDB_PROCESS_QUERY_LIMITED_INFORMATION; + else + env->me_pidquery = PROCESS_QUERY_INFORMATION; +#endif /* _WIN32 */ + + memset(&meta, 0, sizeof(meta)); + + if ((i = mdb_env_read_header(env, &meta)) != 0) { + if (i != ENOENT) + return i; + DPUTS("new mdbenv"); + newenv = 1; + env->me_psize = env->me_os_psize; + if (env->me_psize > MAX_PAGESIZE) + env->me_psize = MAX_PAGESIZE; + } else { + env->me_psize = meta.mm_psize; + } + + /* Was a mapsize configured? */ + if (!env->me_mapsize) { + /* If this is a new environment, take the default, + * else use the size recorded in the existing env. + */ + env->me_mapsize = newenv ? DEFAULT_MAPSIZE : meta.mm_mapsize; + } else if (env->me_mapsize < meta.mm_mapsize) { + /* If the configured size is smaller, make sure it's + * still big enough. Silently round up to minimum if not. + */ + size_t minsize = (meta.mm_last_pg + 1) * meta.mm_psize; + if (env->me_mapsize < minsize) + env->me_mapsize = minsize; + } + + rc = mdb_env_map(env, (flags & MDB_FIXEDMAP) ? meta.mm_address : NULL); + if (rc) + return rc; + + if (newenv) { + if (flags & MDB_FIXEDMAP) + meta.mm_address = env->me_map; + i = mdb_env_init_meta(env, &meta); + if (i != MDB_SUCCESS) { + return i; + } + } + + env->me_maxfree_1pg = (env->me_psize - PAGEHDRSZ) / sizeof(pgno_t) - 1; + env->me_nodemax = (((env->me_psize - PAGEHDRSZ) / MDB_MINKEYS) & -2) + - sizeof(indx_t); +#if !(MDB_MAXKEYSIZE) + env->me_maxkey = env->me_nodemax - (NODESIZE + sizeof(MDB_db)); +#endif + env->me_maxpg = env->me_mapsize / env->me_psize; + +#if MDB_DEBUG + { + int toggle = mdb_env_pick_meta(env); + MDB_db *db = &env->me_metas[toggle]->mm_dbs[MAIN_DBI]; + + DPRINTF(("opened database version %u, pagesize %u", + env->me_metas[0]->mm_version, env->me_psize)); + DPRINTF(("using meta page %d", toggle)); + DPRINTF(("depth: %u", db->md_depth)); + DPRINTF(("entries: %"Z"u", db->md_entries)); + DPRINTF(("branch pages: %"Z"u", db->md_branch_pages)); + DPRINTF(("leaf pages: %"Z"u", db->md_leaf_pages)); + DPRINTF(("overflow pages: %"Z"u", db->md_overflow_pages)); + DPRINTF(("root: %"Z"u", db->md_root)); + } +#endif + + return MDB_SUCCESS; +} + + +/** Release a reader thread's slot in the reader lock table. + * This function is called automatically when a thread exits. + * @param[in] ptr This points to the slot in the reader lock table. + */ +static void +mdb_env_reader_dest(void *ptr) +{ + MDB_reader *reader = ptr; + + reader->mr_pid = 0; +} + +#ifdef _WIN32 +/** Junk for arranging thread-specific callbacks on Windows. This is + * necessarily platform and compiler-specific. Windows supports up + * to 1088 keys. Let's assume nobody opens more than 64 environments + * in a single process, for now. They can override this if needed. + */ +#ifndef MAX_TLS_KEYS +#define MAX_TLS_KEYS 64 +#endif +static pthread_key_t mdb_tls_keys[MAX_TLS_KEYS]; +static int mdb_tls_nkeys; + +static void NTAPI mdb_tls_callback(PVOID module, DWORD reason, PVOID ptr) +{ + int i; + switch(reason) { + case DLL_PROCESS_ATTACH: break; + case DLL_THREAD_ATTACH: break; + case DLL_THREAD_DETACH: + for (i=0; ime_txns->mti_txnid = env->me_metas[toggle]->mm_txnid; + +#ifdef _WIN32 + { + OVERLAPPED ov; + /* First acquire a shared lock. The Unlock will + * then release the existing exclusive lock. + */ + memset(&ov, 0, sizeof(ov)); + if (!LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) { + rc = ErrCode(); + } else { + UnlockFile(env->me_lfd, 0, 0, 1, 0); + *excl = 0; + } + } +#else + { + struct flock lock_info; + /* The shared lock replaces the existing lock */ + memset((void *)&lock_info, 0, sizeof(lock_info)); + lock_info.l_type = F_RDLCK; + lock_info.l_whence = SEEK_SET; + lock_info.l_start = 0; + lock_info.l_len = 1; + while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) && + (rc = ErrCode()) == EINTR) ; + *excl = rc ? -1 : 0; /* error may mean we lost the lock */ + } +#endif + + return rc; +} + +/** Try to get exlusive lock, otherwise shared. + * Maintain *excl = -1: no/unknown lock, 0: shared, 1: exclusive. + */ +static int ESECT +mdb_env_excl_lock(MDB_env *env, int *excl) +{ + int rc = 0; +#ifdef _WIN32 + if (LockFile(env->me_lfd, 0, 0, 1, 0)) { + *excl = 1; + } else { + OVERLAPPED ov; + memset(&ov, 0, sizeof(ov)); + if (LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) { + *excl = 0; + } else { + rc = ErrCode(); + } + } +#else + struct flock lock_info; + memset((void *)&lock_info, 0, sizeof(lock_info)); + lock_info.l_type = F_WRLCK; + lock_info.l_whence = SEEK_SET; + lock_info.l_start = 0; + lock_info.l_len = 1; + while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) && + (rc = ErrCode()) == EINTR) ; + if (!rc) { + *excl = 1; + } else +# ifdef MDB_USE_POSIX_SEM + if (*excl < 0) /* always true when !MDB_USE_POSIX_SEM */ +# endif + { + lock_info.l_type = F_RDLCK; + while ((rc = fcntl(env->me_lfd, F_SETLKW, &lock_info)) && + (rc = ErrCode()) == EINTR) ; + if (rc == 0) + *excl = 0; + } +#endif + return rc; +} + +#ifdef MDB_USE_HASH +/* + * hash_64 - 64 bit Fowler/Noll/Vo-0 FNV-1a hash code + * + * @(#) $Revision: 5.1 $ + * @(#) $Id: hash_64a.c,v 5.1 2009/06/30 09:01:38 chongo Exp $ + * @(#) $Source: /usr/local/src/cmd/fnv/RCS/hash_64a.c,v $ + * + * http://www.isthe.com/chongo/tech/comp/fnv/index.html + * + *** + * + * Please do not copyright this code. This code is in the public domain. + * + * LANDON CURT NOLL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO + * EVENT SHALL LANDON CURT NOLL BE LIABLE FOR ANY SPECIAL, INDIRECT OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF + * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + * + * By: + * chongo /\oo/\ + * http://www.isthe.com/chongo/ + * + * Share and Enjoy! :-) + */ + +typedef unsigned long long mdb_hash_t; +#define MDB_HASH_INIT ((mdb_hash_t)0xcbf29ce484222325ULL) + +/** perform a 64 bit Fowler/Noll/Vo FNV-1a hash on a buffer + * @param[in] val value to hash + * @param[in] hval initial value for hash + * @return 64 bit hash + * + * NOTE: To use the recommended 64 bit FNV-1a hash, use MDB_HASH_INIT as the + * hval arg on the first call. + */ +static mdb_hash_t +mdb_hash_val(MDB_val *val, mdb_hash_t hval) +{ + unsigned char *s = (unsigned char *)val->mv_data; /* unsigned string */ + unsigned char *end = s + val->mv_size; + /* + * FNV-1a hash each octet of the string + */ + while (s < end) { + /* xor the bottom with the current octet */ + hval ^= (mdb_hash_t)*s++; + + /* multiply by the 64 bit FNV magic prime mod 2^64 */ + hval += (hval << 1) + (hval << 4) + (hval << 5) + + (hval << 7) + (hval << 8) + (hval << 40); + } + /* return our new hash value */ + return hval; +} + +/** Hash the string and output the encoded hash. + * This uses modified RFC1924 Ascii85 encoding to accommodate systems with + * very short name limits. We don't care about the encoding being reversible, + * we just want to preserve as many bits of the input as possible in a + * small printable string. + * @param[in] str string to hash + * @param[out] encbuf an array of 11 chars to hold the hash + */ +static const char mdb_a85[]= "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~"; + +static void +mdb_pack85(unsigned long l, char *out) +{ + int i; + + for (i=0; i<5; i++) { + *out++ = mdb_a85[l % 85]; + l /= 85; + } +} + +static void +mdb_hash_enc(MDB_val *val, char *encbuf) +{ + mdb_hash_t h = mdb_hash_val(val, MDB_HASH_INIT); + + mdb_pack85(h, encbuf); + mdb_pack85(h>>32, encbuf+5); + encbuf[10] = '\0'; +} +#endif + +/** Open and/or initialize the lock region for the environment. + * @param[in] env The LMDB environment. + * @param[in] lpath The pathname of the file used for the lock region. + * @param[in] mode The Unix permissions for the file, if we create it. + * @param[out] excl Resulting file lock type: -1 none, 0 shared, 1 exclusive + * @param[in,out] excl In -1, out lock type: -1 none, 0 shared, 1 exclusive + * @return 0 on success, non-zero on failure. + */ +static int ESECT +mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) +{ +#ifdef _WIN32 +# define MDB_ERRCODE_ROFS ERROR_WRITE_PROTECT +#else +# define MDB_ERRCODE_ROFS EROFS +#ifdef O_CLOEXEC /* Linux: Open file and set FD_CLOEXEC atomically */ +# define MDB_CLOEXEC O_CLOEXEC +#else + int fdflags; +# define MDB_CLOEXEC 0 +#endif +#endif + int rc; + off_t size, rsize; + +#ifdef _WIN32 + env->me_lfd = CreateFile(lpath, GENERIC_READ|GENERIC_WRITE, + FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_ALWAYS, + FILE_ATTRIBUTE_NORMAL, NULL); +#else + env->me_lfd = open(lpath, O_RDWR|O_CREAT|MDB_CLOEXEC, mode); +#endif + if (env->me_lfd == INVALID_HANDLE_VALUE) { + rc = ErrCode(); + if (rc == MDB_ERRCODE_ROFS && (env->me_flags & MDB_RDONLY)) { + return MDB_SUCCESS; + } + goto fail_errno; + } +#if ! ((MDB_CLOEXEC) || defined(_WIN32)) + /* Lose record locks when exec*() */ + if ((fdflags = fcntl(env->me_lfd, F_GETFD) | FD_CLOEXEC) >= 0) + fcntl(env->me_lfd, F_SETFD, fdflags); +#endif + + if (!(env->me_flags & MDB_NOTLS)) { + rc = pthread_key_create(&env->me_txkey, mdb_env_reader_dest); + if (rc) + goto fail; + env->me_flags |= MDB_ENV_TXKEY; +#ifdef _WIN32 + /* Windows TLS callbacks need help finding their TLS info. */ + if (mdb_tls_nkeys >= MAX_TLS_KEYS) { + rc = MDB_TLS_FULL; + goto fail; + } + mdb_tls_keys[mdb_tls_nkeys++] = env->me_txkey; +#endif + } + + /* Try to get exclusive lock. If we succeed, then + * nobody is using the lock region and we should initialize it. + */ + if ((rc = mdb_env_excl_lock(env, excl))) goto fail; + +#ifdef _WIN32 + size = GetFileSize(env->me_lfd, NULL); +#else + size = lseek(env->me_lfd, 0, SEEK_END); + if (size == -1) goto fail_errno; +#endif + rsize = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo); + if (size < rsize && *excl > 0) { +#ifdef _WIN32 + if (SetFilePointer(env->me_lfd, rsize, NULL, FILE_BEGIN) != (DWORD)rsize + || !SetEndOfFile(env->me_lfd)) + goto fail_errno; +#else + if (ftruncate(env->me_lfd, rsize) != 0) goto fail_errno; +#endif + } else { + rsize = size; + size = rsize - sizeof(MDB_txninfo); + env->me_maxreaders = size/sizeof(MDB_reader) + 1; + } + { +#ifdef _WIN32 + HANDLE mh; + mh = CreateFileMapping(env->me_lfd, NULL, PAGE_READWRITE, + 0, 0, NULL); + if (!mh) goto fail_errno; + env->me_txns = MapViewOfFileEx(mh, FILE_MAP_WRITE, 0, 0, rsize, NULL); + CloseHandle(mh); + if (!env->me_txns) goto fail_errno; +#else + void *m = mmap(NULL, rsize, PROT_READ|PROT_WRITE, MAP_SHARED, + env->me_lfd, 0); + if (m == MAP_FAILED) goto fail_errno; + env->me_txns = m; +#endif + } + if (*excl > 0) { +#ifdef _WIN32 + BY_HANDLE_FILE_INFORMATION stbuf; + struct { + DWORD volume; + DWORD nhigh; + DWORD nlow; + } idbuf; + MDB_val val; + char encbuf[11]; + + if (!mdb_sec_inited) { + InitializeSecurityDescriptor(&mdb_null_sd, + SECURITY_DESCRIPTOR_REVISION); + SetSecurityDescriptorDacl(&mdb_null_sd, TRUE, 0, FALSE); + mdb_all_sa.nLength = sizeof(SECURITY_ATTRIBUTES); + mdb_all_sa.bInheritHandle = FALSE; + mdb_all_sa.lpSecurityDescriptor = &mdb_null_sd; + mdb_sec_inited = 1; + } + if (!GetFileInformationByHandle(env->me_lfd, &stbuf)) goto fail_errno; + idbuf.volume = stbuf.dwVolumeSerialNumber; + idbuf.nhigh = stbuf.nFileIndexHigh; + idbuf.nlow = stbuf.nFileIndexLow; + val.mv_data = &idbuf; + val.mv_size = sizeof(idbuf); + mdb_hash_enc(&val, encbuf); + sprintf(env->me_txns->mti_rmname, "Global\\MDBr%s", encbuf); + sprintf(env->me_txns->mti_wmname, "Global\\MDBw%s", encbuf); + env->me_rmutex = CreateMutex(&mdb_all_sa, FALSE, env->me_txns->mti_rmname); + if (!env->me_rmutex) goto fail_errno; + env->me_wmutex = CreateMutex(&mdb_all_sa, FALSE, env->me_txns->mti_wmname); + if (!env->me_wmutex) goto fail_errno; +#elif defined(MDB_USE_POSIX_SEM) + struct stat stbuf; + struct { + dev_t dev; + ino_t ino; + } idbuf; + MDB_val val; + char encbuf[11]; + +#if defined(__NetBSD__) +#define MDB_SHORT_SEMNAMES 1 /* limited to 14 chars */ +#endif + if (fstat(env->me_lfd, &stbuf)) goto fail_errno; + idbuf.dev = stbuf.st_dev; + idbuf.ino = stbuf.st_ino; + val.mv_data = &idbuf; + val.mv_size = sizeof(idbuf); + mdb_hash_enc(&val, encbuf); +#ifdef MDB_SHORT_SEMNAMES + encbuf[9] = '\0'; /* drop name from 15 chars to 14 chars */ +#endif + sprintf(env->me_txns->mti_rmname, "/MDBr%s", encbuf); + sprintf(env->me_txns->mti_wmname, "/MDBw%s", encbuf); + /* Clean up after a previous run, if needed: Try to + * remove both semaphores before doing anything else. + */ + sem_unlink(env->me_txns->mti_rmname); + sem_unlink(env->me_txns->mti_wmname); + env->me_rmutex = sem_open(env->me_txns->mti_rmname, + O_CREAT|O_EXCL, mode, 1); + if (env->me_rmutex == SEM_FAILED) goto fail_errno; + env->me_wmutex = sem_open(env->me_txns->mti_wmname, + O_CREAT|O_EXCL, mode, 1); + if (env->me_wmutex == SEM_FAILED) goto fail_errno; +#else /* MDB_USE_POSIX_SEM */ + pthread_mutexattr_t mattr; + + if ((rc = pthread_mutexattr_init(&mattr)) + || (rc = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED)) + || (rc = pthread_mutex_init(&env->me_txns->mti_mutex, &mattr)) + || (rc = pthread_mutex_init(&env->me_txns->mti_wmutex, &mattr))) + goto fail; + pthread_mutexattr_destroy(&mattr); +#endif /* _WIN32 || MDB_USE_POSIX_SEM */ + + env->me_txns->mti_magic = MDB_MAGIC; + env->me_txns->mti_format = MDB_LOCK_FORMAT; + env->me_txns->mti_txnid = 0; + env->me_txns->mti_numreaders = 0; + + } else { + if (env->me_txns->mti_magic != MDB_MAGIC) { + DPUTS("lock region has invalid magic"); + rc = MDB_INVALID; + goto fail; + } + if (env->me_txns->mti_format != MDB_LOCK_FORMAT) { + DPRINTF(("lock region has format+version 0x%x, expected 0x%x", + env->me_txns->mti_format, MDB_LOCK_FORMAT)); + rc = MDB_VERSION_MISMATCH; + goto fail; + } + rc = ErrCode(); + if (rc && rc != EACCES && rc != EAGAIN) { + goto fail; + } +#ifdef _WIN32 + env->me_rmutex = OpenMutex(SYNCHRONIZE, FALSE, env->me_txns->mti_rmname); + if (!env->me_rmutex) goto fail_errno; + env->me_wmutex = OpenMutex(SYNCHRONIZE, FALSE, env->me_txns->mti_wmname); + if (!env->me_wmutex) goto fail_errno; +#elif defined(MDB_USE_POSIX_SEM) + env->me_rmutex = sem_open(env->me_txns->mti_rmname, 0); + if (env->me_rmutex == SEM_FAILED) goto fail_errno; + env->me_wmutex = sem_open(env->me_txns->mti_wmname, 0); + if (env->me_wmutex == SEM_FAILED) goto fail_errno; +#endif + } + return MDB_SUCCESS; + +fail_errno: + rc = ErrCode(); +fail: + return rc; +} + + /** The name of the lock file in the DB environment */ +#define LOCKNAME "/lock.mdb" + /** The name of the data file in the DB environment */ +#define DATANAME "/data.mdb" + /** The suffix of the lock file when no subdir is used */ +#define LOCKSUFF "-lock" + /** Only a subset of the @ref mdb_env flags can be changed + * at runtime. Changing other flags requires closing the + * environment and re-opening it with the new flags. + */ +#define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC|MDB_NOMEMINIT) +#define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY|MDB_WRITEMAP| \ + MDB_NOTLS|MDB_NOLOCK|MDB_NORDAHEAD) + +#if VALID_FLAGS & PERSISTENT_FLAGS & (CHANGEABLE|CHANGELESS) +# error "Persistent DB flags & env flags overlap, but both go in mm_flags" +#endif + +int ESECT +mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode) +{ + int oflags, rc, len, excl = -1; + char *lpath, *dpath; + + if (env->me_fd!=INVALID_HANDLE_VALUE || (flags & ~(CHANGEABLE|CHANGELESS))) + return EINVAL; + + len = strlen(path); + if (flags & MDB_NOSUBDIR) { + rc = len + sizeof(LOCKSUFF) + len + 1; + } else { + rc = len + sizeof(LOCKNAME) + len + sizeof(DATANAME); + } + lpath = malloc(rc); + if (!lpath) + return ENOMEM; + if (flags & MDB_NOSUBDIR) { + dpath = lpath + len + sizeof(LOCKSUFF); + sprintf(lpath, "%s" LOCKSUFF, path); + strcpy(dpath, path); + } else { + dpath = lpath + len + sizeof(LOCKNAME); + sprintf(lpath, "%s" LOCKNAME, path); + sprintf(dpath, "%s" DATANAME, path); + } + + rc = MDB_SUCCESS; + flags |= env->me_flags; + if (flags & MDB_RDONLY) { + /* silently ignore WRITEMAP when we're only getting read access */ + flags &= ~MDB_WRITEMAP; + } else { + if (!((env->me_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX)) && + (env->me_dirty_list = calloc(MDB_IDL_UM_SIZE, sizeof(MDB_ID2))))) + rc = ENOMEM; + } + env->me_flags = flags |= MDB_ENV_ACTIVE; + if (rc) + goto leave; + + env->me_path = strdup(path); + env->me_dbxs = calloc(env->me_maxdbs, sizeof(MDB_dbx)); + env->me_dbflags = calloc(env->me_maxdbs, sizeof(uint16_t)); + env->me_dbiseqs = calloc(env->me_maxdbs, sizeof(unsigned int)); + if (!(env->me_dbxs && env->me_path && env->me_dbflags && env->me_dbiseqs)) { + rc = ENOMEM; + goto leave; + } + + /* For RDONLY, get lockfile after we know datafile exists */ + if (!(flags & (MDB_RDONLY|MDB_NOLOCK))) { + rc = mdb_env_setup_locks(env, lpath, mode, &excl); + if (rc) + goto leave; + } + +#ifdef _WIN32 + if (F_ISSET(flags, MDB_RDONLY)) { + oflags = GENERIC_READ; + len = OPEN_EXISTING; + } else { + oflags = GENERIC_READ|GENERIC_WRITE; + len = OPEN_ALWAYS; + } + mode = FILE_ATTRIBUTE_NORMAL; + env->me_fd = CreateFile(dpath, oflags, FILE_SHARE_READ|FILE_SHARE_WRITE, + NULL, len, mode, NULL); +#else + if (F_ISSET(flags, MDB_RDONLY)) + oflags = O_RDONLY; + else + oflags = O_RDWR | O_CREAT; + + env->me_fd = open(dpath, oflags, mode); +#endif + if (env->me_fd == INVALID_HANDLE_VALUE) { + rc = ErrCode(); + goto leave; + } + + if ((flags & (MDB_RDONLY|MDB_NOLOCK)) == MDB_RDONLY) { + rc = mdb_env_setup_locks(env, lpath, mode, &excl); + if (rc) + goto leave; + } + + if ((rc = mdb_env_open2(env)) == MDB_SUCCESS) { + if (flags & (MDB_RDONLY|MDB_WRITEMAP)) { + env->me_mfd = env->me_fd; + } else { + /* Synchronous fd for meta writes. Needed even with + * MDB_NOSYNC/MDB_NOMETASYNC, in case these get reset. + */ +#ifdef _WIN32 + len = OPEN_EXISTING; + env->me_mfd = CreateFile(dpath, oflags, + FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, len, + mode | FILE_FLAG_WRITE_THROUGH, NULL); +#else + oflags &= ~O_CREAT; + env->me_mfd = open(dpath, oflags | MDB_DSYNC, mode); +#endif + if (env->me_mfd == INVALID_HANDLE_VALUE) { + rc = ErrCode(); + goto leave; + } + } + DPRINTF(("opened dbenv %p", (void *) env)); + if (excl > 0) { + rc = mdb_env_share_locks(env, &excl); + if (rc) + goto leave; + } + if (!((flags & MDB_RDONLY) || + (env->me_pbuf = calloc(1, env->me_psize)))) + rc = ENOMEM; + } + +leave: + if (rc) { + mdb_env_close0(env, excl); + } + free(lpath); + return rc; +} + +/** Destroy resources from mdb_env_open(), clear our readers & DBIs */ +static void ESECT +mdb_env_close0(MDB_env *env, int excl) +{ + int i; + + if (!(env->me_flags & MDB_ENV_ACTIVE)) + return; + + /* Doing this here since me_dbxs may not exist during mdb_env_close */ + for (i = env->me_maxdbs; --i > MAIN_DBI; ) + free(env->me_dbxs[i].md_name.mv_data); + + free(env->me_pbuf); + free(env->me_dbiseqs); + free(env->me_dbflags); + free(env->me_dbxs); + free(env->me_path); + free(env->me_dirty_list); + mdb_midl_free(env->me_free_pgs); + + if (env->me_flags & MDB_ENV_TXKEY) { + pthread_key_delete(env->me_txkey); +#ifdef _WIN32 + /* Delete our key from the global list */ + for (i=0; ime_txkey) { + mdb_tls_keys[i] = mdb_tls_keys[mdb_tls_nkeys-1]; + mdb_tls_nkeys--; + break; + } +#endif + } + + if (env->me_map) { + munmap(env->me_map, env->me_mapsize); + } + if (env->me_mfd != env->me_fd && env->me_mfd != INVALID_HANDLE_VALUE) + (void) close(env->me_mfd); + if (env->me_fd != INVALID_HANDLE_VALUE) + (void) close(env->me_fd); + if (env->me_txns) { + MDB_PID_T pid = env->me_pid; + /* Clearing readers is done in this function because + * me_txkey with its destructor must be disabled first. + */ + for (i = env->me_numreaders; --i >= 0; ) + if (env->me_txns->mti_readers[i].mr_pid == pid) + env->me_txns->mti_readers[i].mr_pid = 0; +#ifdef _WIN32 + if (env->me_rmutex) { + CloseHandle(env->me_rmutex); + if (env->me_wmutex) CloseHandle(env->me_wmutex); + } + /* Windows automatically destroys the mutexes when + * the last handle closes. + */ +#elif defined(MDB_USE_POSIX_SEM) + if (env->me_rmutex != SEM_FAILED) { + sem_close(env->me_rmutex); + if (env->me_wmutex != SEM_FAILED) + sem_close(env->me_wmutex); + /* If we have the filelock: If we are the + * only remaining user, clean up semaphores. + */ + if (excl == 0) + mdb_env_excl_lock(env, &excl); + if (excl > 0) { + sem_unlink(env->me_txns->mti_rmname); + sem_unlink(env->me_txns->mti_wmname); + } + } +#endif + munmap((void *)env->me_txns, (env->me_maxreaders-1)*sizeof(MDB_reader)+sizeof(MDB_txninfo)); + } + if (env->me_lfd != INVALID_HANDLE_VALUE) { +#ifdef _WIN32 + if (excl >= 0) { + /* Unlock the lockfile. Windows would have unlocked it + * after closing anyway, but not necessarily at once. + */ + UnlockFile(env->me_lfd, 0, 0, 1, 0); + } +#endif + (void) close(env->me_lfd); + } + + env->me_flags &= ~(MDB_ENV_ACTIVE|MDB_ENV_TXKEY); +} + + +void ESECT +mdb_env_close(MDB_env *env) +{ + MDB_page *dp; + + if (env == NULL) + return; + + VGMEMP_DESTROY(env); + while ((dp = env->me_dpages) != NULL) { + VGMEMP_DEFINED(&dp->mp_next, sizeof(dp->mp_next)); + env->me_dpages = dp->mp_next; + free(dp); + } + + mdb_env_close0(env, 0); + free(env); +} + +/** Compare two items pointing at aligned size_t's */ +static int +mdb_cmp_long(const MDB_val *a, const MDB_val *b) +{ + return (*(size_t *)a->mv_data < *(size_t *)b->mv_data) ? -1 : + *(size_t *)a->mv_data > *(size_t *)b->mv_data; +} + +/** Compare two items pointing at aligned unsigned int's */ +static int +mdb_cmp_int(const MDB_val *a, const MDB_val *b) +{ + return (*(unsigned int *)a->mv_data < *(unsigned int *)b->mv_data) ? -1 : + *(unsigned int *)a->mv_data > *(unsigned int *)b->mv_data; +} + +/** Compare two items pointing at unsigned ints of unknown alignment. + * Nodes and keys are guaranteed to be 2-byte aligned. + */ +static int +mdb_cmp_cint(const MDB_val *a, const MDB_val *b) +{ +#if BYTE_ORDER == LITTLE_ENDIAN + unsigned short *u, *c; + int x; + + u = (unsigned short *) ((char *) a->mv_data + a->mv_size); + c = (unsigned short *) ((char *) b->mv_data + a->mv_size); + do { + x = *--u - *--c; + } while(!x && u > (unsigned short *)a->mv_data); + return x; +#else + unsigned short *u, *c, *end; + int x; + + end = (unsigned short *) ((char *) a->mv_data + a->mv_size); + u = (unsigned short *)a->mv_data; + c = (unsigned short *)b->mv_data; + do { + x = *u++ - *c++; + } while(!x && u < end); + return x; +#endif +} + +/** Compare two items pointing at size_t's of unknown alignment. */ +#ifdef MISALIGNED_OK +# define mdb_cmp_clong mdb_cmp_long +#else +# define mdb_cmp_clong mdb_cmp_cint +#endif + +/** Compare two items lexically */ +static int +mdb_cmp_memn(const MDB_val *a, const MDB_val *b) +{ + int diff; + ssize_t len_diff; + unsigned int len; + + len = a->mv_size; + len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size; + if (len_diff > 0) { + len = b->mv_size; + len_diff = 1; + } + + diff = memcmp(a->mv_data, b->mv_data, len); + return diff ? diff : len_diff<0 ? -1 : len_diff; +} + +/** Compare two items in reverse byte order */ +static int +mdb_cmp_memnr(const MDB_val *a, const MDB_val *b) +{ + const unsigned char *p1, *p2, *p1_lim; + ssize_t len_diff; + int diff; + + p1_lim = (const unsigned char *)a->mv_data; + p1 = (const unsigned char *)a->mv_data + a->mv_size; + p2 = (const unsigned char *)b->mv_data + b->mv_size; + + len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size; + if (len_diff > 0) { + p1_lim += len_diff; + len_diff = 1; + } + + while (p1 > p1_lim) { + diff = *--p1 - *--p2; + if (diff) + return diff; + } + return len_diff<0 ? -1 : len_diff; +} + +/** Search for key within a page, using binary search. + * Returns the smallest entry larger or equal to the key. + * If exactp is non-null, stores whether the found entry was an exact match + * in *exactp (1 or 0). + * Updates the cursor index with the index of the found entry. + * If no entry larger or equal to the key is found, returns NULL. + */ +static MDB_node * +mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp) +{ + unsigned int i = 0, nkeys; + int low, high; + int rc = 0; + MDB_page *mp = mc->mc_pg[mc->mc_top]; + MDB_node *node = NULL; + MDB_val nodekey; + MDB_cmp_func *cmp; + DKBUF; + + nkeys = NUMKEYS(mp); + + DPRINTF(("searching %u keys in %s %spage %"Z"u", + nkeys, IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", + mdb_dbg_pgno(mp))); + + low = IS_LEAF(mp) ? 0 : 1; + high = nkeys - 1; + cmp = mc->mc_dbx->md_cmp; + + /* Branch pages have no data, so if using integer keys, + * alignment is guaranteed. Use faster mdb_cmp_int. + */ + if (cmp == mdb_cmp_cint && IS_BRANCH(mp)) { + if (NODEPTR(mp, 1)->mn_ksize == sizeof(size_t)) + cmp = mdb_cmp_long; + else + cmp = mdb_cmp_int; + } + + if (IS_LEAF2(mp)) { + nodekey.mv_size = mc->mc_db->md_pad; + node = NODEPTR(mp, 0); /* fake */ + while (low <= high) { + i = (low + high) >> 1; + nodekey.mv_data = LEAF2KEY(mp, i, nodekey.mv_size); + rc = cmp(key, &nodekey); + DPRINTF(("found leaf index %u [%s], rc = %i", + i, DKEY(&nodekey), rc)); + if (rc == 0) + break; + if (rc > 0) + low = i + 1; + else + high = i - 1; + } + } else { + while (low <= high) { + i = (low + high) >> 1; + + node = NODEPTR(mp, i); + nodekey.mv_size = NODEKSZ(node); + nodekey.mv_data = NODEKEY(node); + + rc = cmp(key, &nodekey); +#if MDB_DEBUG + if (IS_LEAF(mp)) + DPRINTF(("found leaf index %u [%s], rc = %i", + i, DKEY(&nodekey), rc)); + else + DPRINTF(("found branch index %u [%s -> %"Z"u], rc = %i", + i, DKEY(&nodekey), NODEPGNO(node), rc)); +#endif + if (rc == 0) + break; + if (rc > 0) + low = i + 1; + else + high = i - 1; + } + } + + if (rc > 0) { /* Found entry is less than the key. */ + i++; /* Skip to get the smallest entry larger than key. */ + if (!IS_LEAF2(mp)) + node = NODEPTR(mp, i); + } + if (exactp) + *exactp = (rc == 0 && nkeys > 0); + /* store the key index */ + mc->mc_ki[mc->mc_top] = i; + if (i >= nkeys) + /* There is no entry larger or equal to the key. */ + return NULL; + + /* nodeptr is fake for LEAF2 */ + return node; +} + +#if 0 +static void +mdb_cursor_adjust(MDB_cursor *mc, func) +{ + MDB_cursor *m2; + + for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { + if (m2->mc_pg[m2->mc_top] == mc->mc_pg[mc->mc_top]) { + func(mc, m2); + } + } +} +#endif + +/** Pop a page off the top of the cursor's stack. */ +static void +mdb_cursor_pop(MDB_cursor *mc) +{ + if (mc->mc_snum) { +#if MDB_DEBUG + MDB_page *top = mc->mc_pg[mc->mc_top]; +#endif + mc->mc_snum--; + if (mc->mc_snum) + mc->mc_top--; + + DPRINTF(("popped page %"Z"u off db %d cursor %p", top->mp_pgno, + DDBI(mc), (void *) mc)); + } +} + +/** Push a page onto the top of the cursor's stack. */ +static int +mdb_cursor_push(MDB_cursor *mc, MDB_page *mp) +{ + DPRINTF(("pushing page %"Z"u on db %d cursor %p", mp->mp_pgno, + DDBI(mc), (void *) mc)); + + if (mc->mc_snum >= CURSOR_STACK) { + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return MDB_CURSOR_FULL; + } + + mc->mc_top = mc->mc_snum++; + mc->mc_pg[mc->mc_top] = mp; + mc->mc_ki[mc->mc_top] = 0; + + return MDB_SUCCESS; +} + +/** Find the address of the page corresponding to a given page number. + * @param[in] txn the transaction for this access. + * @param[in] pgno the page number for the page to retrieve. + * @param[out] ret address of a pointer where the page's address will be stored. + * @param[out] lvl dirty_list inheritance level of found page. 1=current txn, 0=mapped page. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **ret, int *lvl) +{ + MDB_env *env = txn->mt_env; + MDB_page *p = NULL; + int level; + + if (!((txn->mt_flags & MDB_TXN_RDONLY) | (env->me_flags & MDB_WRITEMAP))) { + MDB_txn *tx2 = txn; + level = 1; + do { + MDB_ID2L dl = tx2->mt_u.dirty_list; + unsigned x; + /* Spilled pages were dirtied in this txn and flushed + * because the dirty list got full. Bring this page + * back in from the map (but don't unspill it here, + * leave that unless page_touch happens again). + */ + if (tx2->mt_spill_pgs) { + MDB_ID pn = pgno << 1; + x = mdb_midl_search(tx2->mt_spill_pgs, pn); + if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) { + p = (MDB_page *)(env->me_map + env->me_psize * pgno); + goto done; + } + } + if (dl[0].mid) { + unsigned x = mdb_mid2l_search(dl, pgno); + if (x <= dl[0].mid && dl[x].mid == pgno) { + p = dl[x].mptr; + goto done; + } + } + level++; + } while ((tx2 = tx2->mt_parent) != NULL); + } + + if (pgno < txn->mt_next_pgno) { + level = 0; + p = (MDB_page *)(env->me_map + env->me_psize * pgno); + } else { + DPRINTF(("page %"Z"u not found", pgno)); + txn->mt_flags |= MDB_TXN_ERROR; + return MDB_PAGE_NOTFOUND; + } + +done: + *ret = p; + if (lvl) + *lvl = level; + return MDB_SUCCESS; +} + +/** Finish #mdb_page_search() / #mdb_page_search_lowest(). + * The cursor is at the root page, set up the rest of it. + */ +static int +mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) +{ + MDB_page *mp = mc->mc_pg[mc->mc_top]; + int rc; + DKBUF; + + while (IS_BRANCH(mp)) { + MDB_node *node; + indx_t i; + + DPRINTF(("branch page %"Z"u has %u keys", mp->mp_pgno, NUMKEYS(mp))); + mdb_cassert(mc, NUMKEYS(mp) > 1); + DPRINTF(("found index 0 to page %"Z"u", NODEPGNO(NODEPTR(mp, 0)))); + + if (flags & (MDB_PS_FIRST|MDB_PS_LAST)) { + i = 0; + if (flags & MDB_PS_LAST) + i = NUMKEYS(mp) - 1; + } else { + int exact; + node = mdb_node_search(mc, key, &exact); + if (node == NULL) + i = NUMKEYS(mp) - 1; + else { + i = mc->mc_ki[mc->mc_top]; + if (!exact) { + mdb_cassert(mc, i > 0); + i--; + } + } + DPRINTF(("following index %u for key [%s]", i, DKEY(key))); + } + + mdb_cassert(mc, i < NUMKEYS(mp)); + node = NODEPTR(mp, i); + + if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(node), &mp, NULL)) != 0) + return rc; + + mc->mc_ki[mc->mc_top] = i; + if ((rc = mdb_cursor_push(mc, mp))) + return rc; + + if (flags & MDB_PS_MODIFY) { + if ((rc = mdb_page_touch(mc)) != 0) + return rc; + mp = mc->mc_pg[mc->mc_top]; + } + } + + if (!IS_LEAF(mp)) { + DPRINTF(("internal error, index points to a %02X page!?", + mp->mp_flags)); + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return MDB_CORRUPTED; + } + + DPRINTF(("found leaf page %"Z"u for key [%s]", mp->mp_pgno, + key ? DKEY(key) : "null")); + mc->mc_flags |= C_INITIALIZED; + mc->mc_flags &= ~C_EOF; + + return MDB_SUCCESS; +} + +/** Search for the lowest key under the current branch page. + * This just bypasses a NUMKEYS check in the current page + * before calling mdb_page_search_root(), because the callers + * are all in situations where the current page is known to + * be underfilled. + */ +static int +mdb_page_search_lowest(MDB_cursor *mc) +{ + MDB_page *mp = mc->mc_pg[mc->mc_top]; + MDB_node *node = NODEPTR(mp, 0); + int rc; + + if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(node), &mp, NULL)) != 0) + return rc; + + mc->mc_ki[mc->mc_top] = 0; + if ((rc = mdb_cursor_push(mc, mp))) + return rc; + return mdb_page_search_root(mc, NULL, MDB_PS_FIRST); +} + +/** Search for the page a given key should be in. + * Push it and its parent pages on the cursor stack. + * @param[in,out] mc the cursor for this operation. + * @param[in] key the key to search for, or NULL for first/last page. + * @param[in] flags If MDB_PS_MODIFY is set, visited pages in the DB + * are touched (updated with new page numbers). + * If MDB_PS_FIRST or MDB_PS_LAST is set, find first or last leaf. + * This is used by #mdb_cursor_first() and #mdb_cursor_last(). + * If MDB_PS_ROOTONLY set, just fetch root node, no further lookups. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags) +{ + int rc; + pgno_t root; + + /* Make sure the txn is still viable, then find the root from + * the txn's db table and set it as the root of the cursor's stack. + */ + if (F_ISSET(mc->mc_txn->mt_flags, MDB_TXN_ERROR)) { + DPUTS("transaction has failed, must abort"); + return MDB_BAD_TXN; + } else { + /* Make sure we're using an up-to-date root */ + if (*mc->mc_dbflag & DB_STALE) { + MDB_cursor mc2; + if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi)) + return MDB_BAD_DBI; + mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL); + rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, 0); + if (rc) + return rc; + { + MDB_val data; + int exact = 0; + uint16_t flags; + MDB_node *leaf = mdb_node_search(&mc2, + &mc->mc_dbx->md_name, &exact); + if (!exact) + return MDB_NOTFOUND; + rc = mdb_node_read(mc->mc_txn, leaf, &data); + if (rc) + return rc; + memcpy(&flags, ((char *) data.mv_data + offsetof(MDB_db, md_flags)), + sizeof(uint16_t)); + /* The txn may not know this DBI, or another process may + * have dropped and recreated the DB with other flags. + */ + if ((mc->mc_db->md_flags & PERSISTENT_FLAGS) != flags) + return MDB_INCOMPATIBLE; + memcpy(mc->mc_db, data.mv_data, sizeof(MDB_db)); + } + *mc->mc_dbflag &= ~DB_STALE; + } + root = mc->mc_db->md_root; + + if (root == P_INVALID) { /* Tree is empty. */ + DPUTS("tree is empty"); + return MDB_NOTFOUND; + } + } + + mdb_cassert(mc, root > 1); + if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root) + if ((rc = mdb_page_get(mc->mc_txn, root, &mc->mc_pg[0], NULL)) != 0) + return rc; + + mc->mc_snum = 1; + mc->mc_top = 0; + + DPRINTF(("db %d root page %"Z"u has flags 0x%X", + DDBI(mc), root, mc->mc_pg[0]->mp_flags)); + + if (flags & MDB_PS_MODIFY) { + if ((rc = mdb_page_touch(mc))) + return rc; + } + + if (flags & MDB_PS_ROOTONLY) + return MDB_SUCCESS; + + return mdb_page_search_root(mc, key, flags); +} + +static int +mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp) +{ + MDB_txn *txn = mc->mc_txn; + pgno_t pg = mp->mp_pgno; + unsigned x = 0, ovpages = mp->mp_pages; + MDB_env *env = txn->mt_env; + MDB_IDL sl = txn->mt_spill_pgs; + MDB_ID pn = pg << 1; + int rc; + + DPRINTF(("free ov page %"Z"u (%d)", pg, ovpages)); + /* If the page is dirty or on the spill list we just acquired it, + * so we should give it back to our current free list, if any. + * Otherwise put it onto the list of pages we freed in this txn. + * + * Won't create me_pghead: me_pglast must be inited along with it. + * Unsupported in nested txns: They would need to hide the page + * range in ancestor txns' dirty and spilled lists. + */ + if (env->me_pghead && + !txn->mt_parent && + ((mp->mp_flags & P_DIRTY) || + (sl && (x = mdb_midl_search(sl, pn)) <= sl[0] && sl[x] == pn))) + { + unsigned i, j; + pgno_t *mop; + MDB_ID2 *dl, ix, iy; + rc = mdb_midl_need(&env->me_pghead, ovpages); + if (rc) + return rc; + if (!(mp->mp_flags & P_DIRTY)) { + /* This page is no longer spilled */ + if (x == sl[0]) + sl[0]--; + else + sl[x] |= 1; + goto release; + } + /* Remove from dirty list */ + dl = txn->mt_u.dirty_list; + x = dl[0].mid--; + for (ix = dl[x]; ix.mptr != mp; ix = iy) { + if (x > 1) { + x--; + iy = dl[x]; + dl[x] = ix; + } else { + mdb_cassert(mc, x > 1); + j = ++(dl[0].mid); + dl[j] = ix; /* Unsorted. OK when MDB_TXN_ERROR. */ + txn->mt_flags |= MDB_TXN_ERROR; + return MDB_CORRUPTED; + } + } + if (!(env->me_flags & MDB_WRITEMAP)) + mdb_dpage_free(env, mp); +release: + /* Insert in me_pghead */ + mop = env->me_pghead; + j = mop[0] + ovpages; + for (i = mop[0]; i && mop[i] < pg; i--) + mop[j--] = mop[i]; + while (j>i) + mop[j--] = pg++; + mop[0] += ovpages; + } else { + rc = mdb_midl_append_range(&txn->mt_free_pgs, pg, ovpages); + if (rc) + return rc; + } + mc->mc_db->md_overflow_pages -= ovpages; + return 0; +} + +/** Return the data associated with a given node. + * @param[in] txn The transaction for this operation. + * @param[in] leaf The node being read. + * @param[out] data Updated to point to the node's data. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_node_read(MDB_txn *txn, MDB_node *leaf, MDB_val *data) +{ + MDB_page *omp; /* overflow page */ + pgno_t pgno; + int rc; + + if (!F_ISSET(leaf->mn_flags, F_BIGDATA)) { + data->mv_size = NODEDSZ(leaf); + data->mv_data = NODEDATA(leaf); + return MDB_SUCCESS; + } + + /* Read overflow data. + */ + data->mv_size = NODEDSZ(leaf); + memcpy(&pgno, NODEDATA(leaf), sizeof(pgno)); + if ((rc = mdb_page_get(txn, pgno, &omp, NULL)) != 0) { + DPRINTF(("read overflow page %"Z"u failed", pgno)); + return rc; + } + data->mv_data = METADATA(omp); + + return MDB_SUCCESS; +} + +int +mdb_get(MDB_txn *txn, MDB_dbi dbi, + MDB_val *key, MDB_val *data) +{ + MDB_cursor mc; + MDB_xcursor mx; + int exact = 0; + DKBUF; + + DPRINTF(("===> get db %u key [%s]", dbi, DKEY(key))); + + if (!key || !data || dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) + return EINVAL; + + if (txn->mt_flags & MDB_TXN_ERROR) + return MDB_BAD_TXN; + + mdb_cursor_init(&mc, txn, dbi, &mx); + return mdb_cursor_set(&mc, key, data, MDB_SET, &exact); +} + +/** Find a sibling for a page. + * Replaces the page at the top of the cursor's stack with the + * specified sibling, if one exists. + * @param[in] mc The cursor for this operation. + * @param[in] move_right Non-zero if the right sibling is requested, + * otherwise the left sibling. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_cursor_sibling(MDB_cursor *mc, int move_right) +{ + int rc; + MDB_node *indx; + MDB_page *mp; + + if (mc->mc_snum < 2) { + return MDB_NOTFOUND; /* root has no siblings */ + } + + mdb_cursor_pop(mc); + DPRINTF(("parent page is page %"Z"u, index %u", + mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top])); + + if (move_right ? (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mc->mc_pg[mc->mc_top])) + : (mc->mc_ki[mc->mc_top] == 0)) { + DPRINTF(("no more keys left, moving to %s sibling", + move_right ? "right" : "left")); + if ((rc = mdb_cursor_sibling(mc, move_right)) != MDB_SUCCESS) { + /* undo cursor_pop before returning */ + mc->mc_top++; + mc->mc_snum++; + return rc; + } + } else { + if (move_right) + mc->mc_ki[mc->mc_top]++; + else + mc->mc_ki[mc->mc_top]--; + DPRINTF(("just moving to %s index key %u", + move_right ? "right" : "left", mc->mc_ki[mc->mc_top])); + } + mdb_cassert(mc, IS_BRANCH(mc->mc_pg[mc->mc_top])); + + indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(indx), &mp, NULL)) != 0) { + /* mc will be inconsistent if caller does mc_snum++ as above */ + mc->mc_flags &= ~(C_INITIALIZED|C_EOF); + return rc; + } + + mdb_cursor_push(mc, mp); + if (!move_right) + mc->mc_ki[mc->mc_top] = NUMKEYS(mp)-1; + + return MDB_SUCCESS; +} + +/** Move the cursor to the next data item. */ +static int +mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) +{ + MDB_page *mp; + MDB_node *leaf; + int rc; + + if (mc->mc_flags & C_EOF) { + return MDB_NOTFOUND; + } + + mdb_cassert(mc, mc->mc_flags & C_INITIALIZED); + + mp = mc->mc_pg[mc->mc_top]; + + if (mc->mc_db->md_flags & MDB_DUPSORT) { + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (op == MDB_NEXT || op == MDB_NEXT_DUP) { + rc = mdb_cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_NEXT); + if (op != MDB_NEXT || rc != MDB_NOTFOUND) { + if (rc == MDB_SUCCESS) + MDB_GET_KEY(leaf, key); + return rc; + } + } + } else { + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + if (op == MDB_NEXT_DUP) + return MDB_NOTFOUND; + } + } + + DPRINTF(("cursor_next: top page is %"Z"u in cursor %p", + mdb_dbg_pgno(mp), (void *) mc)); + if (mc->mc_flags & C_DEL) + goto skip; + + if (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mp)) { + DPUTS("=====> move to next sibling page"); + if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS) { + mc->mc_flags |= C_EOF; + return rc; + } + mp = mc->mc_pg[mc->mc_top]; + DPRINTF(("next page is %"Z"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top])); + } else + mc->mc_ki[mc->mc_top]++; + +skip: + DPRINTF(("==> cursor points to page %"Z"u with %u keys, key index %u", + mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top])); + + if (IS_LEAF2(mp)) { + key->mv_size = mc->mc_db->md_pad; + key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); + return MDB_SUCCESS; + } + + mdb_cassert(mc, IS_LEAF(mp)); + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdb_xcursor_init1(mc, leaf); + } + if (data) { + if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) + return rc; + + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + if (rc != MDB_SUCCESS) + return rc; + } + } + + MDB_GET_KEY(leaf, key); + return MDB_SUCCESS; +} + +/** Move the cursor to the previous data item. */ +static int +mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) +{ + MDB_page *mp; + MDB_node *leaf; + int rc; + + mdb_cassert(mc, mc->mc_flags & C_INITIALIZED); + + mp = mc->mc_pg[mc->mc_top]; + + if (mc->mc_db->md_flags & MDB_DUPSORT) { + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (op == MDB_PREV || op == MDB_PREV_DUP) { + rc = mdb_cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_PREV); + if (op != MDB_PREV || rc != MDB_NOTFOUND) { + if (rc == MDB_SUCCESS) { + MDB_GET_KEY(leaf, key); + mc->mc_flags &= ~C_EOF; + } + return rc; + } + } else { + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + if (op == MDB_PREV_DUP) + return MDB_NOTFOUND; + } + } + } + + DPRINTF(("cursor_prev: top page is %"Z"u in cursor %p", + mdb_dbg_pgno(mp), (void *) mc)); + + if (mc->mc_ki[mc->mc_top] == 0) { + DPUTS("=====> move to prev sibling page"); + if ((rc = mdb_cursor_sibling(mc, 0)) != MDB_SUCCESS) { + return rc; + } + mp = mc->mc_pg[mc->mc_top]; + mc->mc_ki[mc->mc_top] = NUMKEYS(mp) - 1; + DPRINTF(("prev page is %"Z"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top])); + } else + mc->mc_ki[mc->mc_top]--; + + mc->mc_flags &= ~C_EOF; + + DPRINTF(("==> cursor points to page %"Z"u with %u keys, key index %u", + mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top])); + + if (IS_LEAF2(mp)) { + key->mv_size = mc->mc_db->md_pad; + key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); + return MDB_SUCCESS; + } + + mdb_cassert(mc, IS_LEAF(mp)); + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdb_xcursor_init1(mc, leaf); + } + if (data) { + if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) + return rc; + + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); + if (rc != MDB_SUCCESS) + return rc; + } + } + + MDB_GET_KEY(leaf, key); + return MDB_SUCCESS; +} + +/** Set the cursor on a specific data item. */ +static int +mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, + MDB_cursor_op op, int *exactp) +{ + int rc; + MDB_page *mp; + MDB_node *leaf = NULL; + DKBUF; + + if (key->mv_size == 0) + return MDB_BAD_VALSIZE; + + if (mc->mc_xcursor) + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + + /* See if we're already on the right page */ + if (mc->mc_flags & C_INITIALIZED) { + MDB_val nodekey; + + mp = mc->mc_pg[mc->mc_top]; + if (!NUMKEYS(mp)) { + mc->mc_ki[mc->mc_top] = 0; + return MDB_NOTFOUND; + } + if (mp->mp_flags & P_LEAF2) { + nodekey.mv_size = mc->mc_db->md_pad; + nodekey.mv_data = LEAF2KEY(mp, 0, nodekey.mv_size); + } else { + leaf = NODEPTR(mp, 0); + MDB_GET_KEY2(leaf, nodekey); + } + rc = mc->mc_dbx->md_cmp(key, &nodekey); + if (rc == 0) { + /* Probably happens rarely, but first node on the page + * was the one we wanted. + */ + mc->mc_ki[mc->mc_top] = 0; + if (exactp) + *exactp = 1; + goto set1; + } + if (rc > 0) { + unsigned int i; + unsigned int nkeys = NUMKEYS(mp); + if (nkeys > 1) { + if (mp->mp_flags & P_LEAF2) { + nodekey.mv_data = LEAF2KEY(mp, + nkeys-1, nodekey.mv_size); + } else { + leaf = NODEPTR(mp, nkeys-1); + MDB_GET_KEY2(leaf, nodekey); + } + rc = mc->mc_dbx->md_cmp(key, &nodekey); + if (rc == 0) { + /* last node was the one we wanted */ + mc->mc_ki[mc->mc_top] = nkeys-1; + if (exactp) + *exactp = 1; + goto set1; + } + if (rc < 0) { + if (mc->mc_ki[mc->mc_top] < NUMKEYS(mp)) { + /* This is definitely the right page, skip search_page */ + if (mp->mp_flags & P_LEAF2) { + nodekey.mv_data = LEAF2KEY(mp, + mc->mc_ki[mc->mc_top], nodekey.mv_size); + } else { + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + MDB_GET_KEY2(leaf, nodekey); + } + rc = mc->mc_dbx->md_cmp(key, &nodekey); + if (rc == 0) { + /* current node was the one we wanted */ + if (exactp) + *exactp = 1; + goto set1; + } + } + rc = 0; + goto set2; + } + } + /* If any parents have right-sibs, search. + * Otherwise, there's nothing further. + */ + for (i=0; imc_top; i++) + if (mc->mc_ki[i] < + NUMKEYS(mc->mc_pg[i])-1) + break; + if (i == mc->mc_top) { + /* There are no other pages */ + mc->mc_ki[mc->mc_top] = nkeys; + return MDB_NOTFOUND; + } + } + if (!mc->mc_top) { + /* There are no other pages */ + mc->mc_ki[mc->mc_top] = 0; + if (op == MDB_SET_RANGE && !exactp) { + rc = 0; + goto set1; + } else + return MDB_NOTFOUND; + } + } + + rc = mdb_page_search(mc, key, 0); + if (rc != MDB_SUCCESS) + return rc; + + mp = mc->mc_pg[mc->mc_top]; + mdb_cassert(mc, IS_LEAF(mp)); + +set2: + leaf = mdb_node_search(mc, key, exactp); + if (exactp != NULL && !*exactp) { + /* MDB_SET specified and not an exact match. */ + return MDB_NOTFOUND; + } + + if (leaf == NULL) { + DPUTS("===> inexact leaf not found, goto sibling"); + if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS) + return rc; /* no entries matched */ + mp = mc->mc_pg[mc->mc_top]; + mdb_cassert(mc, IS_LEAF(mp)); + leaf = NODEPTR(mp, 0); + } + +set1: + mc->mc_flags |= C_INITIALIZED; + mc->mc_flags &= ~C_EOF; + + if (IS_LEAF2(mp)) { + if (op == MDB_SET_RANGE || op == MDB_SET_KEY) { + key->mv_size = mc->mc_db->md_pad; + key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); + } + return MDB_SUCCESS; + } + + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdb_xcursor_init1(mc, leaf); + } + if (data) { + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (op == MDB_SET || op == MDB_SET_KEY || op == MDB_SET_RANGE) { + rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + } else { + int ex2, *ex2p; + if (op == MDB_GET_BOTH) { + ex2p = &ex2; + ex2 = 0; + } else { + ex2p = NULL; + } + rc = mdb_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_SET_RANGE, ex2p); + if (rc != MDB_SUCCESS) + return rc; + } + } else if (op == MDB_GET_BOTH || op == MDB_GET_BOTH_RANGE) { + MDB_val d2; + if ((rc = mdb_node_read(mc->mc_txn, leaf, &d2)) != MDB_SUCCESS) + return rc; + rc = mc->mc_dbx->md_dcmp(data, &d2); + if (rc) { + if (op == MDB_GET_BOTH || rc > 0) + return MDB_NOTFOUND; + rc = 0; + *data = d2; + } + + } else { + if (mc->mc_xcursor) + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) + return rc; + } + } + + /* The key already matches in all other cases */ + if (op == MDB_SET_RANGE || op == MDB_SET_KEY) + MDB_GET_KEY(leaf, key); + DPRINTF(("==> cursor placed on key [%s]", DKEY(key))); + + return rc; +} + +/** Move the cursor to the first item in the database. */ +static int +mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data) +{ + int rc; + MDB_node *leaf; + + if (mc->mc_xcursor) + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + + if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { + rc = mdb_page_search(mc, NULL, MDB_PS_FIRST); + if (rc != MDB_SUCCESS) + return rc; + } + mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); + + leaf = NODEPTR(mc->mc_pg[mc->mc_top], 0); + mc->mc_flags |= C_INITIALIZED; + mc->mc_flags &= ~C_EOF; + + mc->mc_ki[mc->mc_top] = 0; + + if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { + key->mv_size = mc->mc_db->md_pad; + key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], 0, key->mv_size); + return MDB_SUCCESS; + } + + if (data) { + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdb_xcursor_init1(mc, leaf); + rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + if (rc) + return rc; + } else { + if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) + return rc; + } + } + MDB_GET_KEY(leaf, key); + return MDB_SUCCESS; +} + +/** Move the cursor to the last item in the database. */ +static int +mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data) +{ + int rc; + MDB_node *leaf; + + if (mc->mc_xcursor) + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + + if (!(mc->mc_flags & C_EOF)) { + + if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { + rc = mdb_page_search(mc, NULL, MDB_PS_LAST); + if (rc != MDB_SUCCESS) + return rc; + } + mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); + + } + mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]) - 1; + mc->mc_flags |= C_INITIALIZED|C_EOF; + leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + + if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { + key->mv_size = mc->mc_db->md_pad; + key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], key->mv_size); + return MDB_SUCCESS; + } + + if (data) { + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdb_xcursor_init1(mc, leaf); + rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); + if (rc) + return rc; + } else { + if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) + return rc; + } + } + + MDB_GET_KEY(leaf, key); + return MDB_SUCCESS; +} + +int +mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, + MDB_cursor_op op) +{ + int rc; + int exact = 0; + int (*mfunc)(MDB_cursor *mc, MDB_val *key, MDB_val *data); + + if (mc == NULL) + return EINVAL; + + if (mc->mc_txn->mt_flags & MDB_TXN_ERROR) + return MDB_BAD_TXN; + + switch (op) { + case MDB_GET_CURRENT: + if (!(mc->mc_flags & C_INITIALIZED)) { + rc = EINVAL; + } else { + MDB_page *mp = mc->mc_pg[mc->mc_top]; + int nkeys = NUMKEYS(mp); + if (!nkeys || mc->mc_ki[mc->mc_top] >= nkeys) { + mc->mc_ki[mc->mc_top] = nkeys; + rc = MDB_NOTFOUND; + break; + } + rc = MDB_SUCCESS; + if (IS_LEAF2(mp)) { + key->mv_size = mc->mc_db->md_pad; + key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); + } else { + MDB_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + MDB_GET_KEY(leaf, key); + if (data) { + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (mc->mc_flags & C_DEL) + mdb_xcursor_init1(mc, leaf); + rc = mdb_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_GET_CURRENT); + } else { + rc = mdb_node_read(mc->mc_txn, leaf, data); + } + } + } + } + break; + case MDB_GET_BOTH: + case MDB_GET_BOTH_RANGE: + if (data == NULL) { + rc = EINVAL; + break; + } + if (mc->mc_xcursor == NULL) { + rc = MDB_INCOMPATIBLE; + break; + } + /* FALLTHRU */ + case MDB_SET: + case MDB_SET_KEY: + case MDB_SET_RANGE: + if (key == NULL) { + rc = EINVAL; + } else { + rc = mdb_cursor_set(mc, key, data, op, + op == MDB_SET_RANGE ? NULL : &exact); + } + break; + case MDB_GET_MULTIPLE: + if (data == NULL || !(mc->mc_flags & C_INITIALIZED)) { + rc = EINVAL; + break; + } + if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { + rc = MDB_INCOMPATIBLE; + break; + } + rc = MDB_SUCCESS; + if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) || + (mc->mc_xcursor->mx_cursor.mc_flags & C_EOF)) + break; + goto fetchm; + case MDB_NEXT_MULTIPLE: + if (data == NULL) { + rc = EINVAL; + break; + } + if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { + rc = MDB_INCOMPATIBLE; + break; + } + if (!(mc->mc_flags & C_INITIALIZED)) + rc = mdb_cursor_first(mc, key, data); + else + rc = mdb_cursor_next(mc, key, data, MDB_NEXT_DUP); + if (rc == MDB_SUCCESS) { + if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { + MDB_cursor *mx; +fetchm: + mx = &mc->mc_xcursor->mx_cursor; + data->mv_size = NUMKEYS(mx->mc_pg[mx->mc_top]) * + mx->mc_db->md_pad; + data->mv_data = METADATA(mx->mc_pg[mx->mc_top]); + mx->mc_ki[mx->mc_top] = NUMKEYS(mx->mc_pg[mx->mc_top])-1; + } else { + rc = MDB_NOTFOUND; + } + } + break; + case MDB_NEXT: + case MDB_NEXT_DUP: + case MDB_NEXT_NODUP: + if (!(mc->mc_flags & C_INITIALIZED)) + rc = mdb_cursor_first(mc, key, data); + else + rc = mdb_cursor_next(mc, key, data, op); + break; + case MDB_PREV: + case MDB_PREV_DUP: + case MDB_PREV_NODUP: + if (!(mc->mc_flags & C_INITIALIZED)) { + rc = mdb_cursor_last(mc, key, data); + if (rc) + break; + mc->mc_flags |= C_INITIALIZED; + mc->mc_ki[mc->mc_top]++; + } + rc = mdb_cursor_prev(mc, key, data, op); + break; + case MDB_FIRST: + rc = mdb_cursor_first(mc, key, data); + break; + case MDB_FIRST_DUP: + mfunc = mdb_cursor_first; + mmove: + if (data == NULL || !(mc->mc_flags & C_INITIALIZED)) { + rc = EINVAL; + break; + } + if (mc->mc_xcursor == NULL) { + rc = MDB_INCOMPATIBLE; + break; + } + { + MDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { + MDB_GET_KEY(leaf, key); + if (data) + rc = mdb_node_read(mc->mc_txn, leaf, data); + break; + } + } + if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) { + rc = EINVAL; + break; + } + rc = mfunc(&mc->mc_xcursor->mx_cursor, data, NULL); + break; + case MDB_LAST: + rc = mdb_cursor_last(mc, key, data); + break; + case MDB_LAST_DUP: + mfunc = mdb_cursor_last; + goto mmove; + default: + DPRINTF(("unhandled/unimplemented cursor operation %u", op)); + rc = EINVAL; + break; + } + + if (mc->mc_flags & C_DEL) + mc->mc_flags ^= C_DEL; + + return rc; +} + +/** Touch all the pages in the cursor stack. Set mc_top. + * Makes sure all the pages are writable, before attempting a write operation. + * @param[in] mc The cursor to operate on. + */ +static int +mdb_cursor_touch(MDB_cursor *mc) +{ + int rc = MDB_SUCCESS; + + if (mc->mc_dbi > MAIN_DBI && !(*mc->mc_dbflag & DB_DIRTY)) { + MDB_cursor mc2; + MDB_xcursor mcx; + if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi)) + return MDB_BAD_DBI; + mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, &mcx); + rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, MDB_PS_MODIFY); + if (rc) + return rc; + *mc->mc_dbflag |= DB_DIRTY; + } + mc->mc_top = 0; + if (mc->mc_snum) { + do { + rc = mdb_page_touch(mc); + } while (!rc && ++(mc->mc_top) < mc->mc_snum); + mc->mc_top = mc->mc_snum-1; + } + return rc; +} + +/** Do not spill pages to disk if txn is getting full, may fail instead */ +#define MDB_NOSPILL 0x8000 + +int +mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, + unsigned int flags) +{ + enum { MDB_NO_ROOT = MDB_LAST_ERRCODE+10 }; /* internal code */ + MDB_env *env; + MDB_node *leaf = NULL; + MDB_page *fp, *mp; + uint16_t fp_flags; + MDB_val xdata, *rdata, dkey, olddata; + MDB_db dummy; + int do_sub = 0, insert_key, insert_data; + unsigned int mcount = 0, dcount = 0, nospill; + size_t nsize; + int rc, rc2; + unsigned int nflags; + DKBUF; + + if (mc == NULL || key == NULL) + return EINVAL; + + env = mc->mc_txn->mt_env; + + /* Check this first so counter will always be zero on any + * early failures. + */ + if (flags & MDB_MULTIPLE) { + dcount = data[1].mv_size; + data[1].mv_size = 0; + if (!F_ISSET(mc->mc_db->md_flags, MDB_DUPFIXED)) + return MDB_INCOMPATIBLE; + } + + nospill = flags & MDB_NOSPILL; + flags &= ~MDB_NOSPILL; + + if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR)) + return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; + + if (key->mv_size-1 >= ENV_MAXKEY(env)) + return MDB_BAD_VALSIZE; + +#if SIZE_MAX > MAXDATASIZE + if (data->mv_size > ((mc->mc_db->md_flags & MDB_DUPSORT) ? ENV_MAXKEY(env) : MAXDATASIZE)) + return MDB_BAD_VALSIZE; +#else + if ((mc->mc_db->md_flags & MDB_DUPSORT) && data->mv_size > ENV_MAXKEY(env)) + return MDB_BAD_VALSIZE; +#endif + + DPRINTF(("==> put db %d key [%s], size %"Z"u, data size %"Z"u", + DDBI(mc), DKEY(key), key ? key->mv_size : 0, data->mv_size)); + + dkey.mv_size = 0; + + if (flags == MDB_CURRENT) { + if (!(mc->mc_flags & C_INITIALIZED)) + return EINVAL; + rc = MDB_SUCCESS; + } else if (mc->mc_db->md_root == P_INVALID) { + /* new database, cursor has nothing to point to */ + mc->mc_snum = 0; + mc->mc_top = 0; + mc->mc_flags &= ~C_INITIALIZED; + rc = MDB_NO_ROOT; + } else { + int exact = 0; + MDB_val d2; + if (flags & MDB_APPEND) { + MDB_val k2; + rc = mdb_cursor_last(mc, &k2, &d2); + if (rc == 0) { + rc = mc->mc_dbx->md_cmp(key, &k2); + if (rc > 0) { + rc = MDB_NOTFOUND; + mc->mc_ki[mc->mc_top]++; + } else { + /* new key is <= last key */ + rc = MDB_KEYEXIST; + } + } + } else { + rc = mdb_cursor_set(mc, key, &d2, MDB_SET, &exact); + } + if ((flags & MDB_NOOVERWRITE) && rc == 0) { + DPRINTF(("duplicate key [%s]", DKEY(key))); + *data = d2; + return MDB_KEYEXIST; + } + if (rc && rc != MDB_NOTFOUND) + return rc; + } + + if (mc->mc_flags & C_DEL) + mc->mc_flags ^= C_DEL; + + /* Cursor is positioned, check for room in the dirty list */ + if (!nospill) { + if (flags & MDB_MULTIPLE) { + rdata = &xdata; + xdata.mv_size = data->mv_size * dcount; + } else { + rdata = data; + } + if ((rc2 = mdb_page_spill(mc, key, rdata))) + return rc2; + } + + if (rc == MDB_NO_ROOT) { + MDB_page *np; + /* new database, write a root leaf page */ + DPUTS("allocating new root leaf page"); + if ((rc2 = mdb_page_new(mc, P_LEAF, 1, &np))) { + return rc2; + } + mdb_cursor_push(mc, np); + mc->mc_db->md_root = np->mp_pgno; + mc->mc_db->md_depth++; + *mc->mc_dbflag |= DB_DIRTY; + if ((mc->mc_db->md_flags & (MDB_DUPSORT|MDB_DUPFIXED)) + == MDB_DUPFIXED) + np->mp_flags |= P_LEAF2; + mc->mc_flags |= C_INITIALIZED; + } else { + /* make sure all cursor pages are writable */ + rc2 = mdb_cursor_touch(mc); + if (rc2) + return rc2; + } + + insert_key = insert_data = rc; + if (insert_key) { + /* The key does not exist */ + DPRINTF(("inserting key at index %i", mc->mc_ki[mc->mc_top])); + if ((mc->mc_db->md_flags & MDB_DUPSORT) && + LEAFSIZE(key, data) > env->me_nodemax) + { + /* Too big for a node, insert in sub-DB. Set up an empty + * "old sub-page" for prep_subDB to expand to a full page. + */ + fp_flags = P_LEAF|P_DIRTY; + fp = env->me_pbuf; + fp->mp_pad = data->mv_size; /* used if MDB_DUPFIXED */ + fp->mp_lower = fp->mp_upper = (PAGEHDRSZ-PAGEBASE); + olddata.mv_size = PAGEHDRSZ; + goto prep_subDB; + } + } else { + /* there's only a key anyway, so this is a no-op */ + if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { + char *ptr; + unsigned int ksize = mc->mc_db->md_pad; + if (key->mv_size != ksize) + return MDB_BAD_VALSIZE; + ptr = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize); + memcpy(ptr, key->mv_data, ksize); +fix_parent: + /* if overwriting slot 0 of leaf, need to + * update branch key if there is a parent page + */ + if (mc->mc_top && !mc->mc_ki[mc->mc_top]) { + unsigned short top = mc->mc_top; + mc->mc_top--; + /* slot 0 is always an empty key, find real slot */ + while (mc->mc_top && !mc->mc_ki[mc->mc_top]) + mc->mc_top--; + if (mc->mc_ki[mc->mc_top]) + rc2 = mdb_update_key(mc, key); + else + rc2 = MDB_SUCCESS; + mc->mc_top = top; + if (rc2) + return rc2; + } + return MDB_SUCCESS; + } + +more: + leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + olddata.mv_size = NODEDSZ(leaf); + olddata.mv_data = NODEDATA(leaf); + + /* DB has dups? */ + if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) { + /* Prepare (sub-)page/sub-DB to accept the new item, + * if needed. fp: old sub-page or a header faking + * it. mp: new (sub-)page. offset: growth in page + * size. xdata: node data with new page or DB. + */ + unsigned i, offset = 0; + mp = fp = xdata.mv_data = env->me_pbuf; + mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno; + + /* Was a single item before, must convert now */ + if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { + /* Just overwrite the current item */ + if (flags == MDB_CURRENT) + goto current; + +#if UINT_MAX < SIZE_MAX + if (mc->mc_dbx->md_dcmp == mdb_cmp_int && olddata.mv_size == sizeof(size_t)) + mc->mc_dbx->md_dcmp = mdb_cmp_clong; +#endif + /* does data match? */ + if (!mc->mc_dbx->md_dcmp(data, &olddata)) { + if (flags & MDB_NODUPDATA) + return MDB_KEYEXIST; + /* overwrite it */ + goto current; + } + + /* Back up original data item */ + dkey.mv_size = olddata.mv_size; + dkey.mv_data = memcpy(fp+1, olddata.mv_data, olddata.mv_size); + + /* Make sub-page header for the dup items, with dummy body */ + fp->mp_flags = P_LEAF|P_DIRTY|P_SUBP; + fp->mp_lower = (PAGEHDRSZ-PAGEBASE); + xdata.mv_size = PAGEHDRSZ + dkey.mv_size + data->mv_size; + if (mc->mc_db->md_flags & MDB_DUPFIXED) { + fp->mp_flags |= P_LEAF2; + fp->mp_pad = data->mv_size; + xdata.mv_size += 2 * data->mv_size; /* leave space for 2 more */ + } else { + xdata.mv_size += 2 * (sizeof(indx_t) + NODESIZE) + + (dkey.mv_size & 1) + (data->mv_size & 1); + } + fp->mp_upper = xdata.mv_size - PAGEBASE; + olddata.mv_size = xdata.mv_size; /* pretend olddata is fp */ + } else if (leaf->mn_flags & F_SUBDATA) { + /* Data is on sub-DB, just store it */ + flags |= F_DUPDATA|F_SUBDATA; + goto put_sub; + } else { + /* Data is on sub-page */ + fp = olddata.mv_data; + switch (flags) { + default: + if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { + offset = EVEN(NODESIZE + sizeof(indx_t) + + data->mv_size); + break; + } + offset = fp->mp_pad; + if (SIZELEFT(fp) < offset) { + offset *= 4; /* space for 4 more */ + break; + } + /* FALLTHRU: Big enough MDB_DUPFIXED sub-page */ + case MDB_CURRENT: + fp->mp_flags |= P_DIRTY; + COPY_PGNO(fp->mp_pgno, mp->mp_pgno); + mc->mc_xcursor->mx_cursor.mc_pg[0] = fp; + flags |= F_DUPDATA; + goto put_sub; + } + xdata.mv_size = olddata.mv_size + offset; + } + + fp_flags = fp->mp_flags; + if (NODESIZE + NODEKSZ(leaf) + xdata.mv_size > env->me_nodemax) { + /* Too big for a sub-page, convert to sub-DB */ + fp_flags &= ~P_SUBP; +prep_subDB: + if (mc->mc_db->md_flags & MDB_DUPFIXED) { + fp_flags |= P_LEAF2; + dummy.md_pad = fp->mp_pad; + dummy.md_flags = MDB_DUPFIXED; + if (mc->mc_db->md_flags & MDB_INTEGERDUP) + dummy.md_flags |= MDB_INTEGERKEY; + } else { + dummy.md_pad = 0; + dummy.md_flags = 0; + } + dummy.md_depth = 1; + dummy.md_branch_pages = 0; + dummy.md_leaf_pages = 1; + dummy.md_overflow_pages = 0; + dummy.md_entries = NUMKEYS(fp); + xdata.mv_size = sizeof(MDB_db); + xdata.mv_data = &dummy; + if ((rc = mdb_page_alloc(mc, 1, &mp))) + return rc; + offset = env->me_psize - olddata.mv_size; + flags |= F_DUPDATA|F_SUBDATA; + dummy.md_root = mp->mp_pgno; + } + if (mp != fp) { + mp->mp_flags = fp_flags | P_DIRTY; + mp->mp_pad = fp->mp_pad; + mp->mp_lower = fp->mp_lower; + mp->mp_upper = fp->mp_upper + offset; + if (fp_flags & P_LEAF2) { + memcpy(METADATA(mp), METADATA(fp), NUMKEYS(fp) * fp->mp_pad); + } else { + memcpy((char *)mp + mp->mp_upper + PAGEBASE, (char *)fp + fp->mp_upper + PAGEBASE, + olddata.mv_size - fp->mp_upper - PAGEBASE); + for (i=0; imp_ptrs[i] = fp->mp_ptrs[i] + offset; + } + } + + rdata = &xdata; + flags |= F_DUPDATA; + do_sub = 1; + if (!insert_key) + mdb_node_del(mc, 0); + goto new_sub; + } +current: + /* overflow page overwrites need special handling */ + if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { + MDB_page *omp; + pgno_t pg; + int level, ovpages, dpages = OVPAGES(data->mv_size, env->me_psize); + + memcpy(&pg, olddata.mv_data, sizeof(pg)); + if ((rc2 = mdb_page_get(mc->mc_txn, pg, &omp, &level)) != 0) + return rc2; + ovpages = omp->mp_pages; + + /* Is the ov page large enough? */ + if (ovpages >= dpages) { + if (!(omp->mp_flags & P_DIRTY) && + (level || (env->me_flags & MDB_WRITEMAP))) + { + rc = mdb_page_unspill(mc->mc_txn, omp, &omp); + if (rc) + return rc; + level = 0; /* dirty in this txn or clean */ + } + /* Is it dirty? */ + if (omp->mp_flags & P_DIRTY) { + /* yes, overwrite it. Note in this case we don't + * bother to try shrinking the page if the new data + * is smaller than the overflow threshold. + */ + if (level > 1) { + /* It is writable only in a parent txn */ + size_t sz = (size_t) env->me_psize * ovpages, off; + MDB_page *np = mdb_page_malloc(mc->mc_txn, ovpages); + MDB_ID2 id2; + if (!np) + return ENOMEM; + id2.mid = pg; + id2.mptr = np; + rc2 = mdb_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &id2); + mdb_cassert(mc, rc2 == 0); + if (!(flags & MDB_RESERVE)) { + /* Copy end of page, adjusting alignment so + * compiler may copy words instead of bytes. + */ + off = (PAGEHDRSZ + data->mv_size) & -sizeof(size_t); + memcpy((size_t *)((char *)np + off), + (size_t *)((char *)omp + off), sz - off); + sz = PAGEHDRSZ; + } + memcpy(np, omp, sz); /* Copy beginning of page */ + omp = np; + } + SETDSZ(leaf, data->mv_size); + if (F_ISSET(flags, MDB_RESERVE)) + data->mv_data = METADATA(omp); + else + memcpy(METADATA(omp), data->mv_data, data->mv_size); + return MDB_SUCCESS; + } + } + if ((rc2 = mdb_ovpage_free(mc, omp)) != MDB_SUCCESS) + return rc2; + } else if (data->mv_size == olddata.mv_size) { + /* same size, just replace it. Note that we could + * also reuse this node if the new data is smaller, + * but instead we opt to shrink the node in that case. + */ + if (F_ISSET(flags, MDB_RESERVE)) + data->mv_data = olddata.mv_data; + else if (!(mc->mc_flags & C_SUB)) + memcpy(olddata.mv_data, data->mv_data, data->mv_size); + else { + memcpy(NODEKEY(leaf), key->mv_data, key->mv_size); + goto fix_parent; + } + return MDB_SUCCESS; + } + mdb_node_del(mc, 0); + } + + rdata = data; + +new_sub: + nflags = flags & NODE_ADD_FLAGS; + nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : mdb_leaf_size(env, key, rdata); + if (SIZELEFT(mc->mc_pg[mc->mc_top]) < nsize) { + if (( flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA ) + nflags &= ~MDB_APPEND; /* sub-page may need room to grow */ + if (!insert_key) + nflags |= MDB_SPLIT_REPLACE; + rc = mdb_page_split(mc, key, rdata, P_INVALID, nflags); + } else { + /* There is room already in this leaf page. */ + rc = mdb_node_add(mc, mc->mc_ki[mc->mc_top], key, rdata, 0, nflags); + if (rc == 0 && insert_key) { + /* Adjust other cursors pointing to mp */ + MDB_cursor *m2, *m3; + MDB_dbi dbi = mc->mc_dbi; + unsigned i = mc->mc_top; + MDB_page *mp = mc->mc_pg[i]; + + for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + if (mc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3 == mc || m3->mc_snum < mc->mc_snum) continue; + if (m3->mc_pg[i] == mp && m3->mc_ki[i] >= mc->mc_ki[i]) { + m3->mc_ki[i]++; + } + } + } + } + + if (rc == MDB_SUCCESS) { + /* Now store the actual data in the child DB. Note that we're + * storing the user data in the keys field, so there are strict + * size limits on dupdata. The actual data fields of the child + * DB are all zero size. + */ + if (do_sub) { + int xflags; + size_t ecount; +put_sub: + xdata.mv_size = 0; + xdata.mv_data = ""; + leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (flags & MDB_CURRENT) { + xflags = MDB_CURRENT|MDB_NOSPILL; + } else { + mdb_xcursor_init1(mc, leaf); + xflags = (flags & MDB_NODUPDATA) ? + MDB_NOOVERWRITE|MDB_NOSPILL : MDB_NOSPILL; + } + /* converted, write the original data first */ + if (dkey.mv_size) { + rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, xflags); + if (rc) + goto bad_sub; + { + /* Adjust other cursors pointing to mp */ + MDB_cursor *m2; + unsigned i = mc->mc_top; + MDB_page *mp = mc->mc_pg[i]; + + for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { + if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; + if (!(m2->mc_flags & C_INITIALIZED)) continue; + if (m2->mc_pg[i] == mp && m2->mc_ki[i] == mc->mc_ki[i]) { + mdb_xcursor_init1(m2, leaf); + } + } + } + /* we've done our job */ + dkey.mv_size = 0; + } + ecount = mc->mc_xcursor->mx_db.md_entries; + if (flags & MDB_APPENDDUP) + xflags |= MDB_APPEND; + rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, data, &xdata, xflags); + if (flags & F_SUBDATA) { + void *db = NODEDATA(leaf); + memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db)); + } + insert_data = mc->mc_xcursor->mx_db.md_entries - ecount; + } + /* Increment count unless we just replaced an existing item. */ + if (insert_data) + mc->mc_db->md_entries++; + if (insert_key) { + /* Invalidate txn if we created an empty sub-DB */ + if (rc) + goto bad_sub; + /* If we succeeded and the key didn't exist before, + * make sure the cursor is marked valid. + */ + mc->mc_flags |= C_INITIALIZED; + } + if (flags & MDB_MULTIPLE) { + if (!rc) { + mcount++; + /* let caller know how many succeeded, if any */ + data[1].mv_size = mcount; + if (mcount < dcount) { + data[0].mv_data = (char *)data[0].mv_data + data[0].mv_size; + insert_key = insert_data = 0; + goto more; + } + } + } + return rc; +bad_sub: + if (rc == MDB_KEYEXIST) /* should not happen, we deleted that item */ + rc = MDB_CORRUPTED; + } + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return rc; +} + +int +mdb_cursor_del(MDB_cursor *mc, unsigned int flags) +{ + MDB_node *leaf; + MDB_page *mp; + int rc; + + if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR)) + return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; + + if (!(mc->mc_flags & C_INITIALIZED)) + return EINVAL; + + if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) + return MDB_NOTFOUND; + + if (!(flags & MDB_NOSPILL) && (rc = mdb_page_spill(mc, NULL, NULL))) + return rc; + + rc = mdb_cursor_touch(mc); + if (rc) + return rc; + + mp = mc->mc_pg[mc->mc_top]; + if (IS_LEAF2(mp)) + goto del_key; + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (flags & MDB_NODUPDATA) { + /* mdb_cursor_del0() will subtract the final entry */ + mc->mc_db->md_entries -= mc->mc_xcursor->mx_db.md_entries - 1; + } else { + if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) { + mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); + } + rc = mdb_cursor_del(&mc->mc_xcursor->mx_cursor, MDB_NOSPILL); + if (rc) + return rc; + /* If sub-DB still has entries, we're done */ + if (mc->mc_xcursor->mx_db.md_entries) { + if (leaf->mn_flags & F_SUBDATA) { + /* update subDB info */ + void *db = NODEDATA(leaf); + memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db)); + } else { + MDB_cursor *m2; + /* shrink fake page */ + mdb_node_shrink(mp, mc->mc_ki[mc->mc_top]); + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); + /* fix other sub-DB cursors pointed at this fake page */ + for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { + if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; + if (m2->mc_pg[mc->mc_top] == mp && + m2->mc_ki[mc->mc_top] == mc->mc_ki[mc->mc_top]) + m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); + } + } + mc->mc_db->md_entries--; + mc->mc_flags |= C_DEL; + return rc; + } + /* otherwise fall thru and delete the sub-DB */ + } + + if (leaf->mn_flags & F_SUBDATA) { + /* add all the child DB's pages to the free list */ + rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0); + if (rc) + goto fail; + } + } + + /* add overflow pages to free list */ + if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { + MDB_page *omp; + pgno_t pg; + + memcpy(&pg, NODEDATA(leaf), sizeof(pg)); + if ((rc = mdb_page_get(mc->mc_txn, pg, &omp, NULL)) || + (rc = mdb_ovpage_free(mc, omp))) + goto fail; + } + +del_key: + return mdb_cursor_del0(mc); + +fail: + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return rc; +} + +/** Allocate and initialize new pages for a database. + * @param[in] mc a cursor on the database being added to. + * @param[in] flags flags defining what type of page is being allocated. + * @param[in] num the number of pages to allocate. This is usually 1, + * unless allocating overflow pages for a large record. + * @param[out] mp Address of a page, or NULL on failure. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp) +{ + MDB_page *np; + int rc; + + if ((rc = mdb_page_alloc(mc, num, &np))) + return rc; + DPRINTF(("allocated new mpage %"Z"u, page size %u", + np->mp_pgno, mc->mc_txn->mt_env->me_psize)); + np->mp_flags = flags | P_DIRTY; + np->mp_lower = (PAGEHDRSZ-PAGEBASE); + np->mp_upper = mc->mc_txn->mt_env->me_psize - PAGEBASE; + + if (IS_BRANCH(np)) + mc->mc_db->md_branch_pages++; + else if (IS_LEAF(np)) + mc->mc_db->md_leaf_pages++; + else if (IS_OVERFLOW(np)) { + mc->mc_db->md_overflow_pages += num; + np->mp_pages = num; + } + *mp = np; + + return 0; +} + +/** Calculate the size of a leaf node. + * The size depends on the environment's page size; if a data item + * is too large it will be put onto an overflow page and the node + * size will only include the key and not the data. Sizes are always + * rounded up to an even number of bytes, to guarantee 2-byte alignment + * of the #MDB_node headers. + * @param[in] env The environment handle. + * @param[in] key The key for the node. + * @param[in] data The data for the node. + * @return The number of bytes needed to store the node. + */ +static size_t +mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data) +{ + size_t sz; + + sz = LEAFSIZE(key, data); + if (sz > env->me_nodemax) { + /* put on overflow page */ + sz -= data->mv_size - sizeof(pgno_t); + } + + return EVEN(sz + sizeof(indx_t)); +} + +/** Calculate the size of a branch node. + * The size should depend on the environment's page size but since + * we currently don't support spilling large keys onto overflow + * pages, it's simply the size of the #MDB_node header plus the + * size of the key. Sizes are always rounded up to an even number + * of bytes, to guarantee 2-byte alignment of the #MDB_node headers. + * @param[in] env The environment handle. + * @param[in] key The key for the node. + * @return The number of bytes needed to store the node. + */ +static size_t +mdb_branch_size(MDB_env *env, MDB_val *key) +{ + size_t sz; + + sz = INDXSIZE(key); + if (sz > env->me_nodemax) { + /* put on overflow page */ + /* not implemented */ + /* sz -= key->size - sizeof(pgno_t); */ + } + + return sz + sizeof(indx_t); +} + +/** Add a node to the page pointed to by the cursor. + * @param[in] mc The cursor for this operation. + * @param[in] indx The index on the page where the new node should be added. + * @param[in] key The key for the new node. + * @param[in] data The data for the new node, if any. + * @param[in] pgno The page number, if adding a branch node. + * @param[in] flags Flags for the node. + * @return 0 on success, non-zero on failure. Possible errors are: + *
    + *
  • ENOMEM - failed to allocate overflow pages for the node. + *
  • MDB_PAGE_FULL - there is insufficient room in the page. This error + * should never happen since all callers already calculate the + * page's free space before calling this function. + *
+ */ +static int +mdb_node_add(MDB_cursor *mc, indx_t indx, + MDB_val *key, MDB_val *data, pgno_t pgno, unsigned int flags) +{ + unsigned int i; + size_t node_size = NODESIZE; + ssize_t room; + indx_t ofs; + MDB_node *node; + MDB_page *mp = mc->mc_pg[mc->mc_top]; + MDB_page *ofp = NULL; /* overflow page */ + DKBUF; + + mdb_cassert(mc, mp->mp_upper >= mp->mp_lower); + + DPRINTF(("add to %s %spage %"Z"u index %i, data size %"Z"u key size %"Z"u [%s]", + IS_LEAF(mp) ? "leaf" : "branch", + IS_SUBP(mp) ? "sub-" : "", + mdb_dbg_pgno(mp), indx, data ? data->mv_size : 0, + key ? key->mv_size : 0, key ? DKEY(key) : "null")); + + if (IS_LEAF2(mp)) { + /* Move higher keys up one slot. */ + int ksize = mc->mc_db->md_pad, dif; + char *ptr = LEAF2KEY(mp, indx, ksize); + dif = NUMKEYS(mp) - indx; + if (dif > 0) + memmove(ptr+ksize, ptr, dif*ksize); + /* insert new key */ + memcpy(ptr, key->mv_data, ksize); + + /* Just using these for counting */ + mp->mp_lower += sizeof(indx_t); + mp->mp_upper -= ksize - sizeof(indx_t); + return MDB_SUCCESS; + } + + room = (ssize_t)SIZELEFT(mp) - (ssize_t)sizeof(indx_t); + if (key != NULL) + node_size += key->mv_size; + if (IS_LEAF(mp)) { + mdb_cassert(mc, data); + if (F_ISSET(flags, F_BIGDATA)) { + /* Data already on overflow page. */ + node_size += sizeof(pgno_t); + } else if (node_size + data->mv_size > mc->mc_txn->mt_env->me_nodemax) { + int ovpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize); + int rc; + /* Put data on overflow page. */ + DPRINTF(("data size is %"Z"u, node would be %"Z"u, put data on overflow page", + data->mv_size, node_size+data->mv_size)); + node_size = EVEN(node_size + sizeof(pgno_t)); + if ((ssize_t)node_size > room) + goto full; + if ((rc = mdb_page_new(mc, P_OVERFLOW, ovpages, &ofp))) + return rc; + DPRINTF(("allocated overflow page %"Z"u", ofp->mp_pgno)); + flags |= F_BIGDATA; + goto update; + } else { + node_size += data->mv_size; + } + } + node_size = EVEN(node_size); + if ((ssize_t)node_size > room) + goto full; + +update: + /* Move higher pointers up one slot. */ + for (i = NUMKEYS(mp); i > indx; i--) + mp->mp_ptrs[i] = mp->mp_ptrs[i - 1]; + + /* Adjust free space offsets. */ + ofs = mp->mp_upper - node_size; + mdb_cassert(mc, ofs >= mp->mp_lower + sizeof(indx_t)); + mp->mp_ptrs[indx] = ofs; + mp->mp_upper = ofs; + mp->mp_lower += sizeof(indx_t); + + /* Write the node data. */ + node = NODEPTR(mp, indx); + node->mn_ksize = (key == NULL) ? 0 : key->mv_size; + node->mn_flags = flags; + if (IS_LEAF(mp)) + SETDSZ(node,data->mv_size); + else + SETPGNO(node,pgno); + + if (key) + memcpy(NODEKEY(node), key->mv_data, key->mv_size); + + if (IS_LEAF(mp)) { + mdb_cassert(mc, key); + if (ofp == NULL) { + if (F_ISSET(flags, F_BIGDATA)) + memcpy(node->mn_data + key->mv_size, data->mv_data, + sizeof(pgno_t)); + else if (F_ISSET(flags, MDB_RESERVE)) + data->mv_data = node->mn_data + key->mv_size; + else + memcpy(node->mn_data + key->mv_size, data->mv_data, + data->mv_size); + } else { + memcpy(node->mn_data + key->mv_size, &ofp->mp_pgno, + sizeof(pgno_t)); + if (F_ISSET(flags, MDB_RESERVE)) + data->mv_data = METADATA(ofp); + else + memcpy(METADATA(ofp), data->mv_data, data->mv_size); + } + } + + return MDB_SUCCESS; + +full: + DPRINTF(("not enough room in page %"Z"u, got %u ptrs", + mdb_dbg_pgno(mp), NUMKEYS(mp))); + DPRINTF(("upper-lower = %u - %u = %"Z"d", mp->mp_upper,mp->mp_lower,room)); + DPRINTF(("node size = %"Z"u", node_size)); + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return MDB_PAGE_FULL; +} + +/** Delete the specified node from a page. + * @param[in] mc Cursor pointing to the node to delete. + * @param[in] ksize The size of a node. Only used if the page is + * part of a #MDB_DUPFIXED database. + */ +static void +mdb_node_del(MDB_cursor *mc, int ksize) +{ + MDB_page *mp = mc->mc_pg[mc->mc_top]; + indx_t indx = mc->mc_ki[mc->mc_top]; + unsigned int sz; + indx_t i, j, numkeys, ptr; + MDB_node *node; + char *base; + + DPRINTF(("delete node %u on %s page %"Z"u", indx, + IS_LEAF(mp) ? "leaf" : "branch", mdb_dbg_pgno(mp))); + numkeys = NUMKEYS(mp); + mdb_cassert(mc, indx < numkeys); + + if (IS_LEAF2(mp)) { + int x = numkeys - 1 - indx; + base = LEAF2KEY(mp, indx, ksize); + if (x) + memmove(base, base + ksize, x * ksize); + mp->mp_lower -= sizeof(indx_t); + mp->mp_upper += ksize - sizeof(indx_t); + return; + } + + node = NODEPTR(mp, indx); + sz = NODESIZE + node->mn_ksize; + if (IS_LEAF(mp)) { + if (F_ISSET(node->mn_flags, F_BIGDATA)) + sz += sizeof(pgno_t); + else + sz += NODEDSZ(node); + } + sz = EVEN(sz); + + ptr = mp->mp_ptrs[indx]; + for (i = j = 0; i < numkeys; i++) { + if (i != indx) { + mp->mp_ptrs[j] = mp->mp_ptrs[i]; + if (mp->mp_ptrs[i] < ptr) + mp->mp_ptrs[j] += sz; + j++; + } + } + + base = (char *)mp + mp->mp_upper + PAGEBASE; + memmove(base + sz, base, ptr - mp->mp_upper); + + mp->mp_lower -= sizeof(indx_t); + mp->mp_upper += sz; +} + +/** Compact the main page after deleting a node on a subpage. + * @param[in] mp The main page to operate on. + * @param[in] indx The index of the subpage on the main page. + */ +static void +mdb_node_shrink(MDB_page *mp, indx_t indx) +{ + MDB_node *node; + MDB_page *sp, *xp; + char *base; + int nsize, delta; + indx_t i, numkeys, ptr; + + node = NODEPTR(mp, indx); + sp = (MDB_page *)NODEDATA(node); + delta = SIZELEFT(sp); + xp = (MDB_page *)((char *)sp + delta); + + /* shift subpage upward */ + if (IS_LEAF2(sp)) { + nsize = NUMKEYS(sp) * sp->mp_pad; + if (nsize & 1) + return; /* do not make the node uneven-sized */ + memmove(METADATA(xp), METADATA(sp), nsize); + } else { + int i; + numkeys = NUMKEYS(sp); + for (i=numkeys-1; i>=0; i--) + xp->mp_ptrs[i] = sp->mp_ptrs[i] - delta; + } + xp->mp_upper = sp->mp_lower; + xp->mp_lower = sp->mp_lower; + xp->mp_flags = sp->mp_flags; + xp->mp_pad = sp->mp_pad; + COPY_PGNO(xp->mp_pgno, mp->mp_pgno); + + nsize = NODEDSZ(node) - delta; + SETDSZ(node, nsize); + + /* shift lower nodes upward */ + ptr = mp->mp_ptrs[indx]; + numkeys = NUMKEYS(mp); + for (i = 0; i < numkeys; i++) { + if (mp->mp_ptrs[i] <= ptr) + mp->mp_ptrs[i] += delta; + } + + base = (char *)mp + mp->mp_upper + PAGEBASE; + memmove(base + delta, base, ptr - mp->mp_upper + NODESIZE + NODEKSZ(node)); + mp->mp_upper += delta; +} + +/** Initial setup of a sorted-dups cursor. + * Sorted duplicates are implemented as a sub-database for the given key. + * The duplicate data items are actually keys of the sub-database. + * Operations on the duplicate data items are performed using a sub-cursor + * initialized when the sub-database is first accessed. This function does + * the preliminary setup of the sub-cursor, filling in the fields that + * depend only on the parent DB. + * @param[in] mc The main cursor whose sorted-dups cursor is to be initialized. + */ +static void +mdb_xcursor_init0(MDB_cursor *mc) +{ + MDB_xcursor *mx = mc->mc_xcursor; + + mx->mx_cursor.mc_xcursor = NULL; + mx->mx_cursor.mc_txn = mc->mc_txn; + mx->mx_cursor.mc_db = &mx->mx_db; + mx->mx_cursor.mc_dbx = &mx->mx_dbx; + mx->mx_cursor.mc_dbi = mc->mc_dbi; + mx->mx_cursor.mc_dbflag = &mx->mx_dbflag; + mx->mx_cursor.mc_snum = 0; + mx->mx_cursor.mc_top = 0; + mx->mx_cursor.mc_flags = C_SUB; + mx->mx_dbx.md_name.mv_size = 0; + mx->mx_dbx.md_name.mv_data = NULL; + mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp; + mx->mx_dbx.md_dcmp = NULL; + mx->mx_dbx.md_rel = mc->mc_dbx->md_rel; +} + +/** Final setup of a sorted-dups cursor. + * Sets up the fields that depend on the data from the main cursor. + * @param[in] mc The main cursor whose sorted-dups cursor is to be initialized. + * @param[in] node The data containing the #MDB_db record for the + * sorted-dup database. + */ +static void +mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node) +{ + MDB_xcursor *mx = mc->mc_xcursor; + + if (node->mn_flags & F_SUBDATA) { + memcpy(&mx->mx_db, NODEDATA(node), sizeof(MDB_db)); + mx->mx_cursor.mc_pg[0] = 0; + mx->mx_cursor.mc_snum = 0; + mx->mx_cursor.mc_top = 0; + mx->mx_cursor.mc_flags = C_SUB; + } else { + MDB_page *fp = NODEDATA(node); + mx->mx_db.md_pad = mc->mc_pg[mc->mc_top]->mp_pad; + mx->mx_db.md_flags = 0; + mx->mx_db.md_depth = 1; + mx->mx_db.md_branch_pages = 0; + mx->mx_db.md_leaf_pages = 1; + mx->mx_db.md_overflow_pages = 0; + mx->mx_db.md_entries = NUMKEYS(fp); + COPY_PGNO(mx->mx_db.md_root, fp->mp_pgno); + mx->mx_cursor.mc_snum = 1; + mx->mx_cursor.mc_top = 0; + mx->mx_cursor.mc_flags = C_INITIALIZED|C_SUB; + mx->mx_cursor.mc_pg[0] = fp; + mx->mx_cursor.mc_ki[0] = 0; + if (mc->mc_db->md_flags & MDB_DUPFIXED) { + mx->mx_db.md_flags = MDB_DUPFIXED; + mx->mx_db.md_pad = fp->mp_pad; + if (mc->mc_db->md_flags & MDB_INTEGERDUP) + mx->mx_db.md_flags |= MDB_INTEGERKEY; + } + } + DPRINTF(("Sub-db -%u root page %"Z"u", mx->mx_cursor.mc_dbi, + mx->mx_db.md_root)); + mx->mx_dbflag = DB_VALID|DB_DIRTY; /* DB_DIRTY guides mdb_cursor_touch */ +#if UINT_MAX < SIZE_MAX + if (mx->mx_dbx.md_cmp == mdb_cmp_int && mx->mx_db.md_pad == sizeof(size_t)) + mx->mx_dbx.md_cmp = mdb_cmp_clong; +#endif +} + +/** Initialize a cursor for a given transaction and database. */ +static void +mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx) +{ + mc->mc_next = NULL; + mc->mc_backup = NULL; + mc->mc_dbi = dbi; + mc->mc_txn = txn; + mc->mc_db = &txn->mt_dbs[dbi]; + mc->mc_dbx = &txn->mt_dbxs[dbi]; + mc->mc_dbflag = &txn->mt_dbflags[dbi]; + mc->mc_snum = 0; + mc->mc_top = 0; + mc->mc_pg[0] = 0; + mc->mc_flags = 0; + if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) { + mdb_tassert(txn, mx != NULL); + mc->mc_xcursor = mx; + mdb_xcursor_init0(mc); + } else { + mc->mc_xcursor = NULL; + } + if (*mc->mc_dbflag & DB_STALE) { + mdb_page_search(mc, NULL, MDB_PS_ROOTONLY); + } +} + +int +mdb_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **ret) +{ + MDB_cursor *mc; + size_t size = sizeof(MDB_cursor); + + if (!ret || !TXN_DBI_EXIST(txn, dbi)) + return EINVAL; + + if (txn->mt_flags & MDB_TXN_ERROR) + return MDB_BAD_TXN; + + /* Allow read access to the freelist */ + if (!dbi && !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) + return EINVAL; + + if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) + size += sizeof(MDB_xcursor); + + if ((mc = malloc(size)) != NULL) { + mdb_cursor_init(mc, txn, dbi, (MDB_xcursor *)(mc + 1)); + if (txn->mt_cursors) { + mc->mc_next = txn->mt_cursors[dbi]; + txn->mt_cursors[dbi] = mc; + mc->mc_flags |= C_UNTRACK; + } + } else { + return ENOMEM; + } + + *ret = mc; + + return MDB_SUCCESS; +} + +int +mdb_cursor_renew(MDB_txn *txn, MDB_cursor *mc) +{ + if (!mc || !TXN_DBI_EXIST(txn, mc->mc_dbi)) + return EINVAL; + + if ((mc->mc_flags & C_UNTRACK) || txn->mt_cursors) + return EINVAL; + + if (txn->mt_flags & MDB_TXN_ERROR) + return MDB_BAD_TXN; + + mdb_cursor_init(mc, txn, mc->mc_dbi, mc->mc_xcursor); + return MDB_SUCCESS; +} + +/* Return the count of duplicate data items for the current key */ +int +mdb_cursor_count(MDB_cursor *mc, size_t *countp) +{ + MDB_node *leaf; + + if (mc == NULL || countp == NULL) + return EINVAL; + + if (mc->mc_xcursor == NULL) + return MDB_INCOMPATIBLE; + + if (mc->mc_txn->mt_flags & MDB_TXN_ERROR) + return MDB_BAD_TXN; + + if (!(mc->mc_flags & C_INITIALIZED)) + return EINVAL; + + if (!mc->mc_snum || (mc->mc_flags & C_EOF)) + return MDB_NOTFOUND; + + leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { + *countp = 1; + } else { + if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) + return EINVAL; + + *countp = mc->mc_xcursor->mx_db.md_entries; + } + return MDB_SUCCESS; +} + +void +mdb_cursor_close(MDB_cursor *mc) +{ + if (mc && !mc->mc_backup) { + /* remove from txn, if tracked */ + if ((mc->mc_flags & C_UNTRACK) && mc->mc_txn->mt_cursors) { + MDB_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; + while (*prev && *prev != mc) prev = &(*prev)->mc_next; + if (*prev == mc) + *prev = mc->mc_next; + } + free(mc); + } +} + +MDB_txn * +mdb_cursor_txn(MDB_cursor *mc) +{ + if (!mc) return NULL; + return mc->mc_txn; +} + +MDB_dbi +mdb_cursor_dbi(MDB_cursor *mc) +{ + return mc->mc_dbi; +} + +/** Replace the key for a branch node with a new key. + * @param[in] mc Cursor pointing to the node to operate on. + * @param[in] key The new key to use. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_update_key(MDB_cursor *mc, MDB_val *key) +{ + MDB_page *mp; + MDB_node *node; + char *base; + size_t len; + int delta, ksize, oksize; + indx_t ptr, i, numkeys, indx; + DKBUF; + + indx = mc->mc_ki[mc->mc_top]; + mp = mc->mc_pg[mc->mc_top]; + node = NODEPTR(mp, indx); + ptr = mp->mp_ptrs[indx]; +#if MDB_DEBUG + { + MDB_val k2; + char kbuf2[DKBUF_MAXKEYSIZE*2+1]; + k2.mv_data = NODEKEY(node); + k2.mv_size = node->mn_ksize; + DPRINTF(("update key %u (ofs %u) [%s] to [%s] on page %"Z"u", + indx, ptr, + mdb_dkey(&k2, kbuf2), + DKEY(key), + mp->mp_pgno)); + } +#endif + + /* Sizes must be 2-byte aligned. */ + ksize = EVEN(key->mv_size); + oksize = EVEN(node->mn_ksize); + delta = ksize - oksize; + + /* Shift node contents if EVEN(key length) changed. */ + if (delta) { + if (delta > 0 && SIZELEFT(mp) < delta) { + pgno_t pgno; + /* not enough space left, do a delete and split */ + DPRINTF(("Not enough room, delta = %d, splitting...", delta)); + pgno = NODEPGNO(node); + mdb_node_del(mc, 0); + return mdb_page_split(mc, key, NULL, pgno, MDB_SPLIT_REPLACE); + } + + numkeys = NUMKEYS(mp); + for (i = 0; i < numkeys; i++) { + if (mp->mp_ptrs[i] <= ptr) + mp->mp_ptrs[i] -= delta; + } + + base = (char *)mp + mp->mp_upper + PAGEBASE; + len = ptr - mp->mp_upper + NODESIZE; + memmove(base - delta, base, len); + mp->mp_upper -= delta; + + node = NODEPTR(mp, indx); + } + + /* But even if no shift was needed, update ksize */ + if (node->mn_ksize != key->mv_size) + node->mn_ksize = key->mv_size; + + if (key->mv_size) + memcpy(NODEKEY(node), key->mv_data, key->mv_size); + + return MDB_SUCCESS; +} + +static void +mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst); + +/** Move a node from csrc to cdst. + */ +static int +mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst) +{ + MDB_node *srcnode; + MDB_val key, data; + pgno_t srcpg; + MDB_cursor mn; + int rc; + unsigned short flags; + + DKBUF; + + /* Mark src and dst as dirty. */ + if ((rc = mdb_page_touch(csrc)) || + (rc = mdb_page_touch(cdst))) + return rc; + + if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { + key.mv_size = csrc->mc_db->md_pad; + key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top], key.mv_size); + data.mv_size = 0; + data.mv_data = NULL; + srcpg = 0; + flags = 0; + } else { + srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top]); + mdb_cassert(csrc, !((size_t)srcnode & 1)); + srcpg = NODEPGNO(srcnode); + flags = srcnode->mn_flags; + if (csrc->mc_ki[csrc->mc_top] == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { + unsigned int snum = csrc->mc_snum; + MDB_node *s2; + /* must find the lowest key below src */ + rc = mdb_page_search_lowest(csrc); + if (rc) + return rc; + if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { + key.mv_size = csrc->mc_db->md_pad; + key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); + } else { + s2 = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); + key.mv_size = NODEKSZ(s2); + key.mv_data = NODEKEY(s2); + } + csrc->mc_snum = snum--; + csrc->mc_top = snum; + } else { + key.mv_size = NODEKSZ(srcnode); + key.mv_data = NODEKEY(srcnode); + } + data.mv_size = NODEDSZ(srcnode); + data.mv_data = NODEDATA(srcnode); + } + if (IS_BRANCH(cdst->mc_pg[cdst->mc_top]) && cdst->mc_ki[cdst->mc_top] == 0) { + unsigned int snum = cdst->mc_snum; + MDB_node *s2; + MDB_val bkey; + /* must find the lowest key below dst */ + mdb_cursor_copy(cdst, &mn); + rc = mdb_page_search_lowest(&mn); + if (rc) + return rc; + if (IS_LEAF2(mn.mc_pg[mn.mc_top])) { + bkey.mv_size = mn.mc_db->md_pad; + bkey.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, bkey.mv_size); + } else { + s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0); + bkey.mv_size = NODEKSZ(s2); + bkey.mv_data = NODEKEY(s2); + } + mn.mc_snum = snum--; + mn.mc_top = snum; + mn.mc_ki[snum] = 0; + rc = mdb_update_key(&mn, &bkey); + if (rc) + return rc; + } + + DPRINTF(("moving %s node %u [%s] on page %"Z"u to node %u on page %"Z"u", + IS_LEAF(csrc->mc_pg[csrc->mc_top]) ? "leaf" : "branch", + csrc->mc_ki[csrc->mc_top], + DKEY(&key), + csrc->mc_pg[csrc->mc_top]->mp_pgno, + cdst->mc_ki[cdst->mc_top], cdst->mc_pg[cdst->mc_top]->mp_pgno)); + + /* Add the node to the destination page. + */ + rc = mdb_node_add(cdst, cdst->mc_ki[cdst->mc_top], &key, &data, srcpg, flags); + if (rc != MDB_SUCCESS) + return rc; + + /* Delete the node from the source page. + */ + mdb_node_del(csrc, key.mv_size); + + { + /* Adjust other cursors pointing to mp */ + MDB_cursor *m2, *m3; + MDB_dbi dbi = csrc->mc_dbi; + MDB_page *mp = csrc->mc_pg[csrc->mc_top]; + + for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + if (csrc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3 == csrc) continue; + if (m3->mc_pg[csrc->mc_top] == mp && m3->mc_ki[csrc->mc_top] == + csrc->mc_ki[csrc->mc_top]) { + m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top]; + m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; + } + } + } + + /* Update the parent separators. + */ + if (csrc->mc_ki[csrc->mc_top] == 0) { + if (csrc->mc_ki[csrc->mc_top-1] != 0) { + if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { + key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); + } else { + srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); + key.mv_size = NODEKSZ(srcnode); + key.mv_data = NODEKEY(srcnode); + } + DPRINTF(("update separator for source page %"Z"u to [%s]", + csrc->mc_pg[csrc->mc_top]->mp_pgno, DKEY(&key))); + mdb_cursor_copy(csrc, &mn); + mn.mc_snum--; + mn.mc_top--; + if ((rc = mdb_update_key(&mn, &key)) != MDB_SUCCESS) + return rc; + } + if (IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { + MDB_val nullkey; + indx_t ix = csrc->mc_ki[csrc->mc_top]; + nullkey.mv_size = 0; + csrc->mc_ki[csrc->mc_top] = 0; + rc = mdb_update_key(csrc, &nullkey); + csrc->mc_ki[csrc->mc_top] = ix; + mdb_cassert(csrc, rc == MDB_SUCCESS); + } + } + + if (cdst->mc_ki[cdst->mc_top] == 0) { + if (cdst->mc_ki[cdst->mc_top-1] != 0) { + if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { + key.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, key.mv_size); + } else { + srcnode = NODEPTR(cdst->mc_pg[cdst->mc_top], 0); + key.mv_size = NODEKSZ(srcnode); + key.mv_data = NODEKEY(srcnode); + } + DPRINTF(("update separator for destination page %"Z"u to [%s]", + cdst->mc_pg[cdst->mc_top]->mp_pgno, DKEY(&key))); + mdb_cursor_copy(cdst, &mn); + mn.mc_snum--; + mn.mc_top--; + if ((rc = mdb_update_key(&mn, &key)) != MDB_SUCCESS) + return rc; + } + if (IS_BRANCH(cdst->mc_pg[cdst->mc_top])) { + MDB_val nullkey; + indx_t ix = cdst->mc_ki[cdst->mc_top]; + nullkey.mv_size = 0; + cdst->mc_ki[cdst->mc_top] = 0; + rc = mdb_update_key(cdst, &nullkey); + cdst->mc_ki[cdst->mc_top] = ix; + mdb_cassert(csrc, rc == MDB_SUCCESS); + } + } + + return MDB_SUCCESS; +} + +/** Merge one page into another. + * The nodes from the page pointed to by \b csrc will + * be copied to the page pointed to by \b cdst and then + * the \b csrc page will be freed. + * @param[in] csrc Cursor pointing to the source page. + * @param[in] cdst Cursor pointing to the destination page. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) +{ + MDB_page *psrc, *pdst; + MDB_node *srcnode; + MDB_val key, data; + unsigned nkeys; + int rc; + indx_t i, j; + + psrc = csrc->mc_pg[csrc->mc_top]; + pdst = cdst->mc_pg[cdst->mc_top]; + + DPRINTF(("merging page %"Z"u into %"Z"u", psrc->mp_pgno, pdst->mp_pgno)); + + mdb_cassert(csrc, csrc->mc_snum > 1); /* can't merge root page */ + mdb_cassert(csrc, cdst->mc_snum > 1); + + /* Mark dst as dirty. */ + if ((rc = mdb_page_touch(cdst))) + return rc; + + /* Move all nodes from src to dst. + */ + j = nkeys = NUMKEYS(pdst); + if (IS_LEAF2(psrc)) { + key.mv_size = csrc->mc_db->md_pad; + key.mv_data = METADATA(psrc); + for (i = 0; i < NUMKEYS(psrc); i++, j++) { + rc = mdb_node_add(cdst, j, &key, NULL, 0, 0); + if (rc != MDB_SUCCESS) + return rc; + key.mv_data = (char *)key.mv_data + key.mv_size; + } + } else { + for (i = 0; i < NUMKEYS(psrc); i++, j++) { + srcnode = NODEPTR(psrc, i); + if (i == 0 && IS_BRANCH(psrc)) { + MDB_cursor mn; + MDB_node *s2; + mdb_cursor_copy(csrc, &mn); + /* must find the lowest key below src */ + rc = mdb_page_search_lowest(&mn); + if (rc) + return rc; + if (IS_LEAF2(mn.mc_pg[mn.mc_top])) { + key.mv_size = mn.mc_db->md_pad; + key.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, key.mv_size); + } else { + s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0); + key.mv_size = NODEKSZ(s2); + key.mv_data = NODEKEY(s2); + } + } else { + key.mv_size = srcnode->mn_ksize; + key.mv_data = NODEKEY(srcnode); + } + + data.mv_size = NODEDSZ(srcnode); + data.mv_data = NODEDATA(srcnode); + rc = mdb_node_add(cdst, j, &key, &data, NODEPGNO(srcnode), srcnode->mn_flags); + if (rc != MDB_SUCCESS) + return rc; + } + } + + DPRINTF(("dst page %"Z"u now has %u keys (%.1f%% filled)", + pdst->mp_pgno, NUMKEYS(pdst), + (float)PAGEFILL(cdst->mc_txn->mt_env, pdst) / 10)); + + /* Unlink the src page from parent and add to free list. + */ + csrc->mc_top--; + mdb_node_del(csrc, 0); + if (csrc->mc_ki[csrc->mc_top] == 0) { + key.mv_size = 0; + rc = mdb_update_key(csrc, &key); + if (rc) { + csrc->mc_top++; + return rc; + } + } + csrc->mc_top++; + + psrc = csrc->mc_pg[csrc->mc_top]; + /* If not operating on FreeDB, allow this page to be reused + * in this txn. Otherwise just add to free list. + */ + rc = mdb_page_loose(csrc, psrc); + if (rc) + return rc; + if (IS_LEAF(psrc)) + csrc->mc_db->md_leaf_pages--; + else + csrc->mc_db->md_branch_pages--; + { + /* Adjust other cursors pointing to mp */ + MDB_cursor *m2, *m3; + MDB_dbi dbi = csrc->mc_dbi; + + for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + if (csrc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3 == csrc) continue; + if (m3->mc_snum < csrc->mc_snum) continue; + if (m3->mc_pg[csrc->mc_top] == psrc) { + m3->mc_pg[csrc->mc_top] = pdst; + m3->mc_ki[csrc->mc_top] += nkeys; + } + } + } + { + unsigned int snum = cdst->mc_snum; + uint16_t depth = cdst->mc_db->md_depth; + mdb_cursor_pop(cdst); + rc = mdb_rebalance(cdst); + /* Did the tree shrink? */ + if (depth > cdst->mc_db->md_depth) + snum--; + cdst->mc_snum = snum; + cdst->mc_top = snum-1; + } + return rc; +} + +/** Copy the contents of a cursor. + * @param[in] csrc The cursor to copy from. + * @param[out] cdst The cursor to copy to. + */ +static void +mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst) +{ + unsigned int i; + + cdst->mc_txn = csrc->mc_txn; + cdst->mc_dbi = csrc->mc_dbi; + cdst->mc_db = csrc->mc_db; + cdst->mc_dbx = csrc->mc_dbx; + cdst->mc_snum = csrc->mc_snum; + cdst->mc_top = csrc->mc_top; + cdst->mc_flags = csrc->mc_flags; + + for (i=0; imc_snum; i++) { + cdst->mc_pg[i] = csrc->mc_pg[i]; + cdst->mc_ki[i] = csrc->mc_ki[i]; + } +} + +/** Rebalance the tree after a delete operation. + * @param[in] mc Cursor pointing to the page where rebalancing + * should begin. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_rebalance(MDB_cursor *mc) +{ + MDB_node *node; + int rc; + unsigned int ptop, minkeys; + MDB_cursor mn; + indx_t oldki; + + minkeys = 1 + (IS_BRANCH(mc->mc_pg[mc->mc_top])); + DPRINTF(("rebalancing %s page %"Z"u (has %u keys, %.1f%% full)", + IS_LEAF(mc->mc_pg[mc->mc_top]) ? "leaf" : "branch", + mdb_dbg_pgno(mc->mc_pg[mc->mc_top]), NUMKEYS(mc->mc_pg[mc->mc_top]), + (float)PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) / 10)); + + if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= FILL_THRESHOLD && + NUMKEYS(mc->mc_pg[mc->mc_top]) >= minkeys) { + DPRINTF(("no need to rebalance page %"Z"u, above fill threshold", + mdb_dbg_pgno(mc->mc_pg[mc->mc_top]))); + return MDB_SUCCESS; + } + + if (mc->mc_snum < 2) { + MDB_page *mp = mc->mc_pg[0]; + if (IS_SUBP(mp)) { + DPUTS("Can't rebalance a subpage, ignoring"); + return MDB_SUCCESS; + } + if (NUMKEYS(mp) == 0) { + DPUTS("tree is completely empty"); + mc->mc_db->md_root = P_INVALID; + mc->mc_db->md_depth = 0; + mc->mc_db->md_leaf_pages = 0; + rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); + if (rc) + return rc; + /* Adjust cursors pointing to mp */ + mc->mc_snum = 0; + mc->mc_top = 0; + mc->mc_flags &= ~C_INITIALIZED; + { + MDB_cursor *m2, *m3; + MDB_dbi dbi = mc->mc_dbi; + + for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + if (mc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3->mc_snum < mc->mc_snum) continue; + if (m3->mc_pg[0] == mp) { + m3->mc_snum = 0; + m3->mc_top = 0; + m3->mc_flags &= ~C_INITIALIZED; + } + } + } + } else if (IS_BRANCH(mp) && NUMKEYS(mp) == 1) { + int i; + DPUTS("collapsing root page!"); + rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); + if (rc) + return rc; + mc->mc_db->md_root = NODEPGNO(NODEPTR(mp, 0)); + rc = mdb_page_get(mc->mc_txn,mc->mc_db->md_root,&mc->mc_pg[0],NULL); + if (rc) + return rc; + mc->mc_db->md_depth--; + mc->mc_db->md_branch_pages--; + mc->mc_ki[0] = mc->mc_ki[1]; + for (i = 1; imc_db->md_depth; i++) { + mc->mc_pg[i] = mc->mc_pg[i+1]; + mc->mc_ki[i] = mc->mc_ki[i+1]; + } + { + /* Adjust other cursors pointing to mp */ + MDB_cursor *m2, *m3; + MDB_dbi dbi = mc->mc_dbi; + + for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + if (mc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3 == mc || m3->mc_snum < mc->mc_snum) continue; + if (m3->mc_pg[0] == mp) { + m3->mc_snum--; + m3->mc_top--; + for (i=0; imc_snum; i++) { + m3->mc_pg[i] = m3->mc_pg[i+1]; + m3->mc_ki[i] = m3->mc_ki[i+1]; + } + } + } + } + } else + DPUTS("root page doesn't need rebalancing"); + return MDB_SUCCESS; + } + + /* The parent (branch page) must have at least 2 pointers, + * otherwise the tree is invalid. + */ + ptop = mc->mc_top-1; + mdb_cassert(mc, NUMKEYS(mc->mc_pg[ptop]) > 1); + + /* Leaf page fill factor is below the threshold. + * Try to move keys from left or right neighbor, or + * merge with a neighbor page. + */ + + /* Find neighbors. + */ + mdb_cursor_copy(mc, &mn); + mn.mc_xcursor = NULL; + + oldki = mc->mc_ki[mc->mc_top]; + if (mc->mc_ki[ptop] == 0) { + /* We're the leftmost leaf in our parent. + */ + DPUTS("reading right neighbor"); + mn.mc_ki[ptop]++; + node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); + rc = mdb_page_get(mc->mc_txn,NODEPGNO(node),&mn.mc_pg[mn.mc_top],NULL); + if (rc) + return rc; + mn.mc_ki[mn.mc_top] = 0; + mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]); + } else { + /* There is at least one neighbor to the left. + */ + DPUTS("reading left neighbor"); + mn.mc_ki[ptop]--; + node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); + rc = mdb_page_get(mc->mc_txn,NODEPGNO(node),&mn.mc_pg[mn.mc_top],NULL); + if (rc) + return rc; + mn.mc_ki[mn.mc_top] = NUMKEYS(mn.mc_pg[mn.mc_top]) - 1; + mc->mc_ki[mc->mc_top] = 0; + } + + DPRINTF(("found neighbor page %"Z"u (%u keys, %.1f%% full)", + mn.mc_pg[mn.mc_top]->mp_pgno, NUMKEYS(mn.mc_pg[mn.mc_top]), + (float)PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) / 10)); + + /* If the neighbor page is above threshold and has enough keys, + * move one key from it. Otherwise we should try to merge them. + * (A branch page must never have less than 2 keys.) + */ + minkeys = 1 + (IS_BRANCH(mn.mc_pg[mn.mc_top])); + if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= FILL_THRESHOLD && NUMKEYS(mn.mc_pg[mn.mc_top]) > minkeys) { + rc = mdb_node_move(&mn, mc); + if (mc->mc_ki[ptop]) { + oldki++; + } + } else { + if (mc->mc_ki[ptop] == 0) { + rc = mdb_page_merge(&mn, mc); + } else { + oldki += NUMKEYS(mn.mc_pg[mn.mc_top]); + mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1; + rc = mdb_page_merge(mc, &mn); + mdb_cursor_copy(&mn, mc); + } + mc->mc_flags &= ~C_EOF; + } + mc->mc_ki[mc->mc_top] = oldki; + return rc; +} + +/** Complete a delete operation started by #mdb_cursor_del(). */ +static int +mdb_cursor_del0(MDB_cursor *mc) +{ + int rc; + MDB_page *mp; + indx_t ki; + unsigned int nkeys; + + ki = mc->mc_ki[mc->mc_top]; + mdb_node_del(mc, mc->mc_db->md_pad); + mc->mc_db->md_entries--; + rc = mdb_rebalance(mc); + + if (rc == MDB_SUCCESS) { + MDB_cursor *m2, *m3; + MDB_dbi dbi = mc->mc_dbi; + + mp = mc->mc_pg[mc->mc_top]; + nkeys = NUMKEYS(mp); + + /* if mc points past last node in page, find next sibling */ + if (mc->mc_ki[mc->mc_top] >= nkeys) { + rc = mdb_cursor_sibling(mc, 1); + if (rc == MDB_NOTFOUND) { + mc->mc_flags |= C_EOF; + rc = MDB_SUCCESS; + } + } + + /* Adjust other cursors pointing to mp */ + for (m2 = mc->mc_txn->mt_cursors[dbi]; !rc && m2; m2=m2->mc_next) { + m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; + if (! (m2->mc_flags & m3->mc_flags & C_INITIALIZED)) + continue; + if (m3 == mc || m3->mc_snum < mc->mc_snum) + continue; + if (m3->mc_pg[mc->mc_top] == mp) { + if (m3->mc_ki[mc->mc_top] >= ki) { + m3->mc_flags |= C_DEL; + if (m3->mc_ki[mc->mc_top] > ki) + m3->mc_ki[mc->mc_top]--; + else if (mc->mc_db->md_flags & MDB_DUPSORT) + m3->mc_xcursor->mx_cursor.mc_flags |= C_EOF; + } + if (m3->mc_ki[mc->mc_top] >= nkeys) { + rc = mdb_cursor_sibling(m3, 1); + if (rc == MDB_NOTFOUND) { + m3->mc_flags |= C_EOF; + rc = MDB_SUCCESS; + } + } + } + } + mc->mc_flags |= C_DEL; + } + + if (rc) + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return rc; +} + +int +mdb_del(MDB_txn *txn, MDB_dbi dbi, + MDB_val *key, MDB_val *data) +{ + if (!key || dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) + return EINVAL; + + if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR)) + return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; + + if (!F_ISSET(txn->mt_dbs[dbi].md_flags, MDB_DUPSORT)) { + /* must ignore any data */ + data = NULL; + } + + return mdb_del0(txn, dbi, key, data, 0); +} + +static int +mdb_del0(MDB_txn *txn, MDB_dbi dbi, + MDB_val *key, MDB_val *data, unsigned flags) +{ + MDB_cursor mc; + MDB_xcursor mx; + MDB_cursor_op op; + MDB_val rdata, *xdata; + int rc, exact = 0; + DKBUF; + + DPRINTF(("====> delete db %u key [%s]", dbi, DKEY(key))); + + mdb_cursor_init(&mc, txn, dbi, &mx); + + if (data) { + op = MDB_GET_BOTH; + rdata = *data; + xdata = &rdata; + } else { + op = MDB_SET; + xdata = NULL; + flags |= MDB_NODUPDATA; + } + rc = mdb_cursor_set(&mc, key, xdata, op, &exact); + if (rc == 0) { + /* let mdb_page_split know about this cursor if needed: + * delete will trigger a rebalance; if it needs to move + * a node from one page to another, it will have to + * update the parent's separator key(s). If the new sepkey + * is larger than the current one, the parent page may + * run out of space, triggering a split. We need this + * cursor to be consistent until the end of the rebalance. + */ + mc.mc_flags |= C_UNTRACK; + mc.mc_next = txn->mt_cursors[dbi]; + txn->mt_cursors[dbi] = &mc; + rc = mdb_cursor_del(&mc, flags); + txn->mt_cursors[dbi] = mc.mc_next; + } + return rc; +} + +/** Split a page and insert a new node. + * @param[in,out] mc Cursor pointing to the page and desired insertion index. + * The cursor will be updated to point to the actual page and index where + * the node got inserted after the split. + * @param[in] newkey The key for the newly inserted node. + * @param[in] newdata The data for the newly inserted node. + * @param[in] newpgno The page number, if the new node is a branch node. + * @param[in] nflags The #NODE_ADD_FLAGS for the new node. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno, + unsigned int nflags) +{ + unsigned int flags; + int rc = MDB_SUCCESS, new_root = 0, did_split = 0; + indx_t newindx; + pgno_t pgno = 0; + int i, j, split_indx, nkeys, pmax; + MDB_env *env = mc->mc_txn->mt_env; + MDB_node *node; + MDB_val sepkey, rkey, xdata, *rdata = &xdata; + MDB_page *copy = NULL; + MDB_page *mp, *rp, *pp; + int ptop; + MDB_cursor mn; + DKBUF; + + mp = mc->mc_pg[mc->mc_top]; + newindx = mc->mc_ki[mc->mc_top]; + nkeys = NUMKEYS(mp); + + DPRINTF(("-----> splitting %s page %"Z"u and adding [%s] at index %i/%i", + IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno, + DKEY(newkey), mc->mc_ki[mc->mc_top], nkeys)); + + /* Create a right sibling. */ + if ((rc = mdb_page_new(mc, mp->mp_flags, 1, &rp))) + return rc; + DPRINTF(("new right sibling: page %"Z"u", rp->mp_pgno)); + + if (mc->mc_snum < 2) { + if ((rc = mdb_page_new(mc, P_BRANCH, 1, &pp))) + goto done; + /* shift current top to make room for new parent */ + mc->mc_pg[1] = mc->mc_pg[0]; + mc->mc_ki[1] = mc->mc_ki[0]; + mc->mc_pg[0] = pp; + mc->mc_ki[0] = 0; + mc->mc_db->md_root = pp->mp_pgno; + DPRINTF(("root split! new root = %"Z"u", pp->mp_pgno)); + mc->mc_db->md_depth++; + new_root = 1; + + /* Add left (implicit) pointer. */ + if ((rc = mdb_node_add(mc, 0, NULL, NULL, mp->mp_pgno, 0)) != MDB_SUCCESS) { + /* undo the pre-push */ + mc->mc_pg[0] = mc->mc_pg[1]; + mc->mc_ki[0] = mc->mc_ki[1]; + mc->mc_db->md_root = mp->mp_pgno; + mc->mc_db->md_depth--; + goto done; + } + mc->mc_snum = 2; + mc->mc_top = 1; + ptop = 0; + } else { + ptop = mc->mc_top-1; + DPRINTF(("parent branch page is %"Z"u", mc->mc_pg[ptop]->mp_pgno)); + } + + mc->mc_flags |= C_SPLITTING; + mdb_cursor_copy(mc, &mn); + mn.mc_pg[mn.mc_top] = rp; + mn.mc_ki[ptop] = mc->mc_ki[ptop]+1; + + if (nflags & MDB_APPEND) { + mn.mc_ki[mn.mc_top] = 0; + sepkey = *newkey; + split_indx = newindx; + nkeys = 0; + } else { + + split_indx = (nkeys+1) / 2; + + if (IS_LEAF2(rp)) { + char *split, *ins; + int x; + unsigned int lsize, rsize, ksize; + /* Move half of the keys to the right sibling */ + x = mc->mc_ki[mc->mc_top] - split_indx; + ksize = mc->mc_db->md_pad; + split = LEAF2KEY(mp, split_indx, ksize); + rsize = (nkeys - split_indx) * ksize; + lsize = (nkeys - split_indx) * sizeof(indx_t); + mp->mp_lower -= lsize; + rp->mp_lower += lsize; + mp->mp_upper += rsize - lsize; + rp->mp_upper -= rsize - lsize; + sepkey.mv_size = ksize; + if (newindx == split_indx) { + sepkey.mv_data = newkey->mv_data; + } else { + sepkey.mv_data = split; + } + if (x<0) { + ins = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], ksize); + memcpy(rp->mp_ptrs, split, rsize); + sepkey.mv_data = rp->mp_ptrs; + memmove(ins+ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize); + memcpy(ins, newkey->mv_data, ksize); + mp->mp_lower += sizeof(indx_t); + mp->mp_upper -= ksize - sizeof(indx_t); + } else { + if (x) + memcpy(rp->mp_ptrs, split, x * ksize); + ins = LEAF2KEY(rp, x, ksize); + memcpy(ins, newkey->mv_data, ksize); + memcpy(ins+ksize, split + x * ksize, rsize - x * ksize); + rp->mp_lower += sizeof(indx_t); + rp->mp_upper -= ksize - sizeof(indx_t); + mc->mc_ki[mc->mc_top] = x; + mc->mc_pg[mc->mc_top] = rp; + } + } else { + int psize, nsize, k; + /* Maximum free space in an empty page */ + pmax = env->me_psize - PAGEHDRSZ; + if (IS_LEAF(mp)) + nsize = mdb_leaf_size(env, newkey, newdata); + else + nsize = mdb_branch_size(env, newkey); + nsize = EVEN(nsize); + + /* grab a page to hold a temporary copy */ + copy = mdb_page_malloc(mc->mc_txn, 1); + if (copy == NULL) { + rc = ENOMEM; + goto done; + } + copy->mp_pgno = mp->mp_pgno; + copy->mp_flags = mp->mp_flags; + copy->mp_lower = (PAGEHDRSZ-PAGEBASE); + copy->mp_upper = env->me_psize - PAGEBASE; + + /* prepare to insert */ + for (i=0, j=0; imp_ptrs[j++] = 0; + } + copy->mp_ptrs[j++] = mp->mp_ptrs[i]; + } + + /* When items are relatively large the split point needs + * to be checked, because being off-by-one will make the + * difference between success or failure in mdb_node_add. + * + * It's also relevant if a page happens to be laid out + * such that one half of its nodes are all "small" and + * the other half of its nodes are "large." If the new + * item is also "large" and falls on the half with + * "large" nodes, it also may not fit. + * + * As a final tweak, if the new item goes on the last + * spot on the page (and thus, onto the new page), bias + * the split so the new page is emptier than the old page. + * This yields better packing during sequential inserts. + */ + if (nkeys < 20 || nsize > pmax/16 || newindx >= nkeys) { + /* Find split point */ + psize = 0; + if (newindx <= split_indx || newindx >= nkeys) { + i = 0; j = 1; + k = newindx >= nkeys ? nkeys : split_indx+2; + } else { + i = nkeys; j = -1; + k = split_indx-1; + } + for (; i!=k; i+=j) { + if (i == newindx) { + psize += nsize; + node = NULL; + } else { + node = (MDB_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE); + psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t); + if (IS_LEAF(mp)) { + if (F_ISSET(node->mn_flags, F_BIGDATA)) + psize += sizeof(pgno_t); + else + psize += NODEDSZ(node); + } + psize = EVEN(psize); + } + if (psize > pmax || i == k-j) { + split_indx = i + (j<0); + break; + } + } + } + if (split_indx == newindx) { + sepkey.mv_size = newkey->mv_size; + sepkey.mv_data = newkey->mv_data; + } else { + node = (MDB_node *)((char *)mp + copy->mp_ptrs[split_indx] + PAGEBASE); + sepkey.mv_size = node->mn_ksize; + sepkey.mv_data = NODEKEY(node); + } + } + } + + DPRINTF(("separator is %d [%s]", split_indx, DKEY(&sepkey))); + + /* Copy separator key to the parent. + */ + if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(env, &sepkey)) { + mn.mc_snum--; + mn.mc_top--; + did_split = 1; + rc = mdb_page_split(&mn, &sepkey, NULL, rp->mp_pgno, 0); + if (rc) + goto done; + + /* root split? */ + if (mn.mc_snum == mc->mc_snum) { + mc->mc_pg[mc->mc_snum] = mc->mc_pg[mc->mc_top]; + mc->mc_ki[mc->mc_snum] = mc->mc_ki[mc->mc_top]; + mc->mc_pg[mc->mc_top] = mc->mc_pg[ptop]; + mc->mc_ki[mc->mc_top] = mc->mc_ki[ptop]; + mc->mc_snum++; + mc->mc_top++; + ptop++; + } + /* Right page might now have changed parent. + * Check if left page also changed parent. + */ + if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && + mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { + for (i=0; imc_pg[i] = mn.mc_pg[i]; + mc->mc_ki[i] = mn.mc_ki[i]; + } + mc->mc_pg[ptop] = mn.mc_pg[ptop]; + if (mn.mc_ki[ptop]) { + mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1; + } else { + /* find right page's left sibling */ + mc->mc_ki[ptop] = mn.mc_ki[ptop]; + mdb_cursor_sibling(mc, 0); + } + } + } else { + mn.mc_top--; + rc = mdb_node_add(&mn, mn.mc_ki[ptop], &sepkey, NULL, rp->mp_pgno, 0); + mn.mc_top++; + } + mc->mc_flags ^= C_SPLITTING; + if (rc != MDB_SUCCESS) { + goto done; + } + if (nflags & MDB_APPEND) { + mc->mc_pg[mc->mc_top] = rp; + mc->mc_ki[mc->mc_top] = 0; + rc = mdb_node_add(mc, 0, newkey, newdata, newpgno, nflags); + if (rc) + goto done; + for (i=0; imc_top; i++) + mc->mc_ki[i] = mn.mc_ki[i]; + } else if (!IS_LEAF2(mp)) { + /* Move nodes */ + mc->mc_pg[mc->mc_top] = rp; + i = split_indx; + j = 0; + do { + if (i == newindx) { + rkey.mv_data = newkey->mv_data; + rkey.mv_size = newkey->mv_size; + if (IS_LEAF(mp)) { + rdata = newdata; + } else + pgno = newpgno; + flags = nflags; + /* Update index for the new key. */ + mc->mc_ki[mc->mc_top] = j; + } else { + node = (MDB_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE); + rkey.mv_data = NODEKEY(node); + rkey.mv_size = node->mn_ksize; + if (IS_LEAF(mp)) { + xdata.mv_data = NODEDATA(node); + xdata.mv_size = NODEDSZ(node); + rdata = &xdata; + } else + pgno = NODEPGNO(node); + flags = node->mn_flags; + } + + if (!IS_LEAF(mp) && j == 0) { + /* First branch index doesn't need key data. */ + rkey.mv_size = 0; + } + + rc = mdb_node_add(mc, j, &rkey, rdata, pgno, flags); + if (rc) + goto done; + if (i == nkeys) { + i = 0; + j = 0; + mc->mc_pg[mc->mc_top] = copy; + } else { + i++; + j++; + } + } while (i != split_indx); + + nkeys = NUMKEYS(copy); + for (i=0; imp_ptrs[i] = copy->mp_ptrs[i]; + mp->mp_lower = copy->mp_lower; + mp->mp_upper = copy->mp_upper; + memcpy(NODEPTR(mp, nkeys-1), NODEPTR(copy, nkeys-1), + env->me_psize - copy->mp_upper - PAGEBASE); + + /* reset back to original page */ + if (newindx < split_indx) { + mc->mc_pg[mc->mc_top] = mp; + if (nflags & MDB_RESERVE) { + node = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + if (!(node->mn_flags & F_BIGDATA)) + newdata->mv_data = NODEDATA(node); + } + } else { + mc->mc_pg[mc->mc_top] = rp; + mc->mc_ki[ptop]++; + /* Make sure mc_ki is still valid. + */ + if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && + mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { + for (i=0; i<=ptop; i++) { + mc->mc_pg[i] = mn.mc_pg[i]; + mc->mc_ki[i] = mn.mc_ki[i]; + } + } + } + } + + { + /* Adjust other cursors pointing to mp */ + MDB_cursor *m2, *m3; + MDB_dbi dbi = mc->mc_dbi; + int fixup = NUMKEYS(mp); + + for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + if (mc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3 == mc) + continue; + if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) + continue; + if (m3->mc_flags & C_SPLITTING) + continue; + if (new_root) { + int k; + /* root split */ + for (k=m3->mc_top; k>=0; k--) { + m3->mc_ki[k+1] = m3->mc_ki[k]; + m3->mc_pg[k+1] = m3->mc_pg[k]; + } + if (m3->mc_ki[0] >= split_indx) { + m3->mc_ki[0] = 1; + } else { + m3->mc_ki[0] = 0; + } + m3->mc_pg[0] = mc->mc_pg[0]; + m3->mc_snum++; + m3->mc_top++; + } + if (m3->mc_top >= mc->mc_top && m3->mc_pg[mc->mc_top] == mp) { + if (m3->mc_ki[mc->mc_top] >= newindx && !(nflags & MDB_SPLIT_REPLACE)) + m3->mc_ki[mc->mc_top]++; + if (m3->mc_ki[mc->mc_top] >= fixup) { + m3->mc_pg[mc->mc_top] = rp; + m3->mc_ki[mc->mc_top] -= fixup; + m3->mc_ki[ptop] = mn.mc_ki[ptop]; + } + } else if (!did_split && m3->mc_top >= ptop && m3->mc_pg[ptop] == mc->mc_pg[ptop] && + m3->mc_ki[ptop] >= mc->mc_ki[ptop]) { + m3->mc_ki[ptop]++; + } + } + } + DPRINTF(("mp left: %d, rp left: %d", SIZELEFT(mp), SIZELEFT(rp))); + +done: + if (copy) /* tmp page */ + mdb_page_free(env, copy); + if (rc) + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return rc; +} + +int +mdb_put(MDB_txn *txn, MDB_dbi dbi, + MDB_val *key, MDB_val *data, unsigned int flags) +{ + MDB_cursor mc; + MDB_xcursor mx; + + if (!key || !data || dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) + return EINVAL; + + if ((flags & (MDB_NOOVERWRITE|MDB_NODUPDATA|MDB_RESERVE|MDB_APPEND|MDB_APPENDDUP)) != flags) + return EINVAL; + + mdb_cursor_init(&mc, txn, dbi, &mx); + return mdb_cursor_put(&mc, key, data, flags); +} + +#ifndef MDB_WBUF +#define MDB_WBUF (1024*1024) +#endif + + /** State needed for a compacting copy. */ +typedef struct mdb_copy { + pthread_mutex_t mc_mutex; + pthread_cond_t mc_cond; + char *mc_wbuf[2]; + char *mc_over[2]; + MDB_env *mc_env; + MDB_txn *mc_txn; + int mc_wlen[2]; + int mc_olen[2]; + pgno_t mc_next_pgno; + HANDLE mc_fd; + int mc_status; + volatile int mc_new; + int mc_toggle; + +} mdb_copy; + + /** Dedicated writer thread for compacting copy. */ +static THREAD_RET ESECT +mdb_env_copythr(void *arg) +{ + mdb_copy *my = arg; + char *ptr; + int toggle = 0, wsize, rc; +#ifdef _WIN32 + DWORD len; +#define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL) +#else + int len; +#define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0) +#endif + + pthread_mutex_lock(&my->mc_mutex); + my->mc_new = 0; + pthread_cond_signal(&my->mc_cond); + for(;;) { + while (!my->mc_new) + pthread_cond_wait(&my->mc_cond, &my->mc_mutex); + if (my->mc_new < 0) { + my->mc_new = 0; + break; + } + my->mc_new = 0; + wsize = my->mc_wlen[toggle]; + ptr = my->mc_wbuf[toggle]; +again: + while (wsize > 0) { + DO_WRITE(rc, my->mc_fd, ptr, wsize, len); + if (!rc) { + rc = ErrCode(); + break; + } else if (len > 0) { + rc = MDB_SUCCESS; + ptr += len; + wsize -= len; + continue; + } else { + rc = EIO; + break; + } + } + if (rc) { + my->mc_status = rc; + break; + } + /* If there's an overflow page tail, write it too */ + if (my->mc_olen[toggle]) { + wsize = my->mc_olen[toggle]; + ptr = my->mc_over[toggle]; + my->mc_olen[toggle] = 0; + goto again; + } + my->mc_wlen[toggle] = 0; + toggle ^= 1; + pthread_cond_signal(&my->mc_cond); + } + pthread_cond_signal(&my->mc_cond); + pthread_mutex_unlock(&my->mc_mutex); + return (THREAD_RET)0; +#undef DO_WRITE +} + + /** Tell the writer thread there's a buffer ready to write */ +static int ESECT +mdb_env_cthr_toggle(mdb_copy *my, int st) +{ + int toggle = my->mc_toggle ^ 1; + pthread_mutex_lock(&my->mc_mutex); + if (my->mc_status) { + pthread_mutex_unlock(&my->mc_mutex); + return my->mc_status; + } + while (my->mc_new == 1) + pthread_cond_wait(&my->mc_cond, &my->mc_mutex); + my->mc_new = st; + my->mc_toggle = toggle; + pthread_cond_signal(&my->mc_cond); + pthread_mutex_unlock(&my->mc_mutex); + return 0; +} + + /** Depth-first tree traversal for compacting copy. */ +static int ESECT +mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) +{ + MDB_cursor mc; + MDB_txn *txn = my->mc_txn; + MDB_node *ni; + MDB_page *mo, *mp, *leaf; + char *buf, *ptr; + int rc, toggle; + unsigned int i; + + /* Empty DB, nothing to do */ + if (*pg == P_INVALID) + return MDB_SUCCESS; + + mc.mc_snum = 1; + mc.mc_top = 0; + mc.mc_txn = txn; + + rc = mdb_page_get(my->mc_txn, *pg, &mc.mc_pg[0], NULL); + if (rc) + return rc; + rc = mdb_page_search_root(&mc, NULL, MDB_PS_FIRST); + if (rc) + return rc; + + /* Make cursor pages writable */ + buf = ptr = malloc(my->mc_env->me_psize * mc.mc_snum); + if (buf == NULL) + return ENOMEM; + + for (i=0; imc_env->me_psize); + mc.mc_pg[i] = (MDB_page *)ptr; + ptr += my->mc_env->me_psize; + } + + /* This is writable space for a leaf page. Usually not needed. */ + leaf = (MDB_page *)ptr; + + toggle = my->mc_toggle; + while (mc.mc_snum > 0) { + unsigned n; + mp = mc.mc_pg[mc.mc_top]; + n = NUMKEYS(mp); + + if (IS_LEAF(mp)) { + if (!IS_LEAF2(mp) && !(flags & F_DUPDATA)) { + for (i=0; imn_flags & F_BIGDATA) { + MDB_page *omp; + pgno_t pg; + + /* Need writable leaf */ + if (mp != leaf) { + mc.mc_pg[mc.mc_top] = leaf; + mdb_page_copy(leaf, mp, my->mc_env->me_psize); + mp = leaf; + ni = NODEPTR(mp, i); + } + + memcpy(&pg, NODEDATA(ni), sizeof(pg)); + rc = mdb_page_get(txn, pg, &omp, NULL); + if (rc) + goto done; + if (my->mc_wlen[toggle] >= MDB_WBUF) { + rc = mdb_env_cthr_toggle(my, 1); + if (rc) + goto done; + toggle = my->mc_toggle; + } + mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); + memcpy(mo, omp, my->mc_env->me_psize); + mo->mp_pgno = my->mc_next_pgno; + my->mc_next_pgno += omp->mp_pages; + my->mc_wlen[toggle] += my->mc_env->me_psize; + if (omp->mp_pages > 1) { + my->mc_olen[toggle] = my->mc_env->me_psize * (omp->mp_pages - 1); + my->mc_over[toggle] = (char *)omp + my->mc_env->me_psize; + rc = mdb_env_cthr_toggle(my, 1); + if (rc) + goto done; + toggle = my->mc_toggle; + } + memcpy(NODEDATA(ni), &mo->mp_pgno, sizeof(pgno_t)); + } else if (ni->mn_flags & F_SUBDATA) { + MDB_db db; + + /* Need writable leaf */ + if (mp != leaf) { + mc.mc_pg[mc.mc_top] = leaf; + mdb_page_copy(leaf, mp, my->mc_env->me_psize); + mp = leaf; + ni = NODEPTR(mp, i); + } + + memcpy(&db, NODEDATA(ni), sizeof(db)); + my->mc_toggle = toggle; + rc = mdb_env_cwalk(my, &db.md_root, ni->mn_flags & F_DUPDATA); + if (rc) + goto done; + toggle = my->mc_toggle; + memcpy(NODEDATA(ni), &db, sizeof(db)); + } + } + } + } else { + mc.mc_ki[mc.mc_top]++; + if (mc.mc_ki[mc.mc_top] < n) { + pgno_t pg; +again: + ni = NODEPTR(mp, mc.mc_ki[mc.mc_top]); + pg = NODEPGNO(ni); + rc = mdb_page_get(txn, pg, &mp, NULL); + if (rc) + goto done; + mc.mc_top++; + mc.mc_snum++; + mc.mc_ki[mc.mc_top] = 0; + if (IS_BRANCH(mp)) { + /* Whenever we advance to a sibling branch page, + * we must proceed all the way down to its first leaf. + */ + mdb_page_copy(mc.mc_pg[mc.mc_top], mp, my->mc_env->me_psize); + goto again; + } else + mc.mc_pg[mc.mc_top] = mp; + continue; + } + } + if (my->mc_wlen[toggle] >= MDB_WBUF) { + rc = mdb_env_cthr_toggle(my, 1); + if (rc) + goto done; + toggle = my->mc_toggle; + } + mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); + mdb_page_copy(mo, mp, my->mc_env->me_psize); + mo->mp_pgno = my->mc_next_pgno++; + my->mc_wlen[toggle] += my->mc_env->me_psize; + if (mc.mc_top) { + /* Update parent if there is one */ + ni = NODEPTR(mc.mc_pg[mc.mc_top-1], mc.mc_ki[mc.mc_top-1]); + SETPGNO(ni, mo->mp_pgno); + mdb_cursor_pop(&mc); + } else { + /* Otherwise we're done */ + *pg = mo->mp_pgno; + break; + } + } +done: + free(buf); + return rc; +} + + /** Copy environment with compaction. */ +static int ESECT +mdb_env_copyfd1(MDB_env *env, HANDLE fd) +{ + MDB_meta *mm; + MDB_page *mp; + mdb_copy my; + MDB_txn *txn = NULL; + pthread_t thr; + int rc; + +#ifdef _WIN32 + my.mc_mutex = CreateMutex(NULL, FALSE, NULL); + my.mc_cond = CreateEvent(NULL, FALSE, FALSE, NULL); + my.mc_wbuf[0] = _aligned_malloc(MDB_WBUF*2, env->me_os_psize); + if (my.mc_wbuf[0] == NULL) + return errno; +#else + pthread_mutex_init(&my.mc_mutex, NULL); + pthread_cond_init(&my.mc_cond, NULL); +#ifdef HAVE_MEMALIGN + my.mc_wbuf[0] = memalign(env->me_os_psize, MDB_WBUF*2); + if (my.mc_wbuf[0] == NULL) + return errno; +#else + rc = posix_memalign((void **)&my.mc_wbuf[0], env->me_os_psize, MDB_WBUF*2); + if (rc) + return rc; +#endif +#endif + memset(my.mc_wbuf[0], 0, MDB_WBUF*2); + my.mc_wbuf[1] = my.mc_wbuf[0] + MDB_WBUF; + my.mc_wlen[0] = 0; + my.mc_wlen[1] = 0; + my.mc_olen[0] = 0; + my.mc_olen[1] = 0; + my.mc_next_pgno = 2; + my.mc_status = 0; + my.mc_new = 1; + my.mc_toggle = 0; + my.mc_env = env; + my.mc_fd = fd; + THREAD_CREATE(thr, mdb_env_copythr, &my); + + rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); + if (rc) + return rc; + + mp = (MDB_page *)my.mc_wbuf[0]; + memset(mp, 0, 2*env->me_psize); + mp->mp_pgno = 0; + mp->mp_flags = P_META; + mm = (MDB_meta *)METADATA(mp); + mdb_env_init_meta0(env, mm); + mm->mm_address = env->me_metas[0]->mm_address; + + mp = (MDB_page *)(my.mc_wbuf[0] + env->me_psize); + mp->mp_pgno = 1; + mp->mp_flags = P_META; + *(MDB_meta *)METADATA(mp) = *mm; + mm = (MDB_meta *)METADATA(mp); + + /* Count the number of free pages, subtract from lastpg to find + * number of active pages + */ + { + MDB_ID freecount = 0; + MDB_cursor mc; + MDB_val key, data; + mdb_cursor_init(&mc, txn, FREE_DBI, NULL); + while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) + freecount += *(MDB_ID *)data.mv_data; + freecount += txn->mt_dbs[0].md_branch_pages + + txn->mt_dbs[0].md_leaf_pages + + txn->mt_dbs[0].md_overflow_pages; + + /* Set metapage 1 */ + mm->mm_last_pg = txn->mt_next_pgno - freecount - 1; + mm->mm_dbs[1] = txn->mt_dbs[1]; + mm->mm_dbs[1].md_root = mm->mm_last_pg; + mm->mm_txnid = 1; + } + my.mc_wlen[0] = env->me_psize * 2; + my.mc_txn = txn; + pthread_mutex_lock(&my.mc_mutex); + while(my.mc_new) + pthread_cond_wait(&my.mc_cond, &my.mc_mutex); + pthread_mutex_unlock(&my.mc_mutex); + rc = mdb_env_cwalk(&my, &txn->mt_dbs[1].md_root, 0); + if (rc == MDB_SUCCESS && my.mc_wlen[my.mc_toggle]) + rc = mdb_env_cthr_toggle(&my, 1); + mdb_env_cthr_toggle(&my, -1); + pthread_mutex_lock(&my.mc_mutex); + while(my.mc_new) + pthread_cond_wait(&my.mc_cond, &my.mc_mutex); + pthread_mutex_unlock(&my.mc_mutex); + THREAD_FINISH(thr); + + mdb_txn_abort(txn); +#ifdef _WIN32 + CloseHandle(my.mc_cond); + CloseHandle(my.mc_mutex); + _aligned_free(my.mc_wbuf[0]); +#else + pthread_cond_destroy(&my.mc_cond); + pthread_mutex_destroy(&my.mc_mutex); + free(my.mc_wbuf[0]); +#endif + return rc; +} + + /** Copy environment as-is. */ +static int ESECT +mdb_env_copyfd0(MDB_env *env, HANDLE fd) +{ + MDB_txn *txn = NULL; + int rc; + size_t wsize; + char *ptr; +#ifdef _WIN32 + DWORD len, w2; +#define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL) +#else + ssize_t len; + size_t w2; +#define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0) +#endif + + /* Do the lock/unlock of the reader mutex before starting the + * write txn. Otherwise other read txns could block writers. + */ + rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); + if (rc) + return rc; + + if (env->me_txns) { + /* We must start the actual read txn after blocking writers */ + mdb_txn_reset0(txn, "reset-stage1"); + + /* Temporarily block writers until we snapshot the meta pages */ + LOCK_MUTEX_W(env); + + rc = mdb_txn_renew0(txn); + if (rc) { + UNLOCK_MUTEX_W(env); + goto leave; + } + } + + wsize = env->me_psize * 2; + ptr = env->me_map; + w2 = wsize; + while (w2 > 0) { + DO_WRITE(rc, fd, ptr, w2, len); + if (!rc) { + rc = ErrCode(); + break; + } else if (len > 0) { + rc = MDB_SUCCESS; + ptr += len; + w2 -= len; + continue; + } else { + /* Non-blocking or async handles are not supported */ + rc = EIO; + break; + } + } + if (env->me_txns) + UNLOCK_MUTEX_W(env); + + if (rc) + goto leave; + + w2 = txn->mt_next_pgno * env->me_psize; +#ifdef WIN32 + { + LARGE_INTEGER fsize; + GetFileSizeEx(env->me_fd, &fsize); + if (w2 > fsize.QuadPart) + w2 = fsize.QuadPart; + } +#else + { + struct stat st; + fstat(env->me_fd, &st); + if (w2 > (size_t)st.st_size) + w2 = st.st_size; + } +#endif + wsize = w2 - wsize; + while (wsize > 0) { + if (wsize > MAX_WRITE) + w2 = MAX_WRITE; + else + w2 = wsize; + DO_WRITE(rc, fd, ptr, w2, len); + if (!rc) { + rc = ErrCode(); + break; + } else if (len > 0) { + rc = MDB_SUCCESS; + ptr += len; + wsize -= len; + continue; + } else { + rc = EIO; + break; + } + } + +leave: + mdb_txn_abort(txn); + return rc; +} + +int ESECT +mdb_env_copyfd2(MDB_env *env, HANDLE fd, unsigned int flags) +{ + if (flags & MDB_CP_COMPACT) + return mdb_env_copyfd1(env, fd); + else + return mdb_env_copyfd0(env, fd); +} + +int ESECT +mdb_env_copyfd(MDB_env *env, HANDLE fd) +{ + return mdb_env_copyfd2(env, fd, 0); +} + +int ESECT +mdb_env_copy2(MDB_env *env, const char *path, unsigned int flags) +{ + int rc, len; + char *lpath; + HANDLE newfd = INVALID_HANDLE_VALUE; + + if (env->me_flags & MDB_NOSUBDIR) { + lpath = (char *)path; + } else { + len = strlen(path); + len += sizeof(DATANAME); + lpath = malloc(len); + if (!lpath) + return ENOMEM; + sprintf(lpath, "%s" DATANAME, path); + } + + /* The destination path must exist, but the destination file must not. + * We don't want the OS to cache the writes, since the source data is + * already in the OS cache. + */ +#ifdef _WIN32 + newfd = CreateFile(lpath, GENERIC_WRITE, 0, NULL, CREATE_NEW, + FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH, NULL); +#else + newfd = open(lpath, O_WRONLY|O_CREAT|O_EXCL, 0666); +#endif + if (newfd == INVALID_HANDLE_VALUE) { + rc = ErrCode(); + goto leave; + } + + if (env->me_psize >= env->me_os_psize) { +#ifdef O_DIRECT + /* Set O_DIRECT if the file system supports it */ + if ((rc = fcntl(newfd, F_GETFL)) != -1) + (void) fcntl(newfd, F_SETFL, rc | O_DIRECT); +#endif +#ifdef F_NOCACHE /* __APPLE__ */ + rc = fcntl(newfd, F_NOCACHE, 1); + if (rc) { + rc = ErrCode(); + goto leave; + } +#endif + } + + rc = mdb_env_copyfd2(env, newfd, flags); + +leave: + if (!(env->me_flags & MDB_NOSUBDIR)) + free(lpath); + if (newfd != INVALID_HANDLE_VALUE) + if (close(newfd) < 0 && rc == MDB_SUCCESS) + rc = ErrCode(); + + return rc; +} + +int ESECT +mdb_env_copy(MDB_env *env, const char *path) +{ + return mdb_env_copy2(env, path, 0); +} + +int ESECT +mdb_env_set_flags(MDB_env *env, unsigned int flag, int onoff) +{ + if ((flag & CHANGEABLE) != flag) + return EINVAL; + if (onoff) + env->me_flags |= flag; + else + env->me_flags &= ~flag; + return MDB_SUCCESS; +} + +int ESECT +mdb_env_get_flags(MDB_env *env, unsigned int *arg) +{ + if (!env || !arg) + return EINVAL; + + *arg = env->me_flags; + return MDB_SUCCESS; +} + +int ESECT +mdb_env_set_userctx(MDB_env *env, void *ctx) +{ + if (!env) + return EINVAL; + env->me_userctx = ctx; + return MDB_SUCCESS; +} + +void * ESECT +mdb_env_get_userctx(MDB_env *env) +{ + return env ? env->me_userctx : NULL; +} + +int ESECT +mdb_env_set_assert(MDB_env *env, MDB_assert_func *func) +{ + if (!env) + return EINVAL; +#ifndef NDEBUG + env->me_assert_func = func; +#endif + return MDB_SUCCESS; +} + +int ESECT +mdb_env_get_path(MDB_env *env, const char **arg) +{ + if (!env || !arg) + return EINVAL; + + *arg = env->me_path; + return MDB_SUCCESS; +} + +int ESECT +mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *arg) +{ + if (!env || !arg) + return EINVAL; + + *arg = env->me_fd; + return MDB_SUCCESS; +} + +/** Common code for #mdb_stat() and #mdb_env_stat(). + * @param[in] env the environment to operate in. + * @param[in] db the #MDB_db record containing the stats to return. + * @param[out] arg the address of an #MDB_stat structure to receive the stats. + * @return 0, this function always succeeds. + */ +static int ESECT +mdb_stat0(MDB_env *env, MDB_db *db, MDB_stat *arg) +{ + arg->ms_psize = env->me_psize; + arg->ms_depth = db->md_depth; + arg->ms_branch_pages = db->md_branch_pages; + arg->ms_leaf_pages = db->md_leaf_pages; + arg->ms_overflow_pages = db->md_overflow_pages; + arg->ms_entries = db->md_entries; + + return MDB_SUCCESS; +} + +int ESECT +mdb_env_stat(MDB_env *env, MDB_stat *arg) +{ + int toggle; + + if (env == NULL || arg == NULL) + return EINVAL; + + toggle = mdb_env_pick_meta(env); + + return mdb_stat0(env, &env->me_metas[toggle]->mm_dbs[MAIN_DBI], arg); +} + +int ESECT +mdb_env_info(MDB_env *env, MDB_envinfo *arg) +{ + int toggle; + + if (env == NULL || arg == NULL) + return EINVAL; + + toggle = mdb_env_pick_meta(env); + arg->me_mapaddr = env->me_metas[toggle]->mm_address; + arg->me_mapsize = env->me_mapsize; + arg->me_maxreaders = env->me_maxreaders; + + /* me_numreaders may be zero if this process never used any readers. Use + * the shared numreader count if it exists. + */ + arg->me_numreaders = env->me_txns ? env->me_txns->mti_numreaders : env->me_numreaders; + + arg->me_last_pgno = env->me_metas[toggle]->mm_last_pg; + arg->me_last_txnid = env->me_metas[toggle]->mm_txnid; + return MDB_SUCCESS; +} + +/** Set the default comparison functions for a database. + * Called immediately after a database is opened to set the defaults. + * The user can then override them with #mdb_set_compare() or + * #mdb_set_dupsort(). + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + */ +static void +mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi) +{ + uint16_t f = txn->mt_dbs[dbi].md_flags; + + txn->mt_dbxs[dbi].md_cmp = + (f & MDB_REVERSEKEY) ? mdb_cmp_memnr : + (f & MDB_INTEGERKEY) ? mdb_cmp_cint : mdb_cmp_memn; + + txn->mt_dbxs[dbi].md_dcmp = + !(f & MDB_DUPSORT) ? 0 : + ((f & MDB_INTEGERDUP) + ? ((f & MDB_DUPFIXED) ? mdb_cmp_int : mdb_cmp_cint) + : ((f & MDB_REVERSEDUP) ? mdb_cmp_memnr : mdb_cmp_memn)); +} + +int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *dbi) +{ + MDB_val key, data; + MDB_dbi i; + MDB_cursor mc; + MDB_db dummy; + int rc, dbflag, exact; + unsigned int unused = 0, seq; + size_t len; + + if (txn->mt_dbxs[FREE_DBI].md_cmp == NULL) { + mdb_default_cmp(txn, FREE_DBI); + } + + if ((flags & VALID_FLAGS) != flags) + return EINVAL; + if (txn->mt_flags & MDB_TXN_ERROR) + return MDB_BAD_TXN; + + /* main DB? */ + if (!name) { + *dbi = MAIN_DBI; + if (flags & PERSISTENT_FLAGS) { + uint16_t f2 = flags & PERSISTENT_FLAGS; + /* make sure flag changes get committed */ + if ((txn->mt_dbs[MAIN_DBI].md_flags | f2) != txn->mt_dbs[MAIN_DBI].md_flags) { + txn->mt_dbs[MAIN_DBI].md_flags |= f2; + txn->mt_flags |= MDB_TXN_DIRTY; + } + } + mdb_default_cmp(txn, MAIN_DBI); + return MDB_SUCCESS; + } + + if (txn->mt_dbxs[MAIN_DBI].md_cmp == NULL) { + mdb_default_cmp(txn, MAIN_DBI); + } + + /* Is the DB already open? */ + len = strlen(name); + for (i=2; imt_numdbs; i++) { + if (!txn->mt_dbxs[i].md_name.mv_size) { + /* Remember this free slot */ + if (!unused) unused = i; + continue; + } + if (len == txn->mt_dbxs[i].md_name.mv_size && + !strncmp(name, txn->mt_dbxs[i].md_name.mv_data, len)) { + *dbi = i; + return MDB_SUCCESS; + } + } + + /* If no free slot and max hit, fail */ + if (!unused && txn->mt_numdbs >= txn->mt_env->me_maxdbs) + return MDB_DBS_FULL; + + /* Cannot mix named databases with some mainDB flags */ + if (txn->mt_dbs[MAIN_DBI].md_flags & (MDB_DUPSORT|MDB_INTEGERKEY)) + return (flags & MDB_CREATE) ? MDB_INCOMPATIBLE : MDB_NOTFOUND; + + /* Find the DB info */ + dbflag = DB_NEW|DB_VALID; + exact = 0; + key.mv_size = len; + key.mv_data = (void *)name; + mdb_cursor_init(&mc, txn, MAIN_DBI, NULL); + rc = mdb_cursor_set(&mc, &key, &data, MDB_SET, &exact); + if (rc == MDB_SUCCESS) { + /* make sure this is actually a DB */ + MDB_node *node = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); + if (!(node->mn_flags & F_SUBDATA)) + return MDB_INCOMPATIBLE; + } else if (rc == MDB_NOTFOUND && (flags & MDB_CREATE)) { + /* Create if requested */ + data.mv_size = sizeof(MDB_db); + data.mv_data = &dummy; + memset(&dummy, 0, sizeof(dummy)); + dummy.md_root = P_INVALID; + dummy.md_flags = flags & PERSISTENT_FLAGS; + rc = mdb_cursor_put(&mc, &key, &data, F_SUBDATA); + dbflag |= DB_DIRTY; + } + + /* OK, got info, add to table */ + if (rc == MDB_SUCCESS) { + unsigned int slot = unused ? unused : txn->mt_numdbs; + txn->mt_dbxs[slot].md_name.mv_data = strdup(name); + txn->mt_dbxs[slot].md_name.mv_size = len; + txn->mt_dbxs[slot].md_rel = NULL; + txn->mt_dbflags[slot] = dbflag; + /* txn-> and env-> are the same in read txns, use + * tmp variable to avoid undefined assignment + */ + seq = ++txn->mt_env->me_dbiseqs[slot]; + txn->mt_dbiseqs[slot] = seq; + + memcpy(&txn->mt_dbs[slot], data.mv_data, sizeof(MDB_db)); + *dbi = slot; + mdb_default_cmp(txn, slot); + if (!unused) { + txn->mt_numdbs++; + } + } + + return rc; +} + +int mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *arg) +{ + if (!arg || !TXN_DBI_EXIST(txn, dbi)) + return EINVAL; + + if (txn->mt_flags & MDB_TXN_ERROR) + return MDB_BAD_TXN; + + if (txn->mt_dbflags[dbi] & DB_STALE) { + MDB_cursor mc; + MDB_xcursor mx; + /* Stale, must read the DB's root. cursor_init does it for us. */ + mdb_cursor_init(&mc, txn, dbi, &mx); + } + return mdb_stat0(txn->mt_env, &txn->mt_dbs[dbi], arg); +} + +void mdb_dbi_close(MDB_env *env, MDB_dbi dbi) +{ + char *ptr; + if (dbi <= MAIN_DBI || dbi >= env->me_maxdbs) + return; + ptr = env->me_dbxs[dbi].md_name.mv_data; + /* If there was no name, this was already closed */ + if (ptr) { + env->me_dbxs[dbi].md_name.mv_data = NULL; + env->me_dbxs[dbi].md_name.mv_size = 0; + env->me_dbflags[dbi] = 0; + env->me_dbiseqs[dbi]++; + free(ptr); + } +} + +int mdb_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned int *flags) +{ + /* We could return the flags for the FREE_DBI too but what's the point? */ + if (dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) + return EINVAL; + *flags = txn->mt_dbs[dbi].md_flags & PERSISTENT_FLAGS; + return MDB_SUCCESS; +} + +/** Add all the DB's pages to the free list. + * @param[in] mc Cursor on the DB to free. + * @param[in] subs non-Zero to check for sub-DBs in this DB. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_drop0(MDB_cursor *mc, int subs) +{ + int rc; + + rc = mdb_page_search(mc, NULL, MDB_PS_FIRST); + if (rc == MDB_SUCCESS) { + MDB_txn *txn = mc->mc_txn; + MDB_node *ni; + MDB_cursor mx; + unsigned int i; + + /* LEAF2 pages have no nodes, cannot have sub-DBs */ + if (IS_LEAF2(mc->mc_pg[mc->mc_top])) + mdb_cursor_pop(mc); + + mdb_cursor_copy(mc, &mx); + while (mc->mc_snum > 0) { + MDB_page *mp = mc->mc_pg[mc->mc_top]; + unsigned n = NUMKEYS(mp); + if (IS_LEAF(mp)) { + for (i=0; imn_flags & F_BIGDATA) { + MDB_page *omp; + pgno_t pg; + memcpy(&pg, NODEDATA(ni), sizeof(pg)); + rc = mdb_page_get(txn, pg, &omp, NULL); + if (rc != 0) + goto done; + mdb_cassert(mc, IS_OVERFLOW(omp)); + rc = mdb_midl_append_range(&txn->mt_free_pgs, + pg, omp->mp_pages); + if (rc) + goto done; + } else if (subs && (ni->mn_flags & F_SUBDATA)) { + mdb_xcursor_init1(mc, ni); + rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0); + if (rc) + goto done; + } + } + } else { + if ((rc = mdb_midl_need(&txn->mt_free_pgs, n)) != 0) + goto done; + for (i=0; imt_free_pgs, pg); + } + } + if (!mc->mc_top) + break; + mc->mc_ki[mc->mc_top] = i; + rc = mdb_cursor_sibling(mc, 1); + if (rc) { + if (rc != MDB_NOTFOUND) + goto done; + /* no more siblings, go back to beginning + * of previous level. + */ + mdb_cursor_pop(mc); + mc->mc_ki[0] = 0; + for (i=1; imc_snum; i++) { + mc->mc_ki[i] = 0; + mc->mc_pg[i] = mx.mc_pg[i]; + } + } + } + /* free it */ + rc = mdb_midl_append(&txn->mt_free_pgs, mc->mc_db->md_root); +done: + if (rc) + txn->mt_flags |= MDB_TXN_ERROR; + } else if (rc == MDB_NOTFOUND) { + rc = MDB_SUCCESS; + } + return rc; +} + +int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del) +{ + MDB_cursor *mc, *m2; + int rc; + + if ((unsigned)del > 1 || dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) + return EINVAL; + + if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) + return EACCES; + + if (dbi > MAIN_DBI && TXN_DBI_CHANGED(txn, dbi)) + return MDB_BAD_DBI; + + rc = mdb_cursor_open(txn, dbi, &mc); + if (rc) + return rc; + + rc = mdb_drop0(mc, mc->mc_db->md_flags & MDB_DUPSORT); + /* Invalidate the dropped DB's cursors */ + for (m2 = txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) + m2->mc_flags &= ~(C_INITIALIZED|C_EOF); + if (rc) + goto leave; + + /* Can't delete the main DB */ + if (del && dbi > MAIN_DBI) { + rc = mdb_del0(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL, 0); + if (!rc) { + txn->mt_dbflags[dbi] = DB_STALE; + mdb_dbi_close(txn->mt_env, dbi); + } else { + txn->mt_flags |= MDB_TXN_ERROR; + } + } else { + /* reset the DB record, mark it dirty */ + txn->mt_dbflags[dbi] |= DB_DIRTY; + txn->mt_dbs[dbi].md_depth = 0; + txn->mt_dbs[dbi].md_branch_pages = 0; + txn->mt_dbs[dbi].md_leaf_pages = 0; + txn->mt_dbs[dbi].md_overflow_pages = 0; + txn->mt_dbs[dbi].md_entries = 0; + txn->mt_dbs[dbi].md_root = P_INVALID; + + txn->mt_flags |= MDB_TXN_DIRTY; + } +leave: + mdb_cursor_close(mc); + return rc; +} + +int mdb_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) +{ + if (dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) + return EINVAL; + + txn->mt_dbxs[dbi].md_cmp = cmp; + return MDB_SUCCESS; +} + +int mdb_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) +{ + if (dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) + return EINVAL; + + txn->mt_dbxs[dbi].md_dcmp = cmp; + return MDB_SUCCESS; +} + +int mdb_set_relfunc(MDB_txn *txn, MDB_dbi dbi, MDB_rel_func *rel) +{ + if (dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) + return EINVAL; + + txn->mt_dbxs[dbi].md_rel = rel; + return MDB_SUCCESS; +} + +int mdb_set_relctx(MDB_txn *txn, MDB_dbi dbi, void *ctx) +{ + if (dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) + return EINVAL; + + txn->mt_dbxs[dbi].md_relctx = ctx; + return MDB_SUCCESS; +} + +int ESECT +mdb_env_get_maxkeysize(MDB_env *env) +{ + return ENV_MAXKEY(env); +} + +int ESECT +mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) +{ + unsigned int i, rdrs; + MDB_reader *mr; + char buf[64]; + int rc = 0, first = 1; + + if (!env || !func) + return -1; + if (!env->me_txns) { + return func("(no reader locks)\n", ctx); + } + rdrs = env->me_txns->mti_numreaders; + mr = env->me_txns->mti_readers; + for (i=0; i> 1; + cursor = base + pivot + 1; + val = pid - ids[cursor]; + + if( val < 0 ) { + n = pivot; + + } else if ( val > 0 ) { + base = cursor; + n -= pivot + 1; + + } else { + /* found, so it's a duplicate */ + return -1; + } + } + + if( val > 0 ) { + ++cursor; + } + ids[0]++; + for (n = ids[0]; n > cursor; n--) + ids[n] = ids[n-1]; + ids[n] = pid; + return 0; +} + +int ESECT +mdb_reader_check(MDB_env *env, int *dead) +{ + unsigned int i, j, rdrs; + MDB_reader *mr; + MDB_PID_T *pids, pid; + int count = 0; + + if (!env) + return EINVAL; + if (dead) + *dead = 0; + if (!env->me_txns) + return MDB_SUCCESS; + rdrs = env->me_txns->mti_numreaders; + pids = malloc((rdrs+1) * sizeof(MDB_PID_T)); + if (!pids) + return ENOMEM; + pids[0] = 0; + mr = env->me_txns->mti_readers; + for (i=0; ime_pid) { + pid = mr[i].mr_pid; + if (mdb_pid_insert(pids, pid) == 0) { + if (!mdb_reader_pid(env, Pidcheck, pid)) { + LOCK_MUTEX_R(env); + /* Recheck, a new process may have reused pid */ + if (!mdb_reader_pid(env, Pidcheck, pid)) { + for (j=i; j. + * + * Copyright 2000-2014 The OpenLDAP Foundation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#include +#include +#include +#include +#include +#include "midl.h" + +/** @defgroup internal LMDB Internals + * @{ + */ +/** @defgroup idls ID List Management + * @{ + */ +#define CMP(x,y) ( (x) < (y) ? -1 : (x) > (y) ) + +unsigned mdb_midl_search( MDB_IDL ids, MDB_ID id ) +{ + /* + * binary search of id in ids + * if found, returns position of id + * if not found, returns first position greater than id + */ + unsigned base = 0; + unsigned cursor = 1; + int val = 0; + unsigned n = ids[0]; + + while( 0 < n ) { + unsigned pivot = n >> 1; + cursor = base + pivot + 1; + val = CMP( ids[cursor], id ); + + if( val < 0 ) { + n = pivot; + + } else if ( val > 0 ) { + base = cursor; + n -= pivot + 1; + + } else { + return cursor; + } + } + + if( val > 0 ) { + ++cursor; + } + return cursor; +} + +#if 0 /* superseded by append/sort */ +int mdb_midl_insert( MDB_IDL ids, MDB_ID id ) +{ + unsigned x, i; + + x = mdb_midl_search( ids, id ); + assert( x > 0 ); + + if( x < 1 ) { + /* internal error */ + return -2; + } + + if ( x <= ids[0] && ids[x] == id ) { + /* duplicate */ + assert(0); + return -1; + } + + if ( ++ids[0] >= MDB_IDL_DB_MAX ) { + /* no room */ + --ids[0]; + return -2; + + } else { + /* insert id */ + for (i=ids[0]; i>x; i--) + ids[i] = ids[i-1]; + ids[x] = id; + } + + return 0; +} +#endif + +MDB_IDL mdb_midl_alloc(int num) +{ + MDB_IDL ids = malloc((num+2) * sizeof(MDB_ID)); + if (ids) { + *ids++ = num; + *ids = 0; + } + return ids; +} + +void mdb_midl_free(MDB_IDL ids) +{ + if (ids) + free(ids-1); +} + +int mdb_midl_shrink( MDB_IDL *idp ) +{ + MDB_IDL ids = *idp; + if (*(--ids) > MDB_IDL_UM_MAX && + (ids = realloc(ids, (MDB_IDL_UM_MAX+1) * sizeof(MDB_ID)))) + { + *ids++ = MDB_IDL_UM_MAX; + *idp = ids; + return 1; + } + return 0; +} + +static int mdb_midl_grow( MDB_IDL *idp, int num ) +{ + MDB_IDL idn = *idp-1; + /* grow it */ + idn = realloc(idn, (*idn + num + 2) * sizeof(MDB_ID)); + if (!idn) + return ENOMEM; + *idn++ += num; + *idp = idn; + return 0; +} + +int mdb_midl_need( MDB_IDL *idp, unsigned num ) +{ + MDB_IDL ids = *idp; + num += ids[0]; + if (num > ids[-1]) { + num = (num + num/4 + (256 + 2)) & -256; + if (!(ids = realloc(ids-1, num * sizeof(MDB_ID)))) + return ENOMEM; + *ids++ = num - 2; + *idp = ids; + } + return 0; +} + +int mdb_midl_append( MDB_IDL *idp, MDB_ID id ) +{ + MDB_IDL ids = *idp; + /* Too big? */ + if (ids[0] >= ids[-1]) { + if (mdb_midl_grow(idp, MDB_IDL_UM_MAX)) + return ENOMEM; + ids = *idp; + } + ids[0]++; + ids[ids[0]] = id; + return 0; +} + +int mdb_midl_append_list( MDB_IDL *idp, MDB_IDL app ) +{ + MDB_IDL ids = *idp; + /* Too big? */ + if (ids[0] + app[0] >= ids[-1]) { + if (mdb_midl_grow(idp, app[0])) + return ENOMEM; + ids = *idp; + } + memcpy(&ids[ids[0]+1], &app[1], app[0] * sizeof(MDB_ID)); + ids[0] += app[0]; + return 0; +} + +int mdb_midl_append_range( MDB_IDL *idp, MDB_ID id, unsigned n ) +{ + MDB_ID *ids = *idp, len = ids[0]; + /* Too big? */ + if (len + n > ids[-1]) { + if (mdb_midl_grow(idp, n | MDB_IDL_UM_MAX)) + return ENOMEM; + ids = *idp; + } + ids[0] = len + n; + ids += len; + while (n) + ids[n--] = id++; + return 0; +} + +void mdb_midl_xmerge( MDB_IDL idl, MDB_IDL merge ) +{ + MDB_ID old_id, merge_id, i = merge[0], j = idl[0], k = i+j, total = k; + idl[0] = (MDB_ID)-1; /* delimiter for idl scan below */ + old_id = idl[j]; + while (i) { + merge_id = merge[i--]; + for (; old_id < merge_id; old_id = idl[--j]) + idl[k--] = old_id; + idl[k--] = merge_id; + } + idl[0] = total; +} + +/* Quicksort + Insertion sort for small arrays */ + +#define SMALL 8 +#define MIDL_SWAP(a,b) { itmp=(a); (a)=(b); (b)=itmp; } + +void +mdb_midl_sort( MDB_IDL ids ) +{ + /* Max possible depth of int-indexed tree * 2 items/level */ + int istack[sizeof(int)*CHAR_BIT * 2]; + int i,j,k,l,ir,jstack; + MDB_ID a, itmp; + + ir = (int)ids[0]; + l = 1; + jstack = 0; + for(;;) { + if (ir - l < SMALL) { /* Insertion sort */ + for (j=l+1;j<=ir;j++) { + a = ids[j]; + for (i=j-1;i>=1;i--) { + if (ids[i] >= a) break; + ids[i+1] = ids[i]; + } + ids[i+1] = a; + } + if (jstack == 0) break; + ir = istack[jstack--]; + l = istack[jstack--]; + } else { + k = (l + ir) >> 1; /* Choose median of left, center, right */ + MIDL_SWAP(ids[k], ids[l+1]); + if (ids[l] < ids[ir]) { + MIDL_SWAP(ids[l], ids[ir]); + } + if (ids[l+1] < ids[ir]) { + MIDL_SWAP(ids[l+1], ids[ir]); + } + if (ids[l] < ids[l+1]) { + MIDL_SWAP(ids[l], ids[l+1]); + } + i = l+1; + j = ir; + a = ids[l+1]; + for(;;) { + do i++; while(ids[i] > a); + do j--; while(ids[j] < a); + if (j < i) break; + MIDL_SWAP(ids[i],ids[j]); + } + ids[l+1] = ids[j]; + ids[j] = a; + jstack += 2; + if (ir-i+1 >= j-l) { + istack[jstack] = ir; + istack[jstack-1] = i; + ir = j-1; + } else { + istack[jstack] = j-1; + istack[jstack-1] = l; + l = i; + } + } + } +} + +unsigned mdb_mid2l_search( MDB_ID2L ids, MDB_ID id ) +{ + /* + * binary search of id in ids + * if found, returns position of id + * if not found, returns first position greater than id + */ + unsigned base = 0; + unsigned cursor = 1; + int val = 0; + unsigned n = (unsigned)ids[0].mid; + + while( 0 < n ) { + unsigned pivot = n >> 1; + cursor = base + pivot + 1; + val = CMP( id, ids[cursor].mid ); + + if( val < 0 ) { + n = pivot; + + } else if ( val > 0 ) { + base = cursor; + n -= pivot + 1; + + } else { + return cursor; + } + } + + if( val > 0 ) { + ++cursor; + } + return cursor; +} + +int mdb_mid2l_insert( MDB_ID2L ids, MDB_ID2 *id ) +{ + unsigned x, i; + + x = mdb_mid2l_search( ids, id->mid ); + + if( x < 1 ) { + /* internal error */ + return -2; + } + + if ( x <= ids[0].mid && ids[x].mid == id->mid ) { + /* duplicate */ + return -1; + } + + if ( ids[0].mid >= MDB_IDL_UM_MAX ) { + /* too big */ + return -2; + + } else { + /* insert id */ + ids[0].mid++; + for (i=(unsigned)ids[0].mid; i>x; i--) + ids[i] = ids[i-1]; + ids[x] = *id; + } + + return 0; +} + +int mdb_mid2l_append( MDB_ID2L ids, MDB_ID2 *id ) +{ + /* Too big? */ + if (ids[0].mid >= MDB_IDL_UM_MAX) { + return -2; + } + ids[0].mid++; + ids[ids[0].mid] = *id; + return 0; +} + +/** @} */ +/** @} */ diff --git a/vendor/github.com/szferi/gomdb/midl.h b/vendor/github.com/szferi/gomdb/midl.h new file mode 100644 index 000000000000..a7f25026ce53 --- /dev/null +++ b/vendor/github.com/szferi/gomdb/midl.h @@ -0,0 +1,186 @@ +/** @file midl.h + * @brief LMDB ID List header file. + * + * This file was originally part of back-bdb but has been + * modified for use in libmdb. Most of the macros defined + * in this file are unused, just left over from the original. + * + * This file is only used internally in libmdb and its definitions + * are not exposed publicly. + */ +/* $OpenLDAP$ */ +/* This work is part of OpenLDAP Software . + * + * Copyright 2000-2014 The OpenLDAP Foundation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#ifndef _MDB_MIDL_H_ +#define _MDB_MIDL_H_ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** @defgroup internal LMDB Internals + * @{ + */ + +/** @defgroup idls ID List Management + * @{ + */ + /** A generic unsigned ID number. These were entryIDs in back-bdb. + * Preferably it should have the same size as a pointer. + */ +typedef size_t MDB_ID; + + /** An IDL is an ID List, a sorted array of IDs. The first + * element of the array is a counter for how many actual + * IDs are in the list. In the original back-bdb code, IDLs are + * sorted in ascending order. For libmdb IDLs are sorted in + * descending order. + */ +typedef MDB_ID *MDB_IDL; + +/* IDL sizes - likely should be even bigger + * limiting factors: sizeof(ID), thread stack size + */ +#define MDB_IDL_LOGN 16 /* DB_SIZE is 2^16, UM_SIZE is 2^17 */ +#define MDB_IDL_DB_SIZE (1< +#include +#include "lmdb.h" +*/ +import "C" + +import ( + "math" + "runtime" + "unsafe" +) + +// DBIOpen Database Flags +const ( + REVERSEKEY = C.MDB_REVERSEKEY // use reverse string keys + DUPSORT = C.MDB_DUPSORT // use sorted duplicates + INTEGERKEY = C.MDB_INTEGERKEY // numeric keys in native byte order. The keys must all be of the same size. + DUPFIXED = C.MDB_DUPFIXED // with DUPSORT, sorted dup items have fixed size + INTEGERDUP = C.MDB_INTEGERDUP // with DUPSORT, dups are numeric in native byte order + REVERSEDUP = C.MDB_REVERSEDUP // with DUPSORT, use reverse string dups + CREATE = C.MDB_CREATE // create DB if not already existing +) + +// put flags +const ( + NODUPDATA = C.MDB_NODUPDATA + NOOVERWRITE = C.MDB_NOOVERWRITE + RESERVE = C.MDB_RESERVE + APPEND = C.MDB_APPEND + APPENDDUP = C.MDB_APPENDDUP +) + +// Txn is Opaque structure for a transaction handle. +// All database operations require a transaction handle. +// Transactions may be read-only or read-write. +type Txn struct { + _txn *C.MDB_txn +} + +func (env *Env) BeginTxn(parent *Txn, flags uint) (*Txn, error) { + var _txn *C.MDB_txn + var ptxn *C.MDB_txn + if parent == nil { + ptxn = nil + } else { + ptxn = parent._txn + } + if flags&RDONLY == 0 { + runtime.LockOSThread() + } + ret := C.mdb_txn_begin(env._env, ptxn, C.uint(flags), &_txn) + if ret != SUCCESS { + runtime.UnlockOSThread() + return nil, errno(ret) + } + return &Txn{_txn}, nil +} + +func (txn *Txn) Commit() error { + ret := C.mdb_txn_commit(txn._txn) + runtime.UnlockOSThread() + // The transaction handle is freed if there was no error + if ret == C.MDB_SUCCESS { + txn._txn = nil + } + return errno(ret) +} + +func (txn *Txn) Abort() { + if txn._txn == nil { + return + } + C.mdb_txn_abort(txn._txn) + runtime.UnlockOSThread() + // The transaction handle is always freed. + txn._txn = nil +} + +func (txn *Txn) Reset() { + C.mdb_txn_reset(txn._txn) +} + +func (txn *Txn) Renew() error { + ret := C.mdb_txn_renew(txn._txn) + return errno(ret) +} + +func (txn *Txn) DBIOpen(name *string, flags uint) (DBI, error) { + var _dbi C.MDB_dbi + var cname *C.char + if name == nil { + cname = nil + } else { + cname = C.CString(*name) + defer C.free(unsafe.Pointer(cname)) + } + ret := C.mdb_dbi_open(txn._txn, cname, C.uint(flags), &_dbi) + if ret != SUCCESS { + return DBI(math.NaN()), errno(ret) + } + return DBI(_dbi), nil +} + +func (txn *Txn) Stat(dbi DBI) (*Stat, error) { + var _stat C.MDB_stat + ret := C.mdb_stat(txn._txn, C.MDB_dbi(dbi), &_stat) + if ret != SUCCESS { + return nil, errno(ret) + } + stat := Stat{PSize: uint(_stat.ms_psize), + Depth: uint(_stat.ms_depth), + BranchPages: uint64(_stat.ms_branch_pages), + LeafPages: uint64(_stat.ms_leaf_pages), + OverflowPages: uint64(_stat.ms_overflow_pages), + Entries: uint64(_stat.ms_entries)} + return &stat, nil +} + +func (txn *Txn) Drop(dbi DBI, del int) error { + ret := C.mdb_drop(txn._txn, C.MDB_dbi(dbi), C.int(del)) + return errno(ret) +} + +func (txn *Txn) Get(dbi DBI, key []byte) ([]byte, error) { + val, err := txn.GetVal(dbi, key) + if err != nil { + return nil, err + } + return val.Bytes(), nil +} + +func (txn *Txn) GetVal(dbi DBI, key []byte) (Val, error) { + ckey := Wrap(key) + var cval Val + ret := C.mdb_get(txn._txn, C.MDB_dbi(dbi), (*C.MDB_val)(&ckey), (*C.MDB_val)(&cval)) + return cval, errno(ret) +} + +func (txn *Txn) Put(dbi DBI, key []byte, val []byte, flags uint) error { + ckey := Wrap(key) + cval := Wrap(val) + ret := C.mdb_put(txn._txn, C.MDB_dbi(dbi), (*C.MDB_val)(&ckey), (*C.MDB_val)(&cval), C.uint(flags)) + return errno(ret) +} + +func (txn *Txn) Del(dbi DBI, key, val []byte) error { + ckey := Wrap(key) + if val == nil { + ret := C.mdb_del(txn._txn, C.MDB_dbi(dbi), (*C.MDB_val)(&ckey), nil) + return errno(ret) + } + cval := Wrap(val) + ret := C.mdb_del(txn._txn, C.MDB_dbi(dbi), (*C.MDB_val)(&ckey), (*C.MDB_val)(&cval)) + return errno(ret) +} + +type Cursor struct { + _cursor *C.MDB_cursor +} + +func (txn *Txn) CursorOpen(dbi DBI) (*Cursor, error) { + var _cursor *C.MDB_cursor + ret := C.mdb_cursor_open(txn._txn, C.MDB_dbi(dbi), &_cursor) + if ret != SUCCESS { + return nil, errno(ret) + } + return &Cursor{_cursor}, nil +} + +func (txn *Txn) CursorRenew(cursor *Cursor) error { + ret := C.mdb_cursor_renew(txn._txn, cursor._cursor) + return errno(ret) +} + +/* +type CmpFunc func(a, b []byte) int + +func (txn *Txn) SetCompare(dbi DBI, cmp CmpFunc) error { + f := func(a, b *C.MDB_val) C.int { + ga := C.GoBytes(a.mv_data, C.int(a.mv_size)) + gb := C.GoBytes(a.mv_data, C.int(a.mv_size)) + return C.int(cmp(ga, gb)) + } + ret := C.mdb_set_compare(txn._txn, C.MDB_dbi(dbi), *unsafe.Pointer(&f)) + return errno(ret) +} +*/ +// func (txn *Txn) SetDupSort(dbi DBI, comp *C.MDB_comp_func) error +// func (txn *Txn) SetRelFunc(dbi DBI, rel *C.MDB_rel_func) error +// func (txn *Txn) SetRelCtx(dbi DBI, void *) error diff --git a/vendor/github.com/szferi/gomdb/val.go b/vendor/github.com/szferi/gomdb/val.go new file mode 100644 index 000000000000..d8bfb346c692 --- /dev/null +++ b/vendor/github.com/szferi/gomdb/val.go @@ -0,0 +1,52 @@ +package mdb + +/* +#cgo CFLAGS: -pthread -W -Wall -Wno-unused-parameter -Wbad-function-cast -O2 -g +#cgo CFLAGS: -I/usr/local + +#include +#include +#include "lmdb.h" +*/ +import "C" + +import ( + "reflect" + "unsafe" +) + +// MDB_val +type Val C.MDB_val + +// Create a Val that points to p's data. the Val's data must not be freed +// manually and C references must not survive the garbage collection of p (and +// the returned Val). +func Wrap(p []byte) Val { + if len(p) == 0 { + return Val(C.MDB_val{}) + } + return Val(C.MDB_val{ + mv_size: C.size_t(len(p)), + mv_data: unsafe.Pointer(&p[0]), + }) +} + +// If val is nil, a empty slice is retured. +func (val Val) Bytes() []byte { + return C.GoBytes(val.mv_data, C.int(val.mv_size)) +} + +// If val is nil, a empty slice is retured. +func (val Val) BytesNoCopy() []byte { + hdr := reflect.SliceHeader{ + Data: uintptr(unsafe.Pointer(val.mv_data)), + Len: int(val.mv_size), + Cap: int(val.mv_size), + } + return *(*[]byte)(unsafe.Pointer(&hdr)) +} + +// If val is nil, an empty string is returned. +func (val Val) String() string { + return C.GoStringN((*C.char)(val.mv_data), C.int(val.mv_size)) +} From 31e9346b43ee1c661d31a70cd1f7f6d84a3d27a1 Mon Sep 17 00:00:00 2001 From: Matti Ranta Date: Mon, 4 Feb 2019 10:24:28 -0500 Subject: [PATCH 2/3] update build --- Gopkg.lock | 47 +- modules/setting/setting.go | 3 +- .../go-macaron/session/ledis/ledis.go | 227 - vendor/github.com/siddontang/go/LICENSE | 20 - vendor/github.com/siddontang/go/bson/LICENSE | 25 - .../github.com/siddontang/go/filelock/LICENSE | 27 - .../go/filelock/file_lock_generic.go | 17 - .../go/filelock/file_lock_solaris.go | 43 - .../siddontang/go/filelock/file_lock_unix.go | 51 - .../go/filelock/file_lock_windows.go | 36 - vendor/github.com/siddontang/go/hack/hack.go | 27 - .../siddontang/go/ioutil2/ioutil.go | 39 - .../siddontang/go/ioutil2/sectionwriter.go | 69 - vendor/github.com/siddontang/go/log/doc.go | 21 - .../siddontang/go/log/filehandler.go | 221 - .../github.com/siddontang/go/log/handler.go | 48 - vendor/github.com/siddontang/go/log/log.go | 343 - .../siddontang/go/log/sockethandler.go | 65 - vendor/github.com/siddontang/go/num/bytes.go | 67 - vendor/github.com/siddontang/go/num/cmp.go | 161 - vendor/github.com/siddontang/go/num/str.go | 157 - .../github.com/siddontang/go/snappy/LICENSE | 27 - .../github.com/siddontang/go/snappy/decode.go | 124 - .../github.com/siddontang/go/snappy/encode.go | 174 - .../github.com/siddontang/go/snappy/snappy.go | 38 - .../github.com/siddontang/go/sync2/atomic.go | 146 - .../siddontang/go/sync2/semaphore.go | 65 - vendor/github.com/siddontang/ledisdb/LICENSE | 21 - .../siddontang/ledisdb/client/go/LICENSE | 21 - .../siddontang/ledisdb/config/config.go | 279 - .../siddontang/ledisdb/ledis/batch.go | 138 - .../siddontang/ledisdb/ledis/const.go | 100 - .../siddontang/ledisdb/ledis/doc.go | 58 - .../siddontang/ledisdb/ledis/dump.go | 220 - .../siddontang/ledisdb/ledis/event.go | 135 - .../siddontang/ledisdb/ledis/info.go | 26 - .../siddontang/ledisdb/ledis/ledis.go | 215 - .../siddontang/ledisdb/ledis/ledis_db.go | 147 - .../siddontang/ledisdb/ledis/multi.go | 75 - .../siddontang/ledisdb/ledis/replication.go | 246 - .../siddontang/ledisdb/ledis/scan.go | 136 - .../siddontang/ledisdb/ledis/t_bit.go | 926 -- .../siddontang/ledisdb/ledis/t_hash.go | 511 - .../siddontang/ledisdb/ledis/t_kv.go | 396 - .../siddontang/ledisdb/ledis/t_list.go | 665 -- .../siddontang/ledisdb/ledis/t_set.go | 605 -- .../siddontang/ledisdb/ledis/t_ttl.go | 210 - .../siddontang/ledisdb/ledis/t_zset.go | 1025 -- .../github.com/siddontang/ledisdb/ledis/tx.go | 112 - .../siddontang/ledisdb/ledis/util.go | 94 - .../siddontang/ledisdb/rpl/file_io.go | 362 - .../siddontang/ledisdb/rpl/file_store.go | 414 - .../siddontang/ledisdb/rpl/file_table.go | 570 - .../siddontang/ledisdb/rpl/goleveldb_store.go | 224 - .../github.com/siddontang/ledisdb/rpl/log.go | 167 - .../github.com/siddontang/ledisdb/rpl/rpl.go | 331 - .../siddontang/ledisdb/rpl/store.go | 36 - .../siddontang/ledisdb/store/boltdb/const.go | 3 - .../siddontang/ledisdb/store/boltdb/db.go | 172 - .../ledisdb/store/boltdb/iterator.go | 50 - .../ledisdb/store/boltdb/snapshot.go | 37 - .../siddontang/ledisdb/store/boltdb/tx.go | 61 - .../github.com/siddontang/ledisdb/store/db.go | 179 - .../siddontang/ledisdb/store/driver/batch.go | 70 - .../siddontang/ledisdb/store/driver/driver.go | 79 - .../siddontang/ledisdb/store/driver/slice.go | 21 - .../siddontang/ledisdb/store/driver/store.go | 45 - .../ledisdb/store/goleveldb/batch.go | 39 - .../ledisdb/store/goleveldb/const.go | 4 - .../siddontang/ledisdb/store/goleveldb/db.go | 208 - .../ledisdb/store/goleveldb/iterator.go | 49 - .../ledisdb/store/goleveldb/snapshot.go | 26 - .../siddontang/ledisdb/store/iterator.go | 333 - .../siddontang/ledisdb/store/leveldb/batch.go | 104 - .../siddontang/ledisdb/store/leveldb/cache.go | 20 - .../siddontang/ledisdb/store/leveldb/const.go | 3 - .../siddontang/ledisdb/store/leveldb/db.go | 315 - .../ledisdb/store/leveldb/filterpolicy.go | 21 - .../ledisdb/store/leveldb/iterator.go | 70 - .../ledisdb/store/leveldb/leveldb_ext.cc | 95 - .../ledisdb/store/leveldb/leveldb_ext.h | 41 - .../ledisdb/store/leveldb/options.go | 122 - .../siddontang/ledisdb/store/leveldb/slice.go | 40 - .../ledisdb/store/leveldb/snapshot.go | 39 - .../siddontang/ledisdb/store/leveldb/util.go | 45 - .../siddontang/ledisdb/store/mdb/const.go | 3 - .../siddontang/ledisdb/store/mdb/mdb.go | 316 - .../siddontang/ledisdb/store/mdb/snapshot.go | 43 - .../siddontang/ledisdb/store/mdb/tx.go | 90 - .../siddontang/ledisdb/store/rocksdb/batch.go | 83 - .../siddontang/ledisdb/store/rocksdb/cache.go | 20 - .../siddontang/ledisdb/store/rocksdb/const.go | 3 - .../siddontang/ledisdb/store/rocksdb/db.go | 346 - .../siddontang/ledisdb/store/rocksdb/env.go | 27 - .../ledisdb/store/rocksdb/filterpolicy.go | 21 - .../ledisdb/store/rocksdb/iterator.go | 70 - .../ledisdb/store/rocksdb/options.go | 233 - .../ledisdb/store/rocksdb/rocksdb_ext.cc | 44 - .../ledisdb/store/rocksdb/rocksdb_ext.h | 24 - .../siddontang/ledisdb/store/rocksdb/slice.go | 41 - .../ledisdb/store/rocksdb/snapshot.go | 39 - .../siddontang/ledisdb/store/rocksdb/util.go | 54 - .../siddontang/ledisdb/store/slice.go | 9 - .../siddontang/ledisdb/store/snapshot.go | 48 - .../siddontang/ledisdb/store/stat.go | 37 - .../siddontang/ledisdb/store/store.go | 63 - .../github.com/siddontang/ledisdb/store/tx.go | 86 - .../siddontang/ledisdb/store/writebatch.go | 150 - vendor/github.com/szferi/gomdb/LICENSE | 10 - vendor/github.com/szferi/gomdb/cursor.go | 103 - vendor/github.com/szferi/gomdb/env.go | 227 - vendor/github.com/szferi/gomdb/lmdb.h | 1553 --- vendor/github.com/szferi/gomdb/mdb.c | 9364 ----------------- vendor/github.com/szferi/gomdb/mdb.go | 13 - vendor/github.com/szferi/gomdb/midl.c | 360 - vendor/github.com/szferi/gomdb/midl.h | 186 - vendor/github.com/szferi/gomdb/txn.go | 197 - vendor/github.com/szferi/gomdb/val.go | 52 - 118 files changed, 2 insertions(+), 26957 deletions(-) delete mode 100644 vendor/github.com/go-macaron/session/ledis/ledis.go delete mode 100644 vendor/github.com/siddontang/go/LICENSE delete mode 100644 vendor/github.com/siddontang/go/bson/LICENSE delete mode 100644 vendor/github.com/siddontang/go/filelock/LICENSE delete mode 100644 vendor/github.com/siddontang/go/filelock/file_lock_generic.go delete mode 100644 vendor/github.com/siddontang/go/filelock/file_lock_solaris.go delete mode 100644 vendor/github.com/siddontang/go/filelock/file_lock_unix.go delete mode 100644 vendor/github.com/siddontang/go/filelock/file_lock_windows.go delete mode 100644 vendor/github.com/siddontang/go/hack/hack.go delete mode 100644 vendor/github.com/siddontang/go/ioutil2/ioutil.go delete mode 100644 vendor/github.com/siddontang/go/ioutil2/sectionwriter.go delete mode 100644 vendor/github.com/siddontang/go/log/doc.go delete mode 100644 vendor/github.com/siddontang/go/log/filehandler.go delete mode 100644 vendor/github.com/siddontang/go/log/handler.go delete mode 100644 vendor/github.com/siddontang/go/log/log.go delete mode 100644 vendor/github.com/siddontang/go/log/sockethandler.go delete mode 100644 vendor/github.com/siddontang/go/num/bytes.go delete mode 100644 vendor/github.com/siddontang/go/num/cmp.go delete mode 100644 vendor/github.com/siddontang/go/num/str.go delete mode 100644 vendor/github.com/siddontang/go/snappy/LICENSE delete mode 100644 vendor/github.com/siddontang/go/snappy/decode.go delete mode 100644 vendor/github.com/siddontang/go/snappy/encode.go delete mode 100644 vendor/github.com/siddontang/go/snappy/snappy.go delete mode 100644 vendor/github.com/siddontang/go/sync2/atomic.go delete mode 100644 vendor/github.com/siddontang/go/sync2/semaphore.go delete mode 100644 vendor/github.com/siddontang/ledisdb/LICENSE delete mode 100644 vendor/github.com/siddontang/ledisdb/client/go/LICENSE delete mode 100644 vendor/github.com/siddontang/ledisdb/config/config.go delete mode 100644 vendor/github.com/siddontang/ledisdb/ledis/batch.go delete mode 100644 vendor/github.com/siddontang/ledisdb/ledis/const.go delete mode 100644 vendor/github.com/siddontang/ledisdb/ledis/doc.go delete mode 100644 vendor/github.com/siddontang/ledisdb/ledis/dump.go delete mode 100644 vendor/github.com/siddontang/ledisdb/ledis/event.go delete mode 100644 vendor/github.com/siddontang/ledisdb/ledis/info.go delete mode 100644 vendor/github.com/siddontang/ledisdb/ledis/ledis.go delete mode 100644 vendor/github.com/siddontang/ledisdb/ledis/ledis_db.go delete mode 100644 vendor/github.com/siddontang/ledisdb/ledis/multi.go delete mode 100644 vendor/github.com/siddontang/ledisdb/ledis/replication.go delete mode 100644 vendor/github.com/siddontang/ledisdb/ledis/scan.go delete mode 100644 vendor/github.com/siddontang/ledisdb/ledis/t_bit.go delete mode 100644 vendor/github.com/siddontang/ledisdb/ledis/t_hash.go delete mode 100644 vendor/github.com/siddontang/ledisdb/ledis/t_kv.go delete mode 100644 vendor/github.com/siddontang/ledisdb/ledis/t_list.go delete mode 100644 vendor/github.com/siddontang/ledisdb/ledis/t_set.go delete mode 100644 vendor/github.com/siddontang/ledisdb/ledis/t_ttl.go delete mode 100644 vendor/github.com/siddontang/ledisdb/ledis/t_zset.go delete mode 100644 vendor/github.com/siddontang/ledisdb/ledis/tx.go delete mode 100644 vendor/github.com/siddontang/ledisdb/ledis/util.go delete mode 100644 vendor/github.com/siddontang/ledisdb/rpl/file_io.go delete mode 100644 vendor/github.com/siddontang/ledisdb/rpl/file_store.go delete mode 100644 vendor/github.com/siddontang/ledisdb/rpl/file_table.go delete mode 100644 vendor/github.com/siddontang/ledisdb/rpl/goleveldb_store.go delete mode 100644 vendor/github.com/siddontang/ledisdb/rpl/log.go delete mode 100644 vendor/github.com/siddontang/ledisdb/rpl/rpl.go delete mode 100644 vendor/github.com/siddontang/ledisdb/rpl/store.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/boltdb/const.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/boltdb/db.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/boltdb/iterator.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/boltdb/snapshot.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/boltdb/tx.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/db.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/driver/batch.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/driver/driver.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/driver/slice.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/driver/store.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/goleveldb/batch.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/goleveldb/const.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/goleveldb/db.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/goleveldb/iterator.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/goleveldb/snapshot.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/iterator.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/leveldb/batch.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/leveldb/cache.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/leveldb/const.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/leveldb/db.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/leveldb/filterpolicy.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/leveldb/iterator.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/leveldb/leveldb_ext.cc delete mode 100644 vendor/github.com/siddontang/ledisdb/store/leveldb/leveldb_ext.h delete mode 100644 vendor/github.com/siddontang/ledisdb/store/leveldb/options.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/leveldb/slice.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/leveldb/snapshot.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/leveldb/util.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/mdb/const.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/mdb/mdb.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/mdb/snapshot.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/mdb/tx.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/rocksdb/batch.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/rocksdb/cache.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/rocksdb/const.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/rocksdb/db.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/rocksdb/env.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/rocksdb/filterpolicy.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/rocksdb/iterator.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/rocksdb/options.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/rocksdb/rocksdb_ext.cc delete mode 100644 vendor/github.com/siddontang/ledisdb/store/rocksdb/rocksdb_ext.h delete mode 100644 vendor/github.com/siddontang/ledisdb/store/rocksdb/slice.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/rocksdb/snapshot.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/rocksdb/util.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/slice.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/snapshot.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/stat.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/store.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/tx.go delete mode 100644 vendor/github.com/siddontang/ledisdb/store/writebatch.go delete mode 100644 vendor/github.com/szferi/gomdb/LICENSE delete mode 100644 vendor/github.com/szferi/gomdb/cursor.go delete mode 100644 vendor/github.com/szferi/gomdb/env.go delete mode 100644 vendor/github.com/szferi/gomdb/lmdb.h delete mode 100644 vendor/github.com/szferi/gomdb/mdb.c delete mode 100644 vendor/github.com/szferi/gomdb/mdb.go delete mode 100644 vendor/github.com/szferi/gomdb/midl.c delete mode 100644 vendor/github.com/szferi/gomdb/midl.h delete mode 100644 vendor/github.com/szferi/gomdb/txn.go delete mode 100644 vendor/github.com/szferi/gomdb/val.go diff --git a/Gopkg.lock b/Gopkg.lock index 546d0bf60496..ba21eca95c2a 100644 --- a/Gopkg.lock +++ b/Gopkg.lock @@ -396,12 +396,11 @@ [[projects]] branch = "master" - digest = "1:8d322b03fdb82ca78e19c2f90f8eb4ad88b53e3b71cbd01d0b3a1d73beb1abb2" + digest = "1:a26b7b56aece087165b8db87afb05db8495252449553ca20d15f5a24202f36bc" name = "github.com/go-macaron/session" packages = [ ".", "couchbase", - "ledis", "memcache", "mysql", "nodb", @@ -849,22 +848,6 @@ pruneopts = "NUT" revision = "1dba4b3954bc059efc3991ec364f9f9a35f597d2" -[[projects]] - branch = "master" - digest = "1:81cd039986aace9719c68a9794fa8c9dd1007cffa1ff8995631e8ed35aacf6fe" - name = "github.com/siddontang/go" - packages = [ - "filelock", - "hack", - "ioutil2", - "log", - "num", - "snappy", - "sync2", - ] - pruneopts = "NUT" - revision = "bdc77568d726a8702315ec4eafda030b6abc4f43" - [[projects]] branch = "master" digest = "1:dbda803f21e60c38de7d9f884390f2ebbe234ce0c3d139b65bbb36b03a99d266" @@ -873,25 +856,6 @@ pruneopts = "NUT" revision = "d8f7bb82a96d89c1254e5a6c967134e1433c9ee2" -[[projects]] - digest = "1:b8d24bf8620181e3232d6fa1638201ba6d808b4f2dc9b620863554cf9b383748" - name = "github.com/siddontang/ledisdb" - packages = [ - "config", - "ledis", - "rpl", - "store", - "store/boltdb", - "store/driver", - "store/goleveldb", - "store/leveldb", - "store/mdb", - "store/rocksdb", - ] - pruneopts = "NUT" - revision = "af217791d176389b9d9f2d41c414ca2f1de005cf" - version = "v0.4" - [[projects]] digest = "1:89fd77d603a74a6540d60067debad9397865bf040955d907362c95d364baeba6" name = "github.com/src-d/gcfg" @@ -942,14 +906,6 @@ pruneopts = "NUT" revision = "2f17a3356c6616cbfc4ae4c38147dc062a68fb0e" -[[projects]] - branch = "master" - digest = "1:e94c19da512584d4069db25d449150d8849d49a696e6281685b68932d3ba64a5" - name = "github.com/szferi/gomdb" - packages = ["."] - pruneopts = "NUT" - revision = "9f9ffa97e793fb4d06a56e65a5e6b72cb3b32f1c" - [[projects]] branch = "master" digest = "1:3cb6dfe7cdece5716b1c3c3c0b5faf7fce2e83e2758e2baad2e9986d101980b8" @@ -1297,7 +1253,6 @@ "github.com/go-macaron/inject", "github.com/go-macaron/session", "github.com/go-macaron/session/couchbase", - "github.com/go-macaron/session/ledis", "github.com/go-macaron/session/memcache", "github.com/go-macaron/session/mysql", "github.com/go-macaron/session/nodb", diff --git a/modules/setting/setting.go b/modules/setting/setting.go index d739266500a8..77f0725d0014 100644 --- a/modules/setting/setting.go +++ b/modules/setting/setting.go @@ -32,7 +32,6 @@ import ( _ "github.com/go-macaron/cache/redis" "github.com/go-macaron/session" _ "github.com/go-macaron/session/couchbase" // couchbase plugin for session store - _ "github.com/go-macaron/session/ledis" // ledis plugin for session store _ "github.com/go-macaron/session/memcache" // memcache plugin for session store _ "github.com/go-macaron/session/mysql" // mysql plugin for session store _ "github.com/go-macaron/session/nodb" // nodb plugin for session store @@ -1512,7 +1511,7 @@ func newCacheService() { func newSessionService() { SessionConfig.Provider = Cfg.Section("session").Key("PROVIDER").In("memory", - []string{"memory", "file", "redis", "mysql", "postgres", "ledis", "couchbase", "memcache", "nodb"}) + []string{"memory", "file", "redis", "mysql", "postgres", "couchbase", "memcache", "nodb"}) SessionConfig.ProviderConfig = strings.Trim(Cfg.Section("session").Key("PROVIDER_CONFIG").MustString(path.Join(AppDataPath, "sessions")), "\" ") if SessionConfig.Provider == "file" && !filepath.IsAbs(SessionConfig.ProviderConfig) { SessionConfig.ProviderConfig = path.Join(AppWorkPath, SessionConfig.ProviderConfig) diff --git a/vendor/github.com/go-macaron/session/ledis/ledis.go b/vendor/github.com/go-macaron/session/ledis/ledis.go deleted file mode 100644 index bb5ce95244ab..000000000000 --- a/vendor/github.com/go-macaron/session/ledis/ledis.go +++ /dev/null @@ -1,227 +0,0 @@ -// Copyright 2013 Beego Authors -// Copyright 2014 The Macaron Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"): you may -// not use this file except in compliance with the License. You may obtain -// a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -// License for the specific language governing permissions and limitations -// under the License. - -package session - -import ( - "fmt" - "strings" - "sync" - - "github.com/Unknwon/com" - "github.com/siddontang/ledisdb/config" - "github.com/siddontang/ledisdb/ledis" - "gopkg.in/ini.v1" - - "github.com/go-macaron/session" -) - -// LedisStore represents a ledis session store implementation. -type LedisStore struct { - c *ledis.DB - sid string - expire int64 - lock sync.RWMutex - data map[interface{}]interface{} -} - -// NewLedisStore creates and returns a ledis session store. -func NewLedisStore(c *ledis.DB, sid string, expire int64, kv map[interface{}]interface{}) *LedisStore { - return &LedisStore{ - c: c, - expire: expire, - sid: sid, - data: kv, - } -} - -// Set sets value to given key in session. -func (s *LedisStore) Set(key, val interface{}) error { - s.lock.Lock() - defer s.lock.Unlock() - - s.data[key] = val - return nil -} - -// Get gets value by given key in session. -func (s *LedisStore) Get(key interface{}) interface{} { - s.lock.RLock() - defer s.lock.RUnlock() - - return s.data[key] -} - -// Delete delete a key from session. -func (s *LedisStore) Delete(key interface{}) error { - s.lock.Lock() - defer s.lock.Unlock() - - delete(s.data, key) - return nil -} - -// ID returns current session ID. -func (s *LedisStore) ID() string { - return s.sid -} - -// Release releases resource and save data to provider. -func (s *LedisStore) Release() error { - // Skip encoding if the data is empty - if len(s.data) == 0 { - return nil - } - - data, err := session.EncodeGob(s.data) - if err != nil { - return err - } - - if err = s.c.Set([]byte(s.sid), data); err != nil { - return err - } - _, err = s.c.Expire([]byte(s.sid), s.expire) - return err -} - -// Flush deletes all session data. -func (s *LedisStore) Flush() error { - s.lock.Lock() - defer s.lock.Unlock() - - s.data = make(map[interface{}]interface{}) - return nil -} - -// LedisProvider represents a ledis session provider implementation. -type LedisProvider struct { - c *ledis.DB - expire int64 -} - -// Init initializes ledis session provider. -// configs: data_dir=./app.db,db=0 -func (p *LedisProvider) Init(expire int64, configs string) error { - p.expire = expire - - cfg, err := ini.Load([]byte(strings.Replace(configs, ",", "\n", -1))) - if err != nil { - return err - } - - db := 0 - opt := new(config.Config) - for k, v := range cfg.Section("").KeysHash() { - switch k { - case "data_dir": - opt.DataDir = v - case "db": - db = com.StrTo(v).MustInt() - default: - return fmt.Errorf("session/ledis: unsupported option '%s'", k) - } - } - - l, err := ledis.Open(opt) - if err != nil { - return fmt.Errorf("session/ledis: error opening db: %v", err) - } - p.c, err = l.Select(db) - return err -} - -// Read returns raw session store by session ID. -func (p *LedisProvider) Read(sid string) (session.RawStore, error) { - if !p.Exist(sid) { - if err := p.c.Set([]byte(sid), []byte("")); err != nil { - return nil, err - } - } - - var kv map[interface{}]interface{} - kvs, err := p.c.Get([]byte(sid)) - if err != nil { - return nil, err - } - if len(kvs) == 0 { - kv = make(map[interface{}]interface{}) - } else { - kv, err = session.DecodeGob(kvs) - if err != nil { - return nil, err - } - } - - return NewLedisStore(p.c, sid, p.expire, kv), nil -} - -// Exist returns true if session with given ID exists. -func (p *LedisProvider) Exist(sid string) bool { - count, err := p.c.Exists([]byte(sid)) - return err == nil && count > 0 -} - -// Destory deletes a session by session ID. -func (p *LedisProvider) Destory(sid string) error { - _, err := p.c.Del([]byte(sid)) - return err -} - -// Regenerate regenerates a session store from old session ID to new one. -func (p *LedisProvider) Regenerate(oldsid, sid string) (_ session.RawStore, err error) { - if p.Exist(sid) { - return nil, fmt.Errorf("new sid '%s' already exists", sid) - } - - kvs := make([]byte, 0) - if p.Exist(oldsid) { - if kvs, err = p.c.Get([]byte(oldsid)); err != nil { - return nil, err - } else if _, err = p.c.Del([]byte(oldsid)); err != nil { - return nil, err - } - } - if err = p.c.SetEX([]byte(sid), p.expire, kvs); err != nil { - return nil, err - } - - var kv map[interface{}]interface{} - if len(kvs) == 0 { - kv = make(map[interface{}]interface{}) - } else { - kv, err = session.DecodeGob([]byte(kvs)) - if err != nil { - return nil, err - } - } - - return NewLedisStore(p.c, sid, p.expire, kv), nil -} - -// Count counts and returns number of sessions. -func (p *LedisProvider) Count() int { - // FIXME: how come this library does not have DbSize() method? - return -1 -} - -// GC calls GC to clean expired sessions. -func (p *LedisProvider) GC() { - // FIXME: wtf??? -} - -func init() { - session.Register("ledis", &LedisProvider{}) -} diff --git a/vendor/github.com/siddontang/go/LICENSE b/vendor/github.com/siddontang/go/LICENSE deleted file mode 100644 index 80511a0a784d..000000000000 --- a/vendor/github.com/siddontang/go/LICENSE +++ /dev/null @@ -1,20 +0,0 @@ -The MIT License (MIT) - -Copyright (c) 2014 siddontang - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of -the Software, and to permit persons to whom the Software is furnished to do so, -subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS -FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER -IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/vendor/github.com/siddontang/go/bson/LICENSE b/vendor/github.com/siddontang/go/bson/LICENSE deleted file mode 100644 index 890326017b85..000000000000 --- a/vendor/github.com/siddontang/go/bson/LICENSE +++ /dev/null @@ -1,25 +0,0 @@ -BSON library for Go - -Copyright (c) 2010-2012 - Gustavo Niemeyer - -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/github.com/siddontang/go/filelock/LICENSE b/vendor/github.com/siddontang/go/filelock/LICENSE deleted file mode 100644 index fec05ce12959..000000000000 --- a/vendor/github.com/siddontang/go/filelock/LICENSE +++ /dev/null @@ -1,27 +0,0 @@ -Copyright (c) 2011 The LevelDB-Go Authors. All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - * Neither the name of Google Inc. nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/github.com/siddontang/go/filelock/file_lock_generic.go b/vendor/github.com/siddontang/go/filelock/file_lock_generic.go deleted file mode 100644 index 53c292acbdff..000000000000 --- a/vendor/github.com/siddontang/go/filelock/file_lock_generic.go +++ /dev/null @@ -1,17 +0,0 @@ -// Copyright 2012 The LevelDB-Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build !darwin,!dragonfly,!freebsd,!linux,!netbsd,!openbsd,!solaris,!windows - -package filelock - -import ( - "fmt" - "io" - "runtime" -) - -func Lock(name string) (io.Closer, error) { - return nil, fmt.Errorf("leveldb/db: file locking is not implemented on %s/%s", runtime.GOOS, runtime.GOARCH) -} diff --git a/vendor/github.com/siddontang/go/filelock/file_lock_solaris.go b/vendor/github.com/siddontang/go/filelock/file_lock_solaris.go deleted file mode 100644 index 56ff3e2ceef2..000000000000 --- a/vendor/github.com/siddontang/go/filelock/file_lock_solaris.go +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2014 The LevelDB-Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build solaris - -package filelock - -import ( - "io" - "os" - "syscall" -) - -// lockCloser hides all of an os.File's methods, except for Close. -type lockCloser struct { - f *os.File -} - -func (l lockCloser) Close() error { - return l.f.Close() -} - -func Lock(name string) (io.Closer, error) { - f, err := os.Create(name) - if err != nil { - return nil, err - } - - spec := syscall.Flock_t{ - Type: syscall.F_WRLCK, - Whence: int16(os.SEEK_SET), - Start: 0, - Len: 0, // 0 means to lock the entire file. - Pid: int32(os.Getpid()), - } - if err := syscall.FcntlFlock(f.Fd(), syscall.F_SETLK, &spec); err != nil { - f.Close() - return nil, err - } - - return lockCloser{f}, nil -} diff --git a/vendor/github.com/siddontang/go/filelock/file_lock_unix.go b/vendor/github.com/siddontang/go/filelock/file_lock_unix.go deleted file mode 100644 index f70ae6192c59..000000000000 --- a/vendor/github.com/siddontang/go/filelock/file_lock_unix.go +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2014 The LevelDB-Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build darwin dragonfly freebsd linux netbsd openbsd - -package filelock - -import ( - "io" - "os" - "syscall" -) - -// lockCloser hides all of an os.File's methods, except for Close. -type lockCloser struct { - f *os.File -} - -func (l lockCloser) Close() error { - return l.f.Close() -} - -func Lock(name string) (io.Closer, error) { - f, err := os.Create(name) - if err != nil { - return nil, err - } - - /* - Some people tell me FcntlFlock does not exist, so use flock here - */ - if err := syscall.Flock(int(f.Fd()), syscall.LOCK_EX|syscall.LOCK_NB); err != nil { - f.Close() - return nil, err - } - - // spec := syscall.Flock_t{ - // Type: syscall.F_WRLCK, - // Whence: int16(os.SEEK_SET), - // Start: 0, - // Len: 0, // 0 means to lock the entire file. - // Pid: int32(os.Getpid()), - // } - // if err := syscall.FcntlFlock(f.Fd(), syscall.F_SETLK, &spec); err != nil { - // f.Close() - // return nil, err - // } - - return lockCloser{f}, nil -} diff --git a/vendor/github.com/siddontang/go/filelock/file_lock_windows.go b/vendor/github.com/siddontang/go/filelock/file_lock_windows.go deleted file mode 100644 index 5d3e4ba2029a..000000000000 --- a/vendor/github.com/siddontang/go/filelock/file_lock_windows.go +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2013 The LevelDB-Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package filelock - -import ( - "io" - "syscall" -) - -// lockCloser hides all of an syscall.Handle's methods, except for Close. -type lockCloser struct { - fd syscall.Handle -} - -func (l lockCloser) Close() error { - return syscall.Close(l.fd) -} - -func Lock(name string) (io.Closer, error) { - p, err := syscall.UTF16PtrFromString(name) - if err != nil { - return nil, err - } - fd, err := syscall.CreateFile(p, - syscall.GENERIC_READ|syscall.GENERIC_WRITE, - 0, nil, syscall.CREATE_ALWAYS, - syscall.FILE_ATTRIBUTE_NORMAL, - 0, - ) - if err != nil { - return nil, err - } - return lockCloser{fd: fd}, nil -} diff --git a/vendor/github.com/siddontang/go/hack/hack.go b/vendor/github.com/siddontang/go/hack/hack.go deleted file mode 100644 index 74ee83cbf5d5..000000000000 --- a/vendor/github.com/siddontang/go/hack/hack.go +++ /dev/null @@ -1,27 +0,0 @@ -package hack - -import ( - "reflect" - "unsafe" -) - -// no copy to change slice to string -// use your own risk -func String(b []byte) (s string) { - pbytes := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - pstring := (*reflect.StringHeader)(unsafe.Pointer(&s)) - pstring.Data = pbytes.Data - pstring.Len = pbytes.Len - return -} - -// no copy to change string to slice -// use your own risk -func Slice(s string) (b []byte) { - pbytes := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - pstring := (*reflect.StringHeader)(unsafe.Pointer(&s)) - pbytes.Data = pstring.Data - pbytes.Len = pstring.Len - pbytes.Cap = pstring.Len - return -} diff --git a/vendor/github.com/siddontang/go/ioutil2/ioutil.go b/vendor/github.com/siddontang/go/ioutil2/ioutil.go deleted file mode 100644 index 35c0ad3cad51..000000000000 --- a/vendor/github.com/siddontang/go/ioutil2/ioutil.go +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright 2012, Google Inc. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package ioutil2 - -import ( - "io" - "io/ioutil" - "os" - "path" -) - -// Write file to temp and atomically move when everything else succeeds. -func WriteFileAtomic(filename string, data []byte, perm os.FileMode) error { - dir, name := path.Dir(filename), path.Base(filename) - f, err := ioutil.TempFile(dir, name) - if err != nil { - return err - } - n, err := f.Write(data) - f.Close() - if err == nil && n < len(data) { - err = io.ErrShortWrite - } else { - err = os.Chmod(f.Name(), perm) - } - if err != nil { - os.Remove(f.Name()) - return err - } - return os.Rename(f.Name(), filename) -} - -// Check file exists or not -func FileExists(name string) bool { - _, err := os.Stat(name) - return !os.IsNotExist(err) -} diff --git a/vendor/github.com/siddontang/go/ioutil2/sectionwriter.go b/vendor/github.com/siddontang/go/ioutil2/sectionwriter.go deleted file mode 100644 index c02ab0d5fd1f..000000000000 --- a/vendor/github.com/siddontang/go/ioutil2/sectionwriter.go +++ /dev/null @@ -1,69 +0,0 @@ -package ioutil2 - -import ( - "errors" - "io" -) - -var ErrExceedLimit = errors.New("write exceed limit") - -func NewSectionWriter(w io.WriterAt, off int64, n int64) *SectionWriter { - return &SectionWriter{w, off, off, off + n} -} - -type SectionWriter struct { - w io.WriterAt - base int64 - off int64 - limit int64 -} - -func (s *SectionWriter) Write(p []byte) (n int, err error) { - if s.off >= s.limit { - return 0, ErrExceedLimit - } - - if max := s.limit - s.off; int64(len(p)) > max { - return 0, ErrExceedLimit - } - - n, err = s.w.WriteAt(p, s.off) - s.off += int64(n) - return -} - -var errWhence = errors.New("Seek: invalid whence") -var errOffset = errors.New("Seek: invalid offset") - -func (s *SectionWriter) Seek(offset int64, whence int) (int64, error) { - switch whence { - default: - return 0, errWhence - case 0: - offset += s.base - case 1: - offset += s.off - case 2: - offset += s.limit - } - if offset < s.base { - return 0, errOffset - } - s.off = offset - return offset - s.base, nil -} - -func (s *SectionWriter) WriteAt(p []byte, off int64) (n int, err error) { - if off < 0 || off >= s.limit-s.base { - return 0, errOffset - } - off += s.base - if max := s.limit - off; int64(len(p)) > max { - return 0, ErrExceedLimit - } - - return s.w.WriteAt(p, off) -} - -// Size returns the size of the section in bytes. -func (s *SectionWriter) Size() int64 { return s.limit - s.base } diff --git a/vendor/github.com/siddontang/go/log/doc.go b/vendor/github.com/siddontang/go/log/doc.go deleted file mode 100644 index 81a60ee853bc..000000000000 --- a/vendor/github.com/siddontang/go/log/doc.go +++ /dev/null @@ -1,21 +0,0 @@ -// log package supplies more advanced features than go orign log package. -// -// It supports log different level: trace, debug, info, warn, error, fatal. -// -// It also supports different log handlers which you can log to stdout, file, socket, etc... -// -// Use -// -// import "github.com/siddontang/go/log" -// -// //log with different level -// log.Info("hello world") -// log.Error("hello world") -// -// //create a logger with specified handler -// h := NewStreamHandler(os.Stdout) -// l := log.NewDefault(h) -// l.Info("hello world") -// l.Infof("%s %d", "hello", 123) -// -package log diff --git a/vendor/github.com/siddontang/go/log/filehandler.go b/vendor/github.com/siddontang/go/log/filehandler.go deleted file mode 100644 index 2c158e2cf569..000000000000 --- a/vendor/github.com/siddontang/go/log/filehandler.go +++ /dev/null @@ -1,221 +0,0 @@ -package log - -import ( - "fmt" - "os" - "path" - "time" -) - -//FileHandler writes log to a file. -type FileHandler struct { - fd *os.File -} - -func NewFileHandler(fileName string, flag int) (*FileHandler, error) { - dir := path.Dir(fileName) - os.Mkdir(dir, 0777) - - f, err := os.OpenFile(fileName, flag, 0) - if err != nil { - return nil, err - } - - h := new(FileHandler) - - h.fd = f - - return h, nil -} - -func (h *FileHandler) Write(b []byte) (n int, err error) { - return h.fd.Write(b) -} - -func (h *FileHandler) Close() error { - return h.fd.Close() -} - -//RotatingFileHandler writes log a file, if file size exceeds maxBytes, -//it will backup current file and open a new one. -// -//max backup file number is set by backupCount, it will delete oldest if backups too many. -type RotatingFileHandler struct { - fd *os.File - - fileName string - maxBytes int - curBytes int - backupCount int -} - -func NewRotatingFileHandler(fileName string, maxBytes int, backupCount int) (*RotatingFileHandler, error) { - dir := path.Dir(fileName) - os.MkdirAll(dir, 0777) - - h := new(RotatingFileHandler) - - if maxBytes <= 0 { - return nil, fmt.Errorf("invalid max bytes") - } - - h.fileName = fileName - h.maxBytes = maxBytes - h.backupCount = backupCount - - var err error - h.fd, err = os.OpenFile(fileName, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666) - if err != nil { - return nil, err - } - - f, err := h.fd.Stat() - if err != nil { - return nil, err - } - h.curBytes = int(f.Size()) - - return h, nil -} - -func (h *RotatingFileHandler) Write(p []byte) (n int, err error) { - h.doRollover() - n, err = h.fd.Write(p) - h.curBytes += n - return -} - -func (h *RotatingFileHandler) Close() error { - if h.fd != nil { - return h.fd.Close() - } - return nil -} - -func (h *RotatingFileHandler) doRollover() { - - if h.curBytes < h.maxBytes { - return - } - - f, err := h.fd.Stat() - if err != nil { - return - } - - if h.maxBytes <= 0 { - return - } else if f.Size() < int64(h.maxBytes) { - h.curBytes = int(f.Size()) - return - } - - if h.backupCount > 0 { - h.fd.Close() - - for i := h.backupCount - 1; i > 0; i-- { - sfn := fmt.Sprintf("%s.%d", h.fileName, i) - dfn := fmt.Sprintf("%s.%d", h.fileName, i+1) - - os.Rename(sfn, dfn) - } - - dfn := fmt.Sprintf("%s.1", h.fileName) - os.Rename(h.fileName, dfn) - - h.fd, _ = os.OpenFile(h.fileName, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666) - h.curBytes = 0 - f, err := h.fd.Stat() - if err != nil { - return - } - h.curBytes = int(f.Size()) - } -} - -//TimeRotatingFileHandler writes log to a file, -//it will backup current and open a new one, with a period time you sepecified. -// -//refer: http://docs.python.org/2/library/logging.handlers.html. -//same like python TimedRotatingFileHandler. -type TimeRotatingFileHandler struct { - fd *os.File - - baseName string - interval int64 - suffix string - rolloverAt int64 -} - -const ( - WhenSecond = iota - WhenMinute - WhenHour - WhenDay -) - -func NewTimeRotatingFileHandler(baseName string, when int8, interval int) (*TimeRotatingFileHandler, error) { - dir := path.Dir(baseName) - os.Mkdir(dir, 0777) - - h := new(TimeRotatingFileHandler) - - h.baseName = baseName - - switch when { - case WhenSecond: - h.interval = 1 - h.suffix = "2006-01-02_15-04-05" - case WhenMinute: - h.interval = 60 - h.suffix = "2006-01-02_15-04" - case WhenHour: - h.interval = 3600 - h.suffix = "2006-01-02_15" - case WhenDay: - h.interval = 3600 * 24 - h.suffix = "2006-01-02" - default: - return nil, fmt.Errorf("invalid when_rotate: %d", when) - } - - h.interval = h.interval * int64(interval) - - var err error - h.fd, err = os.OpenFile(h.baseName, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666) - if err != nil { - return nil, err - } - - fInfo, _ := h.fd.Stat() - h.rolloverAt = fInfo.ModTime().Unix() + h.interval - - return h, nil -} - -func (h *TimeRotatingFileHandler) doRollover() { - //refer http://hg.python.org/cpython/file/2.7/Lib/logging/handlers.py - now := time.Now() - - if h.rolloverAt <= now.Unix() { - fName := h.baseName + now.Format(h.suffix) - h.fd.Close() - e := os.Rename(h.baseName, fName) - if e != nil { - panic(e) - } - - h.fd, _ = os.OpenFile(h.baseName, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666) - - h.rolloverAt = time.Now().Unix() + h.interval - } -} - -func (h *TimeRotatingFileHandler) Write(b []byte) (n int, err error) { - h.doRollover() - return h.fd.Write(b) -} - -func (h *TimeRotatingFileHandler) Close() error { - return h.fd.Close() -} diff --git a/vendor/github.com/siddontang/go/log/handler.go b/vendor/github.com/siddontang/go/log/handler.go deleted file mode 100644 index 4dc086f45c0a..000000000000 --- a/vendor/github.com/siddontang/go/log/handler.go +++ /dev/null @@ -1,48 +0,0 @@ -package log - -import ( - "io" -) - -//Handler writes logs to somewhere -type Handler interface { - Write(p []byte) (n int, err error) - Close() error -} - -//StreamHandler writes logs to a specified io Writer, maybe stdout, stderr, etc... -type StreamHandler struct { - w io.Writer -} - -func NewStreamHandler(w io.Writer) (*StreamHandler, error) { - h := new(StreamHandler) - - h.w = w - - return h, nil -} - -func (h *StreamHandler) Write(b []byte) (n int, err error) { - return h.w.Write(b) -} - -func (h *StreamHandler) Close() error { - return nil -} - -//NullHandler does nothing, it discards anything. -type NullHandler struct { -} - -func NewNullHandler() (*NullHandler, error) { - return new(NullHandler), nil -} - -func (h *NullHandler) Write(b []byte) (n int, err error) { - return len(b), nil -} - -func (h *NullHandler) Close() { - -} diff --git a/vendor/github.com/siddontang/go/log/log.go b/vendor/github.com/siddontang/go/log/log.go deleted file mode 100644 index 371f6016871e..000000000000 --- a/vendor/github.com/siddontang/go/log/log.go +++ /dev/null @@ -1,343 +0,0 @@ -package log - -import ( - "fmt" - "os" - "runtime" - "strconv" - "strings" - "sync" - "sync/atomic" - "time" -) - -//log level, from low to high, more high means more serious -const ( - LevelTrace = iota - LevelDebug - LevelInfo - LevelWarn - LevelError - LevelFatal -) - -const ( - Ltime = 1 << iota //time format "2006/01/02 15:04:05" - Lfile //file.go:123 - Llevel //[Trace|Debug|Info...] -) - -var LevelName [6]string = [6]string{"Trace", "Debug", "Info", "Warn", "Error", "Fatal"} - -const TimeFormat = "2006/01/02 15:04:05" - -const maxBufPoolSize = 16 - -type atomicInt32 int32 - -func (i *atomicInt32) Set(n int) { - atomic.StoreInt32((*int32)(i), int32(n)) -} - -func (i *atomicInt32) Get() int { - return int(atomic.LoadInt32((*int32)(i))) -} - -type Logger struct { - level atomicInt32 - flag int - - hMutex sync.Mutex - handler Handler - - bufMutex sync.Mutex - bufs [][]byte - - closed atomicInt32 -} - -//new a logger with specified handler and flag -func New(handler Handler, flag int) *Logger { - var l = new(Logger) - - l.level.Set(LevelInfo) - l.handler = handler - - l.flag = flag - - l.closed.Set(0) - - l.bufs = make([][]byte, 0, 16) - - return l -} - -//new a default logger with specified handler and flag: Ltime|Lfile|Llevel -func NewDefault(handler Handler) *Logger { - return New(handler, Ltime|Lfile|Llevel) -} - -func newStdHandler() *StreamHandler { - h, _ := NewStreamHandler(os.Stdout) - return h -} - -var std = NewDefault(newStdHandler()) - -func (l *Logger) popBuf() []byte { - l.bufMutex.Lock() - var buf []byte - if len(l.bufs) == 0 { - buf = make([]byte, 0, 1024) - } else { - buf = l.bufs[len(l.bufs)-1] - l.bufs = l.bufs[0 : len(l.bufs)-1] - } - l.bufMutex.Unlock() - - return buf -} - -func (l *Logger) putBuf(buf []byte) { - l.bufMutex.Lock() - if len(l.bufs) < maxBufPoolSize { - buf = buf[0:0] - l.bufs = append(l.bufs, buf) - } - l.bufMutex.Unlock() -} - -func (l *Logger) Close() { - if l.closed.Get() == 1 { - return - } - l.closed.Set(1) - - l.handler.Close() -} - -//set log level, any log level less than it will not log -func (l *Logger) SetLevel(level int) { - l.level.Set(level) -} - -// name can be in ["trace", "debug", "info", "warn", "error", "fatal"] -func (l *Logger) SetLevelByName(name string) { - name = strings.ToLower(name) - switch name { - case "trace": - l.SetLevel(LevelTrace) - case "debug": - l.SetLevel(LevelDebug) - case "info": - l.SetLevel(LevelInfo) - case "warn": - l.SetLevel(LevelWarn) - case "error": - l.SetLevel(LevelError) - case "fatal": - l.SetLevel(LevelFatal) - } -} - -func (l *Logger) SetHandler(h Handler) { - if l.closed.Get() == 1 { - return - } - - l.hMutex.Lock() - if l.handler != nil { - l.handler.Close() - } - l.handler = h - l.hMutex.Unlock() -} - -func (l *Logger) Output(callDepth int, level int, format string, v ...interface{}) { - if l.closed.Get() == 1 { - // closed - return - } - - if l.level.Get() > level { - // higher level can be logged - return - } - - var s string - if format == "" { - s = fmt.Sprint(v...) - } else { - s = fmt.Sprintf(format, v...) - } - - buf := l.popBuf() - - if l.flag&Ltime > 0 { - now := time.Now().Format(TimeFormat) - buf = append(buf, '[') - buf = append(buf, now...) - buf = append(buf, "] "...) - } - - if l.flag&Lfile > 0 { - _, file, line, ok := runtime.Caller(callDepth) - if !ok { - file = "???" - line = 0 - } else { - for i := len(file) - 1; i > 0; i-- { - if file[i] == '/' { - file = file[i+1:] - break - } - } - } - - buf = append(buf, file...) - buf = append(buf, ':') - - buf = strconv.AppendInt(buf, int64(line), 10) - buf = append(buf, ' ') - } - - if l.flag&Llevel > 0 { - buf = append(buf, '[') - buf = append(buf, LevelName[level]...) - buf = append(buf, "] "...) - } - - buf = append(buf, s...) - - if s[len(s)-1] != '\n' { - buf = append(buf, '\n') - } - - // l.msg <- buf - - l.hMutex.Lock() - l.handler.Write(buf) - l.hMutex.Unlock() - l.putBuf(buf) -} - -//log with Trace level -func (l *Logger) Trace(v ...interface{}) { - l.Output(2, LevelTrace, "", v...) -} - -//log with Debug level -func (l *Logger) Debug(v ...interface{}) { - l.Output(2, LevelDebug, "", v...) -} - -//log with info level -func (l *Logger) Info(v ...interface{}) { - l.Output(2, LevelInfo, "", v...) -} - -//log with warn level -func (l *Logger) Warn(v ...interface{}) { - l.Output(2, LevelWarn, "", v...) -} - -//log with error level -func (l *Logger) Error(v ...interface{}) { - l.Output(2, LevelError, "", v...) -} - -//log with fatal level -func (l *Logger) Fatal(v ...interface{}) { - l.Output(2, LevelFatal, "", v...) -} - -//log with Trace level -func (l *Logger) Tracef(format string, v ...interface{}) { - l.Output(2, LevelTrace, format, v...) -} - -//log with Debug level -func (l *Logger) Debugf(format string, v ...interface{}) { - l.Output(2, LevelDebug, format, v...) -} - -//log with info level -func (l *Logger) Infof(format string, v ...interface{}) { - l.Output(2, LevelInfo, format, v...) -} - -//log with warn level -func (l *Logger) Warnf(format string, v ...interface{}) { - l.Output(2, LevelWarn, format, v...) -} - -//log with error level -func (l *Logger) Errorf(format string, v ...interface{}) { - l.Output(2, LevelError, format, v...) -} - -//log with fatal level -func (l *Logger) Fatalf(format string, v ...interface{}) { - l.Output(2, LevelFatal, format, v...) -} - -func SetLevel(level int) { - std.SetLevel(level) -} - -// name can be in ["trace", "debug", "info", "warn", "error", "fatal"] -func SetLevelByName(name string) { - std.SetLevelByName(name) -} - -func SetHandler(h Handler) { - std.SetHandler(h) -} - -func Trace(v ...interface{}) { - std.Output(2, LevelTrace, "", v...) -} - -func Debug(v ...interface{}) { - std.Output(2, LevelDebug, "", v...) -} - -func Info(v ...interface{}) { - std.Output(2, LevelInfo, "", v...) -} - -func Warn(v ...interface{}) { - std.Output(2, LevelWarn, "", v...) -} - -func Error(v ...interface{}) { - std.Output(2, LevelError, "", v...) -} - -func Fatal(v ...interface{}) { - std.Output(2, LevelFatal, "", v...) -} - -func Tracef(format string, v ...interface{}) { - std.Output(2, LevelTrace, format, v...) -} - -func Debugf(format string, v ...interface{}) { - std.Output(2, LevelDebug, format, v...) -} - -func Infof(format string, v ...interface{}) { - std.Output(2, LevelInfo, format, v...) -} - -func Warnf(format string, v ...interface{}) { - std.Output(2, LevelWarn, format, v...) -} - -func Errorf(format string, v ...interface{}) { - std.Output(2, LevelError, format, v...) -} - -func Fatalf(format string, v ...interface{}) { - std.Output(2, LevelFatal, format, v...) -} diff --git a/vendor/github.com/siddontang/go/log/sockethandler.go b/vendor/github.com/siddontang/go/log/sockethandler.go deleted file mode 100644 index 3e7494d9501b..000000000000 --- a/vendor/github.com/siddontang/go/log/sockethandler.go +++ /dev/null @@ -1,65 +0,0 @@ -package log - -import ( - "encoding/binary" - "net" - "time" -) - -//SocketHandler writes log to a connectionl. -//Network protocol is simple: log length + log | log length + log. log length is uint32, bigendian. -//you must implement your own log server, maybe you can use logd instead simply. -type SocketHandler struct { - c net.Conn - protocol string - addr string -} - -func NewSocketHandler(protocol string, addr string) (*SocketHandler, error) { - s := new(SocketHandler) - - s.protocol = protocol - s.addr = addr - - return s, nil -} - -func (h *SocketHandler) Write(p []byte) (n int, err error) { - if err = h.connect(); err != nil { - return - } - - buf := make([]byte, len(p)+4) - - binary.BigEndian.PutUint32(buf, uint32(len(p))) - - copy(buf[4:], p) - - n, err = h.c.Write(buf) - if err != nil { - h.c.Close() - h.c = nil - } - return -} - -func (h *SocketHandler) Close() error { - if h.c != nil { - h.c.Close() - } - return nil -} - -func (h *SocketHandler) connect() error { - if h.c != nil { - return nil - } - - var err error - h.c, err = net.DialTimeout(h.protocol, h.addr, 20*time.Second) - if err != nil { - return err - } - - return nil -} diff --git a/vendor/github.com/siddontang/go/num/bytes.go b/vendor/github.com/siddontang/go/num/bytes.go deleted file mode 100644 index 1f3def74ac73..000000000000 --- a/vendor/github.com/siddontang/go/num/bytes.go +++ /dev/null @@ -1,67 +0,0 @@ -package num - -import ( - "encoding/binary" -) - -//all are bigendian format - -func BytesToUint16(b []byte) uint16 { - return binary.BigEndian.Uint16(b) -} - -func Uint16ToBytes(u uint16) []byte { - buf := make([]byte, 2) - binary.BigEndian.PutUint16(buf, u) - return buf -} - -func BytesToUint32(b []byte) uint32 { - return binary.BigEndian.Uint32(b) -} - -func Uint32ToBytes(u uint32) []byte { - buf := make([]byte, 4) - binary.BigEndian.PutUint32(buf, u) - return buf -} - -func BytesToUint64(b []byte) uint64 { - return binary.BigEndian.Uint64(b) -} - -func Uint64ToBytes(u uint64) []byte { - buf := make([]byte, 8) - binary.BigEndian.PutUint64(buf, u) - return buf -} - -func BytesToInt16(b []byte) int16 { - return int16(binary.BigEndian.Uint16(b)) -} - -func Int16ToBytes(u int16) []byte { - buf := make([]byte, 2) - binary.BigEndian.PutUint16(buf, uint16(u)) - return buf -} - -func BytesToInt32(b []byte) int32 { - return int32(binary.BigEndian.Uint32(b)) -} - -func Int32ToBytes(u int32) []byte { - buf := make([]byte, 4) - binary.BigEndian.PutUint32(buf, uint32(u)) - return buf -} - -func BytesToInt64(b []byte) int64 { - return int64(binary.BigEndian.Uint64(b)) -} - -func Int64ToBytes(u int64) []byte { - buf := make([]byte, 8) - binary.BigEndian.PutUint64(buf, uint64(u)) - return buf -} diff --git a/vendor/github.com/siddontang/go/num/cmp.go b/vendor/github.com/siddontang/go/num/cmp.go deleted file mode 100644 index 78f8d4f14af3..000000000000 --- a/vendor/github.com/siddontang/go/num/cmp.go +++ /dev/null @@ -1,161 +0,0 @@ -package num - -func MinUint(a uint, b uint) uint { - if a > b { - return b - } else { - return a - } -} - -func MaxUint(a uint, b uint) uint { - if a > b { - return a - } else { - return b - } -} - -func MinInt(a int, b int) int { - if a > b { - return b - } else { - return a - } -} - -func MaxInt(a int, b int) int { - if a > b { - return a - } else { - return b - } -} - -func MinUint8(a uint8, b uint8) uint8 { - if a > b { - return b - } else { - return a - } -} - -func MaxUint8(a uint8, b uint8) uint8 { - if a > b { - return a - } else { - return b - } -} - -func MinInt8(a int8, b int8) int8 { - if a > b { - return b - } else { - return a - } -} - -func MaxInt8(a int8, b int8) int8 { - if a > b { - return a - } else { - return b - } -} - -func MinUint16(a uint16, b uint16) uint16 { - if a > b { - return b - } else { - return a - } -} - -func MaxUint16(a uint16, b uint16) uint16 { - if a > b { - return a - } else { - return b - } -} - -func MinInt16(a int16, b int16) int16 { - if a > b { - return b - } else { - return a - } -} - -func MaxInt16(a int16, b int16) int16 { - if a > b { - return a - } else { - return b - } -} - -func MinUint32(a uint32, b uint32) uint32 { - if a > b { - return b - } else { - return a - } -} - -func MaxUint32(a uint32, b uint32) uint32 { - if a > b { - return a - } else { - return b - } -} - -func MinInt32(a int32, b int32) int32 { - if a > b { - return b - } else { - return a - } -} - -func MaxInt32(a int32, b int32) int32 { - if a > b { - return a - } else { - return b - } -} - -func MinUint64(a uint64, b uint64) uint64 { - if a > b { - return b - } else { - return a - } -} - -func MaxUint64(a uint64, b uint64) uint64 { - if a > b { - return a - } else { - return b - } -} - -func MinInt64(a int64, b int64) int64 { - if a > b { - return b - } else { - return a - } -} - -func MaxInt64(a int64, b int64) int64 { - if a > b { - return a - } else { - return b - } -} diff --git a/vendor/github.com/siddontang/go/num/str.go b/vendor/github.com/siddontang/go/num/str.go deleted file mode 100644 index 4b304817b86d..000000000000 --- a/vendor/github.com/siddontang/go/num/str.go +++ /dev/null @@ -1,157 +0,0 @@ -package num - -import ( - "strconv" -) - -func ParseUint(s string) (uint, error) { - if v, err := strconv.ParseUint(s, 10, 0); err != nil { - return 0, err - } else { - return uint(v), nil - } -} - -func ParseUint8(s string) (uint8, error) { - if v, err := strconv.ParseUint(s, 10, 8); err != nil { - return 0, err - } else { - return uint8(v), nil - } -} - -func ParseUint16(s string) (uint16, error) { - if v, err := strconv.ParseUint(s, 10, 16); err != nil { - return 0, err - } else { - return uint16(v), nil - } -} - -func ParseUint32(s string) (uint32, error) { - if v, err := strconv.ParseUint(s, 10, 32); err != nil { - return 0, err - } else { - return uint32(v), nil - } -} - -func ParseUint64(s string) (uint64, error) { - return strconv.ParseUint(s, 10, 64) -} - -func ParseInt(s string) (int, error) { - if v, err := strconv.ParseInt(s, 10, 0); err != nil { - return 0, err - } else { - return int(v), nil - } -} - -func ParseInt8(s string) (int8, error) { - if v, err := strconv.ParseInt(s, 10, 8); err != nil { - return 0, err - } else { - return int8(v), nil - } -} - -func ParseInt16(s string) (int16, error) { - if v, err := strconv.ParseInt(s, 10, 16); err != nil { - return 0, err - } else { - return int16(v), nil - } -} - -func ParseInt32(s string) (int32, error) { - if v, err := strconv.ParseInt(s, 10, 32); err != nil { - return 0, err - } else { - return int32(v), nil - } -} - -func ParseInt64(s string) (int64, error) { - return strconv.ParseInt(s, 10, 64) -} - -func FormatInt(v int) string { - return strconv.FormatInt(int64(v), 10) -} - -func FormatInt8(v int8) string { - return strconv.FormatInt(int64(v), 10) -} - -func FormatInt16(v int16) string { - return strconv.FormatInt(int64(v), 10) -} - -func FormatInt32(v int32) string { - return strconv.FormatInt(int64(v), 10) -} - -func FormatInt64(v int64) string { - return strconv.FormatInt(int64(v), 10) -} - -func FormatUint(v uint) string { - return strconv.FormatUint(uint64(v), 10) -} - -func FormatUint8(v uint8) string { - return strconv.FormatUint(uint64(v), 10) -} - -func FormatUint16(v uint16) string { - return strconv.FormatUint(uint64(v), 10) -} - -func FormatUint32(v uint32) string { - return strconv.FormatUint(uint64(v), 10) -} - -func FormatUint64(v uint64) string { - return strconv.FormatUint(uint64(v), 10) -} - -func FormatIntToSlice(v int) []byte { - return strconv.AppendInt(nil, int64(v), 10) -} - -func FormatInt8ToSlice(v int8) []byte { - return strconv.AppendInt(nil, int64(v), 10) -} - -func FormatInt16ToSlice(v int16) []byte { - return strconv.AppendInt(nil, int64(v), 10) -} - -func FormatInt32ToSlice(v int32) []byte { - return strconv.AppendInt(nil, int64(v), 10) -} - -func FormatInt64ToSlice(v int64) []byte { - return strconv.AppendInt(nil, int64(v), 10) -} - -func FormatUintToSlice(v uint) []byte { - return strconv.AppendUint(nil, uint64(v), 10) -} - -func FormatUint8ToSlice(v uint8) []byte { - return strconv.AppendUint(nil, uint64(v), 10) -} - -func FormatUint16ToSlice(v uint16) []byte { - return strconv.AppendUint(nil, uint64(v), 10) -} - -func FormatUint32ToSlice(v uint32) []byte { - return strconv.AppendUint(nil, uint64(v), 10) -} - -func FormatUint64ToSlice(v uint64) []byte { - return strconv.AppendUint(nil, uint64(v), 10) -} diff --git a/vendor/github.com/siddontang/go/snappy/LICENSE b/vendor/github.com/siddontang/go/snappy/LICENSE deleted file mode 100644 index 6050c10f4c8b..000000000000 --- a/vendor/github.com/siddontang/go/snappy/LICENSE +++ /dev/null @@ -1,27 +0,0 @@ -Copyright (c) 2011 The Snappy-Go Authors. All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - * Neither the name of Google Inc. nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/github.com/siddontang/go/snappy/decode.go b/vendor/github.com/siddontang/go/snappy/decode.go deleted file mode 100644 index d93c1b9dbfd7..000000000000 --- a/vendor/github.com/siddontang/go/snappy/decode.go +++ /dev/null @@ -1,124 +0,0 @@ -// Copyright 2011 The Snappy-Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package snappy - -import ( - "encoding/binary" - "errors" -) - -// ErrCorrupt reports that the input is invalid. -var ErrCorrupt = errors.New("snappy: corrupt input") - -// DecodedLen returns the length of the decoded block. -func DecodedLen(src []byte) (int, error) { - v, _, err := decodedLen(src) - return v, err -} - -// decodedLen returns the length of the decoded block and the number of bytes -// that the length header occupied. -func decodedLen(src []byte) (blockLen, headerLen int, err error) { - v, n := binary.Uvarint(src) - if n == 0 { - return 0, 0, ErrCorrupt - } - if uint64(int(v)) != v { - return 0, 0, errors.New("snappy: decoded block is too large") - } - return int(v), n, nil -} - -// Decode returns the decoded form of src. The returned slice may be a sub- -// slice of dst if dst was large enough to hold the entire decoded block. -// Otherwise, a newly allocated slice will be returned. -// It is valid to pass a nil dst. -func Decode(dst, src []byte) ([]byte, error) { - dLen, s, err := decodedLen(src) - if err != nil { - return nil, err - } - if len(dst) < dLen { - dst = make([]byte, dLen) - } - - var d, offset, length int - for s < len(src) { - switch src[s] & 0x03 { - case tagLiteral: - x := uint(src[s] >> 2) - switch { - case x < 60: - s += 1 - case x == 60: - s += 2 - if s > len(src) { - return nil, ErrCorrupt - } - x = uint(src[s-1]) - case x == 61: - s += 3 - if s > len(src) { - return nil, ErrCorrupt - } - x = uint(src[s-2]) | uint(src[s-1])<<8 - case x == 62: - s += 4 - if s > len(src) { - return nil, ErrCorrupt - } - x = uint(src[s-3]) | uint(src[s-2])<<8 | uint(src[s-1])<<16 - case x == 63: - s += 5 - if s > len(src) { - return nil, ErrCorrupt - } - x = uint(src[s-4]) | uint(src[s-3])<<8 | uint(src[s-2])<<16 | uint(src[s-1])<<24 - } - length = int(x + 1) - if length <= 0 { - return nil, errors.New("snappy: unsupported literal length") - } - if length > len(dst)-d || length > len(src)-s { - return nil, ErrCorrupt - } - copy(dst[d:], src[s:s+length]) - d += length - s += length - continue - - case tagCopy1: - s += 2 - if s > len(src) { - return nil, ErrCorrupt - } - length = 4 + int(src[s-2])>>2&0x7 - offset = int(src[s-2])&0xe0<<3 | int(src[s-1]) - - case tagCopy2: - s += 3 - if s > len(src) { - return nil, ErrCorrupt - } - length = 1 + int(src[s-3])>>2 - offset = int(src[s-2]) | int(src[s-1])<<8 - - case tagCopy4: - return nil, errors.New("snappy: unsupported COPY_4 tag") - } - - end := d + length - if offset > d || end > len(dst) { - return nil, ErrCorrupt - } - for ; d < end; d++ { - dst[d] = dst[d-offset] - } - } - if d != dLen { - return nil, ErrCorrupt - } - return dst[:d], nil -} diff --git a/vendor/github.com/siddontang/go/snappy/encode.go b/vendor/github.com/siddontang/go/snappy/encode.go deleted file mode 100644 index b2371db11c8f..000000000000 --- a/vendor/github.com/siddontang/go/snappy/encode.go +++ /dev/null @@ -1,174 +0,0 @@ -// Copyright 2011 The Snappy-Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package snappy - -import ( - "encoding/binary" -) - -// We limit how far copy back-references can go, the same as the C++ code. -const maxOffset = 1 << 15 - -// emitLiteral writes a literal chunk and returns the number of bytes written. -func emitLiteral(dst, lit []byte) int { - i, n := 0, uint(len(lit)-1) - switch { - case n < 60: - dst[0] = uint8(n)<<2 | tagLiteral - i = 1 - case n < 1<<8: - dst[0] = 60<<2 | tagLiteral - dst[1] = uint8(n) - i = 2 - case n < 1<<16: - dst[0] = 61<<2 | tagLiteral - dst[1] = uint8(n) - dst[2] = uint8(n >> 8) - i = 3 - case n < 1<<24: - dst[0] = 62<<2 | tagLiteral - dst[1] = uint8(n) - dst[2] = uint8(n >> 8) - dst[3] = uint8(n >> 16) - i = 4 - case int64(n) < 1<<32: - dst[0] = 63<<2 | tagLiteral - dst[1] = uint8(n) - dst[2] = uint8(n >> 8) - dst[3] = uint8(n >> 16) - dst[4] = uint8(n >> 24) - i = 5 - default: - panic("snappy: source buffer is too long") - } - if copy(dst[i:], lit) != len(lit) { - panic("snappy: destination buffer is too short") - } - return i + len(lit) -} - -// emitCopy writes a copy chunk and returns the number of bytes written. -func emitCopy(dst []byte, offset, length int) int { - i := 0 - for length > 0 { - x := length - 4 - if 0 <= x && x < 1<<3 && offset < 1<<11 { - dst[i+0] = uint8(offset>>8)&0x07<<5 | uint8(x)<<2 | tagCopy1 - dst[i+1] = uint8(offset) - i += 2 - break - } - - x = length - if x > 1<<6 { - x = 1 << 6 - } - dst[i+0] = uint8(x-1)<<2 | tagCopy2 - dst[i+1] = uint8(offset) - dst[i+2] = uint8(offset >> 8) - i += 3 - length -= x - } - return i -} - -// Encode returns the encoded form of src. The returned slice may be a sub- -// slice of dst if dst was large enough to hold the entire encoded block. -// Otherwise, a newly allocated slice will be returned. -// It is valid to pass a nil dst. -func Encode(dst, src []byte) ([]byte, error) { - if n := MaxEncodedLen(len(src)); len(dst) < n { - dst = make([]byte, n) - } - - // The block starts with the varint-encoded length of the decompressed bytes. - d := binary.PutUvarint(dst, uint64(len(src))) - - // Return early if src is short. - if len(src) <= 4 { - if len(src) != 0 { - d += emitLiteral(dst[d:], src) - } - return dst[:d], nil - } - - // Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive. - const maxTableSize = 1 << 14 - shift, tableSize := uint(32-8), 1<<8 - for tableSize < maxTableSize && tableSize < len(src) { - shift-- - tableSize *= 2 - } - var table [maxTableSize]int - - // Iterate over the source bytes. - var ( - s int // The iterator position. - t int // The last position with the same hash as s. - lit int // The start position of any pending literal bytes. - ) - for s+3 < len(src) { - // Update the hash table. - b0, b1, b2, b3 := src[s], src[s+1], src[s+2], src[s+3] - h := uint32(b0) | uint32(b1)<<8 | uint32(b2)<<16 | uint32(b3)<<24 - p := &table[(h*0x1e35a7bd)>>shift] - // We need to to store values in [-1, inf) in table. To save - // some initialization time, (re)use the table's zero value - // and shift the values against this zero: add 1 on writes, - // subtract 1 on reads. - t, *p = *p-1, s+1 - // If t is invalid or src[s:s+4] differs from src[t:t+4], accumulate a literal byte. - if t < 0 || s-t >= maxOffset || b0 != src[t] || b1 != src[t+1] || b2 != src[t+2] || b3 != src[t+3] { - s++ - continue - } - // Otherwise, we have a match. First, emit any pending literal bytes. - if lit != s { - d += emitLiteral(dst[d:], src[lit:s]) - } - // Extend the match to be as long as possible. - s0 := s - s, t = s+4, t+4 - for s < len(src) && src[s] == src[t] { - s++ - t++ - } - // Emit the copied bytes. - d += emitCopy(dst[d:], s-t, s-s0) - lit = s - } - - // Emit any final pending literal bytes and return. - if lit != len(src) { - d += emitLiteral(dst[d:], src[lit:]) - } - return dst[:d], nil -} - -// MaxEncodedLen returns the maximum length of a snappy block, given its -// uncompressed length. -func MaxEncodedLen(srcLen int) int { - // Compressed data can be defined as: - // compressed := item* literal* - // item := literal* copy - // - // The trailing literal sequence has a space blowup of at most 62/60 - // since a literal of length 60 needs one tag byte + one extra byte - // for length information. - // - // Item blowup is trickier to measure. Suppose the "copy" op copies - // 4 bytes of data. Because of a special check in the encoding code, - // we produce a 4-byte copy only if the offset is < 65536. Therefore - // the copy op takes 3 bytes to encode, and this type of item leads - // to at most the 62/60 blowup for representing literals. - // - // Suppose the "copy" op copies 5 bytes of data. If the offset is big - // enough, it will take 5 bytes to encode the copy op. Therefore the - // worst case here is a one-byte literal followed by a five-byte copy. - // That is, 6 bytes of input turn into 7 bytes of "compressed" data. - // - // This last factor dominates the blowup, so the final estimate is: - return 32 + srcLen + srcLen/6 -} diff --git a/vendor/github.com/siddontang/go/snappy/snappy.go b/vendor/github.com/siddontang/go/snappy/snappy.go deleted file mode 100644 index 2f1b790d0b71..000000000000 --- a/vendor/github.com/siddontang/go/snappy/snappy.go +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright 2011 The Snappy-Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Package snappy implements the snappy block-based compression format. -// It aims for very high speeds and reasonable compression. -// -// The C++ snappy implementation is at http://code.google.com/p/snappy/ -package snappy - -/* -Each encoded block begins with the varint-encoded length of the decoded data, -followed by a sequence of chunks. Chunks begin and end on byte boundaries. The -first byte of each chunk is broken into its 2 least and 6 most significant bits -called l and m: l ranges in [0, 4) and m ranges in [0, 64). l is the chunk tag. -Zero means a literal tag. All other values mean a copy tag. - -For literal tags: - - If m < 60, the next 1 + m bytes are literal bytes. - - Otherwise, let n be the little-endian unsigned integer denoted by the next - m - 59 bytes. The next 1 + n bytes after that are literal bytes. - -For copy tags, length bytes are copied from offset bytes ago, in the style of -Lempel-Ziv compression algorithms. In particular: - - For l == 1, the offset ranges in [0, 1<<11) and the length in [4, 12). - The length is 4 + the low 3 bits of m. The high 3 bits of m form bits 8-10 - of the offset. The next byte is bits 0-7 of the offset. - - For l == 2, the offset ranges in [0, 1<<16) and the length in [1, 65). - The length is 1 + m. The offset is the little-endian unsigned integer - denoted by the next 2 bytes. - - For l == 3, this tag is a legacy format that is no longer supported. -*/ -const ( - tagLiteral = 0x00 - tagCopy1 = 0x01 - tagCopy2 = 0x02 - tagCopy4 = 0x03 -) diff --git a/vendor/github.com/siddontang/go/sync2/atomic.go b/vendor/github.com/siddontang/go/sync2/atomic.go deleted file mode 100644 index 382fc20dfec7..000000000000 --- a/vendor/github.com/siddontang/go/sync2/atomic.go +++ /dev/null @@ -1,146 +0,0 @@ -// Copyright 2013, Google Inc. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package sync2 - -import ( - "sync" - "sync/atomic" - "time" -) - -type AtomicInt32 int32 - -func (i *AtomicInt32) Add(n int32) int32 { - return atomic.AddInt32((*int32)(i), n) -} - -func (i *AtomicInt32) Set(n int32) { - atomic.StoreInt32((*int32)(i), n) -} - -func (i *AtomicInt32) Get() int32 { - return atomic.LoadInt32((*int32)(i)) -} - -func (i *AtomicInt32) CompareAndSwap(oldval, newval int32) (swapped bool) { - return atomic.CompareAndSwapInt32((*int32)(i), oldval, newval) -} - -type AtomicUint32 uint32 - -func (i *AtomicUint32) Add(n uint32) uint32 { - return atomic.AddUint32((*uint32)(i), n) -} - -func (i *AtomicUint32) Set(n uint32) { - atomic.StoreUint32((*uint32)(i), n) -} - -func (i *AtomicUint32) Get() uint32 { - return atomic.LoadUint32((*uint32)(i)) -} - -func (i *AtomicUint32) CompareAndSwap(oldval, newval uint32) (swapped bool) { - return atomic.CompareAndSwapUint32((*uint32)(i), oldval, newval) -} - -type AtomicInt64 int64 - -func (i *AtomicInt64) Add(n int64) int64 { - return atomic.AddInt64((*int64)(i), n) -} - -func (i *AtomicInt64) Set(n int64) { - atomic.StoreInt64((*int64)(i), n) -} - -func (i *AtomicInt64) Get() int64 { - return atomic.LoadInt64((*int64)(i)) -} - -func (i *AtomicInt64) CompareAndSwap(oldval, newval int64) (swapped bool) { - return atomic.CompareAndSwapInt64((*int64)(i), oldval, newval) -} - -type AtomicUint64 uint64 - -func (i *AtomicUint64) Add(n uint64) uint64 { - return atomic.AddUint64((*uint64)(i), n) -} - -func (i *AtomicUint64) Set(n uint64) { - atomic.StoreUint64((*uint64)(i), n) -} - -func (i *AtomicUint64) Get() uint64 { - return atomic.LoadUint64((*uint64)(i)) -} - -func (i *AtomicUint64) CompareAndSwap(oldval, newval uint64) (swapped bool) { - return atomic.CompareAndSwapUint64((*uint64)(i), oldval, newval) -} - -type AtomicDuration int64 - -func (d *AtomicDuration) Add(duration time.Duration) time.Duration { - return time.Duration(atomic.AddInt64((*int64)(d), int64(duration))) -} - -func (d *AtomicDuration) Set(duration time.Duration) { - atomic.StoreInt64((*int64)(d), int64(duration)) -} - -func (d *AtomicDuration) Get() time.Duration { - return time.Duration(atomic.LoadInt64((*int64)(d))) -} - -func (d *AtomicDuration) CompareAndSwap(oldval, newval time.Duration) (swapped bool) { - return atomic.CompareAndSwapInt64((*int64)(d), int64(oldval), int64(newval)) -} - -// AtomicString gives you atomic-style APIs for string, but -// it's only a convenience wrapper that uses a mutex. So, it's -// not as efficient as the rest of the atomic types. -type AtomicString struct { - mu sync.Mutex - str string -} - -func (s *AtomicString) Set(str string) { - s.mu.Lock() - s.str = str - s.mu.Unlock() -} - -func (s *AtomicString) Get() string { - s.mu.Lock() - str := s.str - s.mu.Unlock() - return str -} - -func (s *AtomicString) CompareAndSwap(oldval, newval string) (swapped bool) { - s.mu.Lock() - defer s.mu.Unlock() - if s.str == oldval { - s.str = newval - return true - } - return false -} - -type AtomicBool int32 - -func (b *AtomicBool) Set(v bool) { - if v { - atomic.StoreInt32((*int32)(b), 1) - } else { - atomic.StoreInt32((*int32)(b), 0) - } -} - -func (b *AtomicBool) Get() bool { - return atomic.LoadInt32((*int32)(b)) == 1 -} diff --git a/vendor/github.com/siddontang/go/sync2/semaphore.go b/vendor/github.com/siddontang/go/sync2/semaphore.go deleted file mode 100644 index d310da7294c7..000000000000 --- a/vendor/github.com/siddontang/go/sync2/semaphore.go +++ /dev/null @@ -1,65 +0,0 @@ -package sync2 - -import ( - "sync" - "sync/atomic" - "time" -) - -func NewSemaphore(initialCount int) *Semaphore { - res := &Semaphore{ - counter: int64(initialCount), - } - res.cond.L = &res.lock - return res -} - -type Semaphore struct { - lock sync.Mutex - cond sync.Cond - counter int64 -} - -func (s *Semaphore) Release() { - s.lock.Lock() - s.counter += 1 - if s.counter >= 0 { - s.cond.Signal() - } - s.lock.Unlock() -} - -func (s *Semaphore) Acquire() { - s.lock.Lock() - for s.counter < 1 { - s.cond.Wait() - } - s.counter -= 1 - s.lock.Unlock() -} - -func (s *Semaphore) AcquireTimeout(timeout time.Duration) bool { - done := make(chan bool, 1) - // Gate used to communicate between the threads and decide what the result - // is. If the main thread decides, we have timed out, otherwise we succeed. - decided := new(int32) - go func() { - s.Acquire() - if atomic.SwapInt32(decided, 1) == 0 { - done <- true - } else { - // If we already decided the result, and this thread did not win - s.Release() - } - }() - select { - case <-done: - return true - case <-time.NewTimer(timeout).C: - if atomic.SwapInt32(decided, 1) == 1 { - // The other thread already decided the result - return true - } - return false - } -} diff --git a/vendor/github.com/siddontang/ledisdb/LICENSE b/vendor/github.com/siddontang/ledisdb/LICENSE deleted file mode 100644 index 7ece9fdf5a64..000000000000 --- a/vendor/github.com/siddontang/ledisdb/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -The MIT License (MIT) - -Copyright (c) 2014 siddontang - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. \ No newline at end of file diff --git a/vendor/github.com/siddontang/ledisdb/client/go/LICENSE b/vendor/github.com/siddontang/ledisdb/client/go/LICENSE deleted file mode 100644 index 7ece9fdf5a64..000000000000 --- a/vendor/github.com/siddontang/ledisdb/client/go/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -The MIT License (MIT) - -Copyright (c) 2014 siddontang - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. \ No newline at end of file diff --git a/vendor/github.com/siddontang/ledisdb/config/config.go b/vendor/github.com/siddontang/ledisdb/config/config.go deleted file mode 100644 index 92ea190957ea..000000000000 --- a/vendor/github.com/siddontang/ledisdb/config/config.go +++ /dev/null @@ -1,279 +0,0 @@ -package config - -import ( - "bytes" - "errors" - "github.com/BurntSushi/toml" - "github.com/siddontang/go/ioutil2" - "io" - "io/ioutil" - "sync" -) - -var ( - ErrNoConfigFile = errors.New("Running without a config file") -) - -const ( - DefaultAddr string = "127.0.0.1:6380" - - DefaultDBName string = "goleveldb" - - DefaultDataDir string = "./var" - - KB int = 1024 - MB int = KB * 1024 - GB int = MB * 1024 -) - -type LevelDBConfig struct { - Compression bool `toml:"compression"` - BlockSize int `toml:"block_size"` - WriteBufferSize int `toml:"write_buffer_size"` - CacheSize int `toml:"cache_size"` - MaxOpenFiles int `toml:"max_open_files"` -} - -type RocksDBConfig struct { - Compression int `toml:"compression"` - BlockSize int `toml:"block_size"` - WriteBufferSize int `toml:"write_buffer_size"` - CacheSize int `toml:"cache_size"` - MaxOpenFiles int `toml:"max_open_files"` - MaxWriteBufferNum int `toml:"max_write_buffer_num"` - MinWriteBufferNumberToMerge int `toml:"min_write_buffer_number_to_merge"` - NumLevels int `toml:"num_levels"` - Level0FileNumCompactionTrigger int `toml:"level0_file_num_compaction_trigger"` - Level0SlowdownWritesTrigger int `toml:"level0_slowdown_writes_trigger"` - Level0StopWritesTrigger int `toml:"level0_stop_writes_trigger"` - TargetFileSizeBase int `toml:"target_file_size_base"` - TargetFileSizeMultiplier int `toml:"target_file_size_multiplier"` - MaxBytesForLevelBase int `toml:"max_bytes_for_level_base"` - MaxBytesForLevelMultiplier int `toml:"max_bytes_for_level_multiplier"` - DisableAutoCompactions bool `toml:"disable_auto_compactions"` - DisableDataSync bool `toml:"disable_data_sync"` - UseFsync bool `toml:"use_fsync"` - MaxBackgroundCompactions int `toml:"max_background_compactions"` - MaxBackgroundFlushes int `toml:"max_background_flushes"` - AllowOsBuffer bool `toml:"allow_os_buffer"` - EnableStatistics bool `toml:"enable_statistics"` - StatsDumpPeriodSec int `toml:"stats_dump_period_sec"` - BackgroundThreads int `toml:"background_theads"` - HighPriorityBackgroundThreads int `toml:"high_priority_background_threads"` - DisableWAL bool `toml:"disable_wal"` -} - -type LMDBConfig struct { - MapSize int `toml:"map_size"` - NoSync bool `toml:"nosync"` -} - -type ReplicationConfig struct { - Path string `toml:"path"` - Sync bool `toml:"sync"` - WaitSyncTime int `toml:"wait_sync_time"` - WaitMaxSlaveAcks int `toml:"wait_max_slave_acks"` - ExpiredLogDays int `toml:"expired_log_days"` - StoreName string `toml:"store_name"` - MaxLogFileSize int64 `toml:"max_log_file_size"` - MaxLogFileNum int `toml:"max_log_file_num"` - SyncLog int `toml:"sync_log"` - Compression bool `toml:"compression"` - UseMmap bool `toml:"use_mmap"` -} - -type SnapshotConfig struct { - Path string `toml:"path"` - MaxNum int `toml:"max_num"` -} - -type Config struct { - m sync.RWMutex `toml:"-"` - - FileName string `toml:"-"` - - Addr string `toml:"addr"` - - HttpAddr string `toml:"http_addr"` - - SlaveOf string `toml:"slaveof"` - - Readonly bool `toml:readonly` - - DataDir string `toml:"data_dir"` - - DBName string `toml:"db_name"` - DBPath string `toml:"db_path"` - DBSyncCommit int `toml:"db_sync_commit"` - - LevelDB LevelDBConfig `toml:"leveldb"` - RocksDB RocksDBConfig `toml:"rocksdb"` - - LMDB LMDBConfig `toml:"lmdb"` - - AccessLog string `toml:"access_log"` - - UseReplication bool `toml:"use_replication"` - Replication ReplicationConfig `toml:"replication"` - - Snapshot SnapshotConfig `toml:"snapshot"` - - ConnReadBufferSize int `toml:"conn_read_buffer_size"` - ConnWriteBufferSize int `toml:"conn_write_buffer_size"` - ConnKeepaliveInterval int `toml:"conn_keepalive_interval"` - - TTLCheckInterval int `toml:"ttl_check_interval"` -} - -func NewConfigWithFile(fileName string) (*Config, error) { - data, err := ioutil.ReadFile(fileName) - if err != nil { - return nil, err - } - - if cfg, err := NewConfigWithData(data); err != nil { - return nil, err - } else { - cfg.FileName = fileName - return cfg, nil - } -} - -func NewConfigWithData(data []byte) (*Config, error) { - cfg := NewConfigDefault() - - _, err := toml.Decode(string(data), cfg) - if err != nil { - return nil, err - } - - cfg.adjust() - - return cfg, nil -} - -func NewConfigDefault() *Config { - cfg := new(Config) - - cfg.Addr = DefaultAddr - cfg.HttpAddr = "" - - cfg.DataDir = DefaultDataDir - - cfg.DBName = DefaultDBName - - cfg.SlaveOf = "" - cfg.Readonly = false - - // disable access log - cfg.AccessLog = "" - - cfg.LMDB.MapSize = 20 * MB - cfg.LMDB.NoSync = true - - cfg.UseReplication = false - cfg.Replication.WaitSyncTime = 500 - cfg.Replication.Compression = true - cfg.Replication.WaitMaxSlaveAcks = 2 - cfg.Replication.SyncLog = 0 - cfg.Replication.UseMmap = true - cfg.Snapshot.MaxNum = 1 - - cfg.RocksDB.AllowOsBuffer = true - cfg.RocksDB.EnableStatistics = false - cfg.RocksDB.UseFsync = false - cfg.RocksDB.DisableAutoCompactions = false - cfg.RocksDB.AllowOsBuffer = true - cfg.RocksDB.DisableWAL = false - - cfg.adjust() - - return cfg -} - -func getDefault(d int, s int) int { - if s <= 0 { - return d - } else { - return s - } -} - -func (cfg *Config) adjust() { - cfg.LevelDB.adjust() - - cfg.RocksDB.adjust() - - cfg.Replication.ExpiredLogDays = getDefault(7, cfg.Replication.ExpiredLogDays) - cfg.Replication.MaxLogFileNum = getDefault(50, cfg.Replication.MaxLogFileNum) - cfg.ConnReadBufferSize = getDefault(4*KB, cfg.ConnReadBufferSize) - cfg.ConnWriteBufferSize = getDefault(4*KB, cfg.ConnWriteBufferSize) - cfg.TTLCheckInterval = getDefault(1, cfg.TTLCheckInterval) - -} - -func (cfg *LevelDBConfig) adjust() { - cfg.CacheSize = getDefault(4*MB, cfg.CacheSize) - cfg.BlockSize = getDefault(4*KB, cfg.BlockSize) - cfg.WriteBufferSize = getDefault(4*MB, cfg.WriteBufferSize) - cfg.MaxOpenFiles = getDefault(1024, cfg.MaxOpenFiles) -} - -func (cfg *RocksDBConfig) adjust() { - cfg.CacheSize = getDefault(4*MB, cfg.CacheSize) - cfg.BlockSize = getDefault(4*KB, cfg.BlockSize) - cfg.WriteBufferSize = getDefault(4*MB, cfg.WriteBufferSize) - cfg.MaxOpenFiles = getDefault(1024, cfg.MaxOpenFiles) - cfg.MaxWriteBufferNum = getDefault(2, cfg.MaxWriteBufferNum) - cfg.MinWriteBufferNumberToMerge = getDefault(1, cfg.MinWriteBufferNumberToMerge) - cfg.NumLevels = getDefault(7, cfg.NumLevels) - cfg.Level0FileNumCompactionTrigger = getDefault(4, cfg.Level0FileNumCompactionTrigger) - cfg.Level0SlowdownWritesTrigger = getDefault(16, cfg.Level0SlowdownWritesTrigger) - cfg.Level0StopWritesTrigger = getDefault(64, cfg.Level0StopWritesTrigger) - cfg.TargetFileSizeBase = getDefault(32*MB, cfg.TargetFileSizeBase) - cfg.TargetFileSizeMultiplier = getDefault(1, cfg.TargetFileSizeMultiplier) - cfg.MaxBytesForLevelBase = getDefault(32*MB, cfg.MaxBytesForLevelBase) - cfg.MaxBytesForLevelMultiplier = getDefault(1, cfg.MaxBytesForLevelMultiplier) - cfg.MaxBackgroundCompactions = getDefault(1, cfg.MaxBackgroundCompactions) - cfg.MaxBackgroundFlushes = getDefault(1, cfg.MaxBackgroundFlushes) - cfg.StatsDumpPeriodSec = getDefault(3600, cfg.StatsDumpPeriodSec) - cfg.BackgroundThreads = getDefault(2, cfg.BackgroundThreads) - cfg.HighPriorityBackgroundThreads = getDefault(1, cfg.HighPriorityBackgroundThreads) -} - -func (cfg *Config) Dump(w io.Writer) error { - e := toml.NewEncoder(w) - e.Indent = "" - return e.Encode(cfg) -} - -func (cfg *Config) DumpFile(fileName string) error { - var b bytes.Buffer - - if err := cfg.Dump(&b); err != nil { - return err - } - - return ioutil2.WriteFileAtomic(fileName, b.Bytes(), 0644) -} - -func (cfg *Config) Rewrite() error { - if len(cfg.FileName) == 0 { - return ErrNoConfigFile - } - - return cfg.DumpFile(cfg.FileName) -} - -func (cfg *Config) GetReadonly() bool { - cfg.m.RLock() - b := cfg.Readonly - cfg.m.RUnlock() - return b -} - -func (cfg *Config) SetReadonly(b bool) { - cfg.m.Lock() - cfg.Readonly = b - cfg.m.Unlock() -} diff --git a/vendor/github.com/siddontang/ledisdb/ledis/batch.go b/vendor/github.com/siddontang/ledisdb/ledis/batch.go deleted file mode 100644 index c9064df4927a..000000000000 --- a/vendor/github.com/siddontang/ledisdb/ledis/batch.go +++ /dev/null @@ -1,138 +0,0 @@ -package ledis - -import ( - "github.com/siddontang/go/log" - "github.com/siddontang/ledisdb/rpl" - "github.com/siddontang/ledisdb/store" - "sync" -) - -type batch struct { - l *Ledis - - *store.WriteBatch - - sync.Locker - - tx *Tx -} - -func (b *batch) Commit() error { - if b.l.cfg.GetReadonly() { - return ErrWriteInROnly - } - - if b.tx == nil { - return b.l.handleCommit(b.WriteBatch, b.WriteBatch) - } else { - if b.l.r != nil { - if err := b.tx.data.Append(b.WriteBatch.BatchData()); err != nil { - return err - } - } - return b.WriteBatch.Commit() - } -} - -func (b *batch) Lock() { - b.Locker.Lock() -} - -func (b *batch) Unlock() { - b.WriteBatch.Rollback() - b.Locker.Unlock() -} - -func (b *batch) Put(key []byte, value []byte) { - b.WriteBatch.Put(key, value) -} - -func (b *batch) Delete(key []byte) { - b.WriteBatch.Delete(key) -} - -type dbBatchLocker struct { - l *sync.Mutex - wrLock *sync.RWMutex -} - -func (l *dbBatchLocker) Lock() { - l.wrLock.RLock() - l.l.Lock() -} - -func (l *dbBatchLocker) Unlock() { - l.l.Unlock() - l.wrLock.RUnlock() -} - -type txBatchLocker struct { -} - -func (l *txBatchLocker) Lock() {} -func (l *txBatchLocker) Unlock() {} - -type multiBatchLocker struct { -} - -func (l *multiBatchLocker) Lock() {} -func (l *multiBatchLocker) Unlock() {} - -func (l *Ledis) newBatch(wb *store.WriteBatch, locker sync.Locker, tx *Tx) *batch { - b := new(batch) - b.l = l - b.WriteBatch = wb - - b.Locker = locker - - b.tx = tx - - return b -} - -type commiter interface { - Commit() error -} - -type commitDataGetter interface { - Data() []byte -} - -func (l *Ledis) handleCommit(g commitDataGetter, c commiter) error { - l.commitLock.Lock() - - var err error - if l.r != nil { - var rl *rpl.Log - if rl, err = l.r.Log(g.Data()); err != nil { - l.commitLock.Unlock() - - log.Fatal("write wal error %s", err.Error()) - return err - } - - l.propagate(rl) - - if err = c.Commit(); err != nil { - l.commitLock.Unlock() - - log.Fatal("commit error %s", err.Error()) - l.noticeReplication() - return err - } - - if err = l.r.UpdateCommitID(rl.ID); err != nil { - l.commitLock.Unlock() - - log.Fatal("update commit id error %s", err.Error()) - l.noticeReplication() - return err - } - } else { - err = c.Commit() - } - - l.commitLock.Unlock() - - return err -} diff --git a/vendor/github.com/siddontang/ledisdb/ledis/const.go b/vendor/github.com/siddontang/ledisdb/ledis/const.go deleted file mode 100644 index a61cb901bb27..000000000000 --- a/vendor/github.com/siddontang/ledisdb/ledis/const.go +++ /dev/null @@ -1,100 +0,0 @@ -package ledis - -import ( - "errors" -) - -const Version = "0.4" - -const ( - NoneType byte = 0 - KVType byte = 1 - HashType byte = 2 - HSizeType byte = 3 - ListType byte = 4 - LMetaType byte = 5 - ZSetType byte = 6 - ZSizeType byte = 7 - ZScoreType byte = 8 - BitType byte = 9 - BitMetaType byte = 10 - SetType byte = 11 - SSizeType byte = 12 - - maxDataType byte = 100 - - /* - I make a big mistake about TTL time key format and have to use a new one (change 101 to 103). - You must run the ledis-upgrade-ttl to upgrade db. - */ - ObsoleteExpTimeType byte = 101 - ExpMetaType byte = 102 - ExpTimeType byte = 103 - - MetaType byte = 201 -) - -var ( - TypeName = map[byte]string{ - KVType: "kv", - HashType: "hash", - HSizeType: "hsize", - ListType: "list", - LMetaType: "lmeta", - ZSetType: "zset", - ZSizeType: "zsize", - ZScoreType: "zscore", - BitType: "bit", - BitMetaType: "bitmeta", - SetType: "set", - SSizeType: "ssize", - ExpTimeType: "exptime", - ExpMetaType: "expmeta", - } -) - -const ( - defaultScanCount int = 10 -) - -var ( - errKeySize = errors.New("invalid key size") - errValueSize = errors.New("invalid value size") - errHashFieldSize = errors.New("invalid hash field size") - errSetMemberSize = errors.New("invalid set member size") - errZSetMemberSize = errors.New("invalid zset member size") - errExpireValue = errors.New("invalid expire value") -) - -const ( - //we don't support too many databases - MaxDBNumber uint8 = 16 - - //max key size - MaxKeySize int = 1024 - - //max hash field size - MaxHashFieldSize int = 1024 - - //max zset member size - MaxZSetMemberSize int = 1024 - - //max set member size - MaxSetMemberSize int = 1024 - - //max value size - MaxValueSize int = 1024 * 1024 * 1024 -) - -var ( - ErrScoreMiss = errors.New("zset score miss") - ErrWriteInROnly = errors.New("write not support in readonly mode") - ErrRplInRDWR = errors.New("replication not support in read write mode") - ErrRplNotSupport = errors.New("replication not support") -) - -const ( - DBAutoCommit uint8 = 0x0 - DBInTransaction uint8 = 0x1 - DBInMulti uint8 = 0x2 -) diff --git a/vendor/github.com/siddontang/ledisdb/ledis/doc.go b/vendor/github.com/siddontang/ledisdb/ledis/doc.go deleted file mode 100644 index c6bfe7807bef..000000000000 --- a/vendor/github.com/siddontang/ledisdb/ledis/doc.go +++ /dev/null @@ -1,58 +0,0 @@ -// Package ledis is a high performance embedded NoSQL. -// -// Ledis supports various data structure like kv, list, hash and zset like redis. -// -// Other features include replication, data with a limited time-to-live. -// -// Usage -// -// First create a ledis instance before use: -// -// l := ledis.Open(cfg) -// -// cfg is a Config instance which contains configuration for ledis use, -// like DataDir (root directory for ledis working to store data). -// -// After you create a ledis instance, you can select a DB to store you data: -// -// db, _ := l.Select(0) -// -// DB must be selected by a index, ledis supports only 16 databases, so the index range is [0-15]. -// -// KV -// -// KV is the most basic ledis type like any other key-value database. -// -// err := db.Set(key, value) -// value, err := db.Get(key) -// -// List -// -// List is simply lists of values, sorted by insertion order. -// You can push or pop value on the list head (left) or tail (right). -// -// err := db.LPush(key, value1) -// err := db.RPush(key, value2) -// value1, err := db.LPop(key) -// value2, err := db.RPop(key) -// -// Hash -// -// Hash is a map between fields and values. -// -// n, err := db.HSet(key, field1, value1) -// n, err := db.HSet(key, field2, value2) -// value1, err := db.HGet(key, field1) -// value2, err := db.HGet(key, field2) -// -// ZSet -// -// ZSet is a sorted collections of values. -// Every member of zset is associated with score, a int64 value which used to sort, from smallest to greatest score. -// Members are unique, but score may be same. -// -// n, err := db.ZAdd(key, ScorePair{score1, member1}, ScorePair{score2, member2}) -// ay, err := db.ZRangeByScore(key, minScore, maxScore, 0, -1) -// -// -package ledis diff --git a/vendor/github.com/siddontang/ledisdb/ledis/dump.go b/vendor/github.com/siddontang/ledisdb/ledis/dump.go deleted file mode 100644 index 863255c1220d..000000000000 --- a/vendor/github.com/siddontang/ledisdb/ledis/dump.go +++ /dev/null @@ -1,220 +0,0 @@ -package ledis - -import ( - "bufio" - "bytes" - "encoding/binary" - "github.com/siddontang/go/snappy" - "github.com/siddontang/ledisdb/store" - "io" - "os" -) - -type DumpHead struct { - CommitID uint64 -} - -func (h *DumpHead) Read(r io.Reader) error { - if err := binary.Read(r, binary.BigEndian, &h.CommitID); err != nil { - return err - } - - return nil -} - -func (h *DumpHead) Write(w io.Writer) error { - if err := binary.Write(w, binary.BigEndian, h.CommitID); err != nil { - return err - } - - return nil -} - -func (l *Ledis) DumpFile(path string) error { - f, err := os.Create(path) - if err != nil { - return err - } - defer f.Close() - - return l.Dump(f) -} - -func (l *Ledis) Dump(w io.Writer) error { - var err error - - var commitID uint64 - var snap *store.Snapshot - - l.wLock.Lock() - - if l.r != nil { - if commitID, err = l.r.LastCommitID(); err != nil { - l.wLock.Unlock() - return err - } - } - - if snap, err = l.ldb.NewSnapshot(); err != nil { - l.wLock.Unlock() - return err - } - - l.wLock.Unlock() - - wb := bufio.NewWriterSize(w, 4096) - - h := &DumpHead{commitID} - - if err = h.Write(wb); err != nil { - return err - } - - it := snap.NewIterator() - it.SeekToFirst() - - compressBuf := make([]byte, 4096) - - var key []byte - var value []byte - for ; it.Valid(); it.Next() { - key = it.RawKey() - value = it.RawValue() - - if key, err = snappy.Encode(compressBuf, key); err != nil { - return err - } - - if err = binary.Write(wb, binary.BigEndian, uint16(len(key))); err != nil { - return err - } - - if _, err = wb.Write(key); err != nil { - return err - } - - if value, err = snappy.Encode(compressBuf, value); err != nil { - return err - } - - if err = binary.Write(wb, binary.BigEndian, uint32(len(value))); err != nil { - return err - } - - if _, err = wb.Write(value); err != nil { - return err - } - } - - if err = wb.Flush(); err != nil { - return err - } - - compressBuf = nil - - return nil -} - -// clear all data and load dump file to db -func (l *Ledis) LoadDumpFile(path string) (*DumpHead, error) { - f, err := os.Open(path) - if err != nil { - return nil, err - } - defer f.Close() - - return l.LoadDump(f) -} - -// clear all data and load dump file to db -func (l *Ledis) LoadDump(r io.Reader) (*DumpHead, error) { - l.wLock.Lock() - defer l.wLock.Unlock() - - var err error - if err = l.flushAll(); err != nil { - return nil, err - } - - rb := bufio.NewReaderSize(r, 4096) - - h := new(DumpHead) - - if err = h.Read(rb); err != nil { - return nil, err - } - - var keyLen uint16 - var valueLen uint32 - - var keyBuf bytes.Buffer - var valueBuf bytes.Buffer - - deKeyBuf := make([]byte, 4096) - deValueBuf := make([]byte, 4096) - - var key, value []byte - - wb := l.ldb.NewWriteBatch() - defer wb.Close() - - n := 0 - - for { - if err = binary.Read(rb, binary.BigEndian, &keyLen); err != nil && err != io.EOF { - return nil, err - } else if err == io.EOF { - break - } - - if _, err = io.CopyN(&keyBuf, rb, int64(keyLen)); err != nil { - return nil, err - } - - if key, err = snappy.Decode(deKeyBuf, keyBuf.Bytes()); err != nil { - return nil, err - } - - if err = binary.Read(rb, binary.BigEndian, &valueLen); err != nil { - return nil, err - } - - if _, err = io.CopyN(&valueBuf, rb, int64(valueLen)); err != nil { - return nil, err - } - - if value, err = snappy.Decode(deValueBuf, valueBuf.Bytes()); err != nil { - return nil, err - } - - wb.Put(key, value) - n++ - if n%1024 == 0 { - if err = wb.Commit(); err != nil { - return nil, err - } - } - - // if err = l.ldb.Put(key, value); err != nil { - // return nil, err - // } - - keyBuf.Reset() - valueBuf.Reset() - } - - if err = wb.Commit(); err != nil { - return nil, err - } - - deKeyBuf = nil - deValueBuf = nil - - if l.r != nil { - if err := l.r.UpdateCommitID(h.CommitID); err != nil { - return nil, err - } - } - - return h, nil -} diff --git a/vendor/github.com/siddontang/ledisdb/ledis/event.go b/vendor/github.com/siddontang/ledisdb/ledis/event.go deleted file mode 100644 index 2a3b54a07e2a..000000000000 --- a/vendor/github.com/siddontang/ledisdb/ledis/event.go +++ /dev/null @@ -1,135 +0,0 @@ -package ledis - -import ( - "errors" - "fmt" - "github.com/siddontang/go/hack" - "strconv" -) - -var errInvalidEvent = errors.New("invalid event") - -func formatEventKey(buf []byte, k []byte) ([]byte, error) { - if len(k) < 2 { - return nil, errInvalidEvent - } - - buf = append(buf, fmt.Sprintf("DB:%2d ", k[0])...) - buf = append(buf, fmt.Sprintf("%s ", TypeName[k[1]])...) - - db := new(DB) - db.index = k[0] - - //to do format at respective place - - switch k[1] { - case KVType: - if key, err := db.decodeKVKey(k); err != nil { - return nil, err - } else { - buf = strconv.AppendQuote(buf, hack.String(key)) - } - case HashType: - if key, field, err := db.hDecodeHashKey(k); err != nil { - return nil, err - } else { - buf = strconv.AppendQuote(buf, hack.String(key)) - buf = append(buf, ' ') - buf = strconv.AppendQuote(buf, hack.String(field)) - } - case HSizeType: - if key, err := db.hDecodeSizeKey(k); err != nil { - return nil, err - } else { - buf = strconv.AppendQuote(buf, hack.String(key)) - } - case ListType: - if key, seq, err := db.lDecodeListKey(k); err != nil { - return nil, err - } else { - buf = strconv.AppendQuote(buf, hack.String(key)) - buf = append(buf, ' ') - buf = strconv.AppendInt(buf, int64(seq), 10) - } - case LMetaType: - if key, err := db.lDecodeMetaKey(k); err != nil { - return nil, err - } else { - buf = strconv.AppendQuote(buf, hack.String(key)) - } - case ZSetType: - if key, m, err := db.zDecodeSetKey(k); err != nil { - return nil, err - } else { - buf = strconv.AppendQuote(buf, hack.String(key)) - buf = append(buf, ' ') - buf = strconv.AppendQuote(buf, hack.String(m)) - } - case ZSizeType: - if key, err := db.zDecodeSizeKey(k); err != nil { - return nil, err - } else { - buf = strconv.AppendQuote(buf, hack.String(key)) - } - case ZScoreType: - if key, m, score, err := db.zDecodeScoreKey(k); err != nil { - return nil, err - } else { - buf = strconv.AppendQuote(buf, hack.String(key)) - buf = append(buf, ' ') - buf = strconv.AppendQuote(buf, hack.String(m)) - buf = append(buf, ' ') - buf = strconv.AppendInt(buf, score, 10) - } - case BitType: - if key, seq, err := db.bDecodeBinKey(k); err != nil { - return nil, err - } else { - buf = strconv.AppendQuote(buf, hack.String(key)) - buf = append(buf, ' ') - buf = strconv.AppendUint(buf, uint64(seq), 10) - } - case BitMetaType: - if key, err := db.bDecodeMetaKey(k); err != nil { - return nil, err - } else { - buf = strconv.AppendQuote(buf, hack.String(key)) - } - case SetType: - if key, member, err := db.sDecodeSetKey(k); err != nil { - return nil, err - } else { - buf = strconv.AppendQuote(buf, hack.String(key)) - buf = append(buf, ' ') - buf = strconv.AppendQuote(buf, hack.String(member)) - } - case SSizeType: - if key, err := db.sDecodeSizeKey(k); err != nil { - return nil, err - } else { - buf = strconv.AppendQuote(buf, hack.String(key)) - } - case ExpTimeType: - if tp, key, t, err := db.expDecodeTimeKey(k); err != nil { - return nil, err - } else { - buf = append(buf, TypeName[tp]...) - buf = append(buf, ' ') - buf = strconv.AppendQuote(buf, hack.String(key)) - buf = append(buf, ' ') - buf = strconv.AppendInt(buf, t, 10) - } - case ExpMetaType: - if tp, key, err := db.expDecodeMetaKey(k); err != nil { - return nil, err - } else { - buf = append(buf, TypeName[tp]...) - buf = append(buf, ' ') - buf = strconv.AppendQuote(buf, hack.String(key)) - } - default: - return nil, errInvalidEvent - } - - return buf, nil -} diff --git a/vendor/github.com/siddontang/ledisdb/ledis/info.go b/vendor/github.com/siddontang/ledisdb/ledis/info.go deleted file mode 100644 index df3ee72ce280..000000000000 --- a/vendor/github.com/siddontang/ledisdb/ledis/info.go +++ /dev/null @@ -1,26 +0,0 @@ -package ledis - -import () - -// todo, add info - -// type Keyspace struct { -// Kvs int `json:"kvs"` -// KvExpires int `json:"kv_expires"` - -// Lists int `json:"lists"` -// ListExpires int `json:"list_expires"` - -// Bitmaps int `json:"bitmaps"` -// BitmapExpires int `json:"bitmap_expires"` - -// ZSets int `json:"zsets"` -// ZSetExpires int `json:"zset_expires"` - -// Hashes int `json:"hashes"` -// HashExpires int `json:"hahsh_expires"` -// } - -// type Info struct { -// KeySpaces [MaxDBNumber]Keyspace -// } diff --git a/vendor/github.com/siddontang/ledisdb/ledis/ledis.go b/vendor/github.com/siddontang/ledisdb/ledis/ledis.go deleted file mode 100644 index 6e390c5404ea..000000000000 --- a/vendor/github.com/siddontang/ledisdb/ledis/ledis.go +++ /dev/null @@ -1,215 +0,0 @@ -package ledis - -import ( - "fmt" - "github.com/siddontang/go/filelock" - "github.com/siddontang/go/log" - "github.com/siddontang/ledisdb/config" - "github.com/siddontang/ledisdb/rpl" - "github.com/siddontang/ledisdb/store" - "io" - "os" - "path" - "sync" - "time" -) - -type Ledis struct { - cfg *config.Config - - ldb *store.DB - dbs [MaxDBNumber]*DB - - quit chan struct{} - wg sync.WaitGroup - - //for replication - r *rpl.Replication - rc chan struct{} - rbatch *store.WriteBatch - rwg sync.WaitGroup - rhs []NewLogEventHandler - - wLock sync.RWMutex //allow one write at same time - commitLock sync.Mutex //allow one write commit at same time - - lock io.Closer - - tcs [MaxDBNumber]*ttlChecker -} - -func Open(cfg *config.Config) (*Ledis, error) { - if len(cfg.DataDir) == 0 { - cfg.DataDir = config.DefaultDataDir - } - - os.MkdirAll(cfg.DataDir, 0755) - - var err error - - l := new(Ledis) - l.cfg = cfg - - if l.lock, err = filelock.Lock(path.Join(cfg.DataDir, "LOCK")); err != nil { - return nil, err - } - - l.quit = make(chan struct{}) - - if l.ldb, err = store.Open(cfg); err != nil { - return nil, err - } - - if cfg.UseReplication { - if l.r, err = rpl.NewReplication(cfg); err != nil { - return nil, err - } - - l.rc = make(chan struct{}, 1) - l.rbatch = l.ldb.NewWriteBatch() - - l.wg.Add(1) - go l.onReplication() - - //first we must try wait all replication ok - //maybe some logs are not committed - l.WaitReplication() - } else { - l.r = nil - } - - for i := uint8(0); i < MaxDBNumber; i++ { - l.dbs[i] = l.newDB(i) - } - - l.checkTTL() - - return l, nil -} - -func (l *Ledis) Close() { - close(l.quit) - l.wg.Wait() - - l.ldb.Close() - - if l.r != nil { - l.r.Close() - //l.r = nil - } - - if l.lock != nil { - l.lock.Close() - //l.lock = nil - } -} - -func (l *Ledis) Select(index int) (*DB, error) { - if index < 0 || index >= int(MaxDBNumber) { - return nil, fmt.Errorf("invalid db index %d", index) - } - - return l.dbs[index], nil -} - -// Flush All will clear all data and replication logs -func (l *Ledis) FlushAll() error { - l.wLock.Lock() - defer l.wLock.Unlock() - - return l.flushAll() -} - -func (l *Ledis) flushAll() error { - it := l.ldb.NewIterator() - defer it.Close() - - w := l.ldb.NewWriteBatch() - defer w.Rollback() - - n := 0 - for ; it.Valid(); it.Next() { - n++ - if n == 10000 { - if err := w.Commit(); err != nil { - log.Fatal("flush all commit error: %s", err.Error()) - return err - } - n = 0 - } - w.Delete(it.RawKey()) - } - - if err := w.Commit(); err != nil { - log.Fatal("flush all commit error: %s", err.Error()) - return err - } - - if l.r != nil { - if err := l.r.Clear(); err != nil { - log.Fatal("flush all replication clear error: %s", err.Error()) - return err - } - } - - return nil -} - -func (l *Ledis) IsReadOnly() bool { - if l.cfg.GetReadonly() { - return true - } else if l.r != nil { - if b, _ := l.r.CommitIDBehind(); b { - return true - } - } - return false -} - -func (l *Ledis) checkTTL() { - for i, db := range l.dbs { - c := newTTLChecker(db) - - c.register(KVType, db.kvBatch, db.delete) - c.register(ListType, db.listBatch, db.lDelete) - c.register(HashType, db.hashBatch, db.hDelete) - c.register(ZSetType, db.zsetBatch, db.zDelete) - c.register(BitType, db.binBatch, db.bDelete) - c.register(SetType, db.setBatch, db.sDelete) - - l.tcs[i] = c - } - - if l.cfg.TTLCheckInterval == 0 { - l.cfg.TTLCheckInterval = 1 - } - - l.wg.Add(1) - go func() { - defer l.wg.Done() - - tick := time.NewTicker(time.Duration(l.cfg.TTLCheckInterval) * time.Second) - defer tick.Stop() - - for { - select { - case <-tick.C: - if l.IsReadOnly() { - break - } - - for _, c := range l.tcs { - c.check() - } - case <-l.quit: - return - } - } - - }() - -} - -func (l *Ledis) StoreStat() *store.Stat { - return l.ldb.Stat() -} diff --git a/vendor/github.com/siddontang/ledisdb/ledis/ledis_db.go b/vendor/github.com/siddontang/ledisdb/ledis/ledis_db.go deleted file mode 100644 index ebde98e87bd9..000000000000 --- a/vendor/github.com/siddontang/ledisdb/ledis/ledis_db.go +++ /dev/null @@ -1,147 +0,0 @@ -package ledis - -import ( - "fmt" - "github.com/siddontang/ledisdb/store" - "sync" -) - -type ibucket interface { - Get(key []byte) ([]byte, error) - GetSlice(key []byte) (store.Slice, error) - - Put(key []byte, value []byte) error - Delete(key []byte) error - - NewIterator() *store.Iterator - - NewWriteBatch() *store.WriteBatch - - RangeIterator(min []byte, max []byte, rangeType uint8) *store.RangeLimitIterator - RevRangeIterator(min []byte, max []byte, rangeType uint8) *store.RangeLimitIterator - RangeLimitIterator(min []byte, max []byte, rangeType uint8, offset int, count int) *store.RangeLimitIterator - RevRangeLimitIterator(min []byte, max []byte, rangeType uint8, offset int, count int) *store.RangeLimitIterator -} - -type DB struct { - l *Ledis - - sdb *store.DB - - bucket ibucket - - index uint8 - - kvBatch *batch - listBatch *batch - hashBatch *batch - zsetBatch *batch - binBatch *batch - setBatch *batch - - status uint8 - - lbkeys *lBlockKeys -} - -func (l *Ledis) newDB(index uint8) *DB { - d := new(DB) - - d.l = l - - d.sdb = l.ldb - - d.bucket = d.sdb - - d.status = DBAutoCommit - d.index = index - - d.kvBatch = d.newBatch() - d.listBatch = d.newBatch() - d.hashBatch = d.newBatch() - d.zsetBatch = d.newBatch() - d.binBatch = d.newBatch() - d.setBatch = d.newBatch() - - d.lbkeys = newLBlockKeys() - - return d -} - -func (db *DB) newBatch() *batch { - return db.l.newBatch(db.bucket.NewWriteBatch(), &dbBatchLocker{l: &sync.Mutex{}, wrLock: &db.l.wLock}, nil) -} - -func (db *DB) Index() int { - return int(db.index) -} - -func (db *DB) IsAutoCommit() bool { - return db.status == DBAutoCommit -} - -func (db *DB) FlushAll() (drop int64, err error) { - all := [...](func() (int64, error)){ - db.flush, - db.lFlush, - db.hFlush, - db.zFlush, - db.bFlush, - db.sFlush} - - for _, flush := range all { - if n, e := flush(); e != nil { - err = e - return - } else { - drop += n - } - } - - return -} - -func (db *DB) flushType(t *batch, dataType byte) (drop int64, err error) { - var deleteFunc func(t *batch, key []byte) int64 - var metaDataType byte - switch dataType { - case KVType: - deleteFunc = db.delete - metaDataType = KVType - case ListType: - deleteFunc = db.lDelete - metaDataType = LMetaType - case HashType: - deleteFunc = db.hDelete - metaDataType = HSizeType - case ZSetType: - deleteFunc = db.zDelete - metaDataType = ZSizeType - case BitType: - deleteFunc = db.bDelete - metaDataType = BitMetaType - case SetType: - deleteFunc = db.sDelete - metaDataType = SSizeType - default: - return 0, fmt.Errorf("invalid data type: %s", TypeName[dataType]) - } - - var keys [][]byte - keys, err = db.scan(metaDataType, nil, 1024, false, "") - for len(keys) != 0 || err != nil { - for _, key := range keys { - deleteFunc(t, key) - db.rmExpire(t, dataType, key) - - } - - if err = t.Commit(); err != nil { - return - } else { - drop += int64(len(keys)) - } - keys, err = db.scan(metaDataType, nil, 1024, false, "") - } - return -} diff --git a/vendor/github.com/siddontang/ledisdb/ledis/multi.go b/vendor/github.com/siddontang/ledisdb/ledis/multi.go deleted file mode 100644 index 29abe34aa3c6..000000000000 --- a/vendor/github.com/siddontang/ledisdb/ledis/multi.go +++ /dev/null @@ -1,75 +0,0 @@ -package ledis - -import ( - "errors" - "fmt" -) - -var ( - ErrNestMulti = errors.New("nest multi not supported") - ErrMultiDone = errors.New("multi has been closed") -) - -type Multi struct { - *DB -} - -func (db *DB) IsInMulti() bool { - return db.status == DBInMulti -} - -// begin a mutli to execute commands, -// it will block any other write operations before you close the multi, unlike transaction, mutli can not rollback -func (db *DB) Multi() (*Multi, error) { - if db.IsInMulti() { - return nil, ErrNestMulti - } - - m := new(Multi) - - m.DB = new(DB) - m.DB.status = DBInMulti - - m.DB.l = db.l - - m.l.wLock.Lock() - - m.DB.sdb = db.sdb - - m.DB.bucket = db.sdb - - m.DB.index = db.index - - m.DB.kvBatch = m.newBatch() - m.DB.listBatch = m.newBatch() - m.DB.hashBatch = m.newBatch() - m.DB.zsetBatch = m.newBatch() - m.DB.binBatch = m.newBatch() - m.DB.setBatch = m.newBatch() - - m.DB.lbkeys = db.lbkeys - - return m, nil -} - -func (m *Multi) newBatch() *batch { - return m.l.newBatch(m.bucket.NewWriteBatch(), &multiBatchLocker{}, nil) -} - -func (m *Multi) Close() error { - if m.bucket == nil { - return ErrMultiDone - } - m.l.wLock.Unlock() - m.bucket = nil - return nil -} - -func (m *Multi) Select(index int) error { - if index < 0 || index >= int(MaxDBNumber) { - return fmt.Errorf("invalid db index %d", index) - } - - m.DB.index = uint8(index) - return nil -} diff --git a/vendor/github.com/siddontang/ledisdb/ledis/replication.go b/vendor/github.com/siddontang/ledisdb/ledis/replication.go deleted file mode 100644 index 4f02259898fd..000000000000 --- a/vendor/github.com/siddontang/ledisdb/ledis/replication.go +++ /dev/null @@ -1,246 +0,0 @@ -package ledis - -import ( - "bytes" - "errors" - "github.com/siddontang/go/log" - "github.com/siddontang/go/snappy" - "github.com/siddontang/ledisdb/rpl" - "github.com/siddontang/ledisdb/store" - "io" - "time" -) - -const ( - maxReplLogSize = 1 * 1024 * 1024 -) - -var ( - ErrLogMissed = errors.New("log is pured in server") -) - -func (l *Ledis) ReplicationUsed() bool { - return l.r != nil -} - -func (l *Ledis) handleReplication() error { - l.wLock.Lock() - defer l.wLock.Unlock() - - l.rwg.Add(1) - rl := &rpl.Log{} - var err error - for { - if err = l.r.NextNeedCommitLog(rl); err != nil { - if err != rpl.ErrNoBehindLog { - log.Error("get next commit log err, %s", err.Error) - return err - } else { - l.rwg.Done() - return nil - } - } else { - l.rbatch.Rollback() - - if rl.Compression == 1 { - //todo optimize - if rl.Data, err = snappy.Decode(nil, rl.Data); err != nil { - log.Error("decode log error %s", err.Error()) - return err - } - } - - if bd, err := store.NewBatchData(rl.Data); err != nil { - log.Error("decode batch log error %s", err.Error()) - return err - } else if err = bd.Replay(l.rbatch); err != nil { - log.Error("replay batch log error %s", err.Error()) - } - - l.commitLock.Lock() - if err = l.rbatch.Commit(); err != nil { - log.Error("commit log error %s", err.Error()) - } else if err = l.r.UpdateCommitID(rl.ID); err != nil { - log.Error("update commit id error %s", err.Error()) - } - - l.commitLock.Unlock() - if err != nil { - return err - } - } - - } -} - -func (l *Ledis) onReplication() { - defer l.wg.Done() - - l.noticeReplication() - - for { - select { - case <-l.rc: - l.handleReplication() - case <-l.quit: - return - } - } -} - -func (l *Ledis) WaitReplication() error { - if !l.ReplicationUsed() { - return ErrRplNotSupport - - } - - l.noticeReplication() - l.rwg.Wait() - - for i := 0; i < 100; i++ { - b, err := l.r.CommitIDBehind() - if err != nil { - return err - } else if b { - l.noticeReplication() - l.rwg.Wait() - time.Sleep(100 * time.Millisecond) - } else { - return nil - } - } - - return errors.New("wait replication too many times") -} - -func (l *Ledis) StoreLogsFromReader(rb io.Reader) error { - if !l.ReplicationUsed() { - return ErrRplNotSupport - } else if !l.cfg.Readonly { - return ErrRplInRDWR - } - - log := &rpl.Log{} - - for { - if err := log.Decode(rb); err != nil { - if err == io.EOF { - break - } else { - return err - } - } - - if err := l.r.StoreLog(log); err != nil { - return err - } - - } - - l.noticeReplication() - - return nil -} - -func (l *Ledis) noticeReplication() { - AsyncNotify(l.rc) -} - -func (l *Ledis) StoreLogsFromData(data []byte) error { - rb := bytes.NewReader(data) - - return l.StoreLogsFromReader(rb) -} - -func (l *Ledis) ReadLogsTo(startLogID uint64, w io.Writer) (n int, nextLogID uint64, err error) { - if !l.ReplicationUsed() { - // no replication log - nextLogID = 0 - err = ErrRplNotSupport - return - } - - var firtID, lastID uint64 - - firtID, err = l.r.FirstLogID() - if err != nil { - return - } - - if startLogID < firtID { - err = ErrLogMissed - return - } - - lastID, err = l.r.LastLogID() - if err != nil { - return - } - - nextLogID = startLogID - - log := &rpl.Log{} - for i := startLogID; i <= lastID; i++ { - if err = l.r.GetLog(i, log); err != nil { - return - } - - if err = log.Encode(w); err != nil { - return - } - - nextLogID = i + 1 - - n += log.Size() - - if n > maxReplLogSize { - break - } - } - - return -} - -// try to read events, if no events read, try to wait the new event singal until timeout seconds -func (l *Ledis) ReadLogsToTimeout(startLogID uint64, w io.Writer, timeout int, quitCh chan struct{}) (n int, nextLogID uint64, err error) { - n, nextLogID, err = l.ReadLogsTo(startLogID, w) - if err != nil { - return - } else if n != 0 { - return - } - //no events read - select { - case <-l.r.WaitLog(): - case <-time.After(time.Duration(timeout) * time.Second): - case <-quitCh: - return - } - return l.ReadLogsTo(startLogID, w) -} - -func (l *Ledis) propagate(rl *rpl.Log) { - for _, h := range l.rhs { - h(rl) - } -} - -type NewLogEventHandler func(rl *rpl.Log) - -func (l *Ledis) AddNewLogEventHandler(h NewLogEventHandler) error { - if !l.ReplicationUsed() { - return ErrRplNotSupport - } - - l.rhs = append(l.rhs, h) - - return nil -} - -func (l *Ledis) ReplicationStat() (*rpl.Stat, error) { - if !l.ReplicationUsed() { - return nil, ErrRplNotSupport - } - - return l.r.Stat() -} diff --git a/vendor/github.com/siddontang/ledisdb/ledis/scan.go b/vendor/github.com/siddontang/ledisdb/ledis/scan.go deleted file mode 100644 index 9e8e235a16a5..000000000000 --- a/vendor/github.com/siddontang/ledisdb/ledis/scan.go +++ /dev/null @@ -1,136 +0,0 @@ -package ledis - -import ( - "errors" - "github.com/siddontang/ledisdb/store" - "regexp" -) - -var errDataType = errors.New("error data type") -var errMetaKey = errors.New("error meta key") - -func (db *DB) scan(dataType byte, key []byte, count int, inclusive bool, match string) ([][]byte, error) { - return db.scanGeneric(dataType, key, count, inclusive, match, false) -} - -func (db *DB) revscan(dataType byte, key []byte, count int, inclusive bool, match string) ([][]byte, error) { - return db.scanGeneric(dataType, key, count, inclusive, match, true) -} - -func (db *DB) scanGeneric(dataType byte, key []byte, count int, - inclusive bool, match string, reverse bool) ([][]byte, error) { - var minKey, maxKey []byte - var err error - var r *regexp.Regexp - - if len(match) > 0 { - if r, err = regexp.Compile(match); err != nil { - return nil, err - } - } - - tp := store.RangeOpen - - if !reverse { - if minKey, err = db.encodeScanMinKey(dataType, key); err != nil { - return nil, err - } - if maxKey, err = db.encodeScanMaxKey(dataType, nil); err != nil { - return nil, err - } - - if inclusive { - tp = store.RangeROpen - } - } else { - if minKey, err = db.encodeScanMinKey(dataType, nil); err != nil { - return nil, err - } - if maxKey, err = db.encodeScanMaxKey(dataType, key); err != nil { - return nil, err - } - - if inclusive { - tp = store.RangeLOpen - } - } - - if count <= 0 { - count = defaultScanCount - } - - var it *store.RangeLimitIterator - if !reverse { - it = db.bucket.RangeIterator(minKey, maxKey, tp) - } else { - it = db.bucket.RevRangeIterator(minKey, maxKey, tp) - } - - v := make([][]byte, 0, count) - - for i := 0; it.Valid() && i < count; it.Next() { - if k, err := db.decodeScanKey(dataType, it.Key()); err != nil { - continue - } else if r != nil && !r.Match(k) { - continue - } else { - v = append(v, k) - i++ - } - } - it.Close() - return v, nil -} - -func (db *DB) encodeScanMinKey(dataType byte, key []byte) ([]byte, error) { - if len(key) == 0 { - return db.encodeScanKey(dataType, nil) - } else { - if err := checkKeySize(key); err != nil { - return nil, err - } - return db.encodeScanKey(dataType, key) - } -} - -func (db *DB) encodeScanMaxKey(dataType byte, key []byte) ([]byte, error) { - if len(key) > 0 { - if err := checkKeySize(key); err != nil { - return nil, err - } - - return db.encodeScanKey(dataType, key) - } - - k, err := db.encodeScanKey(dataType, nil) - if err != nil { - return nil, err - } - k[len(k)-1] = dataType + 1 - return k, nil -} - -func (db *DB) encodeScanKey(dataType byte, key []byte) ([]byte, error) { - switch dataType { - case KVType: - return db.encodeKVKey(key), nil - case LMetaType: - return db.lEncodeMetaKey(key), nil - case HSizeType: - return db.hEncodeSizeKey(key), nil - case ZSizeType: - return db.zEncodeSizeKey(key), nil - case BitMetaType: - return db.bEncodeMetaKey(key), nil - case SSizeType: - return db.sEncodeSizeKey(key), nil - default: - return nil, errDataType - } -} -func (db *DB) decodeScanKey(dataType byte, ek []byte) ([]byte, error) { - if len(ek) < 2 || ek[0] != db.index || ek[1] != dataType { - return nil, errMetaKey - } - return ek[2:], nil -} diff --git a/vendor/github.com/siddontang/ledisdb/ledis/t_bit.go b/vendor/github.com/siddontang/ledisdb/ledis/t_bit.go deleted file mode 100644 index 771ef9016e71..000000000000 --- a/vendor/github.com/siddontang/ledisdb/ledis/t_bit.go +++ /dev/null @@ -1,926 +0,0 @@ -package ledis - -import ( - "encoding/binary" - "errors" - "github.com/siddontang/go/num" - "github.com/siddontang/ledisdb/store" - "sort" - "time" -) - -const ( - OPand uint8 = iota + 1 - OPor - OPxor - OPnot -) - -type BitPair struct { - Pos int32 - Val uint8 -} - -type segBitInfo struct { - Seq uint32 - Off uint32 - Val uint8 -} - -type segBitInfoArray []segBitInfo - -const ( - // byte - segByteWidth uint32 = 9 - segByteSize uint32 = 1 << segByteWidth - - // bit - segBitWidth uint32 = segByteWidth + 3 - segBitSize uint32 = segByteSize << 3 - - maxByteSize uint32 = 8 << 20 - maxSegCount uint32 = maxByteSize / segByteSize - - minSeq uint32 = 0 - maxSeq uint32 = uint32((maxByteSize << 3) - 1) -) - -var bitsInByte = [256]int32{0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, - 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, - 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, - 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, - 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, - 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, - 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, - 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, - 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, - 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, - 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8} - -var fillBits = [...]uint8{1, 3, 7, 15, 31, 63, 127, 255} - -var emptySegment []byte = make([]byte, segByteSize, segByteSize) - -var fillSegment []byte = func() []byte { - data := make([]byte, segByteSize, segByteSize) - for i := uint32(0); i < segByteSize; i++ { - data[i] = 0xff - } - return data -}() - -var errBinKey = errors.New("invalid bin key") -var errOffset = errors.New("invalid offset") -var errDuplicatePos = errors.New("duplicate bit pos") - -func getBit(sz []byte, offset uint32) uint8 { - index := offset >> 3 - if index >= uint32(len(sz)) { - return 0 // error("overflow") - } - - offset -= index << 3 - return sz[index] >> offset & 1 -} - -func setBit(sz []byte, offset uint32, val uint8) bool { - if val != 1 && val != 0 { - return false // error("invalid val") - } - - index := offset >> 3 - if index >= uint32(len(sz)) { - return false // error("overflow") - } - - offset -= index << 3 - if sz[index]>>offset&1 != val { - sz[index] ^= (1 << offset) - } - return true -} - -func (datas segBitInfoArray) Len() int { - return len(datas) -} - -func (datas segBitInfoArray) Less(i, j int) bool { - res := (datas)[i].Seq < (datas)[j].Seq - if !res && (datas)[i].Seq == (datas)[j].Seq { - res = (datas)[i].Off < (datas)[j].Off - } - return res -} - -func (datas segBitInfoArray) Swap(i, j int) { - datas[i], datas[j] = datas[j], datas[i] -} - -func (db *DB) bEncodeMetaKey(key []byte) []byte { - mk := make([]byte, len(key)+2) - mk[0] = db.index - mk[1] = BitMetaType - - copy(mk[2:], key) - return mk -} - -func (db *DB) bDecodeMetaKey(bkey []byte) ([]byte, error) { - if len(bkey) < 2 || bkey[0] != db.index || bkey[1] != BitMetaType { - return nil, errBinKey - } - - return bkey[2:], nil -} - -func (db *DB) bEncodeBinKey(key []byte, seq uint32) []byte { - bk := make([]byte, len(key)+8) - - pos := 0 - bk[pos] = db.index - pos++ - bk[pos] = BitType - pos++ - - binary.BigEndian.PutUint16(bk[pos:], uint16(len(key))) - pos += 2 - - copy(bk[pos:], key) - pos += len(key) - - binary.BigEndian.PutUint32(bk[pos:], seq) - - return bk -} - -func (db *DB) bDecodeBinKey(bkey []byte) (key []byte, seq uint32, err error) { - if len(bkey) < 8 || bkey[0] != db.index { - err = errBinKey - return - } - - keyLen := binary.BigEndian.Uint16(bkey[2:4]) - if int(keyLen+8) != len(bkey) { - err = errBinKey - return - } - - key = bkey[4 : 4+keyLen] - seq = uint32(binary.BigEndian.Uint32(bkey[4+keyLen:])) - return -} - -func (db *DB) bCapByteSize(seq uint32, off uint32) uint32 { - var offByteSize uint32 = (off >> 3) + 1 - if offByteSize > segByteSize { - offByteSize = segByteSize - } - - return seq<= 0 { - offset += int32((uint32(tailSeq)<> segBitWidth - off &= (segBitSize - 1) - return -} - -func (db *DB) bGetMeta(key []byte) (tailSeq int32, tailOff int32, err error) { - var v []byte - - mk := db.bEncodeMetaKey(key) - v, err = db.bucket.Get(mk) - if err != nil { - return - } - - if v != nil { - tailSeq = int32(binary.LittleEndian.Uint32(v[0:4])) - tailOff = int32(binary.LittleEndian.Uint32(v[4:8])) - } else { - tailSeq = -1 - tailOff = -1 - } - return -} - -func (db *DB) bSetMeta(t *batch, key []byte, tailSeq uint32, tailOff uint32) { - ek := db.bEncodeMetaKey(key) - - buf := make([]byte, 8) - binary.LittleEndian.PutUint32(buf[0:4], tailSeq) - binary.LittleEndian.PutUint32(buf[4:8], tailOff) - - t.Put(ek, buf) - return -} - -func (db *DB) bUpdateMeta(t *batch, key []byte, seq uint32, off uint32) (tailSeq uint32, tailOff uint32, err error) { - var tseq, toff int32 - var update bool = false - - if tseq, toff, err = db.bGetMeta(key); err != nil { - return - } else if tseq < 0 { - update = true - } else { - tailSeq = uint32(num.MaxInt32(tseq, 0)) - tailOff = uint32(num.MaxInt32(toff, 0)) - update = (seq > tailSeq || (seq == tailSeq && off > tailOff)) - } - - if update { - db.bSetMeta(t, key, seq, off) - tailSeq = seq - tailOff = off - } - return -} - -func (db *DB) bDelete(t *batch, key []byte) (drop int64) { - mk := db.bEncodeMetaKey(key) - t.Delete(mk) - - minKey := db.bEncodeBinKey(key, minSeq) - maxKey := db.bEncodeBinKey(key, maxSeq) - it := db.bucket.RangeIterator(minKey, maxKey, store.RangeClose) - for ; it.Valid(); it.Next() { - t.Delete(it.RawKey()) - drop++ - } - it.Close() - - return drop -} - -func (db *DB) bGetSegment(key []byte, seq uint32) ([]byte, []byte, error) { - bk := db.bEncodeBinKey(key, seq) - segment, err := db.bucket.Get(bk) - if err != nil { - return bk, nil, err - } - return bk, segment, nil -} - -func (db *DB) bAllocateSegment(key []byte, seq uint32) ([]byte, []byte, error) { - bk, segment, err := db.bGetSegment(key, seq) - if err == nil && segment == nil { - segment = make([]byte, segByteSize, segByteSize) - } - return bk, segment, err -} - -func (db *DB) bIterator(key []byte) *store.RangeLimitIterator { - sk := db.bEncodeBinKey(key, minSeq) - ek := db.bEncodeBinKey(key, maxSeq) - return db.bucket.RangeIterator(sk, ek, store.RangeClose) -} - -func (db *DB) bSegAnd(a []byte, b []byte, res *[]byte) { - if a == nil || b == nil { - *res = nil - return - } - - data := *res - if data == nil { - data = make([]byte, segByteSize, segByteSize) - *res = data - } - - for i := uint32(0); i < segByteSize; i++ { - data[i] = a[i] & b[i] - } - return -} - -func (db *DB) bSegOr(a []byte, b []byte, res *[]byte) { - if a == nil || b == nil { - if a == nil && b == nil { - *res = nil - } else if a == nil { - *res = b - } else { - *res = a - } - return - } - - data := *res - if data == nil { - data = make([]byte, segByteSize, segByteSize) - *res = data - } - - for i := uint32(0); i < segByteSize; i++ { - data[i] = a[i] | b[i] - } - return -} - -func (db *DB) bSegXor(a []byte, b []byte, res *[]byte) { - if a == nil && b == nil { - *res = fillSegment - return - } - - if a == nil { - a = emptySegment - } - - if b == nil { - b = emptySegment - } - - data := *res - if data == nil { - data = make([]byte, segByteSize, segByteSize) - *res = data - } - - for i := uint32(0); i < segByteSize; i++ { - data[i] = a[i] ^ b[i] - } - - return -} - -func (db *DB) bExpireAt(key []byte, when int64) (int64, error) { - t := db.binBatch - t.Lock() - defer t.Unlock() - - if seq, _, err := db.bGetMeta(key); err != nil || seq < 0 { - return 0, err - } else { - db.expireAt(t, BitType, key, when) - if err := t.Commit(); err != nil { - return 0, err - } - } - return 1, nil -} - -func (db *DB) bCountByte(val byte, soff uint32, eoff uint32) int32 { - if soff > eoff { - soff, eoff = eoff, soff - } - - mask := uint8(0) - if soff > 0 { - mask |= fillBits[soff-1] - } - if eoff < 7 { - mask |= (fillBits[7] ^ fillBits[eoff]) - } - mask = fillBits[7] ^ mask - - return bitsInByte[val&mask] -} - -func (db *DB) bCountSeg(key []byte, seq uint32, soff uint32, eoff uint32) (cnt int32, err error) { - if soff >= segBitSize || soff < 0 || - eoff >= segBitSize || eoff < 0 { - return - } - - var segment []byte - if _, segment, err = db.bGetSegment(key, seq); err != nil { - return - } - - if segment == nil { - return - } - - if soff > eoff { - soff, eoff = eoff, soff - } - - headIdx := int(soff >> 3) - endIdx := int(eoff >> 3) - sByteOff := soff - ((soff >> 3) << 3) - eByteOff := eoff - ((eoff >> 3) << 3) - - if headIdx == endIdx { - cnt = db.bCountByte(segment[headIdx], sByteOff, eByteOff) - } else { - cnt = db.bCountByte(segment[headIdx], sByteOff, 7) + - db.bCountByte(segment[endIdx], 0, eByteOff) - } - - // sum up following bytes - for idx, end := headIdx+1, endIdx-1; idx <= end; idx += 1 { - cnt += bitsInByte[segment[idx]] - if idx == end { - break - } - } - - return -} - -func (db *DB) BGet(key []byte) (data []byte, err error) { - if err = checkKeySize(key); err != nil { - return - } - - var ts, to int32 - if ts, to, err = db.bGetMeta(key); err != nil || ts < 0 { - return - } - - var tailSeq, tailOff = uint32(ts), uint32(to) - var capByteSize uint32 = db.bCapByteSize(tailSeq, tailOff) - data = make([]byte, capByteSize, capByteSize) - - minKey := db.bEncodeBinKey(key, minSeq) - maxKey := db.bEncodeBinKey(key, tailSeq) - it := db.bucket.RangeIterator(minKey, maxKey, store.RangeClose) - - var seq, s, e uint32 - for ; it.Valid(); it.Next() { - if _, seq, err = db.bDecodeBinKey(it.RawKey()); err != nil { - data = nil - break - } - - s = seq << segByteWidth - e = num.MinUint32(s+segByteSize, capByteSize) - copy(data[s:e], it.RawValue()) - } - it.Close() - - return -} - -func (db *DB) BDelete(key []byte) (drop int64, err error) { - if err = checkKeySize(key); err != nil { - return - } - - t := db.binBatch - t.Lock() - defer t.Unlock() - - drop = db.bDelete(t, key) - db.rmExpire(t, BitType, key) - - err = t.Commit() - return -} - -func (db *DB) BSetBit(key []byte, offset int32, val uint8) (ori uint8, err error) { - if err = checkKeySize(key); err != nil { - return - } - - // todo : check offset - var seq, off uint32 - if seq, off, err = db.bParseOffset(key, offset); err != nil { - return 0, err - } - - var bk, segment []byte - if bk, segment, err = db.bAllocateSegment(key, seq); err != nil { - return 0, err - } - - if segment != nil { - ori = getBit(segment, off) - if setBit(segment, off, val) { - t := db.binBatch - t.Lock() - defer t.Unlock() - - t.Put(bk, segment) - if _, _, e := db.bUpdateMeta(t, key, seq, off); e != nil { - err = e - return - } - - err = t.Commit() - } - } - - return -} - -func (db *DB) BMSetBit(key []byte, args ...BitPair) (place int64, err error) { - if err = checkKeySize(key); err != nil { - return - } - - // (ps : so as to aviod wasting memory copy while calling db.Get() and batch.Put(), - // here we sequence the params by pos, so that we can merge the execution of - // diff pos setting which targets on the same segment respectively. ) - - // #1 : sequence request data - var argCnt = len(args) - var bitInfos segBitInfoArray = make(segBitInfoArray, argCnt) - var seq, off uint32 - - for i, info := range args { - if seq, off, err = db.bParseOffset(key, info.Pos); err != nil { - return - } - - bitInfos[i].Seq = seq - bitInfos[i].Off = off - bitInfos[i].Val = info.Val - } - - sort.Sort(bitInfos) - - for i := 1; i < argCnt; i++ { - if bitInfos[i].Seq == bitInfos[i-1].Seq && bitInfos[i].Off == bitInfos[i-1].Off { - return 0, errDuplicatePos - } - } - - // #2 : execute bit set in order - t := db.binBatch - t.Lock() - defer t.Unlock() - - var curBinKey, curSeg []byte - var curSeq, maxSeq, maxOff uint32 - - for _, info := range bitInfos { - if curSeg != nil && info.Seq != curSeq { - t.Put(curBinKey, curSeg) - curSeg = nil - } - - if curSeg == nil { - curSeq = info.Seq - if curBinKey, curSeg, err = db.bAllocateSegment(key, info.Seq); err != nil { - return - } - - if curSeg == nil { - continue - } - } - - if setBit(curSeg, info.Off, info.Val) { - maxSeq = info.Seq - maxOff = info.Off - place++ - } - } - - if curSeg != nil { - t.Put(curBinKey, curSeg) - } - - // finally, update meta - if place > 0 { - if _, _, err = db.bUpdateMeta(t, key, maxSeq, maxOff); err != nil { - return - } - - err = t.Commit() - } - - return -} - -func (db *DB) BGetBit(key []byte, offset int32) (uint8, error) { - if seq, off, err := db.bParseOffset(key, offset); err != nil { - return 0, err - } else { - _, segment, err := db.bGetSegment(key, seq) - if err != nil { - return 0, err - } - - if segment == nil { - return 0, nil - } else { - return getBit(segment, off), nil - } - } -} - -// func (db *DB) BGetRange(key []byte, start int32, end int32) ([]byte, error) { -// section := make([]byte) - -// return -// } - -func (db *DB) BCount(key []byte, start int32, end int32) (cnt int32, err error) { - var sseq, soff uint32 - if sseq, soff, err = db.bParseOffset(key, start); err != nil { - return - } - - var eseq, eoff uint32 - if eseq, eoff, err = db.bParseOffset(key, end); err != nil { - return - } - - if sseq > eseq || (sseq == eseq && soff > eoff) { - sseq, eseq = eseq, sseq - soff, eoff = eoff, soff - } - - var segCnt int32 - if eseq == sseq { - if segCnt, err = db.bCountSeg(key, sseq, soff, eoff); err != nil { - return 0, err - } - - cnt = segCnt - - } else { - if segCnt, err = db.bCountSeg(key, sseq, soff, segBitSize-1); err != nil { - return 0, err - } else { - cnt += segCnt - } - - if segCnt, err = db.bCountSeg(key, eseq, 0, eoff); err != nil { - return 0, err - } else { - cnt += segCnt - } - } - - // middle segs - var segment []byte - skey := db.bEncodeBinKey(key, sseq) - ekey := db.bEncodeBinKey(key, eseq) - - it := db.bucket.RangeIterator(skey, ekey, store.RangeOpen) - for ; it.Valid(); it.Next() { - segment = it.RawValue() - for _, bt := range segment { - cnt += bitsInByte[bt] - } - } - it.Close() - - return -} - -func (db *DB) BTail(key []byte) (int32, error) { - // effective length of data, the highest bit-pos set in history - tailSeq, tailOff, err := db.bGetMeta(key) - if err != nil { - return 0, err - } - - tail := int32(-1) - if tailSeq >= 0 { - tail = int32(uint32(tailSeq)< maxDstSeq || (seq == maxDstSeq && off > maxDstOff) { - maxDstSeq = seq - maxDstOff = off - } - } - - if (op == OPnot && validKeyNum != 1) || - (op != OPnot && validKeyNum < 2) { - return // with not enough existing source key - } - - var srcIdx int - for srcIdx = 0; srcIdx < keyNum; srcIdx++ { - if srckeys[srcIdx] != nil { - break - } - } - - // init - data - var segments = make([][]byte, maxDstSeq+1) - - if op == OPnot { - // ps : - // ( ~num == num ^ 0x11111111 ) - // we init the result segments with all bit set, - // then we can calculate through the way of 'xor'. - - // ahead segments bin format : 1111 ... 1111 - for i := uint32(0); i < maxDstSeq; i++ { - segments[i] = fillSegment - } - - // last segment bin format : 1111..1100..0000 - var tailSeg = make([]byte, segByteSize, segByteSize) - var fillByte = fillBits[7] - var tailSegLen = db.bCapByteSize(uint32(0), maxDstOff) - for i := uint32(0); i < tailSegLen-1; i++ { - tailSeg[i] = fillByte - } - tailSeg[tailSegLen-1] = fillBits[maxDstOff-(tailSegLen-1)<<3] - segments[maxDstSeq] = tailSeg - - } else { - // ps : init segments by data corresponding to the 1st valid source key - it := db.bIterator(srckeys[srcIdx]) - for ; it.Valid(); it.Next() { - if _, seq, err = db.bDecodeBinKey(it.RawKey()); err != nil { - // to do ... - it.Close() - return - } - segments[seq] = it.Value() - } - it.Close() - srcIdx++ - } - - // operation with following keys - var res []byte - for i := srcIdx; i < keyNum; i++ { - if srckeys[i] == nil { - continue - } - - it := db.bIterator(srckeys[i]) - for idx, end := uint32(0), false; !end; it.Next() { - end = !it.Valid() - if !end { - if _, seq, err = db.bDecodeBinKey(it.RawKey()); err != nil { - // to do ... - it.Close() - return - } - } else { - seq = maxDstSeq + 1 - } - - // todo : - // operation 'and' can be optimize here : - // if seq > max_segments_idx, this loop can be break, - // which can avoid cost from Key() and bDecodeBinKey() - - for ; idx < seq; idx++ { - res = nil - exeOp(segments[idx], nil, &res) - segments[idx] = res - } - - if !end { - res = it.Value() - exeOp(segments[seq], res, &res) - segments[seq] = res - idx++ - } - } - it.Close() - } - - // clear the old data in case - db.bDelete(t, dstkey) - db.rmExpire(t, BitType, dstkey) - - // set data - db.bSetMeta(t, dstkey, maxDstSeq, maxDstOff) - - var bk []byte - for seq, segt := range segments { - if segt != nil { - bk = db.bEncodeBinKey(dstkey, uint32(seq)) - t.Put(bk, segt) - } - } - - err = t.Commit() - if err == nil { - // blen = int32(db.bCapByteSize(maxDstOff, maxDstOff)) - blen = int32(maxDstSeq< MaxKeySize || len(key) == 0 { - return errKeySize - } else if len(field) > MaxHashFieldSize || len(field) == 0 { - return errHashFieldSize - } - return nil -} - -func (db *DB) hEncodeSizeKey(key []byte) []byte { - buf := make([]byte, len(key)+2) - - buf[0] = db.index - buf[1] = HSizeType - - copy(buf[2:], key) - return buf -} - -func (db *DB) hDecodeSizeKey(ek []byte) ([]byte, error) { - if len(ek) < 2 || ek[0] != db.index || ek[1] != HSizeType { - return nil, errHSizeKey - } - - return ek[2:], nil -} - -func (db *DB) hEncodeHashKey(key []byte, field []byte) []byte { - buf := make([]byte, len(key)+len(field)+1+1+2+1) - - pos := 0 - buf[pos] = db.index - pos++ - buf[pos] = HashType - pos++ - - binary.BigEndian.PutUint16(buf[pos:], uint16(len(key))) - pos += 2 - - copy(buf[pos:], key) - pos += len(key) - - buf[pos] = hashStartSep - pos++ - copy(buf[pos:], field) - - return buf -} - -func (db *DB) hDecodeHashKey(ek []byte) ([]byte, []byte, error) { - if len(ek) < 5 || ek[0] != db.index || ek[1] != HashType { - return nil, nil, errHashKey - } - - pos := 2 - keyLen := int(binary.BigEndian.Uint16(ek[pos:])) - pos += 2 - - if keyLen+5 > len(ek) { - return nil, nil, errHashKey - } - - key := ek[pos : pos+keyLen] - pos += keyLen - - if ek[pos] != hashStartSep { - return nil, nil, errHashKey - } - - pos++ - field := ek[pos:] - return key, field, nil -} - -func (db *DB) hEncodeStartKey(key []byte) []byte { - return db.hEncodeHashKey(key, nil) -} - -func (db *DB) hEncodeStopKey(key []byte) []byte { - k := db.hEncodeHashKey(key, nil) - - k[len(k)-1] = hashStopSep - - return k -} - -func (db *DB) hSetItem(key []byte, field []byte, value []byte) (int64, error) { - t := db.hashBatch - - ek := db.hEncodeHashKey(key, field) - - var n int64 = 1 - if v, _ := db.bucket.Get(ek); v != nil { - n = 0 - } else { - if _, err := db.hIncrSize(key, 1); err != nil { - return 0, err - } - } - - t.Put(ek, value) - return n, nil -} - -// ps : here just focus on deleting the hash data, -// any other likes expire is ignore. -func (db *DB) hDelete(t *batch, key []byte) int64 { - sk := db.hEncodeSizeKey(key) - start := db.hEncodeStartKey(key) - stop := db.hEncodeStopKey(key) - - var num int64 = 0 - it := db.bucket.RangeLimitIterator(start, stop, store.RangeROpen, 0, -1) - for ; it.Valid(); it.Next() { - t.Delete(it.Key()) - num++ - } - it.Close() - - t.Delete(sk) - return num -} - -func (db *DB) hExpireAt(key []byte, when int64) (int64, error) { - t := db.hashBatch - t.Lock() - defer t.Unlock() - - if hlen, err := db.HLen(key); err != nil || hlen == 0 { - return 0, err - } else { - db.expireAt(t, HashType, key, when) - if err := t.Commit(); err != nil { - return 0, err - } - } - return 1, nil -} - -func (db *DB) HLen(key []byte) (int64, error) { - if err := checkKeySize(key); err != nil { - return 0, err - } - - return Int64(db.bucket.Get(db.hEncodeSizeKey(key))) -} - -func (db *DB) HSet(key []byte, field []byte, value []byte) (int64, error) { - if err := checkHashKFSize(key, field); err != nil { - return 0, err - } else if err := checkValueSize(value); err != nil { - return 0, err - } - - t := db.hashBatch - t.Lock() - defer t.Unlock() - - n, err := db.hSetItem(key, field, value) - if err != nil { - return 0, err - } - - err = t.Commit() - return n, err -} - -func (db *DB) HGet(key []byte, field []byte) ([]byte, error) { - if err := checkHashKFSize(key, field); err != nil { - return nil, err - } - - return db.bucket.Get(db.hEncodeHashKey(key, field)) -} - -func (db *DB) HMset(key []byte, args ...FVPair) error { - t := db.hashBatch - t.Lock() - defer t.Unlock() - - var err error - var ek []byte - var num int64 = 0 - for i := 0; i < len(args); i++ { - if err := checkHashKFSize(key, args[i].Field); err != nil { - return err - } else if err := checkValueSize(args[i].Value); err != nil { - return err - } - - ek = db.hEncodeHashKey(key, args[i].Field) - - if v, err := db.bucket.Get(ek); err != nil { - return err - } else if v == nil { - num++ - } - - t.Put(ek, args[i].Value) - } - - if _, err = db.hIncrSize(key, num); err != nil { - return err - } - - //todo add binglog - err = t.Commit() - return err -} - -func (db *DB) HMget(key []byte, args ...[]byte) ([][]byte, error) { - var ek []byte - - it := db.bucket.NewIterator() - defer it.Close() - - r := make([][]byte, len(args)) - for i := 0; i < len(args); i++ { - if err := checkHashKFSize(key, args[i]); err != nil { - return nil, err - } - - ek = db.hEncodeHashKey(key, args[i]) - - r[i] = it.Find(ek) - } - - return r, nil -} - -func (db *DB) HDel(key []byte, args ...[]byte) (int64, error) { - t := db.hashBatch - - var ek []byte - var v []byte - var err error - - t.Lock() - defer t.Unlock() - - it := db.bucket.NewIterator() - defer it.Close() - - var num int64 = 0 - for i := 0; i < len(args); i++ { - if err := checkHashKFSize(key, args[i]); err != nil { - return 0, err - } - - ek = db.hEncodeHashKey(key, args[i]) - - v = it.RawFind(ek) - if v == nil { - continue - } else { - num++ - t.Delete(ek) - } - } - - if _, err = db.hIncrSize(key, -num); err != nil { - return 0, err - } - - err = t.Commit() - - return num, err -} - -func (db *DB) hIncrSize(key []byte, delta int64) (int64, error) { - t := db.hashBatch - sk := db.hEncodeSizeKey(key) - - var err error - var size int64 = 0 - if size, err = Int64(db.bucket.Get(sk)); err != nil { - return 0, err - } else { - size += delta - if size <= 0 { - size = 0 - t.Delete(sk) - db.rmExpire(t, HashType, key) - } else { - t.Put(sk, PutInt64(size)) - } - } - - return size, nil -} - -func (db *DB) HIncrBy(key []byte, field []byte, delta int64) (int64, error) { - if err := checkHashKFSize(key, field); err != nil { - return 0, err - } - - t := db.hashBatch - var ek []byte - var err error - - t.Lock() - defer t.Unlock() - - ek = db.hEncodeHashKey(key, field) - - var n int64 = 0 - if n, err = StrInt64(db.bucket.Get(ek)); err != nil { - return 0, err - } - - n += delta - - _, err = db.hSetItem(key, field, num.FormatInt64ToSlice(n)) - if err != nil { - return 0, err - } - - err = t.Commit() - - return n, err -} - -func (db *DB) HGetAll(key []byte) ([]FVPair, error) { - if err := checkKeySize(key); err != nil { - return nil, err - } - - start := db.hEncodeStartKey(key) - stop := db.hEncodeStopKey(key) - - v := make([]FVPair, 0, 16) - - it := db.bucket.RangeLimitIterator(start, stop, store.RangeROpen, 0, -1) - for ; it.Valid(); it.Next() { - _, f, err := db.hDecodeHashKey(it.Key()) - if err != nil { - return nil, err - } - - v = append(v, FVPair{Field: f, Value: it.Value()}) - } - - it.Close() - - return v, nil -} - -func (db *DB) HKeys(key []byte) ([][]byte, error) { - if err := checkKeySize(key); err != nil { - return nil, err - } - - start := db.hEncodeStartKey(key) - stop := db.hEncodeStopKey(key) - - v := make([][]byte, 0, 16) - - it := db.bucket.RangeLimitIterator(start, stop, store.RangeROpen, 0, -1) - for ; it.Valid(); it.Next() { - _, f, err := db.hDecodeHashKey(it.Key()) - if err != nil { - return nil, err - } - v = append(v, f) - } - - it.Close() - - return v, nil -} - -func (db *DB) HValues(key []byte) ([][]byte, error) { - if err := checkKeySize(key); err != nil { - return nil, err - } - - start := db.hEncodeStartKey(key) - stop := db.hEncodeStopKey(key) - - v := make([][]byte, 0, 16) - - it := db.bucket.RangeLimitIterator(start, stop, store.RangeROpen, 0, -1) - for ; it.Valid(); it.Next() { - _, _, err := db.hDecodeHashKey(it.Key()) - if err != nil { - return nil, err - } - - v = append(v, it.Value()) - } - - it.Close() - - return v, nil -} - -func (db *DB) HClear(key []byte) (int64, error) { - if err := checkKeySize(key); err != nil { - return 0, err - } - - t := db.hashBatch - t.Lock() - defer t.Unlock() - - num := db.hDelete(t, key) - db.rmExpire(t, HashType, key) - - err := t.Commit() - return num, err -} - -func (db *DB) HMclear(keys ...[]byte) (int64, error) { - t := db.hashBatch - t.Lock() - defer t.Unlock() - - for _, key := range keys { - if err := checkKeySize(key); err != nil { - return 0, err - } - - db.hDelete(t, key) - db.rmExpire(t, HashType, key) - } - - err := t.Commit() - return int64(len(keys)), err -} - -func (db *DB) hFlush() (drop int64, err error) { - t := db.hashBatch - - t.Lock() - defer t.Unlock() - - return db.flushType(t, HashType) -} - -func (db *DB) HScan(key []byte, count int, inclusive bool, match string) ([][]byte, error) { - return db.scan(HSizeType, key, count, inclusive, match) -} - -func (db *DB) HRevScan(key []byte, count int, inclusive bool, match string) ([][]byte, error) { - return db.revscan(HSizeType, key, count, inclusive, match) -} - -func (db *DB) HExpire(key []byte, duration int64) (int64, error) { - if duration <= 0 { - return 0, errExpireValue - } - - return db.hExpireAt(key, time.Now().Unix()+duration) -} - -func (db *DB) HExpireAt(key []byte, when int64) (int64, error) { - if when <= time.Now().Unix() { - return 0, errExpireValue - } - - return db.hExpireAt(key, when) -} - -func (db *DB) HTTL(key []byte) (int64, error) { - if err := checkKeySize(key); err != nil { - return -1, err - } - - return db.ttl(HashType, key) -} - -func (db *DB) HPersist(key []byte) (int64, error) { - if err := checkKeySize(key); err != nil { - return 0, err - } - - t := db.hashBatch - t.Lock() - defer t.Unlock() - - n, err := db.rmExpire(t, HashType, key) - if err != nil { - return 0, err - } - - err = t.Commit() - return n, err -} diff --git a/vendor/github.com/siddontang/ledisdb/ledis/t_kv.go b/vendor/github.com/siddontang/ledisdb/ledis/t_kv.go deleted file mode 100644 index 37315c18e452..000000000000 --- a/vendor/github.com/siddontang/ledisdb/ledis/t_kv.go +++ /dev/null @@ -1,396 +0,0 @@ -package ledis - -import ( - "errors" - "github.com/siddontang/go/num" - "github.com/siddontang/ledisdb/store" - "time" -) - -type KVPair struct { - Key []byte - Value []byte -} - -var errKVKey = errors.New("invalid encode kv key") - -func checkKeySize(key []byte) error { - if len(key) > MaxKeySize || len(key) == 0 { - return errKeySize - } - return nil -} - -func checkValueSize(value []byte) error { - if len(value) > MaxValueSize { - return errValueSize - } - - return nil -} - -func (db *DB) encodeKVKey(key []byte) []byte { - ek := make([]byte, len(key)+2) - ek[0] = db.index - ek[1] = KVType - copy(ek[2:], key) - return ek -} - -func (db *DB) decodeKVKey(ek []byte) ([]byte, error) { - if len(ek) < 2 || ek[0] != db.index || ek[1] != KVType { - return nil, errKVKey - } - - return ek[2:], nil -} - -func (db *DB) encodeKVMinKey() []byte { - ek := db.encodeKVKey(nil) - return ek -} - -func (db *DB) encodeKVMaxKey() []byte { - ek := db.encodeKVKey(nil) - ek[len(ek)-1] = KVType + 1 - return ek -} - -func (db *DB) incr(key []byte, delta int64) (int64, error) { - if err := checkKeySize(key); err != nil { - return 0, err - } - - var err error - key = db.encodeKVKey(key) - - t := db.kvBatch - - t.Lock() - defer t.Unlock() - - var n int64 - n, err = StrInt64(db.bucket.Get(key)) - if err != nil { - return 0, err - } - - n += delta - - t.Put(key, num.FormatInt64ToSlice(n)) - - err = t.Commit() - return n, err -} - -// ps : here just focus on deleting the key-value data, -// any other likes expire is ignore. -func (db *DB) delete(t *batch, key []byte) int64 { - key = db.encodeKVKey(key) - t.Delete(key) - return 1 -} - -func (db *DB) setExpireAt(key []byte, when int64) (int64, error) { - t := db.kvBatch - t.Lock() - defer t.Unlock() - - if exist, err := db.Exists(key); err != nil || exist == 0 { - return 0, err - } else { - db.expireAt(t, KVType, key, when) - if err := t.Commit(); err != nil { - return 0, err - } - } - return 1, nil -} - -func (db *DB) Decr(key []byte) (int64, error) { - return db.incr(key, -1) -} - -func (db *DB) DecrBy(key []byte, decrement int64) (int64, error) { - return db.incr(key, -decrement) -} - -func (db *DB) Del(keys ...[]byte) (int64, error) { - if len(keys) == 0 { - return 0, nil - } - - codedKeys := make([][]byte, len(keys)) - for i, k := range keys { - codedKeys[i] = db.encodeKVKey(k) - } - - t := db.kvBatch - t.Lock() - defer t.Unlock() - - for i, k := range keys { - t.Delete(codedKeys[i]) - db.rmExpire(t, KVType, k) - } - - err := t.Commit() - return int64(len(keys)), err -} - -func (db *DB) Exists(key []byte) (int64, error) { - if err := checkKeySize(key); err != nil { - return 0, err - } - - var err error - key = db.encodeKVKey(key) - - var v []byte - v, err = db.bucket.Get(key) - if v != nil && err == nil { - return 1, nil - } - - return 0, err -} - -func (db *DB) Get(key []byte) ([]byte, error) { - if err := checkKeySize(key); err != nil { - return nil, err - } - - key = db.encodeKVKey(key) - - return db.bucket.Get(key) -} - -func (db *DB) GetSlice(key []byte) (store.Slice, error) { - if err := checkKeySize(key); err != nil { - return nil, err - } - - key = db.encodeKVKey(key) - - return db.bucket.GetSlice(key) -} - -func (db *DB) GetSet(key []byte, value []byte) ([]byte, error) { - if err := checkKeySize(key); err != nil { - return nil, err - } else if err := checkValueSize(value); err != nil { - return nil, err - } - - key = db.encodeKVKey(key) - - t := db.kvBatch - - t.Lock() - defer t.Unlock() - - oldValue, err := db.bucket.Get(key) - if err != nil { - return nil, err - } - - t.Put(key, value) - - err = t.Commit() - - return oldValue, err -} - -func (db *DB) Incr(key []byte) (int64, error) { - return db.incr(key, 1) -} - -func (db *DB) IncrBy(key []byte, increment int64) (int64, error) { - return db.incr(key, increment) -} - -func (db *DB) MGet(keys ...[]byte) ([][]byte, error) { - values := make([][]byte, len(keys)) - - it := db.bucket.NewIterator() - defer it.Close() - - for i := range keys { - if err := checkKeySize(keys[i]); err != nil { - return nil, err - } - - values[i] = it.Find(db.encodeKVKey(keys[i])) - } - - return values, nil -} - -func (db *DB) MSet(args ...KVPair) error { - if len(args) == 0 { - return nil - } - - t := db.kvBatch - - var err error - var key []byte - var value []byte - - t.Lock() - defer t.Unlock() - - for i := 0; i < len(args); i++ { - if err := checkKeySize(args[i].Key); err != nil { - return err - } else if err := checkValueSize(args[i].Value); err != nil { - return err - } - - key = db.encodeKVKey(args[i].Key) - - value = args[i].Value - - t.Put(key, value) - - } - - err = t.Commit() - return err -} - -func (db *DB) Set(key []byte, value []byte) error { - if err := checkKeySize(key); err != nil { - return err - } else if err := checkValueSize(value); err != nil { - return err - } - - var err error - key = db.encodeKVKey(key) - - t := db.kvBatch - - t.Lock() - defer t.Unlock() - - t.Put(key, value) - - err = t.Commit() - - return err -} - -func (db *DB) SetNX(key []byte, value []byte) (int64, error) { - if err := checkKeySize(key); err != nil { - return 0, err - } else if err := checkValueSize(value); err != nil { - return 0, err - } - - var err error - key = db.encodeKVKey(key) - - var n int64 = 1 - - t := db.kvBatch - - t.Lock() - defer t.Unlock() - - if v, err := db.bucket.Get(key); err != nil { - return 0, err - } else if v != nil { - n = 0 - } else { - t.Put(key, value) - - err = t.Commit() - } - - return n, err -} - -func (db *DB) SetEX(key []byte, duration int64, value []byte) error { - if err := checkKeySize(key); err != nil { - return err - } else if err := checkValueSize(value); err != nil { - return err - } else if duration <= 0 { - return errExpireValue - } - - ek := db.encodeKVKey(key) - - t := db.kvBatch - - t.Lock() - defer t.Unlock() - - t.Put(ek, value) - db.expireAt(t, KVType, key, time.Now().Unix()+duration) - - if err := t.Commit(); err != nil { - return err - } - - return nil -} - -func (db *DB) flush() (drop int64, err error) { - t := db.kvBatch - t.Lock() - defer t.Unlock() - return db.flushType(t, KVType) -} - -//if inclusive is true, scan range [key, inf) else (key, inf) -func (db *DB) Scan(key []byte, count int, inclusive bool, match string) ([][]byte, error) { - return db.scan(KVType, key, count, inclusive, match) -} - -//if inclusive is true, revscan range (-inf, key] else (inf, key) -func (db *DB) RevScan(key []byte, count int, inclusive bool, match string) ([][]byte, error) { - return db.revscan(KVType, key, count, inclusive, match) -} - -func (db *DB) Expire(key []byte, duration int64) (int64, error) { - if duration <= 0 { - return 0, errExpireValue - } - - return db.setExpireAt(key, time.Now().Unix()+duration) -} - -func (db *DB) ExpireAt(key []byte, when int64) (int64, error) { - if when <= time.Now().Unix() { - return 0, errExpireValue - } - - return db.setExpireAt(key, when) -} - -func (db *DB) TTL(key []byte) (int64, error) { - if err := checkKeySize(key); err != nil { - return -1, err - } - - return db.ttl(KVType, key) -} - -func (db *DB) Persist(key []byte) (int64, error) { - if err := checkKeySize(key); err != nil { - return 0, err - } - - t := db.kvBatch - t.Lock() - defer t.Unlock() - n, err := db.rmExpire(t, KVType, key) - if err != nil { - return 0, err - } - - err = t.Commit() - return n, err -} diff --git a/vendor/github.com/siddontang/ledisdb/ledis/t_list.go b/vendor/github.com/siddontang/ledisdb/ledis/t_list.go deleted file mode 100644 index 7f66bed75664..000000000000 --- a/vendor/github.com/siddontang/ledisdb/ledis/t_list.go +++ /dev/null @@ -1,665 +0,0 @@ -package ledis - -import ( - "container/list" - "encoding/binary" - "errors" - "github.com/siddontang/go/hack" - "github.com/siddontang/ledisdb/store" - "sync" - "time" -) - -const ( - listHeadSeq int32 = 1 - listTailSeq int32 = 2 - - listMinSeq int32 = 1000 - listMaxSeq int32 = 1<<31 - 1000 - listInitialSeq int32 = listMinSeq + (listMaxSeq-listMinSeq)/2 -) - -var errLMetaKey = errors.New("invalid lmeta key") -var errListKey = errors.New("invalid list key") -var errListSeq = errors.New("invalid list sequence, overflow") - -func (db *DB) lEncodeMetaKey(key []byte) []byte { - buf := make([]byte, len(key)+2) - buf[0] = db.index - buf[1] = LMetaType - - copy(buf[2:], key) - return buf -} - -func (db *DB) lDecodeMetaKey(ek []byte) ([]byte, error) { - if len(ek) < 2 || ek[0] != db.index || ek[1] != LMetaType { - return nil, errLMetaKey - } - - return ek[2:], nil -} - -func (db *DB) lEncodeListKey(key []byte, seq int32) []byte { - buf := make([]byte, len(key)+8) - - pos := 0 - buf[pos] = db.index - pos++ - buf[pos] = ListType - pos++ - - binary.BigEndian.PutUint16(buf[pos:], uint16(len(key))) - pos += 2 - - copy(buf[pos:], key) - pos += len(key) - - binary.BigEndian.PutUint32(buf[pos:], uint32(seq)) - - return buf -} - -func (db *DB) lDecodeListKey(ek []byte) (key []byte, seq int32, err error) { - if len(ek) < 8 || ek[0] != db.index || ek[1] != ListType { - err = errListKey - return - } - - keyLen := int(binary.BigEndian.Uint16(ek[2:])) - if keyLen+8 != len(ek) { - err = errListKey - return - } - - key = ek[4 : 4+keyLen] - seq = int32(binary.BigEndian.Uint32(ek[4+keyLen:])) - return -} - -func (db *DB) lpush(key []byte, whereSeq int32, args ...[]byte) (int64, error) { - if err := checkKeySize(key); err != nil { - return 0, err - } - - var headSeq int32 - var tailSeq int32 - var size int32 - var err error - - t := db.listBatch - t.Lock() - defer t.Unlock() - - metaKey := db.lEncodeMetaKey(key) - headSeq, tailSeq, size, err = db.lGetMeta(nil, metaKey) - if err != nil { - return 0, err - } - - var pushCnt int = len(args) - if pushCnt == 0 { - return int64(size), nil - } - - var seq int32 = headSeq - var delta int32 = -1 - if whereSeq == listTailSeq { - seq = tailSeq - delta = 1 - } - - // append elements - if size > 0 { - seq += delta - } - - for i := 0; i < pushCnt; i++ { - ek := db.lEncodeListKey(key, seq+int32(i)*delta) - t.Put(ek, args[i]) - } - - seq += int32(pushCnt-1) * delta - if seq <= listMinSeq || seq >= listMaxSeq { - return 0, errListSeq - } - - // set meta info - if whereSeq == listHeadSeq { - headSeq = seq - } else { - tailSeq = seq - } - - db.lSetMeta(metaKey, headSeq, tailSeq) - - err = t.Commit() - - if err == nil { - db.lSignalAsReady(key, pushCnt) - } - - return int64(size) + int64(pushCnt), err -} - -func (db *DB) lpop(key []byte, whereSeq int32) ([]byte, error) { - if err := checkKeySize(key); err != nil { - return nil, err - } - - t := db.listBatch - t.Lock() - defer t.Unlock() - - var headSeq int32 - var tailSeq int32 - var err error - - metaKey := db.lEncodeMetaKey(key) - headSeq, tailSeq, _, err = db.lGetMeta(nil, metaKey) - if err != nil { - return nil, err - } - - var value []byte - - var seq int32 = headSeq - if whereSeq == listTailSeq { - seq = tailSeq - } - - itemKey := db.lEncodeListKey(key, seq) - value, err = db.bucket.Get(itemKey) - if err != nil { - return nil, err - } - - if whereSeq == listHeadSeq { - headSeq += 1 - } else { - tailSeq -= 1 - } - - t.Delete(itemKey) - size := db.lSetMeta(metaKey, headSeq, tailSeq) - if size == 0 { - db.rmExpire(t, HashType, key) - } - - err = t.Commit() - return value, err -} - -// ps : here just focus on deleting the list data, -// any other likes expire is ignore. -func (db *DB) lDelete(t *batch, key []byte) int64 { - mk := db.lEncodeMetaKey(key) - - var headSeq int32 - var tailSeq int32 - var err error - - it := db.bucket.NewIterator() - defer it.Close() - - headSeq, tailSeq, _, err = db.lGetMeta(it, mk) - if err != nil { - return 0 - } - - var num int64 = 0 - startKey := db.lEncodeListKey(key, headSeq) - stopKey := db.lEncodeListKey(key, tailSeq) - - rit := store.NewRangeIterator(it, &store.Range{startKey, stopKey, store.RangeClose}) - for ; rit.Valid(); rit.Next() { - t.Delete(rit.RawKey()) - num++ - } - - t.Delete(mk) - - return num -} - -func (db *DB) lGetMeta(it *store.Iterator, ek []byte) (headSeq int32, tailSeq int32, size int32, err error) { - var v []byte - if it != nil { - v = it.Find(ek) - } else { - v, err = db.bucket.Get(ek) - } - if err != nil { - return - } else if v == nil { - headSeq = listInitialSeq - tailSeq = listInitialSeq - size = 0 - return - } else { - headSeq = int32(binary.LittleEndian.Uint32(v[0:4])) - tailSeq = int32(binary.LittleEndian.Uint32(v[4:8])) - size = tailSeq - headSeq + 1 - } - return -} - -func (db *DB) lSetMeta(ek []byte, headSeq int32, tailSeq int32) int32 { - t := db.listBatch - - var size int32 = tailSeq - headSeq + 1 - if size < 0 { - // todo : log error + panic - } else if size == 0 { - t.Delete(ek) - } else { - buf := make([]byte, 8) - - binary.LittleEndian.PutUint32(buf[0:4], uint32(headSeq)) - binary.LittleEndian.PutUint32(buf[4:8], uint32(tailSeq)) - - t.Put(ek, buf) - } - - return size -} - -func (db *DB) lExpireAt(key []byte, when int64) (int64, error) { - t := db.listBatch - t.Lock() - defer t.Unlock() - - if llen, err := db.LLen(key); err != nil || llen == 0 { - return 0, err - } else { - db.expireAt(t, ListType, key, when) - if err := t.Commit(); err != nil { - return 0, err - } - } - return 1, nil -} - -func (db *DB) LIndex(key []byte, index int32) ([]byte, error) { - if err := checkKeySize(key); err != nil { - return nil, err - } - - var seq int32 - var headSeq int32 - var tailSeq int32 - var err error - - metaKey := db.lEncodeMetaKey(key) - - it := db.bucket.NewIterator() - defer it.Close() - - headSeq, tailSeq, _, err = db.lGetMeta(it, metaKey) - if err != nil { - return nil, err - } - - if index >= 0 { - seq = headSeq + index - } else { - seq = tailSeq + index + 1 - } - - sk := db.lEncodeListKey(key, seq) - v := it.Find(sk) - - return v, nil -} - -func (db *DB) LLen(key []byte) (int64, error) { - if err := checkKeySize(key); err != nil { - return 0, err - } - - ek := db.lEncodeMetaKey(key) - _, _, size, err := db.lGetMeta(nil, ek) - return int64(size), err -} - -func (db *DB) LPop(key []byte) ([]byte, error) { - return db.lpop(key, listHeadSeq) -} - -func (db *DB) LPush(key []byte, args ...[]byte) (int64, error) { - return db.lpush(key, listHeadSeq, args...) -} - -func (db *DB) LRange(key []byte, start int32, stop int32) ([][]byte, error) { - if err := checkKeySize(key); err != nil { - return nil, err - } - - var headSeq int32 - var llen int32 - var err error - - metaKey := db.lEncodeMetaKey(key) - - it := db.bucket.NewIterator() - defer it.Close() - - if headSeq, _, llen, err = db.lGetMeta(it, metaKey); err != nil { - return nil, err - } - - if start < 0 { - start = llen + start - } - if stop < 0 { - stop = llen + stop - } - if start < 0 { - start = 0 - } - - if start > stop || start >= llen { - return [][]byte{}, nil - } - - if stop >= llen { - stop = llen - 1 - } - - limit := (stop - start) + 1 - headSeq += start - - v := make([][]byte, 0, limit) - - startKey := db.lEncodeListKey(key, headSeq) - rit := store.NewRangeLimitIterator(it, - &store.Range{ - Min: startKey, - Max: nil, - Type: store.RangeClose}, - &store.Limit{ - Offset: 0, - Count: int(limit)}) - - for ; rit.Valid(); rit.Next() { - v = append(v, rit.Value()) - } - - return v, nil -} - -func (db *DB) RPop(key []byte) ([]byte, error) { - return db.lpop(key, listTailSeq) -} - -func (db *DB) RPush(key []byte, args ...[]byte) (int64, error) { - return db.lpush(key, listTailSeq, args...) -} - -func (db *DB) LClear(key []byte) (int64, error) { - if err := checkKeySize(key); err != nil { - return 0, err - } - - t := db.listBatch - t.Lock() - defer t.Unlock() - - num := db.lDelete(t, key) - db.rmExpire(t, ListType, key) - - err := t.Commit() - return num, err -} - -func (db *DB) LMclear(keys ...[]byte) (int64, error) { - t := db.listBatch - t.Lock() - defer t.Unlock() - - for _, key := range keys { - if err := checkKeySize(key); err != nil { - return 0, err - } - - db.lDelete(t, key) - db.rmExpire(t, ListType, key) - - } - - err := t.Commit() - return int64(len(keys)), err -} - -func (db *DB) lFlush() (drop int64, err error) { - t := db.listBatch - t.Lock() - defer t.Unlock() - return db.flushType(t, ListType) -} - -func (db *DB) LExpire(key []byte, duration int64) (int64, error) { - if duration <= 0 { - return 0, errExpireValue - } - - return db.lExpireAt(key, time.Now().Unix()+duration) -} - -func (db *DB) LExpireAt(key []byte, when int64) (int64, error) { - if when <= time.Now().Unix() { - return 0, errExpireValue - } - - return db.lExpireAt(key, when) -} - -func (db *DB) LTTL(key []byte) (int64, error) { - if err := checkKeySize(key); err != nil { - return -1, err - } - - return db.ttl(ListType, key) -} - -func (db *DB) LPersist(key []byte) (int64, error) { - if err := checkKeySize(key); err != nil { - return 0, err - } - - t := db.listBatch - t.Lock() - defer t.Unlock() - - n, err := db.rmExpire(t, ListType, key) - if err != nil { - return 0, err - } - - err = t.Commit() - return n, err -} - -func (db *DB) LScan(key []byte, count int, inclusive bool, match string) ([][]byte, error) { - return db.scan(LMetaType, key, count, inclusive, match) -} - -func (db *DB) LRevScan(key []byte, count int, inclusive bool, match string) ([][]byte, error) { - return db.revscan(LMetaType, key, count, inclusive, match) -} - -func (db *DB) lEncodeMinKey() []byte { - return db.lEncodeMetaKey(nil) -} - -func (db *DB) lEncodeMaxKey() []byte { - ek := db.lEncodeMetaKey(nil) - ek[len(ek)-1] = LMetaType + 1 - return ek -} - -func (db *DB) BLPop(keys [][]byte, timeout time.Duration) ([]interface{}, error) { - return db.lblockPop(keys, listHeadSeq, timeout) -} - -func (db *DB) BRPop(keys [][]byte, timeout time.Duration) ([]interface{}, error) { - return db.lblockPop(keys, listTailSeq, timeout) -} - -func (db *DB) lblockPop(keys [][]byte, whereSeq int32, timeout time.Duration) ([]interface{}, error) { - ch := make(chan []byte) - - bkeys := [][]byte{} - for _, key := range keys { - v, err := db.lpop(key, whereSeq) - if err != nil { - return nil, err - } else if v != nil { - return []interface{}{key, v}, nil - } else { - if db.IsAutoCommit() { - //block wait can not be supported in transaction and multi - db.lbkeys.wait(key, ch) - bkeys = append(bkeys, key) - } - } - } - if len(bkeys) == 0 { - return nil, nil - } - - defer func() { - for _, key := range bkeys { - db.lbkeys.unwait(key, ch) - } - }() - - deadT := time.Now().Add(timeout) - - for { - if timeout == 0 { - key := <-ch - if v, err := db.lpop(key, whereSeq); err != nil { - return nil, err - } else if v == nil { - continue - } else { - return []interface{}{key, v}, nil - } - } else { - d := deadT.Sub(time.Now()) - if d < 0 { - return nil, nil - } - - select { - case key := <-ch: - if v, err := db.lpop(key, whereSeq); err != nil { - return nil, err - } else if v == nil { - db.lbkeys.wait(key, ch) - continue - } else { - return []interface{}{key, v}, nil - } - case <-time.After(d): - return nil, nil - } - } - - } -} - -func (db *DB) lSignalAsReady(key []byte, num int) { - if db.status == DBInTransaction { - //for transaction, only data can be pushed after tx commit and it is hard to signal - //so we don't handle it now - return - } - - db.lbkeys.signal(key, num) -} - -type lbKeyCh chan<- []byte - -type lBlockKeys struct { - sync.Mutex - - keys map[string]*list.List -} - -func newLBlockKeys() *lBlockKeys { - l := new(lBlockKeys) - - l.keys = make(map[string]*list.List) - return l -} - -func (l *lBlockKeys) signal(key []byte, num int) { - l.Lock() - defer l.Unlock() - - s := hack.String(key) - chs, ok := l.keys[s] - if !ok { - return - } - - var n *list.Element - - i := 0 - for e := chs.Front(); e != nil && i < num; e = n { - ch := e.Value.(lbKeyCh) - n = e.Next() - select { - case ch <- key: - chs.Remove(e) - i++ - default: - //waiter unwait - chs.Remove(e) - } - } - - if chs.Len() == 0 { - delete(l.keys, s) - } -} - -func (l *lBlockKeys) wait(key []byte, ch lbKeyCh) { - l.Lock() - defer l.Unlock() - - s := hack.String(key) - chs, ok := l.keys[s] - if !ok { - chs = list.New() - l.keys[s] = chs - } - - chs.PushBack(ch) -} - -func (l *lBlockKeys) unwait(key []byte, ch lbKeyCh) { - l.Lock() - defer l.Unlock() - - s := hack.String(key) - chs, ok := l.keys[s] - if !ok { - return - } else { - var n *list.Element - for e := chs.Front(); e != nil; e = n { - c := e.Value.(lbKeyCh) - n = e.Next() - if c == ch { - chs.Remove(e) - } - } - - if chs.Len() == 0 { - delete(l.keys, s) - } - } -} diff --git a/vendor/github.com/siddontang/ledisdb/ledis/t_set.go b/vendor/github.com/siddontang/ledisdb/ledis/t_set.go deleted file mode 100644 index d1647524c1bf..000000000000 --- a/vendor/github.com/siddontang/ledisdb/ledis/t_set.go +++ /dev/null @@ -1,605 +0,0 @@ -package ledis - -import ( - "encoding/binary" - "errors" - "github.com/siddontang/go/hack" - "github.com/siddontang/ledisdb/store" - "time" -) - -var errSetKey = errors.New("invalid set key") -var errSSizeKey = errors.New("invalid ssize key") - -const ( - setStartSep byte = ':' - setStopSep byte = setStartSep + 1 - UnionType byte = 51 - DiffType byte = 52 - InterType byte = 53 -) - -func checkSetKMSize(key []byte, member []byte) error { - if len(key) > MaxKeySize || len(key) == 0 { - return errKeySize - } else if len(member) > MaxSetMemberSize || len(member) == 0 { - return errSetMemberSize - } - return nil -} - -func (db *DB) sEncodeSizeKey(key []byte) []byte { - buf := make([]byte, len(key)+2) - - buf[0] = db.index - buf[1] = SSizeType - - copy(buf[2:], key) - return buf -} - -func (db *DB) sDecodeSizeKey(ek []byte) ([]byte, error) { - if len(ek) < 2 || ek[0] != db.index || ek[1] != SSizeType { - return nil, errSSizeKey - } - - return ek[2:], nil -} - -func (db *DB) sEncodeSetKey(key []byte, member []byte) []byte { - buf := make([]byte, len(key)+len(member)+1+1+2+1) - - pos := 0 - buf[pos] = db.index - pos++ - buf[pos] = SetType - pos++ - - binary.BigEndian.PutUint16(buf[pos:], uint16(len(key))) - pos += 2 - - copy(buf[pos:], key) - pos += len(key) - - buf[pos] = setStartSep - pos++ - copy(buf[pos:], member) - - return buf -} - -func (db *DB) sDecodeSetKey(ek []byte) ([]byte, []byte, error) { - if len(ek) < 5 || ek[0] != db.index || ek[1] != SetType { - return nil, nil, errSetKey - } - - pos := 2 - keyLen := int(binary.BigEndian.Uint16(ek[pos:])) - pos += 2 - - if keyLen+5 > len(ek) { - return nil, nil, errSetKey - } - - key := ek[pos : pos+keyLen] - pos += keyLen - - if ek[pos] != hashStartSep { - return nil, nil, errSetKey - } - - pos++ - member := ek[pos:] - return key, member, nil -} - -func (db *DB) sEncodeStartKey(key []byte) []byte { - return db.sEncodeSetKey(key, nil) -} - -func (db *DB) sEncodeStopKey(key []byte) []byte { - k := db.sEncodeSetKey(key, nil) - - k[len(k)-1] = setStopSep - - return k -} - -func (db *DB) sFlush() (drop int64, err error) { - - t := db.setBatch - t.Lock() - defer t.Unlock() - - return db.flushType(t, SetType) -} - -func (db *DB) sDelete(t *batch, key []byte) int64 { - sk := db.sEncodeSizeKey(key) - start := db.sEncodeStartKey(key) - stop := db.sEncodeStopKey(key) - - var num int64 = 0 - it := db.bucket.RangeLimitIterator(start, stop, store.RangeROpen, 0, -1) - for ; it.Valid(); it.Next() { - t.Delete(it.RawKey()) - num++ - } - - it.Close() - - t.Delete(sk) - return num -} - -func (db *DB) sIncrSize(key []byte, delta int64) (int64, error) { - t := db.setBatch - sk := db.sEncodeSizeKey(key) - - var err error - var size int64 = 0 - if size, err = Int64(db.bucket.Get(sk)); err != nil { - return 0, err - } else { - size += delta - if size <= 0 { - size = 0 - t.Delete(sk) - db.rmExpire(t, SetType, key) - } else { - t.Put(sk, PutInt64(size)) - } - } - - return size, nil -} - -func (db *DB) sExpireAt(key []byte, when int64) (int64, error) { - t := db.setBatch - t.Lock() - defer t.Unlock() - - if scnt, err := db.SCard(key); err != nil || scnt == 0 { - return 0, err - } else { - db.expireAt(t, SetType, key, when) - if err := t.Commit(); err != nil { - return 0, err - } - - } - - return 1, nil -} - -func (db *DB) sSetItem(key []byte, member []byte) (int64, error) { - t := db.setBatch - ek := db.sEncodeSetKey(key, member) - - var n int64 = 1 - if v, _ := db.bucket.Get(ek); v != nil { - n = 0 - } else { - if _, err := db.sIncrSize(key, 1); err != nil { - return 0, err - } - } - - t.Put(ek, nil) - return n, nil -} - -func (db *DB) SAdd(key []byte, args ...[]byte) (int64, error) { - t := db.setBatch - t.Lock() - defer t.Unlock() - - var err error - var ek []byte - var num int64 = 0 - for i := 0; i < len(args); i++ { - if err := checkSetKMSize(key, args[i]); err != nil { - return 0, err - } - - ek = db.sEncodeSetKey(key, args[i]) - - if v, err := db.bucket.Get(ek); err != nil { - return 0, err - } else if v == nil { - num++ - } - - t.Put(ek, nil) - } - - if _, err = db.sIncrSize(key, num); err != nil { - return 0, err - } - - err = t.Commit() - return num, err - -} - -func (db *DB) SCard(key []byte) (int64, error) { - if err := checkKeySize(key); err != nil { - return 0, err - } - - sk := db.sEncodeSizeKey(key) - - return Int64(db.bucket.Get(sk)) -} - -func (db *DB) sDiffGeneric(keys ...[]byte) ([][]byte, error) { - destMap := make(map[string]bool) - - members, err := db.SMembers(keys[0]) - if err != nil { - return nil, err - } - - for _, m := range members { - destMap[hack.String(m)] = true - } - - for _, k := range keys[1:] { - members, err := db.SMembers(k) - if err != nil { - return nil, err - } - - for _, m := range members { - if _, ok := destMap[hack.String(m)]; !ok { - continue - } else if ok { - delete(destMap, hack.String(m)) - } - } - // O - A = O, O is zero set. - if len(destMap) == 0 { - return nil, nil - } - } - - slice := make([][]byte, len(destMap)) - idx := 0 - for k, v := range destMap { - if !v { - continue - } - slice[idx] = []byte(k) - idx++ - } - - return slice, nil -} - -func (db *DB) SDiff(keys ...[]byte) ([][]byte, error) { - v, err := db.sDiffGeneric(keys...) - return v, err -} - -func (db *DB) SDiffStore(dstKey []byte, keys ...[]byte) (int64, error) { - n, err := db.sStoreGeneric(dstKey, DiffType, keys...) - return n, err -} - -func (db *DB) sInterGeneric(keys ...[]byte) ([][]byte, error) { - destMap := make(map[string]bool) - - members, err := db.SMembers(keys[0]) - if err != nil { - return nil, err - } - - for _, m := range members { - destMap[hack.String(m)] = true - } - - for _, key := range keys[1:] { - if err := checkKeySize(key); err != nil { - return nil, err - } - - members, err := db.SMembers(key) - if err != nil { - return nil, err - } else if len(members) == 0 { - return nil, err - } - - tempMap := make(map[string]bool) - for _, member := range members { - if err := checkKeySize(member); err != nil { - return nil, err - } - if _, ok := destMap[hack.String(member)]; ok { - tempMap[hack.String(member)] = true //mark this item as selected - } - } - destMap = tempMap //reduce the size of the result set - if len(destMap) == 0 { - return nil, nil - } - } - - slice := make([][]byte, len(destMap)) - idx := 0 - for k, v := range destMap { - if !v { - continue - } - - slice[idx] = []byte(k) - idx++ - } - - return slice, nil - -} - -func (db *DB) SInter(keys ...[]byte) ([][]byte, error) { - v, err := db.sInterGeneric(keys...) - return v, err - -} - -func (db *DB) SInterStore(dstKey []byte, keys ...[]byte) (int64, error) { - n, err := db.sStoreGeneric(dstKey, InterType, keys...) - return n, err -} - -func (db *DB) SIsMember(key []byte, member []byte) (int64, error) { - ek := db.sEncodeSetKey(key, member) - - var n int64 = 1 - if v, err := db.bucket.Get(ek); err != nil { - return 0, err - } else if v == nil { - n = 0 - } - return n, nil -} - -func (db *DB) SMembers(key []byte) ([][]byte, error) { - if err := checkKeySize(key); err != nil { - return nil, err - } - - start := db.sEncodeStartKey(key) - stop := db.sEncodeStopKey(key) - - v := make([][]byte, 0, 16) - - it := db.bucket.RangeLimitIterator(start, stop, store.RangeROpen, 0, -1) - for ; it.Valid(); it.Next() { - _, m, err := db.sDecodeSetKey(it.Key()) - if err != nil { - return nil, err - } - - v = append(v, m) - } - - it.Close() - - return v, nil -} - -func (db *DB) SRem(key []byte, args ...[]byte) (int64, error) { - t := db.setBatch - t.Lock() - defer t.Unlock() - - var ek []byte - var v []byte - var err error - - it := db.bucket.NewIterator() - defer it.Close() - - var num int64 = 0 - for i := 0; i < len(args); i++ { - if err := checkSetKMSize(key, args[i]); err != nil { - return 0, err - } - - ek = db.sEncodeSetKey(key, args[i]) - - v = it.RawFind(ek) - if v == nil { - continue - } else { - num++ - t.Delete(ek) - } - } - - if _, err = db.sIncrSize(key, -num); err != nil { - return 0, err - } - - err = t.Commit() - return num, err - -} - -func (db *DB) sUnionGeneric(keys ...[]byte) ([][]byte, error) { - dstMap := make(map[string]bool) - - for _, key := range keys { - if err := checkKeySize(key); err != nil { - return nil, err - } - - members, err := db.SMembers(key) - if err != nil { - return nil, err - } - - for _, member := range members { - dstMap[hack.String(member)] = true - } - } - - slice := make([][]byte, len(dstMap)) - idx := 0 - for k, v := range dstMap { - if !v { - continue - } - slice[idx] = []byte(k) - idx++ - } - - return slice, nil -} - -func (db *DB) SUnion(keys ...[]byte) ([][]byte, error) { - v, err := db.sUnionGeneric(keys...) - return v, err -} - -func (db *DB) SUnionStore(dstKey []byte, keys ...[]byte) (int64, error) { - n, err := db.sStoreGeneric(dstKey, UnionType, keys...) - return n, err -} - -func (db *DB) sStoreGeneric(dstKey []byte, optType byte, keys ...[]byte) (int64, error) { - if err := checkKeySize(dstKey); err != nil { - return 0, err - } - - t := db.setBatch - t.Lock() - defer t.Unlock() - - db.sDelete(t, dstKey) - - var err error - var ek []byte - var v [][]byte - - switch optType { - case UnionType: - v, err = db.sUnionGeneric(keys...) - case DiffType: - v, err = db.sDiffGeneric(keys...) - case InterType: - v, err = db.sInterGeneric(keys...) - } - - if err != nil { - return 0, err - } - - for _, m := range v { - if err := checkSetKMSize(dstKey, m); err != nil { - return 0, err - } - - ek = db.sEncodeSetKey(dstKey, m) - - if _, err := db.bucket.Get(ek); err != nil { - return 0, err - } - - t.Put(ek, nil) - } - - var n = int64(len(v)) - sk := db.sEncodeSizeKey(dstKey) - t.Put(sk, PutInt64(n)) - - if err = t.Commit(); err != nil { - return 0, err - } - return n, nil -} - -func (db *DB) SClear(key []byte) (int64, error) { - if err := checkKeySize(key); err != nil { - return 0, err - } - - t := db.setBatch - t.Lock() - defer t.Unlock() - - num := db.sDelete(t, key) - db.rmExpire(t, SetType, key) - - err := t.Commit() - return num, err -} - -func (db *DB) SMclear(keys ...[]byte) (int64, error) { - t := db.setBatch - t.Lock() - defer t.Unlock() - - for _, key := range keys { - if err := checkKeySize(key); err != nil { - return 0, err - } - - db.sDelete(t, key) - db.rmExpire(t, SetType, key) - } - - err := t.Commit() - return int64(len(keys)), err -} - -func (db *DB) SExpire(key []byte, duration int64) (int64, error) { - if duration <= 0 { - return 0, errExpireValue - } - - return db.sExpireAt(key, time.Now().Unix()+duration) - -} - -func (db *DB) SExpireAt(key []byte, when int64) (int64, error) { - if when <= time.Now().Unix() { - return 0, errExpireValue - } - - return db.sExpireAt(key, when) - -} - -func (db *DB) STTL(key []byte) (int64, error) { - if err := checkKeySize(key); err != nil { - return -1, err - } - - return db.ttl(SetType, key) -} - -func (db *DB) SPersist(key []byte) (int64, error) { - if err := checkKeySize(key); err != nil { - return 0, err - } - - t := db.setBatch - t.Lock() - defer t.Unlock() - - n, err := db.rmExpire(t, SetType, key) - if err != nil { - return 0, err - } - err = t.Commit() - return n, err -} - -func (db *DB) SScan(key []byte, count int, inclusive bool, match string) ([][]byte, error) { - return db.scan(SSizeType, key, count, inclusive, match) -} - -func (db *DB) SRevScan(key []byte, count int, inclusive bool, match string) ([][]byte, error) { - return db.revscan(SSizeType, key, count, inclusive, match) -} diff --git a/vendor/github.com/siddontang/ledisdb/ledis/t_ttl.go b/vendor/github.com/siddontang/ledisdb/ledis/t_ttl.go deleted file mode 100644 index f16a735d026a..000000000000 --- a/vendor/github.com/siddontang/ledisdb/ledis/t_ttl.go +++ /dev/null @@ -1,210 +0,0 @@ -package ledis - -import ( - "encoding/binary" - "errors" - "github.com/siddontang/ledisdb/store" - "sync" - "time" -) - -var ( - errExpMetaKey = errors.New("invalid expire meta key") - errExpTimeKey = errors.New("invalid expire time key") -) - -type onExpired func(*batch, []byte) int64 - -type ttlChecker struct { - sync.Mutex - db *DB - txs []*batch - cbs []onExpired - - //next check time - nc int64 -} - -var errExpType = errors.New("invalid expire type") - -func (db *DB) expEncodeTimeKey(dataType byte, key []byte, when int64) []byte { - buf := make([]byte, len(key)+11) - - buf[0] = db.index - buf[1] = ExpTimeType - pos := 2 - - binary.BigEndian.PutUint64(buf[pos:], uint64(when)) - pos += 8 - - buf[pos] = dataType - pos++ - - copy(buf[pos:], key) - - return buf -} - -func (db *DB) expEncodeMetaKey(dataType byte, key []byte) []byte { - buf := make([]byte, len(key)+3) - - buf[0] = db.index - buf[1] = ExpMetaType - buf[2] = dataType - pos := 3 - - copy(buf[pos:], key) - - return buf -} - -func (db *DB) expDecodeMetaKey(mk []byte) (byte, []byte, error) { - if len(mk) <= 3 || mk[0] != db.index || mk[1] != ExpMetaType { - return 0, nil, errExpMetaKey - } - - return mk[2], mk[3:], nil -} - -func (db *DB) expDecodeTimeKey(tk []byte) (byte, []byte, int64, error) { - if len(tk) < 11 || tk[0] != db.index || tk[1] != ExpTimeType { - return 0, nil, 0, errExpTimeKey - } - - return tk[10], tk[11:], int64(binary.BigEndian.Uint64(tk[2:])), nil -} - -func (db *DB) expire(t *batch, dataType byte, key []byte, duration int64) { - db.expireAt(t, dataType, key, time.Now().Unix()+duration) -} - -func (db *DB) expireAt(t *batch, dataType byte, key []byte, when int64) { - mk := db.expEncodeMetaKey(dataType, key) - tk := db.expEncodeTimeKey(dataType, key, when) - - t.Put(tk, mk) - t.Put(mk, PutInt64(when)) - - tc := db.l.tcs[db.index] - tc.setNextCheckTime(when, false) -} - -func (db *DB) ttl(dataType byte, key []byte) (t int64, err error) { - mk := db.expEncodeMetaKey(dataType, key) - - if t, err = Int64(db.bucket.Get(mk)); err != nil || t == 0 { - t = -1 - } else { - t -= time.Now().Unix() - if t <= 0 { - t = -1 - } - // if t == -1 : to remove ???? - } - - return t, err -} - -func (db *DB) rmExpire(t *batch, dataType byte, key []byte) (int64, error) { - mk := db.expEncodeMetaKey(dataType, key) - if v, err := db.bucket.Get(mk); err != nil { - return 0, err - } else if v == nil { - return 0, nil - } else if when, err2 := Int64(v, nil); err2 != nil { - return 0, err2 - } else { - tk := db.expEncodeTimeKey(dataType, key, when) - t.Delete(mk) - t.Delete(tk) - return 1, nil - } -} - -func newTTLChecker(db *DB) *ttlChecker { - c := new(ttlChecker) - c.db = db - c.txs = make([]*batch, maxDataType) - c.cbs = make([]onExpired, maxDataType) - c.nc = 0 - return c -} - -func (c *ttlChecker) register(dataType byte, t *batch, f onExpired) { - c.txs[dataType] = t - c.cbs[dataType] = f -} - -func (c *ttlChecker) setNextCheckTime(when int64, force bool) { - c.Lock() - if force { - c.nc = when - } else if !force && c.nc > when { - c.nc = when - } - c.Unlock() -} - -func (c *ttlChecker) check() { - now := time.Now().Unix() - - c.Lock() - nc := c.nc - c.Unlock() - - if now < nc { - return - } - - nc = now + 3600 - - db := c.db - dbGet := db.bucket.Get - - minKey := db.expEncodeTimeKey(NoneType, nil, 0) - maxKey := db.expEncodeTimeKey(maxDataType, nil, nc) - - it := db.bucket.RangeLimitIterator(minKey, maxKey, store.RangeROpen, 0, -1) - for ; it.Valid(); it.Next() { - tk := it.RawKey() - mk := it.RawValue() - - dt, k, nt, err := db.expDecodeTimeKey(tk) - if err != nil { - continue - } - - if nt > now { - //the next ttl check time is nt! - nc = nt - break - } - - t := c.txs[dt] - cb := c.cbs[dt] - if tk == nil || cb == nil { - continue - } - - t.Lock() - - if exp, err := Int64(dbGet(mk)); err == nil { - // check expire again - if exp <= now { - cb(t, k) - t.Delete(tk) - t.Delete(mk) - - t.Commit() - } - - } - - t.Unlock() - } - it.Close() - - c.setNextCheckTime(nc, true) - - return -} diff --git a/vendor/github.com/siddontang/ledisdb/ledis/t_zset.go b/vendor/github.com/siddontang/ledisdb/ledis/t_zset.go deleted file mode 100644 index 47191710555e..000000000000 --- a/vendor/github.com/siddontang/ledisdb/ledis/t_zset.go +++ /dev/null @@ -1,1025 +0,0 @@ -package ledis - -import ( - "bytes" - "encoding/binary" - "errors" - "github.com/siddontang/go/hack" - "github.com/siddontang/ledisdb/store" - "time" -) - -const ( - MinScore int64 = -1<<63 + 1 - MaxScore int64 = 1<<63 - 1 - InvalidScore int64 = -1 << 63 - - AggregateSum byte = 0 - AggregateMin byte = 1 - AggregateMax byte = 2 -) - -type ScorePair struct { - Score int64 - Member []byte -} - -var errZSizeKey = errors.New("invalid zsize key") -var errZSetKey = errors.New("invalid zset key") -var errZScoreKey = errors.New("invalid zscore key") -var errScoreOverflow = errors.New("zset score overflow") -var errInvalidAggregate = errors.New("invalid aggregate") -var errInvalidWeightNum = errors.New("invalid weight number") -var errInvalidSrcKeyNum = errors.New("invalid src key number") - -const ( - zsetNScoreSep byte = '<' - zsetPScoreSep byte = zsetNScoreSep + 1 - zsetStopScoreSep byte = zsetPScoreSep + 1 - - zsetStartMemSep byte = ':' - zsetStopMemSep byte = zsetStartMemSep + 1 -) - -func checkZSetKMSize(key []byte, member []byte) error { - if len(key) > MaxKeySize || len(key) == 0 { - return errKeySize - } else if len(member) > MaxZSetMemberSize || len(member) == 0 { - return errZSetMemberSize - } - return nil -} - -func (db *DB) zEncodeSizeKey(key []byte) []byte { - buf := make([]byte, len(key)+2) - buf[0] = db.index - buf[1] = ZSizeType - - copy(buf[2:], key) - return buf -} - -func (db *DB) zDecodeSizeKey(ek []byte) ([]byte, error) { - if len(ek) < 2 || ek[0] != db.index || ek[1] != ZSizeType { - return nil, errZSizeKey - } - - return ek[2:], nil -} - -func (db *DB) zEncodeSetKey(key []byte, member []byte) []byte { - buf := make([]byte, len(key)+len(member)+5) - - pos := 0 - buf[pos] = db.index - pos++ - - buf[pos] = ZSetType - pos++ - - binary.BigEndian.PutUint16(buf[pos:], uint16(len(key))) - pos += 2 - - copy(buf[pos:], key) - pos += len(key) - - buf[pos] = zsetStartMemSep - pos++ - - copy(buf[pos:], member) - - return buf -} - -func (db *DB) zDecodeSetKey(ek []byte) ([]byte, []byte, error) { - if len(ek) < 5 || ek[0] != db.index || ek[1] != ZSetType { - return nil, nil, errZSetKey - } - - keyLen := int(binary.BigEndian.Uint16(ek[2:])) - if keyLen+5 > len(ek) { - return nil, nil, errZSetKey - } - - key := ek[4 : 4+keyLen] - - if ek[4+keyLen] != zsetStartMemSep { - return nil, nil, errZSetKey - } - - member := ek[5+keyLen:] - return key, member, nil -} - -func (db *DB) zEncodeStartSetKey(key []byte) []byte { - k := db.zEncodeSetKey(key, nil) - return k -} - -func (db *DB) zEncodeStopSetKey(key []byte) []byte { - k := db.zEncodeSetKey(key, nil) - k[len(k)-1] = zsetStartMemSep + 1 - return k -} - -func (db *DB) zEncodeScoreKey(key []byte, member []byte, score int64) []byte { - buf := make([]byte, len(key)+len(member)+14) - - pos := 0 - buf[pos] = db.index - pos++ - - buf[pos] = ZScoreType - pos++ - - binary.BigEndian.PutUint16(buf[pos:], uint16(len(key))) - pos += 2 - - copy(buf[pos:], key) - pos += len(key) - - if score < 0 { - buf[pos] = zsetNScoreSep - } else { - buf[pos] = zsetPScoreSep - } - - pos++ - binary.BigEndian.PutUint64(buf[pos:], uint64(score)) - pos += 8 - - buf[pos] = zsetStartMemSep - pos++ - - copy(buf[pos:], member) - return buf -} - -func (db *DB) zEncodeStartScoreKey(key []byte, score int64) []byte { - return db.zEncodeScoreKey(key, nil, score) -} - -func (db *DB) zEncodeStopScoreKey(key []byte, score int64) []byte { - k := db.zEncodeScoreKey(key, nil, score) - k[len(k)-1] = zsetStopMemSep - return k -} - -func (db *DB) zDecodeScoreKey(ek []byte) (key []byte, member []byte, score int64, err error) { - if len(ek) < 14 || ek[0] != db.index || ek[1] != ZScoreType { - err = errZScoreKey - return - } - - keyLen := int(binary.BigEndian.Uint16(ek[2:])) - if keyLen+14 > len(ek) { - err = errZScoreKey - return - } - - key = ek[4 : 4+keyLen] - pos := 4 + keyLen - - if (ek[pos] != zsetNScoreSep) && (ek[pos] != zsetPScoreSep) { - err = errZScoreKey - return - } - pos++ - - score = int64(binary.BigEndian.Uint64(ek[pos:])) - pos += 8 - - if ek[pos] != zsetStartMemSep { - err = errZScoreKey - return - } - - pos++ - - member = ek[pos:] - return -} - -func (db *DB) zSetItem(t *batch, key []byte, score int64, member []byte) (int64, error) { - if score <= MinScore || score >= MaxScore { - return 0, errScoreOverflow - } - - var exists int64 = 0 - ek := db.zEncodeSetKey(key, member) - - if v, err := db.bucket.Get(ek); err != nil { - return 0, err - } else if v != nil { - exists = 1 - - if s, err := Int64(v, err); err != nil { - return 0, err - } else { - sk := db.zEncodeScoreKey(key, member, s) - t.Delete(sk) - } - } - - t.Put(ek, PutInt64(score)) - - sk := db.zEncodeScoreKey(key, member, score) - t.Put(sk, []byte{}) - - return exists, nil -} - -func (db *DB) zDelItem(t *batch, key []byte, member []byte, skipDelScore bool) (int64, error) { - ek := db.zEncodeSetKey(key, member) - if v, err := db.bucket.Get(ek); err != nil { - return 0, err - } else if v == nil { - //not exists - return 0, nil - } else { - //exists - if !skipDelScore { - //we must del score - if s, err := Int64(v, err); err != nil { - return 0, err - } else { - sk := db.zEncodeScoreKey(key, member, s) - t.Delete(sk) - } - } - } - - t.Delete(ek) - - return 1, nil -} - -func (db *DB) zDelete(t *batch, key []byte) int64 { - delMembCnt, _ := db.zRemRange(t, key, MinScore, MaxScore, 0, -1) - // todo : log err - return delMembCnt -} - -func (db *DB) zExpireAt(key []byte, when int64) (int64, error) { - t := db.zsetBatch - t.Lock() - defer t.Unlock() - - if zcnt, err := db.ZCard(key); err != nil || zcnt == 0 { - return 0, err - } else { - db.expireAt(t, ZSetType, key, when) - if err := t.Commit(); err != nil { - return 0, err - } - } - return 1, nil -} - -func (db *DB) ZAdd(key []byte, args ...ScorePair) (int64, error) { - if len(args) == 0 { - return 0, nil - } - - t := db.zsetBatch - t.Lock() - defer t.Unlock() - - var num int64 = 0 - for i := 0; i < len(args); i++ { - score := args[i].Score - member := args[i].Member - - if err := checkZSetKMSize(key, member); err != nil { - return 0, err - } - - if n, err := db.zSetItem(t, key, score, member); err != nil { - return 0, err - } else if n == 0 { - //add new - num++ - } - } - - if _, err := db.zIncrSize(t, key, num); err != nil { - return 0, err - } - - err := t.Commit() - return num, err -} - -func (db *DB) zIncrSize(t *batch, key []byte, delta int64) (int64, error) { - sk := db.zEncodeSizeKey(key) - - size, err := Int64(db.bucket.Get(sk)) - if err != nil { - return 0, err - } else { - size += delta - if size <= 0 { - size = 0 - t.Delete(sk) - db.rmExpire(t, ZSetType, key) - } else { - t.Put(sk, PutInt64(size)) - } - } - - return size, nil -} - -func (db *DB) ZCard(key []byte) (int64, error) { - if err := checkKeySize(key); err != nil { - return 0, err - } - - sk := db.zEncodeSizeKey(key) - return Int64(db.bucket.Get(sk)) -} - -func (db *DB) ZScore(key []byte, member []byte) (int64, error) { - if err := checkZSetKMSize(key, member); err != nil { - return InvalidScore, err - } - - var score int64 = InvalidScore - - k := db.zEncodeSetKey(key, member) - if v, err := db.bucket.Get(k); err != nil { - return InvalidScore, err - } else if v == nil { - return InvalidScore, ErrScoreMiss - } else { - if score, err = Int64(v, nil); err != nil { - return InvalidScore, err - } - } - - return score, nil -} - -func (db *DB) ZRem(key []byte, members ...[]byte) (int64, error) { - if len(members) == 0 { - return 0, nil - } - - t := db.zsetBatch - t.Lock() - defer t.Unlock() - - var num int64 = 0 - for i := 0; i < len(members); i++ { - if err := checkZSetKMSize(key, members[i]); err != nil { - return 0, err - } - - if n, err := db.zDelItem(t, key, members[i], false); err != nil { - return 0, err - } else if n == 1 { - num++ - } - } - - if _, err := db.zIncrSize(t, key, -num); err != nil { - return 0, err - } - - err := t.Commit() - return num, err -} - -func (db *DB) ZIncrBy(key []byte, delta int64, member []byte) (int64, error) { - if err := checkZSetKMSize(key, member); err != nil { - return InvalidScore, err - } - - t := db.zsetBatch - t.Lock() - defer t.Unlock() - - ek := db.zEncodeSetKey(key, member) - - var oldScore int64 = 0 - v, err := db.bucket.Get(ek) - if err != nil { - return InvalidScore, err - } else if v == nil { - db.zIncrSize(t, key, 1) - } else { - if oldScore, err = Int64(v, err); err != nil { - return InvalidScore, err - } - } - - newScore := oldScore + delta - if newScore >= MaxScore || newScore <= MinScore { - return InvalidScore, errScoreOverflow - } - - sk := db.zEncodeScoreKey(key, member, newScore) - t.Put(sk, []byte{}) - t.Put(ek, PutInt64(newScore)) - - if v != nil { - // so as to update score, we must delete the old one - oldSk := db.zEncodeScoreKey(key, member, oldScore) - t.Delete(oldSk) - } - - err = t.Commit() - return newScore, err -} - -func (db *DB) ZCount(key []byte, min int64, max int64) (int64, error) { - if err := checkKeySize(key); err != nil { - return 0, err - } - minKey := db.zEncodeStartScoreKey(key, min) - maxKey := db.zEncodeStopScoreKey(key, max) - - rangeType := store.RangeROpen - - it := db.bucket.RangeLimitIterator(minKey, maxKey, rangeType, 0, -1) - var n int64 = 0 - for ; it.Valid(); it.Next() { - n++ - } - it.Close() - - return n, nil -} - -func (db *DB) zrank(key []byte, member []byte, reverse bool) (int64, error) { - if err := checkZSetKMSize(key, member); err != nil { - return 0, err - } - - k := db.zEncodeSetKey(key, member) - - it := db.bucket.NewIterator() - defer it.Close() - - if v := it.Find(k); v == nil { - return -1, nil - } else { - if s, err := Int64(v, nil); err != nil { - return 0, err - } else { - var rit *store.RangeLimitIterator - - sk := db.zEncodeScoreKey(key, member, s) - - if !reverse { - minKey := db.zEncodeStartScoreKey(key, MinScore) - - rit = store.NewRangeIterator(it, &store.Range{minKey, sk, store.RangeClose}) - } else { - maxKey := db.zEncodeStopScoreKey(key, MaxScore) - rit = store.NewRevRangeIterator(it, &store.Range{sk, maxKey, store.RangeClose}) - } - - var lastKey []byte = nil - var n int64 = 0 - - for ; rit.Valid(); rit.Next() { - n++ - - lastKey = rit.BufKey(lastKey) - } - - if _, m, _, err := db.zDecodeScoreKey(lastKey); err == nil && bytes.Equal(m, member) { - n-- - return n, nil - } - } - } - - return -1, nil -} - -func (db *DB) zIterator(key []byte, min int64, max int64, offset int, count int, reverse bool) *store.RangeLimitIterator { - minKey := db.zEncodeStartScoreKey(key, min) - maxKey := db.zEncodeStopScoreKey(key, max) - - if !reverse { - return db.bucket.RangeLimitIterator(minKey, maxKey, store.RangeClose, offset, count) - } else { - return db.bucket.RevRangeLimitIterator(minKey, maxKey, store.RangeClose, offset, count) - } -} - -func (db *DB) zRemRange(t *batch, key []byte, min int64, max int64, offset int, count int) (int64, error) { - if len(key) > MaxKeySize { - return 0, errKeySize - } - - it := db.zIterator(key, min, max, offset, count, false) - var num int64 = 0 - for ; it.Valid(); it.Next() { - sk := it.RawKey() - _, m, _, err := db.zDecodeScoreKey(sk) - if err != nil { - continue - } - - if n, err := db.zDelItem(t, key, m, true); err != nil { - return 0, err - } else if n == 1 { - num++ - } - - t.Delete(sk) - } - it.Close() - - if _, err := db.zIncrSize(t, key, -num); err != nil { - return 0, err - } - - return num, nil -} - -func (db *DB) zRange(key []byte, min int64, max int64, offset int, count int, reverse bool) ([]ScorePair, error) { - if len(key) > MaxKeySize { - return nil, errKeySize - } - - if offset < 0 { - return []ScorePair{}, nil - } - - nv := 64 - if count > 0 { - nv = count - } - - v := make([]ScorePair, 0, nv) - - var it *store.RangeLimitIterator - - //if reverse and offset is 0, count < 0, we may use forward iterator then reverse - //because store iterator prev is slower than next - if !reverse || (offset == 0 && count < 0) { - it = db.zIterator(key, min, max, offset, count, false) - } else { - it = db.zIterator(key, min, max, offset, count, true) - } - - for ; it.Valid(); it.Next() { - _, m, s, err := db.zDecodeScoreKey(it.Key()) - //may be we will check key equal? - if err != nil { - continue - } - - v = append(v, ScorePair{Member: m, Score: s}) - } - it.Close() - - if reverse && (offset == 0 && count < 0) { - for i, j := 0, len(v)-1; i < j; i, j = i+1, j-1 { - v[i], v[j] = v[j], v[i] - } - } - - return v, nil -} - -func (db *DB) zParseLimit(key []byte, start int, stop int) (offset int, count int, err error) { - if start < 0 || stop < 0 { - //refer redis implementation - var size int64 - size, err = db.ZCard(key) - if err != nil { - return - } - - llen := int(size) - - if start < 0 { - start = llen + start - } - if stop < 0 { - stop = llen + stop - } - - if start < 0 { - start = 0 - } - - if start >= llen { - offset = -1 - return - } - } - - if start > stop { - offset = -1 - return - } - - offset = start - count = (stop - start) + 1 - return -} - -func (db *DB) ZClear(key []byte) (int64, error) { - t := db.zsetBatch - t.Lock() - defer t.Unlock() - - rmCnt, err := db.zRemRange(t, key, MinScore, MaxScore, 0, -1) - if err == nil { - err = t.Commit() - } - - return rmCnt, err -} - -func (db *DB) ZMclear(keys ...[]byte) (int64, error) { - t := db.zsetBatch - t.Lock() - defer t.Unlock() - - for _, key := range keys { - if _, err := db.zRemRange(t, key, MinScore, MaxScore, 0, -1); err != nil { - return 0, err - } - } - - err := t.Commit() - - return int64(len(keys)), err -} - -func (db *DB) ZRange(key []byte, start int, stop int) ([]ScorePair, error) { - return db.ZRangeGeneric(key, start, stop, false) -} - -//min and max must be inclusive -//if no limit, set offset = 0 and count = -1 -func (db *DB) ZRangeByScore(key []byte, min int64, max int64, - offset int, count int) ([]ScorePair, error) { - return db.ZRangeByScoreGeneric(key, min, max, offset, count, false) -} - -func (db *DB) ZRank(key []byte, member []byte) (int64, error) { - return db.zrank(key, member, false) -} - -func (db *DB) ZRemRangeByRank(key []byte, start int, stop int) (int64, error) { - offset, count, err := db.zParseLimit(key, start, stop) - if err != nil { - return 0, err - } - - var rmCnt int64 - - t := db.zsetBatch - t.Lock() - defer t.Unlock() - - rmCnt, err = db.zRemRange(t, key, MinScore, MaxScore, offset, count) - if err == nil { - err = t.Commit() - } - - return rmCnt, err -} - -//min and max must be inclusive -func (db *DB) ZRemRangeByScore(key []byte, min int64, max int64) (int64, error) { - t := db.zsetBatch - t.Lock() - defer t.Unlock() - - rmCnt, err := db.zRemRange(t, key, min, max, 0, -1) - if err == nil { - err = t.Commit() - } - - return rmCnt, err -} - -func (db *DB) ZRevRange(key []byte, start int, stop int) ([]ScorePair, error) { - return db.ZRangeGeneric(key, start, stop, true) -} - -func (db *DB) ZRevRank(key []byte, member []byte) (int64, error) { - return db.zrank(key, member, true) -} - -//min and max must be inclusive -//if no limit, set offset = 0 and count = -1 -func (db *DB) ZRevRangeByScore(key []byte, min int64, max int64, offset int, count int) ([]ScorePair, error) { - return db.ZRangeByScoreGeneric(key, min, max, offset, count, true) -} - -func (db *DB) ZRangeGeneric(key []byte, start int, stop int, reverse bool) ([]ScorePair, error) { - offset, count, err := db.zParseLimit(key, start, stop) - if err != nil { - return nil, err - } - - return db.zRange(key, MinScore, MaxScore, offset, count, reverse) -} - -//min and max must be inclusive -//if no limit, set offset = 0 and count = -1 -func (db *DB) ZRangeByScoreGeneric(key []byte, min int64, max int64, - offset int, count int, reverse bool) ([]ScorePair, error) { - - return db.zRange(key, min, max, offset, count, reverse) -} - -func (db *DB) zFlush() (drop int64, err error) { - t := db.zsetBatch - t.Lock() - defer t.Unlock() - return db.flushType(t, ZSetType) -} - -func (db *DB) ZExpire(key []byte, duration int64) (int64, error) { - if duration <= 0 { - return 0, errExpireValue - } - - return db.zExpireAt(key, time.Now().Unix()+duration) -} - -func (db *DB) ZExpireAt(key []byte, when int64) (int64, error) { - if when <= time.Now().Unix() { - return 0, errExpireValue - } - - return db.zExpireAt(key, when) -} - -func (db *DB) ZTTL(key []byte) (int64, error) { - if err := checkKeySize(key); err != nil { - return -1, err - } - - return db.ttl(ZSetType, key) -} - -func (db *DB) ZPersist(key []byte) (int64, error) { - if err := checkKeySize(key); err != nil { - return 0, err - } - - t := db.zsetBatch - t.Lock() - defer t.Unlock() - - n, err := db.rmExpire(t, ZSetType, key) - if err != nil { - return 0, err - } - - err = t.Commit() - return n, err -} - -func getAggregateFunc(aggregate byte) func(int64, int64) int64 { - switch aggregate { - case AggregateSum: - return func(a int64, b int64) int64 { - return a + b - } - case AggregateMax: - return func(a int64, b int64) int64 { - if a > b { - return a - } - return b - } - case AggregateMin: - return func(a int64, b int64) int64 { - if a > b { - return b - } - return a - } - } - return nil -} - -func (db *DB) ZUnionStore(destKey []byte, srcKeys [][]byte, weights []int64, aggregate byte) (int64, error) { - - var destMap = map[string]int64{} - aggregateFunc := getAggregateFunc(aggregate) - if aggregateFunc == nil { - return 0, errInvalidAggregate - } - if len(srcKeys) < 1 { - return 0, errInvalidSrcKeyNum - } - if weights != nil { - if len(srcKeys) != len(weights) { - return 0, errInvalidWeightNum - } - } else { - weights = make([]int64, len(srcKeys)) - for i := 0; i < len(weights); i++ { - weights[i] = 1 - } - } - - for i, key := range srcKeys { - scorePairs, err := db.ZRange(key, 0, -1) - if err != nil { - return 0, err - } - for _, pair := range scorePairs { - if score, ok := destMap[hack.String(pair.Member)]; !ok { - destMap[hack.String(pair.Member)] = pair.Score * weights[i] - } else { - destMap[hack.String(pair.Member)] = aggregateFunc(score, pair.Score*weights[i]) - } - } - } - - t := db.zsetBatch - t.Lock() - defer t.Unlock() - - db.zDelete(t, destKey) - - for member, score := range destMap { - if err := checkZSetKMSize(destKey, []byte(member)); err != nil { - return 0, err - } - - if _, err := db.zSetItem(t, destKey, score, []byte(member)); err != nil { - return 0, err - } - } - - var n = int64(len(destMap)) - sk := db.zEncodeSizeKey(destKey) - t.Put(sk, PutInt64(n)) - - if err := t.Commit(); err != nil { - return 0, err - } - return n, nil -} - -func (db *DB) ZInterStore(destKey []byte, srcKeys [][]byte, weights []int64, aggregate byte) (int64, error) { - - aggregateFunc := getAggregateFunc(aggregate) - if aggregateFunc == nil { - return 0, errInvalidAggregate - } - if len(srcKeys) < 1 { - return 0, errInvalidSrcKeyNum - } - if weights != nil { - if len(srcKeys) != len(weights) { - return 0, errInvalidWeightNum - } - } else { - weights = make([]int64, len(srcKeys)) - for i := 0; i < len(weights); i++ { - weights[i] = 1 - } - } - - var destMap = map[string]int64{} - scorePairs, err := db.ZRange(srcKeys[0], 0, -1) - if err != nil { - return 0, err - } - for _, pair := range scorePairs { - destMap[hack.String(pair.Member)] = pair.Score * weights[0] - } - - for i, key := range srcKeys[1:] { - scorePairs, err := db.ZRange(key, 0, -1) - if err != nil { - return 0, err - } - tmpMap := map[string]int64{} - for _, pair := range scorePairs { - if score, ok := destMap[hack.String(pair.Member)]; ok { - tmpMap[hack.String(pair.Member)] = aggregateFunc(score, pair.Score*weights[i+1]) - } - } - destMap = tmpMap - } - - t := db.zsetBatch - t.Lock() - defer t.Unlock() - - db.zDelete(t, destKey) - - for member, score := range destMap { - if err := checkZSetKMSize(destKey, []byte(member)); err != nil { - return 0, err - } - if _, err := db.zSetItem(t, destKey, score, []byte(member)); err != nil { - return 0, err - } - } - - var n int64 = int64(len(destMap)) - sk := db.zEncodeSizeKey(destKey) - t.Put(sk, PutInt64(n)) - - if err := t.Commit(); err != nil { - return 0, err - } - return n, nil -} - -func (db *DB) ZScan(key []byte, count int, inclusive bool, match string) ([][]byte, error) { - return db.scan(ZSizeType, key, count, inclusive, match) -} - -func (db *DB) ZRevScan(key []byte, count int, inclusive bool, match string) ([][]byte, error) { - return db.revscan(ZSizeType, key, count, inclusive, match) -} - -func (db *DB) ZRangeByLex(key []byte, min []byte, max []byte, rangeType uint8, offset int, count int) ([][]byte, error) { - if min == nil { - min = db.zEncodeStartSetKey(key) - } else { - min = db.zEncodeSetKey(key, min) - } - if max == nil { - max = db.zEncodeStopSetKey(key) - } else { - max = db.zEncodeSetKey(key, max) - } - - it := db.bucket.RangeLimitIterator(min, max, rangeType, offset, count) - defer it.Close() - - ay := make([][]byte, 0, 16) - for ; it.Valid(); it.Next() { - if _, m, err := db.zDecodeSetKey(it.Key()); err == nil { - ay = append(ay, m) - } - } - - return ay, nil -} - -func (db *DB) ZRemRangeByLex(key []byte, min []byte, max []byte, rangeType uint8) (int64, error) { - if min == nil { - min = db.zEncodeStartSetKey(key) - } else { - min = db.zEncodeSetKey(key, min) - } - if max == nil { - max = db.zEncodeStopSetKey(key) - } else { - max = db.zEncodeSetKey(key, max) - } - - t := db.zsetBatch - t.Lock() - defer t.Unlock() - - it := db.bucket.RangeIterator(min, max, rangeType) - defer it.Close() - - var n int64 = 0 - for ; it.Valid(); it.Next() { - t.Delete(it.RawKey()) - n++ - } - - if err := t.Commit(); err != nil { - return 0, err - } - - return n, nil -} - -func (db *DB) ZLexCount(key []byte, min []byte, max []byte, rangeType uint8) (int64, error) { - if min == nil { - min = db.zEncodeStartSetKey(key) - } else { - min = db.zEncodeSetKey(key, min) - } - if max == nil { - max = db.zEncodeStopSetKey(key) - } else { - max = db.zEncodeSetKey(key, max) - } - - it := db.bucket.RangeIterator(min, max, rangeType) - defer it.Close() - - var n int64 = 0 - for ; it.Valid(); it.Next() { - n++ - } - - return n, nil -} diff --git a/vendor/github.com/siddontang/ledisdb/ledis/tx.go b/vendor/github.com/siddontang/ledisdb/ledis/tx.go deleted file mode 100644 index 03d6c5b7ba34..000000000000 --- a/vendor/github.com/siddontang/ledisdb/ledis/tx.go +++ /dev/null @@ -1,112 +0,0 @@ -package ledis - -import ( - "errors" - "fmt" - "github.com/siddontang/ledisdb/store" -) - -var ( - ErrNestTx = errors.New("nest transaction not supported") - ErrTxDone = errors.New("Transaction has already been committed or rolled back") -) - -type Tx struct { - *DB - - tx *store.Tx - - data *store.BatchData -} - -func (db *DB) IsTransaction() bool { - return db.status == DBInTransaction -} - -// Begin a transaction, it will block all other write operations before calling Commit or Rollback. -// You must be very careful to prevent long-time transaction. -func (db *DB) Begin() (*Tx, error) { - if db.IsTransaction() { - return nil, ErrNestTx - } - - tx := new(Tx) - - tx.data = &store.BatchData{} - - tx.DB = new(DB) - tx.DB.l = db.l - - tx.l.wLock.Lock() - - tx.DB.sdb = db.sdb - - var err error - tx.tx, err = db.sdb.Begin() - if err != nil { - tx.l.wLock.Unlock() - return nil, err - } - - tx.DB.bucket = tx.tx - - tx.DB.status = DBInTransaction - - tx.DB.index = db.index - - tx.DB.kvBatch = tx.newBatch() - tx.DB.listBatch = tx.newBatch() - tx.DB.hashBatch = tx.newBatch() - tx.DB.zsetBatch = tx.newBatch() - tx.DB.binBatch = tx.newBatch() - tx.DB.setBatch = tx.newBatch() - - tx.DB.lbkeys = db.lbkeys - - return tx, nil -} - -func (tx *Tx) Commit() error { - if tx.tx == nil { - return ErrTxDone - } - - err := tx.l.handleCommit(tx.data, tx.tx) - tx.data.Reset() - - tx.tx = nil - - tx.l.wLock.Unlock() - - tx.DB.bucket = nil - - return err -} - -func (tx *Tx) Rollback() error { - if tx.tx == nil { - return ErrTxDone - } - - err := tx.tx.Rollback() - tx.data.Reset() - tx.tx = nil - - tx.l.wLock.Unlock() - tx.DB.bucket = nil - - return err -} - -func (tx *Tx) newBatch() *batch { - return tx.l.newBatch(tx.tx.NewWriteBatch(), &txBatchLocker{}, tx) -} - -func (tx *Tx) Select(index int) error { - if index < 0 || index >= int(MaxDBNumber) { - return fmt.Errorf("invalid db index %d", index) - } - - tx.DB.index = uint8(index) - return nil -} diff --git a/vendor/github.com/siddontang/ledisdb/ledis/util.go b/vendor/github.com/siddontang/ledisdb/ledis/util.go deleted file mode 100644 index a0abdd02228f..000000000000 --- a/vendor/github.com/siddontang/ledisdb/ledis/util.go +++ /dev/null @@ -1,94 +0,0 @@ -package ledis - -import ( - "encoding/binary" - "errors" - "github.com/siddontang/go/hack" - "strconv" -) - -var errIntNumber = errors.New("invalid integer") - -/* - Below I forget why I use little endian to store int. - Maybe I was foolish at that time. -*/ - -func Int64(v []byte, err error) (int64, error) { - if err != nil { - return 0, err - } else if v == nil || len(v) == 0 { - return 0, nil - } else if len(v) != 8 { - return 0, errIntNumber - } - - return int64(binary.LittleEndian.Uint64(v)), nil -} - -func Uint64(v []byte, err error) (uint64, error) { - if err != nil { - return 0, err - } else if v == nil || len(v) == 0 { - return 0, nil - } else if len(v) != 8 { - return 0, errIntNumber - } - - return binary.LittleEndian.Uint64(v), nil -} - -func PutInt64(v int64) []byte { - b := make([]byte, 8) - binary.LittleEndian.PutUint64(b, uint64(v)) - return b -} - -func StrInt64(v []byte, err error) (int64, error) { - if err != nil { - return 0, err - } else if v == nil { - return 0, nil - } else { - return strconv.ParseInt(hack.String(v), 10, 64) - } -} - -func StrUint64(v []byte, err error) (uint64, error) { - if err != nil { - return 0, err - } else if v == nil { - return 0, nil - } else { - return strconv.ParseUint(hack.String(v), 10, 64) - } -} - -func StrInt32(v []byte, err error) (int32, error) { - if err != nil { - return 0, err - } else if v == nil { - return 0, nil - } else { - res, err := strconv.ParseInt(hack.String(v), 10, 32) - return int32(res), err - } -} - -func StrInt8(v []byte, err error) (int8, error) { - if err != nil { - return 0, err - } else if v == nil { - return 0, nil - } else { - res, err := strconv.ParseInt(hack.String(v), 10, 8) - return int8(res), err - } -} - -func AsyncNotify(ch chan struct{}) { - select { - case ch <- struct{}{}: - default: - } -} diff --git a/vendor/github.com/siddontang/ledisdb/rpl/file_io.go b/vendor/github.com/siddontang/ledisdb/rpl/file_io.go deleted file mode 100644 index 08e9d2eb3e19..000000000000 --- a/vendor/github.com/siddontang/ledisdb/rpl/file_io.go +++ /dev/null @@ -1,362 +0,0 @@ -package rpl - -import ( - "fmt" - "github.com/edsrzf/mmap-go" - "github.com/siddontang/go/log" - "io" - "os" -) - -//like leveldb or rocksdb file interface, haha! - -type writeFile interface { - Sync() error - Write(b []byte) (n int, err error) - Close() error - ReadAt(buf []byte, offset int64) (int, error) - Truncate(size int64) error - SetOffset(o int64) - Name() string - Size() int - Offset() int64 -} - -type readFile interface { - ReadAt(buf []byte, offset int64) (int, error) - Close() error - Size() int - Name() string -} - -type rawWriteFile struct { - writeFile - f *os.File - offset int64 - name string -} - -func newRawWriteFile(name string, size int64) (writeFile, error) { - m := new(rawWriteFile) - var err error - - m.name = name - - m.f, err = os.OpenFile(name, os.O_CREATE|os.O_RDWR, 0644) - if err != nil { - return nil, err - } - - return m, nil -} - -func (m *rawWriteFile) Close() error { - if err := m.f.Truncate(m.offset); err != nil { - return fmt.Errorf("close truncate %s error %s", m.name, err.Error()) - } - - if err := m.f.Close(); err != nil { - return fmt.Errorf("close %s error %s", m.name, err.Error()) - } - - return nil -} - -func (m *rawWriteFile) Sync() error { - return m.f.Sync() -} - -func (m *rawWriteFile) Write(b []byte) (n int, err error) { - n, err = m.f.WriteAt(b, m.offset) - if err != nil { - return - } else if n != len(b) { - err = io.ErrShortWrite - return - } - - m.offset += int64(n) - return -} - -func (m *rawWriteFile) ReadAt(buf []byte, offset int64) (int, error) { - return m.f.ReadAt(buf, offset) -} - -func (m *rawWriteFile) Truncate(size int64) error { - var err error - if err = m.f.Truncate(size); err != nil { - return err - } - - if m.offset > size { - m.offset = size - } - return nil -} - -func (m *rawWriteFile) SetOffset(o int64) { - m.offset = o -} - -func (m *rawWriteFile) Offset() int64 { - return m.offset -} - -func (m *rawWriteFile) Name() string { - return m.name -} - -func (m *rawWriteFile) Size() int { - st, _ := m.f.Stat() - return int(st.Size()) -} - -type rawReadFile struct { - readFile - - f *os.File - name string -} - -func newRawReadFile(name string) (readFile, error) { - m := new(rawReadFile) - - var err error - m.f, err = os.Open(name) - m.name = name - - if err != nil { - return nil, err - } - - return m, err -} - -func (m *rawReadFile) Close() error { - return m.f.Close() -} - -func (m *rawReadFile) Size() int { - st, _ := m.f.Stat() - return int(st.Size()) -} - -func (m *rawReadFile) ReadAt(b []byte, offset int64) (int, error) { - return m.f.ReadAt(b, offset) -} - -func (m *rawReadFile) Name() string { - return m.name -} - -///////////////////////////////////////////////// - -type mmapWriteFile struct { - writeFile - - f *os.File - m mmap.MMap - name string - size int64 - offset int64 -} - -func newMmapWriteFile(name string, size int64) (writeFile, error) { - m := new(mmapWriteFile) - - m.name = name - - var err error - - m.f, err = os.OpenFile(name, os.O_CREATE|os.O_RDWR, 0644) - if err != nil { - return nil, err - } - - if size == 0 { - st, _ := m.f.Stat() - size = st.Size() - } - - if err = m.f.Truncate(size); err != nil { - return nil, err - } - - if m.m, err = mmap.Map(m.f, mmap.RDWR, 0); err != nil { - return nil, err - } - - m.size = size - m.offset = 0 - return m, nil -} - -func (m *mmapWriteFile) Size() int { - return int(m.size) -} - -func (m *mmapWriteFile) Sync() error { - return m.m.Flush() -} - -func (m *mmapWriteFile) Close() error { - if err := m.m.Unmap(); err != nil { - return fmt.Errorf("unmap %s error %s", m.name, err.Error()) - } - - if err := m.f.Truncate(m.offset); err != nil { - return fmt.Errorf("close truncate %s error %s", m.name, err.Error()) - } - - if err := m.f.Close(); err != nil { - return fmt.Errorf("close %s error %s", m.name, err.Error()) - } - - return nil -} - -func (m *mmapWriteFile) Write(b []byte) (n int, err error) { - extra := int64(len(b)) - (m.size - m.offset) - if extra > 0 { - newSize := m.size + extra + m.size/10 - if err = m.Truncate(newSize); err != nil { - return - } - m.size = newSize - } - - n = copy(m.m[m.offset:], b) - if n != len(b) { - return 0, io.ErrShortWrite - } - - m.offset += int64(len(b)) - return len(b), nil -} - -func (m *mmapWriteFile) ReadAt(buf []byte, offset int64) (int, error) { - if offset > m.offset { - return 0, fmt.Errorf("invalid offset %d", offset) - } - - n := copy(buf, m.m[offset:m.offset]) - if n != len(buf) { - return n, io.ErrUnexpectedEOF - } - - return n, nil -} - -func (m *mmapWriteFile) Truncate(size int64) error { - var err error - if err = m.m.Unmap(); err != nil { - return err - } - - if err = m.f.Truncate(size); err != nil { - return err - } - - if m.m, err = mmap.Map(m.f, mmap.RDWR, 0); err != nil { - return err - } - - m.size = size - if m.offset > m.size { - m.offset = m.size - } - return nil -} - -func (m *mmapWriteFile) SetOffset(o int64) { - m.offset = o -} - -func (m *mmapWriteFile) Offset() int64 { - return m.offset -} - -func (m *mmapWriteFile) Name() string { - return m.name -} - -type mmapReadFile struct { - readFile - - f *os.File - m mmap.MMap - name string -} - -func newMmapReadFile(name string) (readFile, error) { - m := new(mmapReadFile) - - m.name = name - - var err error - m.f, err = os.Open(name) - if err != nil { - return nil, err - } - - m.m, err = mmap.Map(m.f, mmap.RDONLY, 0) - return m, err -} - -func (m *mmapReadFile) ReadAt(buf []byte, offset int64) (int, error) { - if int64(offset) > int64(len(m.m)) { - return 0, fmt.Errorf("invalid offset %d", offset) - } - - n := copy(buf, m.m[offset:]) - if n != len(buf) { - return n, io.ErrUnexpectedEOF - } - - return n, nil -} - -func (m *mmapReadFile) Close() error { - if m.m != nil { - if err := m.m.Unmap(); err != nil { - log.Error("unmap %s error %s", m.name, err.Error()) - } - m.m = nil - } - - if m.f != nil { - if err := m.f.Close(); err != nil { - log.Error("close %s error %s", m.name, err.Error()) - } - m.f = nil - } - - return nil -} - -func (m *mmapReadFile) Size() int { - return len(m.m) -} - -func (m *mmapReadFile) Name() string { - return m.name -} - -///////////////////////////////////// - -func newWriteFile(useMmap bool, name string, size int64) (writeFile, error) { - if useMmap { - return newMmapWriteFile(name, size) - } else { - return newRawWriteFile(name, size) - } -} - -func newReadFile(useMmap bool, name string) (readFile, error) { - if useMmap { - return newMmapReadFile(name) - } else { - return newRawReadFile(name) - } -} diff --git a/vendor/github.com/siddontang/ledisdb/rpl/file_store.go b/vendor/github.com/siddontang/ledisdb/rpl/file_store.go deleted file mode 100644 index 161ab8d923d8..000000000000 --- a/vendor/github.com/siddontang/ledisdb/rpl/file_store.go +++ /dev/null @@ -1,414 +0,0 @@ -package rpl - -import ( - "fmt" - "github.com/siddontang/go/log" - "github.com/siddontang/go/num" - "github.com/siddontang/ledisdb/config" - "io/ioutil" - "os" - "sort" - "sync" - "time" -) - -const ( - defaultMaxLogFileSize = int64(256 * 1024 * 1024) - - maxLogFileSize = int64(1024 * 1024 * 1024) - - defaultLogNumInFile = int64(1024 * 1024) -) - -/* - File Store: - 00000001.data - 00000001.meta - 00000002.data - 00000002.meta - - data: log1 data | log2 data | magic data - - if data has no magic data, it means that we don't close replication gracefully. - so we must repair the log data - log data: id (bigendian uint64), create time (bigendian uint32), compression (byte), data len(bigendian uint32), data - split data = log0 data + [padding 0] -> file % pagesize() == 0 - - meta: log1 offset | log2 offset - log offset: bigendian uint32 | bigendian uint32 - - //sha1 of github.com/siddontang/ledisdb 20 bytes - magic data = "\x1c\x1d\xb8\x88\xff\x9e\x45\x55\x40\xf0\x4c\xda\xe0\xce\x47\xde\x65\x48\x71\x17" - - we must guarantee that the log id is monotonic increment strictly. - if log1's id is 1, log2 must be 2 -*/ - -type FileStore struct { - LogStore - - cfg *config.Config - - base string - - rm sync.RWMutex - wm sync.Mutex - - rs tableReaders - w *tableWriter - - quit chan struct{} -} - -func NewFileStore(base string, cfg *config.Config) (*FileStore, error) { - s := new(FileStore) - - s.quit = make(chan struct{}) - - var err error - - if err = os.MkdirAll(base, 0755); err != nil { - return nil, err - } - - s.base = base - - if cfg.Replication.MaxLogFileSize == 0 { - cfg.Replication.MaxLogFileSize = defaultMaxLogFileSize - } - - cfg.Replication.MaxLogFileSize = num.MinInt64(cfg.Replication.MaxLogFileSize, maxLogFileSize) - - s.cfg = cfg - - if err = s.load(); err != nil { - return nil, err - } - - index := int64(1) - if len(s.rs) != 0 { - index = s.rs[len(s.rs)-1].index + 1 - } - - s.w = newTableWriter(s.base, index, cfg.Replication.MaxLogFileSize, cfg.Replication.UseMmap) - s.w.SetSyncType(cfg.Replication.SyncLog) - - go s.checkTableReaders() - - return s, nil -} - -func (s *FileStore) GetLog(id uint64, l *Log) error { - //first search in table writer - if err := s.w.GetLog(id, l); err == nil { - return nil - } else if err != ErrLogNotFound { - return err - } - - s.rm.RLock() - t := s.rs.Search(id) - - if t == nil { - s.rm.RUnlock() - - return ErrLogNotFound - } - - err := t.GetLog(id, l) - s.rm.RUnlock() - - return err -} - -func (s *FileStore) FirstID() (uint64, error) { - id := uint64(0) - - s.rm.RLock() - if len(s.rs) > 0 { - id = s.rs[0].first - } else { - id = 0 - } - s.rm.RUnlock() - - if id > 0 { - return id, nil - } - - //if id = 0, - - return s.w.First(), nil -} - -func (s *FileStore) LastID() (uint64, error) { - id := s.w.Last() - if id > 0 { - return id, nil - } - - //if table writer has no last id, we may find in the last table reader - - s.rm.RLock() - if len(s.rs) > 0 { - id = s.rs[len(s.rs)-1].last - } - s.rm.RUnlock() - - return id, nil -} - -func (s *FileStore) StoreLog(l *Log) error { - s.wm.Lock() - err := s.storeLog(l) - s.wm.Unlock() - return err -} - -func (s *FileStore) storeLog(l *Log) error { - err := s.w.StoreLog(l) - if err == nil { - return nil - } else if err != errTableNeedFlush { - return err - } - - var r *tableReader - r, err = s.w.Flush() - - if err != nil { - log.Fatal("write table flush error %s, can not store!!!", err.Error()) - - s.w.Close() - - return err - } - - s.rm.Lock() - s.rs = append(s.rs, r) - s.rm.Unlock() - - err = s.w.StoreLog(l) - - return err -} - -func (s *FileStore) PurgeExpired(n int64) error { - s.rm.Lock() - - purges := []*tableReader{} - - t := uint32(time.Now().Unix() - int64(n)) - - for i, r := range s.rs { - if r.lastTime > t { - purges = s.rs[0:i] - s.rs = s.rs[i:] - break - } - } - - s.rm.Unlock() - - s.purgeTableReaders(purges) - - return nil -} - -func (s *FileStore) Sync() error { - return s.w.Sync() -} - -func (s *FileStore) Clear() error { - s.wm.Lock() - s.rm.Lock() - - defer func() { - s.rm.Unlock() - s.wm.Unlock() - }() - - s.w.Close() - - for i := range s.rs { - s.rs[i].Close() - } - - s.rs = tableReaders{} - - if err := os.RemoveAll(s.base); err != nil { - return err - } - - if err := os.MkdirAll(s.base, 0755); err != nil { - return err - } - - s.w = newTableWriter(s.base, 1, s.cfg.Replication.MaxLogFileSize, s.cfg.Replication.UseMmap) - - return nil -} - -func (s *FileStore) Close() error { - close(s.quit) - - s.wm.Lock() - s.rm.Lock() - - if r, err := s.w.Flush(); err != nil { - if err != errNilHandler { - log.Error("close err: %s", err.Error()) - } - } else { - r.Close() - s.w.Close() - } - - for i := range s.rs { - s.rs[i].Close() - } - - s.rs = tableReaders{} - - s.rm.Unlock() - s.wm.Unlock() - - return nil -} - -func (s *FileStore) checkTableReaders() { - t := time.NewTicker(60 * time.Second) - defer t.Stop() - for { - select { - case <-t.C: - s.rm.Lock() - - for _, r := range s.rs { - if !r.Keepalived() { - r.Close() - } - } - - purges := []*tableReader{} - maxNum := s.cfg.Replication.MaxLogFileNum - num := len(s.rs) - if num > maxNum { - purges = s.rs[:num-maxNum] - s.rs = s.rs[num-maxNum:] - } - - s.rm.Unlock() - - s.purgeTableReaders(purges) - - case <-s.quit: - return - } - } -} - -func (s *FileStore) purgeTableReaders(purges []*tableReader) { - for _, r := range purges { - dataName := fmtTableDataName(r.base, r.index) - metaName := fmtTableMetaName(r.base, r.index) - r.Close() - if err := os.Remove(dataName); err != nil { - log.Error("purge table data %s err: %s", dataName, err.Error()) - } - if err := os.Remove(metaName); err != nil { - log.Error("purge table meta %s err: %s", metaName, err.Error()) - } - - } -} - -func (s *FileStore) load() error { - fs, err := ioutil.ReadDir(s.base) - if err != nil { - return err - } - - s.rs = make(tableReaders, 0, len(fs)) - - var r *tableReader - var index int64 - for _, f := range fs { - if _, err := fmt.Sscanf(f.Name(), "%08d.data", &index); err == nil { - if r, err = newTableReader(s.base, index, s.cfg.Replication.UseMmap); err != nil { - log.Error("load table %s err: %s", f.Name(), err.Error()) - } else { - s.rs = append(s.rs, r) - } - } - } - - if err := s.rs.check(); err != nil { - return err - } - - return nil -} - -type tableReaders []*tableReader - -func (ts tableReaders) Len() int { - return len(ts) -} - -func (ts tableReaders) Swap(i, j int) { - ts[i], ts[j] = ts[j], ts[i] -} - -func (ts tableReaders) Less(i, j int) bool { - return ts[i].first < ts[j].first -} - -func (ts tableReaders) Search(id uint64) *tableReader { - i, j := 0, len(ts)-1 - - for i <= j { - h := i + (j-i)/2 - - if ts[h].first <= id && id <= ts[h].last { - return ts[h] - } else if ts[h].last < id { - i = h + 1 - } else { - j = h - 1 - } - } - - return nil -} - -func (ts tableReaders) check() error { - if len(ts) == 0 { - return nil - } - - sort.Sort(ts) - - first := ts[0].first - last := ts[0].last - index := ts[0].index - - if first == 0 || first > last { - return fmt.Errorf("invalid log in table %s", ts[0]) - } - - for i := 1; i < len(ts); i++ { - if ts[i].first <= last { - return fmt.Errorf("invalid first log id %d in table %s", ts[i].first, ts[i]) - } - - if ts[i].index <= index { - return fmt.Errorf("invalid index %d in table %s", ts[i].index, ts[i]) - } - - first = ts[i].first - last = ts[i].last - index = ts[i].index - } - return nil -} diff --git a/vendor/github.com/siddontang/ledisdb/rpl/file_table.go b/vendor/github.com/siddontang/ledisdb/rpl/file_table.go deleted file mode 100644 index b4dcbfd9536c..000000000000 --- a/vendor/github.com/siddontang/ledisdb/rpl/file_table.go +++ /dev/null @@ -1,570 +0,0 @@ -package rpl - -import ( - "bytes" - "encoding/binary" - "errors" - "fmt" - "github.com/siddontang/go/log" - "github.com/siddontang/go/sync2" - "io" - "path" - "sync" - "time" -) - -var ( - magic = []byte("\x1c\x1d\xb8\x88\xff\x9e\x45\x55\x40\xf0\x4c\xda\xe0\xce\x47\xde\x65\x48\x71\x17") - errTableNeedFlush = errors.New("write table need flush") - errNilHandler = errors.New("nil write handler") -) - -const tableReaderKeepaliveInterval int64 = 30 - -func fmtTableDataName(base string, index int64) string { - return path.Join(base, fmt.Sprintf("%08d.data", index)) -} - -func fmtTableMetaName(base string, index int64) string { - return path.Join(base, fmt.Sprintf("%08d.meta", index)) -} - -type tableReader struct { - sync.Mutex - - base string - index int64 - - data readFile - meta readFile - - first uint64 - last uint64 - - lastTime uint32 - - lastReadTime sync2.AtomicInt64 - - useMmap bool -} - -func newTableReader(base string, index int64, useMmap bool) (*tableReader, error) { - if index <= 0 { - return nil, fmt.Errorf("invalid index %d", index) - } - t := new(tableReader) - t.base = base - t.index = index - - t.useMmap = useMmap - - var err error - - if err = t.check(); err != nil { - log.Error("check %d error: %s, try to repair", t.index, err.Error()) - - if err = t.repair(); err != nil { - log.Error("repair %d error: %s", t.index, err.Error()) - return nil, err - } - } - - t.close() - - return t, nil -} - -func (t *tableReader) String() string { - return fmt.Sprintf("%d", t.index) -} - -func (t *tableReader) Close() { - t.Lock() - - t.close() - - t.Unlock() -} - -func (t *tableReader) close() { - if t.data != nil { - t.data.Close() - t.data = nil - } - - if t.meta != nil { - t.meta.Close() - t.meta = nil - } -} - -func (t *tableReader) Keepalived() bool { - l := t.lastReadTime.Get() - if l > 0 && time.Now().Unix()-l > tableReaderKeepaliveInterval { - return false - } - - return true -} - -func (t *tableReader) getLogPos(index int) (uint32, error) { - var buf [4]byte - if _, err := t.meta.ReadAt(buf[0:4], int64(index)*4); err != nil { - return 0, err - } - - return binary.BigEndian.Uint32(buf[0:4]), nil -} - -func (t *tableReader) checkData() error { - var err error - //check will use raw file mode - if t.data, err = newReadFile(false, fmtTableDataName(t.base, t.index)); err != nil { - return err - } - - if t.data.Size() < len(magic) { - return fmt.Errorf("data file %s size %d too short", t.data.Name(), t.data.Size()) - } - - buf := make([]byte, len(magic)) - if _, err := t.data.ReadAt(buf, int64(t.data.Size()-len(magic))); err != nil { - return err - } - - if !bytes.Equal(magic, buf) { - return fmt.Errorf("data file %s invalid magic data %q", t.data.Name(), buf) - } - - return nil -} - -func (t *tableReader) checkMeta() error { - var err error - //check will use raw file mode - if t.meta, err = newReadFile(false, fmtTableMetaName(t.base, t.index)); err != nil { - return err - } - - if t.meta.Size()%4 != 0 || t.meta.Size() == 0 { - return fmt.Errorf("meta file %s invalid offset len %d, must 4 multiple and not 0", t.meta.Name(), t.meta.Size()) - } - - return nil -} - -func (t *tableReader) check() error { - var err error - - if err := t.checkMeta(); err != nil { - return err - } - - if err := t.checkData(); err != nil { - return err - } - - firstLogPos, _ := t.getLogPos(0) - lastLogPos, _ := t.getLogPos(t.meta.Size()/4 - 1) - - if firstLogPos != 0 { - return fmt.Errorf("invalid first log pos %d, must 0", firstLogPos) - } - - var l Log - if _, err = t.decodeLogHead(&l, t.data, int64(firstLogPos)); err != nil { - return fmt.Errorf("decode first log err %s", err.Error()) - } - - t.first = l.ID - var n int64 - if n, err = t.decodeLogHead(&l, t.data, int64(lastLogPos)); err != nil { - return fmt.Errorf("decode last log err %s", err.Error()) - } else if n+int64(len(magic)) != int64(t.data.Size()) { - return fmt.Errorf("extra log data at offset %d", n) - } - - t.last = l.ID - t.lastTime = l.CreateTime - - if t.first > t.last { - return fmt.Errorf("invalid log table first %d > last %d", t.first, t.last) - } else if (t.last - t.first + 1) != uint64(t.meta.Size()/4) { - return fmt.Errorf("invalid log table, first %d, last %d, and log num %d", t.first, t.last, t.meta.Size()/4) - } - - return nil -} - -func (t *tableReader) repair() error { - t.close() - - var err error - var data writeFile - var meta writeFile - - //repair will use raw file mode - data, err = newWriteFile(false, fmtTableDataName(t.base, t.index), 0) - data.SetOffset(int64(data.Size())) - - meta, err = newWriteFile(false, fmtTableMetaName(t.base, t.index), int64(defaultLogNumInFile*4)) - - var l Log - var pos int64 = 0 - var nextPos int64 = 0 - b := make([]byte, 4) - - t.first = 0 - t.last = 0 - - for { - nextPos, err = t.decodeLogHead(&l, data, pos) - if err != nil { - //if error, we may lost all logs from pos - log.Error("%s may lost logs from %d", data.Name(), pos) - break - } - - if l.ID == 0 { - log.Error("%s may lost logs from %d, invalid log 0", data.Name(), pos) - break - } - - if t.first == 0 { - t.first = l.ID - } - - if t.last == 0 { - t.last = l.ID - } else if l.ID <= t.last { - log.Error("%s may lost logs from %d, invalid logid %d", t.data.Name(), pos, l.ID) - break - } - - t.last = l.ID - t.lastTime = l.CreateTime - - binary.BigEndian.PutUint32(b, uint32(pos)) - meta.Write(b) - - pos = nextPos - - t.lastTime = l.CreateTime - } - - var e error - if err := meta.Close(); err != nil { - e = err - } - - data.SetOffset(pos) - - if _, err = data.Write(magic); err != nil { - log.Error("write magic error %s", err.Error()) - } - - if err = data.Close(); err != nil { - return err - } - - return e -} - -func (t *tableReader) decodeLogHead(l *Log, r io.ReaderAt, pos int64) (int64, error) { - dataLen, err := l.DecodeHeadAt(r, pos) - if err != nil { - return 0, err - } - - return pos + int64(l.HeadSize()) + int64(dataLen), nil -} - -func (t *tableReader) GetLog(id uint64, l *Log) error { - if id < t.first || id > t.last { - return ErrLogNotFound - } - - t.lastReadTime.Set(time.Now().Unix()) - - t.Lock() - - if err := t.openTable(); err != nil { - t.close() - t.Unlock() - return err - } - t.Unlock() - - pos, err := t.getLogPos(int(id - t.first)) - if err != nil { - return err - } - - if err := l.DecodeAt(t.data, int64(pos)); err != nil { - return err - } else if l.ID != id { - return fmt.Errorf("invalid log id %d != %d", l.ID, id) - } - - return nil -} - -func (t *tableReader) openTable() error { - var err error - if t.data == nil { - if t.data, err = newReadFile(t.useMmap, fmtTableDataName(t.base, t.index)); err != nil { - return err - } - } - - if t.meta == nil { - if t.meta, err = newReadFile(t.useMmap, fmtTableMetaName(t.base, t.index)); err != nil { - return err - } - - } - - return nil -} - -type tableWriter struct { - sync.RWMutex - - data writeFile - meta writeFile - - base string - index int64 - - first uint64 - last uint64 - lastTime uint32 - - maxLogSize int64 - - closed bool - - syncType int - - posBuf []byte - - useMmap bool -} - -func newTableWriter(base string, index int64, maxLogSize int64, useMmap bool) *tableWriter { - if index <= 0 { - panic(fmt.Errorf("invalid index %d", index)) - } - - t := new(tableWriter) - - t.base = base - t.index = index - - t.maxLogSize = maxLogSize - - t.closed = false - - t.posBuf = make([]byte, 4) - - t.useMmap = useMmap - - return t -} - -func (t *tableWriter) String() string { - return fmt.Sprintf("%d", t.index) -} - -func (t *tableWriter) SetMaxLogSize(s int64) { - t.maxLogSize = s -} - -func (t *tableWriter) SetSyncType(tp int) { - t.syncType = tp -} - -func (t *tableWriter) close() { - if t.meta != nil { - if err := t.meta.Close(); err != nil { - log.Fatal("close log meta error %s", err.Error()) - } - t.meta = nil - } - - if t.data != nil { - if _, err := t.data.Write(magic); err != nil { - log.Fatal("write magic error %s", err.Error()) - } - - if err := t.data.Close(); err != nil { - log.Fatal("close log data error %s", err.Error()) - } - t.data = nil - } -} - -func (t *tableWriter) Close() { - t.Lock() - t.closed = true - - t.close() - t.Unlock() -} - -func (t *tableWriter) First() uint64 { - t.Lock() - id := t.first - t.Unlock() - return id -} - -func (t *tableWriter) Last() uint64 { - t.Lock() - id := t.last - t.Unlock() - return id -} - -func (t *tableWriter) Flush() (*tableReader, error) { - t.Lock() - - if t.data == nil || t.meta == nil { - t.Unlock() - return nil, errNilHandler - } - - tr := new(tableReader) - tr.base = t.base - tr.index = t.index - - tr.first = t.first - tr.last = t.last - tr.lastTime = t.lastTime - tr.useMmap = t.useMmap - - t.close() - - t.first = 0 - t.last = 0 - t.index = t.index + 1 - - t.Unlock() - - return tr, nil -} - -func (t *tableWriter) StoreLog(l *Log) error { - t.Lock() - err := t.storeLog(l) - t.Unlock() - - return err -} - -func (t *tableWriter) openFile() error { - var err error - if t.data == nil { - if t.data, err = newWriteFile(t.useMmap, fmtTableDataName(t.base, t.index), t.maxLogSize+t.maxLogSize/10+int64(len(magic))); err != nil { - return err - } - } - - if t.meta == nil { - if t.meta, err = newWriteFile(t.useMmap, fmtTableMetaName(t.base, t.index), int64(defaultLogNumInFile*4)); err != nil { - return err - } - } - return err -} - -func (t *tableWriter) storeLog(l *Log) error { - if l.ID == 0 { - return ErrStoreLogID - } - - if t.closed { - return fmt.Errorf("table writer is closed") - } - - if t.last > 0 && l.ID != t.last+1 { - return ErrStoreLogID - } - - if t.data != nil && t.data.Offset() > t.maxLogSize { - return errTableNeedFlush - } - - var err error - if err = t.openFile(); err != nil { - return err - } - - offsetPos := t.data.Offset() - if err = l.Encode(t.data); err != nil { - return err - } - - binary.BigEndian.PutUint32(t.posBuf, uint32(offsetPos)) - if _, err = t.meta.Write(t.posBuf); err != nil { - return err - } - - if t.first == 0 { - t.first = l.ID - } - - t.last = l.ID - t.lastTime = l.CreateTime - - if t.syncType == 2 { - if err := t.data.Sync(); err != nil { - log.Error("sync table error %s", err.Error()) - } - } - - return nil -} - -func (t *tableWriter) GetLog(id uint64, l *Log) error { - t.RLock() - defer t.RUnlock() - - if id < t.first || id > t.last { - return ErrLogNotFound - } - - var buf [4]byte - if _, err := t.meta.ReadAt(buf[0:4], int64((id-t.first)*4)); err != nil { - return err - } - - offset := binary.BigEndian.Uint32(buf[0:4]) - - if err := l.DecodeAt(t.data, int64(offset)); err != nil { - return err - } else if l.ID != id { - return fmt.Errorf("invalid log id %d != %d", id, l.ID) - } - - return nil -} - -func (t *tableWriter) Sync() error { - t.Lock() - - var err error - if t.data != nil { - err = t.data.Sync() - t.Unlock() - return err - } - - if t.meta != nil { - err = t.meta.Sync() - } - - t.Unlock() - - return err -} diff --git a/vendor/github.com/siddontang/ledisdb/rpl/goleveldb_store.go b/vendor/github.com/siddontang/ledisdb/rpl/goleveldb_store.go deleted file mode 100644 index 5ece8d511b15..000000000000 --- a/vendor/github.com/siddontang/ledisdb/rpl/goleveldb_store.go +++ /dev/null @@ -1,224 +0,0 @@ -package rpl - -import ( - "bytes" - "fmt" - "github.com/siddontang/go/num" - "github.com/siddontang/ledisdb/config" - "github.com/siddontang/ledisdb/store" - "os" - "sync" - "time" -) - -type GoLevelDBStore struct { - LogStore - - m sync.Mutex - db *store.DB - - cfg *config.Config - - first uint64 - last uint64 - - buf bytes.Buffer -} - -func (s *GoLevelDBStore) FirstID() (uint64, error) { - s.m.Lock() - id, err := s.firstID() - s.m.Unlock() - - return id, err -} - -func (s *GoLevelDBStore) LastID() (uint64, error) { - s.m.Lock() - id, err := s.lastID() - s.m.Unlock() - - return id, err -} - -func (s *GoLevelDBStore) firstID() (uint64, error) { - if s.first != InvalidLogID { - return s.first, nil - } - - it := s.db.NewIterator() - defer it.Close() - - it.SeekToFirst() - - if it.Valid() { - s.first = num.BytesToUint64(it.RawKey()) - } - - return s.first, nil -} - -func (s *GoLevelDBStore) lastID() (uint64, error) { - if s.last != InvalidLogID { - return s.last, nil - } - - it := s.db.NewIterator() - defer it.Close() - - it.SeekToLast() - - if it.Valid() { - s.last = num.BytesToUint64(it.RawKey()) - } - - return s.last, nil -} - -func (s *GoLevelDBStore) GetLog(id uint64, log *Log) error { - v, err := s.db.Get(num.Uint64ToBytes(id)) - if err != nil { - return err - } else if v == nil { - return ErrLogNotFound - } else { - return log.Decode(bytes.NewBuffer(v)) - } -} - -func (s *GoLevelDBStore) StoreLog(log *Log) error { - s.m.Lock() - defer s.m.Unlock() - - last, err := s.lastID() - if err != nil { - return err - } - - s.last = InvalidLogID - - s.buf.Reset() - - if log.ID != last+1 { - return ErrStoreLogID - } - - last = log.ID - key := num.Uint64ToBytes(log.ID) - - if err := log.Encode(&s.buf); err != nil { - return err - } - - if err = s.db.Put(key, s.buf.Bytes()); err != nil { - return err - } - - s.last = last - return nil -} - -func (s *GoLevelDBStore) PurgeExpired(n int64) error { - if n <= 0 { - return fmt.Errorf("invalid expired time %d", n) - } - - t := uint32(time.Now().Unix() - int64(n)) - - s.m.Lock() - defer s.m.Unlock() - - s.reset() - - it := s.db.NewIterator() - it.SeekToFirst() - - w := s.db.NewWriteBatch() - defer w.Rollback() - - l := new(Log) - for ; it.Valid(); it.Next() { - v := it.RawValue() - - if err := l.Unmarshal(v); err != nil { - return err - } else if l.CreateTime > t { - break - } else { - w.Delete(it.RawKey()) - } - } - - if err := w.Commit(); err != nil { - return err - } - - return nil -} - -func (s *GoLevelDBStore) Sync() error { - //no other way for sync, so ignore here - return nil -} - -func (s *GoLevelDBStore) reset() { - s.first = InvalidLogID - s.last = InvalidLogID -} - -func (s *GoLevelDBStore) Clear() error { - s.m.Lock() - defer s.m.Unlock() - - if s.db != nil { - s.db.Close() - } - - s.reset() - os.RemoveAll(s.cfg.DBPath) - - return s.open() -} - -func (s *GoLevelDBStore) Close() error { - s.m.Lock() - defer s.m.Unlock() - - if s.db == nil { - return nil - } - - err := s.db.Close() - s.db = nil - return err -} - -func (s *GoLevelDBStore) open() error { - var err error - - s.first = InvalidLogID - s.last = InvalidLogID - - s.db, err = store.Open(s.cfg) - return err -} - -func NewGoLevelDBStore(base string, syncLog int) (*GoLevelDBStore, error) { - cfg := config.NewConfigDefault() - cfg.DBName = "goleveldb" - cfg.DBPath = base - cfg.LevelDB.BlockSize = 16 * 1024 * 1024 - cfg.LevelDB.CacheSize = 64 * 1024 * 1024 - cfg.LevelDB.WriteBufferSize = 64 * 1024 * 1024 - cfg.LevelDB.Compression = false - cfg.DBSyncCommit = syncLog - - s := new(GoLevelDBStore) - s.cfg = cfg - - if err := s.open(); err != nil { - return nil, err - } - - return s, nil -} diff --git a/vendor/github.com/siddontang/ledisdb/rpl/log.go b/vendor/github.com/siddontang/ledisdb/rpl/log.go deleted file mode 100644 index ad0b48cd4f79..000000000000 --- a/vendor/github.com/siddontang/ledisdb/rpl/log.go +++ /dev/null @@ -1,167 +0,0 @@ -package rpl - -import ( - "bytes" - "encoding/binary" - "io" - "sync" -) - -const LogHeadSize = 17 - -type Log struct { - ID uint64 - CreateTime uint32 - Compression uint8 - - Data []byte -} - -func (l *Log) HeadSize() int { - return LogHeadSize -} - -func (l *Log) Size() int { - return l.HeadSize() + len(l.Data) -} - -func (l *Log) Marshal() ([]byte, error) { - buf := bytes.NewBuffer(make([]byte, l.Size())) - buf.Reset() - - if err := l.Encode(buf); err != nil { - return nil, err - } - - return buf.Bytes(), nil -} - -func (l *Log) Unmarshal(b []byte) error { - buf := bytes.NewBuffer(b) - - return l.Decode(buf) -} - -var headPool = sync.Pool{ - New: func() interface{} { return make([]byte, LogHeadSize) }, -} - -func (l *Log) Encode(w io.Writer) error { - b := headPool.Get().([]byte) - pos := 0 - - binary.BigEndian.PutUint64(b[pos:], l.ID) - pos += 8 - binary.BigEndian.PutUint32(b[pos:], uint32(l.CreateTime)) - pos += 4 - b[pos] = l.Compression - pos++ - binary.BigEndian.PutUint32(b[pos:], uint32(len(l.Data))) - - n, err := w.Write(b) - headPool.Put(b) - - if err != nil { - return err - } else if n != LogHeadSize { - return io.ErrShortWrite - } - - if n, err = w.Write(l.Data); err != nil { - return err - } else if n != len(l.Data) { - return io.ErrShortWrite - } - return nil -} - -func (l *Log) Decode(r io.Reader) error { - length, err := l.DecodeHead(r) - if err != nil { - return err - } - - l.growData(int(length)) - - if _, err := io.ReadFull(r, l.Data); err != nil { - return err - } - - return nil -} - -func (l *Log) DecodeHead(r io.Reader) (uint32, error) { - buf := headPool.Get().([]byte) - - if _, err := io.ReadFull(r, buf); err != nil { - headPool.Put(buf) - return 0, err - } - - length := l.decodeHeadBuf(buf) - - headPool.Put(buf) - - return length, nil -} - -func (l *Log) DecodeAt(r io.ReaderAt, pos int64) error { - length, err := l.DecodeHeadAt(r, pos) - if err != nil { - return err - } - - l.growData(int(length)) - var n int - n, err = r.ReadAt(l.Data, pos+int64(LogHeadSize)) - if err == io.EOF && n == len(l.Data) { - err = nil - } - - return err -} - -func (l *Log) growData(length int) { - l.Data = l.Data[0:0] - - if cap(l.Data) >= length { - l.Data = l.Data[0:length] - } else { - l.Data = make([]byte, length) - } -} - -func (l *Log) DecodeHeadAt(r io.ReaderAt, pos int64) (uint32, error) { - buf := headPool.Get().([]byte) - - n, err := r.ReadAt(buf, pos) - if err != nil && err != io.EOF { - headPool.Put(buf) - - return 0, err - } - - length := l.decodeHeadBuf(buf) - headPool.Put(buf) - - if err == io.EOF && (length != 0 || n != len(buf)) { - return 0, err - } - - return length, nil -} - -func (l *Log) decodeHeadBuf(buf []byte) uint32 { - pos := 0 - l.ID = binary.BigEndian.Uint64(buf[pos:]) - pos += 8 - - l.CreateTime = binary.BigEndian.Uint32(buf[pos:]) - pos += 4 - - l.Compression = uint8(buf[pos]) - pos++ - - length := binary.BigEndian.Uint32(buf[pos:]) - return length -} diff --git a/vendor/github.com/siddontang/ledisdb/rpl/rpl.go b/vendor/github.com/siddontang/ledisdb/rpl/rpl.go deleted file mode 100644 index d232992579d3..000000000000 --- a/vendor/github.com/siddontang/ledisdb/rpl/rpl.go +++ /dev/null @@ -1,331 +0,0 @@ -package rpl - -import ( - "encoding/binary" - "github.com/siddontang/go/log" - "github.com/siddontang/go/snappy" - "github.com/siddontang/ledisdb/config" - "os" - "path" - "sync" - "time" -) - -type Stat struct { - FirstID uint64 - LastID uint64 - CommitID uint64 -} - -type Replication struct { - m sync.Mutex - - cfg *config.Config - - s LogStore - - commitID uint64 - commitLog *os.File - - quit chan struct{} - - wg sync.WaitGroup - - nc chan struct{} - - ncm sync.Mutex -} - -func NewReplication(cfg *config.Config) (*Replication, error) { - if len(cfg.Replication.Path) == 0 { - cfg.Replication.Path = path.Join(cfg.DataDir, "rpl") - } - - base := cfg.Replication.Path - - r := new(Replication) - - r.quit = make(chan struct{}) - r.nc = make(chan struct{}) - - r.cfg = cfg - - var err error - - switch cfg.Replication.StoreName { - case "goleveldb": - if r.s, err = NewGoLevelDBStore(path.Join(base, "wal"), cfg.Replication.SyncLog); err != nil { - return nil, err - } - default: - if r.s, err = NewFileStore(path.Join(base, "ldb"), cfg); err != nil { - return nil, err - } - } - - if r.commitLog, err = os.OpenFile(path.Join(base, "commit.log"), os.O_RDWR|os.O_CREATE, 0644); err != nil { - return nil, err - } - - if s, _ := r.commitLog.Stat(); s.Size() == 0 { - r.commitID = 0 - } else if err = binary.Read(r.commitLog, binary.BigEndian, &r.commitID); err != nil { - return nil, err - } - - r.wg.Add(1) - go r.run() - - return r, nil -} - -func (r *Replication) Close() error { - close(r.quit) - - r.wg.Wait() - - r.m.Lock() - defer r.m.Unlock() - - if r.s != nil { - r.s.Close() - r.s = nil - } - - if err := r.updateCommitID(r.commitID, true); err != nil { - log.Error("update commit id err %s", err.Error()) - } - - if r.commitLog != nil { - r.commitLog.Close() - r.commitLog = nil - } - - return nil -} - -func (r *Replication) Log(data []byte) (*Log, error) { - if r.cfg.Replication.Compression { - //todo optimize - var err error - if data, err = snappy.Encode(nil, data); err != nil { - return nil, err - } - } - - r.m.Lock() - - lastID, err := r.s.LastID() - if err != nil { - r.m.Unlock() - return nil, err - } - - commitId := r.commitID - if lastID < commitId { - lastID = commitId - } else if lastID > commitId { - r.m.Unlock() - return nil, ErrCommitIDBehind - } - - l := new(Log) - l.ID = lastID + 1 - l.CreateTime = uint32(time.Now().Unix()) - - if r.cfg.Replication.Compression { - l.Compression = 1 - } else { - l.Compression = 0 - } - - l.Data = data - - if err = r.s.StoreLog(l); err != nil { - r.m.Unlock() - return nil, err - } - - r.m.Unlock() - - r.ncm.Lock() - close(r.nc) - r.nc = make(chan struct{}) - r.ncm.Unlock() - - return l, nil -} - -func (r *Replication) WaitLog() <-chan struct{} { - r.ncm.Lock() - ch := r.nc - r.ncm.Unlock() - return ch -} - -func (r *Replication) StoreLog(log *Log) error { - r.m.Lock() - err := r.s.StoreLog(log) - r.m.Unlock() - - return err -} - -func (r *Replication) FirstLogID() (uint64, error) { - r.m.Lock() - id, err := r.s.FirstID() - r.m.Unlock() - - return id, err -} - -func (r *Replication) LastLogID() (uint64, error) { - r.m.Lock() - id, err := r.s.LastID() - r.m.Unlock() - return id, err -} - -func (r *Replication) LastCommitID() (uint64, error) { - r.m.Lock() - id := r.commitID - r.m.Unlock() - return id, nil -} - -func (r *Replication) UpdateCommitID(id uint64) error { - r.m.Lock() - err := r.updateCommitID(id, r.cfg.Replication.SyncLog == 2) - r.m.Unlock() - - return err -} - -func (r *Replication) Stat() (*Stat, error) { - r.m.Lock() - defer r.m.Unlock() - - s := &Stat{} - var err error - - if s.FirstID, err = r.s.FirstID(); err != nil { - return nil, err - } - - if s.LastID, err = r.s.LastID(); err != nil { - return nil, err - } - - s.CommitID = r.commitID - return s, nil -} - -func (r *Replication) updateCommitID(id uint64, force bool) error { - if force { - if _, err := r.commitLog.Seek(0, os.SEEK_SET); err != nil { - return err - } - - if err := binary.Write(r.commitLog, binary.BigEndian, id); err != nil { - return err - } - } - - r.commitID = id - - return nil -} - -func (r *Replication) CommitIDBehind() (bool, error) { - r.m.Lock() - - id, err := r.s.LastID() - if err != nil { - r.m.Unlock() - return false, err - } - - behind := id > r.commitID - r.m.Unlock() - - return behind, nil -} - -func (r *Replication) GetLog(id uint64, log *Log) error { - return r.s.GetLog(id, log) -} - -func (r *Replication) NextNeedCommitLog(log *Log) error { - r.m.Lock() - defer r.m.Unlock() - - id, err := r.s.LastID() - if err != nil { - return err - } - - if id <= r.commitID { - return ErrNoBehindLog - } - - return r.s.GetLog(r.commitID+1, log) - -} - -func (r *Replication) Clear() error { - return r.ClearWithCommitID(0) -} - -func (r *Replication) ClearWithCommitID(id uint64) error { - r.m.Lock() - defer r.m.Unlock() - - if err := r.s.Clear(); err != nil { - return err - } - - return r.updateCommitID(id, true) -} - -func (r *Replication) run() { - defer r.wg.Done() - - syncTc := time.NewTicker(1 * time.Second) - purgeTc := time.NewTicker(1 * time.Hour) - - for { - select { - case <-purgeTc.C: - n := (r.cfg.Replication.ExpiredLogDays * 24 * 3600) - r.m.Lock() - err := r.s.PurgeExpired(int64(n)) - r.m.Unlock() - if err != nil { - log.Error("purge expired log error %s", err.Error()) - } - case <-syncTc.C: - if r.cfg.Replication.SyncLog == 1 { - r.m.Lock() - err := r.s.Sync() - r.m.Unlock() - if err != nil { - log.Error("sync store error %s", err.Error()) - } - } - if r.cfg.Replication.SyncLog != 2 { - //we will sync commit id every 1 second - r.m.Lock() - err := r.updateCommitID(r.commitID, true) - r.m.Unlock() - - if err != nil { - log.Error("sync commitid error %s", err.Error()) - } - } - case <-r.quit: - syncTc.Stop() - purgeTc.Stop() - return - } - } -} diff --git a/vendor/github.com/siddontang/ledisdb/rpl/store.go b/vendor/github.com/siddontang/ledisdb/rpl/store.go deleted file mode 100644 index 9f985ec6be0f..000000000000 --- a/vendor/github.com/siddontang/ledisdb/rpl/store.go +++ /dev/null @@ -1,36 +0,0 @@ -package rpl - -import ( - "errors" -) - -const ( - InvalidLogID uint64 = 0 -) - -var ( - ErrLogNotFound = errors.New("log not found") - ErrStoreLogID = errors.New("log id is less") - ErrNoBehindLog = errors.New("no behind commit log") - ErrCommitIDBehind = errors.New("commit id is behind last log id") -) - -type LogStore interface { - GetLog(id uint64, log *Log) error - - FirstID() (uint64, error) - LastID() (uint64, error) - - // if log id is less than current last id, return error - StoreLog(log *Log) error - - // Delete logs before n seconds - PurgeExpired(n int64) error - - Sync() error - - // Clear all logs - Clear() error - - Close() error -} diff --git a/vendor/github.com/siddontang/ledisdb/store/boltdb/const.go b/vendor/github.com/siddontang/ledisdb/store/boltdb/const.go deleted file mode 100644 index 1e7d0aecf983..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/boltdb/const.go +++ /dev/null @@ -1,3 +0,0 @@ -package boltdb - -const DBName = "boltdb" diff --git a/vendor/github.com/siddontang/ledisdb/store/boltdb/db.go b/vendor/github.com/siddontang/ledisdb/store/boltdb/db.go deleted file mode 100644 index ac8cc03f76dd..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/boltdb/db.go +++ /dev/null @@ -1,172 +0,0 @@ -package boltdb - -import ( - "github.com/boltdb/bolt" - "github.com/siddontang/ledisdb/config" - "github.com/siddontang/ledisdb/store/driver" - "os" - "path" -) - -var bucketName = []byte("ledisdb") - -type Store struct { -} - -func (s Store) String() string { - return DBName -} - -func (s Store) Open(dbPath string, cfg *config.Config) (driver.IDB, error) { - os.MkdirAll(dbPath, 0755) - name := path.Join(dbPath, "ledis_bolt.db") - db := new(DB) - var err error - - db.path = name - db.cfg = cfg - - db.db, err = bolt.Open(name, 0600, nil) - if err != nil { - return nil, err - } - - var tx *bolt.Tx - tx, err = db.db.Begin(true) - if err != nil { - return nil, err - } - - _, err = tx.CreateBucketIfNotExists(bucketName) - if err != nil { - tx.Rollback() - return nil, err - } - - if err = tx.Commit(); err != nil { - return nil, err - } - - return db, nil -} - -func (s Store) Repair(path string, cfg *config.Config) error { - return nil -} - -type DB struct { - cfg *config.Config - db *bolt.DB - path string -} - -func (db *DB) Close() error { - return db.db.Close() -} - -func (db *DB) Get(key []byte) ([]byte, error) { - var value []byte - - t, err := db.db.Begin(false) - if err != nil { - return nil, err - } - defer t.Rollback() - - b := t.Bucket(bucketName) - - value = b.Get(key) - - if value == nil { - return nil, nil - } else { - return append([]byte{}, value...), nil - } -} - -func (db *DB) Put(key []byte, value []byte) error { - err := db.db.Update(func(tx *bolt.Tx) error { - b := tx.Bucket(bucketName) - return b.Put(key, value) - }) - return err -} - -func (db *DB) Delete(key []byte) error { - err := db.db.Update(func(tx *bolt.Tx) error { - b := tx.Bucket(bucketName) - return b.Delete(key) - }) - return err -} - -func (db *DB) SyncPut(key []byte, value []byte) error { - return db.Put(key, value) -} - -func (db *DB) SyncDelete(key []byte) error { - return db.Delete(key) -} - -func (db *DB) NewIterator() driver.IIterator { - tx, err := db.db.Begin(false) - if err != nil { - return &Iterator{} - } - b := tx.Bucket(bucketName) - - return &Iterator{ - tx: tx, - it: b.Cursor()} -} - -func (db *DB) NewWriteBatch() driver.IWriteBatch { - return driver.NewWriteBatch(db) -} - -func (db *DB) Begin() (driver.Tx, error) { - tx, err := db.db.Begin(true) - if err != nil { - return nil, err - } - - return &Tx{ - tx: tx, - b: tx.Bucket(bucketName), - }, nil -} - -func (db *DB) NewSnapshot() (driver.ISnapshot, error) { - return newSnapshot(db) -} - -func (db *DB) BatchPut(writes []driver.Write) error { - err := db.db.Update(func(tx *bolt.Tx) error { - b := tx.Bucket(bucketName) - var err error - for _, w := range writes { - if w.Value == nil { - err = b.Delete(w.Key) - } else { - err = b.Put(w.Key, w.Value) - } - if err != nil { - return err - } - } - return nil - }) - return err -} - -func (db *DB) SyncBatchPut(writes []driver.Write) error { - return db.BatchPut(writes) -} - -func (db *DB) Compact() error { - return nil -} - -func init() { - driver.Register(Store{}) -} diff --git a/vendor/github.com/siddontang/ledisdb/store/boltdb/iterator.go b/vendor/github.com/siddontang/ledisdb/store/boltdb/iterator.go deleted file mode 100644 index aea750824137..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/boltdb/iterator.go +++ /dev/null @@ -1,50 +0,0 @@ -package boltdb - -import ( - "github.com/boltdb/bolt" -) - -type Iterator struct { - tx *bolt.Tx - it *bolt.Cursor - key []byte - value []byte -} - -func (it *Iterator) Close() error { - if it.tx != nil { - return it.tx.Rollback() - } else { - return nil - } -} - -func (it *Iterator) First() { - it.key, it.value = it.it.First() -} - -func (it *Iterator) Last() { - it.key, it.value = it.it.Last() -} - -func (it *Iterator) Seek(key []byte) { - it.key, it.value = it.it.Seek(key) -} - -func (it *Iterator) Next() { - it.key, it.value = it.it.Next() -} -func (it *Iterator) Prev() { - it.key, it.value = it.it.Prev() -} - -func (it *Iterator) Valid() bool { - return !(it.key == nil && it.value == nil) -} - -func (it *Iterator) Key() []byte { - return it.key -} -func (it *Iterator) Value() []byte { - return it.value -} diff --git a/vendor/github.com/siddontang/ledisdb/store/boltdb/snapshot.go b/vendor/github.com/siddontang/ledisdb/store/boltdb/snapshot.go deleted file mode 100644 index ad4df90be555..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/boltdb/snapshot.go +++ /dev/null @@ -1,37 +0,0 @@ -package boltdb - -import ( - "github.com/boltdb/bolt" - "github.com/siddontang/ledisdb/store/driver" -) - -type Snapshot struct { - tx *bolt.Tx - b *bolt.Bucket -} - -func newSnapshot(db *DB) (*Snapshot, error) { - tx, err := db.db.Begin(false) - if err != nil { - return nil, err - } - - return &Snapshot{ - tx: tx, - b: tx.Bucket(bucketName)}, nil -} - -func (s *Snapshot) Get(key []byte) ([]byte, error) { - return s.b.Get(key), nil -} - -func (s *Snapshot) NewIterator() driver.IIterator { - return &Iterator{ - tx: nil, - it: s.b.Cursor(), - } -} - -func (s *Snapshot) Close() { - s.tx.Rollback() -} diff --git a/vendor/github.com/siddontang/ledisdb/store/boltdb/tx.go b/vendor/github.com/siddontang/ledisdb/store/boltdb/tx.go deleted file mode 100644 index 8dba000f10ca..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/boltdb/tx.go +++ /dev/null @@ -1,61 +0,0 @@ -package boltdb - -import ( - "github.com/boltdb/bolt" - "github.com/siddontang/ledisdb/store/driver" -) - -type Tx struct { - tx *bolt.Tx - b *bolt.Bucket -} - -func (t *Tx) Get(key []byte) ([]byte, error) { - return t.b.Get(key), nil -} - -func (t *Tx) Put(key []byte, value []byte) error { - return t.b.Put(key, value) -} - -func (t *Tx) Delete(key []byte) error { - return t.b.Delete(key) -} - -func (t *Tx) NewIterator() driver.IIterator { - return &Iterator{ - tx: nil, - it: t.b.Cursor(), - } -} - -func (t *Tx) NewWriteBatch() driver.IWriteBatch { - return driver.NewWriteBatch(t) -} - -func (t *Tx) BatchPut(writes []driver.Write) error { - var err error - for _, w := range writes { - if w.Value == nil { - err = t.b.Delete(w.Key) - } else { - err = t.b.Put(w.Key, w.Value) - } - if err != nil { - return err - } - } - return nil -} - -func (t *Tx) SyncBatchPut(writes []driver.Write) error { - return t.BatchPut(writes) -} - -func (t *Tx) Rollback() error { - return t.tx.Rollback() -} - -func (t *Tx) Commit() error { - return t.tx.Commit() -} diff --git a/vendor/github.com/siddontang/ledisdb/store/db.go b/vendor/github.com/siddontang/ledisdb/store/db.go deleted file mode 100644 index 9964c5ee86d4..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/db.go +++ /dev/null @@ -1,179 +0,0 @@ -package store - -import ( - "github.com/siddontang/ledisdb/config" - "github.com/siddontang/ledisdb/store/driver" - "sync" - "time" -) - -type DB struct { - db driver.IDB - name string - - st *Stat - - cfg *config.Config - - lastCommit time.Time - - m sync.Mutex -} - -func (db *DB) Close() error { - return db.db.Close() -} - -func (db *DB) String() string { - return db.name -} - -func (db *DB) NewIterator() *Iterator { - db.st.IterNum.Add(1) - - it := new(Iterator) - it.it = db.db.NewIterator() - it.st = db.st - - return it -} - -func (db *DB) Get(key []byte) ([]byte, error) { - t := time.Now() - v, err := db.db.Get(key) - db.st.statGet(v, err) - db.st.GetTotalTime.Add(time.Now().Sub(t)) - return v, err -} - -func (db *DB) Put(key []byte, value []byte) error { - db.st.PutNum.Add(1) - - if db.needSyncCommit() { - return db.db.SyncPut(key, value) - - } else { - return db.db.Put(key, value) - - } -} - -func (db *DB) Delete(key []byte) error { - db.st.DeleteNum.Add(1) - - if db.needSyncCommit() { - return db.db.SyncDelete(key) - } else { - return db.db.Delete(key) - } -} - -func (db *DB) NewWriteBatch() *WriteBatch { - db.st.BatchNum.Add(1) - wb := new(WriteBatch) - wb.wb = db.db.NewWriteBatch() - wb.st = db.st - wb.db = db - return wb -} - -func (db *DB) NewSnapshot() (*Snapshot, error) { - db.st.SnapshotNum.Add(1) - - var err error - s := &Snapshot{} - if s.ISnapshot, err = db.db.NewSnapshot(); err != nil { - return nil, err - } - s.st = db.st - - return s, nil -} - -func (db *DB) Compact() error { - db.st.CompactNum.Add(1) - - t := time.Now() - err := db.db.Compact() - - db.st.CompactTotalTime.Add(time.Now().Sub(t)) - - return err -} - -func (db *DB) RangeIterator(min []byte, max []byte, rangeType uint8) *RangeLimitIterator { - return NewRangeLimitIterator(db.NewIterator(), &Range{min, max, rangeType}, &Limit{0, -1}) -} - -func (db *DB) RevRangeIterator(min []byte, max []byte, rangeType uint8) *RangeLimitIterator { - return NewRevRangeLimitIterator(db.NewIterator(), &Range{min, max, rangeType}, &Limit{0, -1}) -} - -//count < 0, unlimit. -// -//offset must >= 0, if < 0, will get nothing. -func (db *DB) RangeLimitIterator(min []byte, max []byte, rangeType uint8, offset int, count int) *RangeLimitIterator { - return NewRangeLimitIterator(db.NewIterator(), &Range{min, max, rangeType}, &Limit{offset, count}) -} - -//count < 0, unlimit. -// -//offset must >= 0, if < 0, will get nothing. -func (db *DB) RevRangeLimitIterator(min []byte, max []byte, rangeType uint8, offset int, count int) *RangeLimitIterator { - return NewRevRangeLimitIterator(db.NewIterator(), &Range{min, max, rangeType}, &Limit{offset, count}) -} - -func (db *DB) Begin() (*Tx, error) { - tx, err := db.db.Begin() - if err != nil { - return nil, err - } - - db.st.TxNum.Add(1) - - return &Tx{tx, db.st}, nil -} - -func (db *DB) Stat() *Stat { - return db.st -} - -func (db *DB) needSyncCommit() bool { - if db.cfg.DBSyncCommit == 0 { - return false - } else if db.cfg.DBSyncCommit == 2 { - return true - } else { - n := time.Now() - need := false - db.m.Lock() - - if n.Sub(db.lastCommit) > time.Second { - need = true - } - db.lastCommit = n - - db.m.Unlock() - return need - } - -} - -func (db *DB) GetSlice(key []byte) (Slice, error) { - if d, ok := db.db.(driver.ISliceGeter); ok { - t := time.Now() - v, err := d.GetSlice(key) - db.st.statGet(v, err) - db.st.GetTotalTime.Add(time.Now().Sub(t)) - return v, err - } else { - v, err := db.Get(key) - if err != nil { - return nil, err - } else if v == nil { - return nil, nil - } else { - return driver.GoSlice(v), nil - } - } -} diff --git a/vendor/github.com/siddontang/ledisdb/store/driver/batch.go b/vendor/github.com/siddontang/ledisdb/store/driver/batch.go deleted file mode 100644 index 1c1e8992125d..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/driver/batch.go +++ /dev/null @@ -1,70 +0,0 @@ -package driver - -import ( - "github.com/syndtr/goleveldb/leveldb" -) - -type BatchPuter interface { - BatchPut([]Write) error - SyncBatchPut([]Write) error -} - -type Write struct { - Key []byte - Value []byte -} - -type WriteBatch struct { - d *leveldb.Batch - - wb []Write - w BatchPuter -} - -func (wb *WriteBatch) Close() { - wb.d.Reset() - wb.wb = wb.wb[0:0] -} - -func (wb *WriteBatch) Put(key, value []byte) { - if value == nil { - value = []byte{} - } - wb.wb = append(wb.wb, Write{key, value}) -} - -func (wb *WriteBatch) Delete(key []byte) { - wb.wb = append(wb.wb, Write{key, nil}) -} - -func (wb *WriteBatch) Commit() error { - return wb.w.BatchPut(wb.wb) -} - -func (wb *WriteBatch) SyncCommit() error { - return wb.w.SyncBatchPut(wb.wb) -} - -func (wb *WriteBatch) Rollback() error { - wb.wb = wb.wb[0:0] - return nil -} - -func (wb *WriteBatch) Data() []byte { - wb.d.Reset() - for _, w := range wb.wb { - if w.Value == nil { - wb.d.Delete(w.Key) - } else { - wb.d.Put(w.Key, w.Value) - } - } - return wb.d.Dump() -} - -func NewWriteBatch(puter BatchPuter) *WriteBatch { - return &WriteBatch{ - &leveldb.Batch{}, - []Write{}, - puter} -} diff --git a/vendor/github.com/siddontang/ledisdb/store/driver/driver.go b/vendor/github.com/siddontang/ledisdb/store/driver/driver.go deleted file mode 100644 index b571738df1d0..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/driver/driver.go +++ /dev/null @@ -1,79 +0,0 @@ -package driver - -import ( - "errors" -) - -var ( - ErrTxSupport = errors.New("transaction is not supported") -) - -type IDB interface { - Close() error - - Get(key []byte) ([]byte, error) - - Put(key []byte, value []byte) error - Delete(key []byte) error - - SyncPut(key []byte, value []byte) error - SyncDelete(key []byte) error - - NewIterator() IIterator - - NewWriteBatch() IWriteBatch - - NewSnapshot() (ISnapshot, error) - - Begin() (Tx, error) - - Compact() error -} - -type ISnapshot interface { - Get(key []byte) ([]byte, error) - NewIterator() IIterator - Close() -} - -type IIterator interface { - Close() error - - First() - Last() - Seek(key []byte) - - Next() - Prev() - - Valid() bool - - Key() []byte - Value() []byte -} - -type IWriteBatch interface { - Put(key []byte, value []byte) - Delete(key []byte) - Commit() error - SyncCommit() error - Rollback() error - Data() []byte - Close() -} - -type Tx interface { - Get(key []byte) ([]byte, error) - Put(key []byte, value []byte) error - Delete(key []byte) error - - NewIterator() IIterator - NewWriteBatch() IWriteBatch - - Commit() error - Rollback() error -} - -type ISliceGeter interface { - GetSlice(key []byte) (ISlice, error) -} diff --git a/vendor/github.com/siddontang/ledisdb/store/driver/slice.go b/vendor/github.com/siddontang/ledisdb/store/driver/slice.go deleted file mode 100644 index d0c80e0b8fcf..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/driver/slice.go +++ /dev/null @@ -1,21 +0,0 @@ -package driver - -type ISlice interface { - Data() []byte - Size() int - Free() -} - -type GoSlice []byte - -func (s GoSlice) Data() []byte { - return []byte(s) -} - -func (s GoSlice) Size() int { - return len(s) -} - -func (s GoSlice) Free() { - -} diff --git a/vendor/github.com/siddontang/ledisdb/store/driver/store.go b/vendor/github.com/siddontang/ledisdb/store/driver/store.go deleted file mode 100644 index dffa670552bf..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/driver/store.go +++ /dev/null @@ -1,45 +0,0 @@ -package driver - -import ( - "fmt" - "github.com/siddontang/ledisdb/config" -) - -type Store interface { - String() string - Open(path string, cfg *config.Config) (IDB, error) - Repair(path string, cfg *config.Config) error -} - -var dbs = map[string]Store{} - -func Register(s Store) { - name := s.String() - if _, ok := dbs[name]; ok { - panic(fmt.Errorf("store %s is registered", s)) - } - - dbs[name] = s -} - -func ListStores() []string { - s := []string{} - for k, _ := range dbs { - s = append(s, k) - } - - return s -} - -func GetStore(cfg *config.Config) (Store, error) { - if len(cfg.DBName) == 0 { - cfg.DBName = config.DefaultDBName - } - - s, ok := dbs[cfg.DBName] - if !ok { - return nil, fmt.Errorf("store %s is not registered", cfg.DBName) - } - - return s, nil -} diff --git a/vendor/github.com/siddontang/ledisdb/store/goleveldb/batch.go b/vendor/github.com/siddontang/ledisdb/store/goleveldb/batch.go deleted file mode 100644 index 2032279a2a8e..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/goleveldb/batch.go +++ /dev/null @@ -1,39 +0,0 @@ -package goleveldb - -import ( - "github.com/syndtr/goleveldb/leveldb" -) - -type WriteBatch struct { - db *DB - wbatch *leveldb.Batch -} - -func (w *WriteBatch) Put(key, value []byte) { - w.wbatch.Put(key, value) -} - -func (w *WriteBatch) Delete(key []byte) { - w.wbatch.Delete(key) -} - -func (w *WriteBatch) Commit() error { - return w.db.db.Write(w.wbatch, nil) -} - -func (w *WriteBatch) SyncCommit() error { - return w.db.db.Write(w.wbatch, w.db.syncOpts) -} - -func (w *WriteBatch) Rollback() error { - w.wbatch.Reset() - return nil -} - -func (w *WriteBatch) Close() { - w.wbatch.Reset() -} - -func (w *WriteBatch) Data() []byte { - return w.wbatch.Dump() -} diff --git a/vendor/github.com/siddontang/ledisdb/store/goleveldb/const.go b/vendor/github.com/siddontang/ledisdb/store/goleveldb/const.go deleted file mode 100644 index 2fffa7c82bb4..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/goleveldb/const.go +++ /dev/null @@ -1,4 +0,0 @@ -package goleveldb - -const DBName = "goleveldb" -const MemDBName = "memory" diff --git a/vendor/github.com/siddontang/ledisdb/store/goleveldb/db.go b/vendor/github.com/siddontang/ledisdb/store/goleveldb/db.go deleted file mode 100644 index af8633b8cd7b..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/goleveldb/db.go +++ /dev/null @@ -1,208 +0,0 @@ -package goleveldb - -import ( - "github.com/syndtr/goleveldb/leveldb" - "github.com/syndtr/goleveldb/leveldb/cache" - "github.com/syndtr/goleveldb/leveldb/filter" - "github.com/syndtr/goleveldb/leveldb/opt" - "github.com/syndtr/goleveldb/leveldb/storage" - "github.com/syndtr/goleveldb/leveldb/util" - - "github.com/siddontang/ledisdb/config" - "github.com/siddontang/ledisdb/store/driver" - - "os" -) - -const defaultFilterBits int = 10 - -type Store struct { -} - -func (s Store) String() string { - return DBName -} - -type MemStore struct { -} - -func (s MemStore) String() string { - return MemDBName -} - -type DB struct { - path string - - cfg *config.LevelDBConfig - - db *leveldb.DB - - opts *opt.Options - - iteratorOpts *opt.ReadOptions - - syncOpts *opt.WriteOptions - - cache cache.Cache - - filter filter.Filter -} - -func (s Store) Open(path string, cfg *config.Config) (driver.IDB, error) { - if err := os.MkdirAll(path, 0755); err != nil { - return nil, err - } - - db := new(DB) - db.path = path - db.cfg = &cfg.LevelDB - - db.initOpts() - - var err error - db.db, err = leveldb.OpenFile(db.path, db.opts) - - if err != nil { - return nil, err - } - - return db, nil -} - -func (s Store) Repair(path string, cfg *config.Config) error { - db, err := leveldb.RecoverFile(path, newOptions(&cfg.LevelDB)) - if err != nil { - return err - } - - db.Close() - return nil -} - -func (s MemStore) Open(path string, cfg *config.Config) (driver.IDB, error) { - db := new(DB) - db.path = path - db.cfg = &cfg.LevelDB - - db.initOpts() - - var err error - db.db, err = leveldb.Open(storage.NewMemStorage(), db.opts) - if err != nil { - return nil, err - } - - return db, nil -} - -func (s MemStore) Repair(path string, cfg *config.Config) error { - return nil -} - -func (db *DB) initOpts() { - db.opts = newOptions(db.cfg) - - db.iteratorOpts = &opt.ReadOptions{} - db.iteratorOpts.DontFillCache = true - - db.syncOpts = &opt.WriteOptions{} - db.syncOpts.Sync = true -} - -func newOptions(cfg *config.LevelDBConfig) *opt.Options { - opts := &opt.Options{} - opts.ErrorIfMissing = false - - opts.BlockCache = cache.NewLRUCache(cfg.CacheSize) - - //we must use bloomfilter - opts.Filter = filter.NewBloomFilter(defaultFilterBits) - - if !cfg.Compression { - opts.Compression = opt.NoCompression - } else { - opts.Compression = opt.SnappyCompression - } - - opts.BlockSize = cfg.BlockSize - opts.WriteBuffer = cfg.WriteBufferSize - opts.CachedOpenFiles = cfg.MaxOpenFiles - - //here we use default value, later add config support - opts.CompactionTableSize = 32 * 1024 * 1024 - opts.WriteL0SlowdownTrigger = 16 - opts.WriteL0PauseTrigger = 64 - - return opts -} - -func (db *DB) Close() error { - return db.db.Close() -} - -func (db *DB) Put(key, value []byte) error { - return db.db.Put(key, value, nil) -} - -func (db *DB) Get(key []byte) ([]byte, error) { - v, err := db.db.Get(key, nil) - if err == leveldb.ErrNotFound { - return nil, nil - } - return v, nil -} - -func (db *DB) Delete(key []byte) error { - return db.db.Delete(key, nil) -} - -func (db *DB) SyncPut(key []byte, value []byte) error { - return db.db.Put(key, value, db.syncOpts) -} - -func (db *DB) SyncDelete(key []byte) error { - return db.db.Delete(key, db.syncOpts) -} - -func (db *DB) NewWriteBatch() driver.IWriteBatch { - wb := &WriteBatch{ - db: db, - wbatch: new(leveldb.Batch), - } - return wb -} - -func (db *DB) NewIterator() driver.IIterator { - it := &Iterator{ - db.db.NewIterator(nil, db.iteratorOpts), - } - - return it -} - -func (db *DB) Begin() (driver.Tx, error) { - return nil, driver.ErrTxSupport -} - -func (db *DB) NewSnapshot() (driver.ISnapshot, error) { - snapshot, err := db.db.GetSnapshot() - if err != nil { - return nil, err - } - - s := &Snapshot{ - db: db, - snp: snapshot, - } - - return s, nil -} - -func (db *DB) Compact() error { - return db.db.CompactRange(util.Range{nil, nil}) -} - -func init() { - driver.Register(Store{}) - driver.Register(MemStore{}) -} diff --git a/vendor/github.com/siddontang/ledisdb/store/goleveldb/iterator.go b/vendor/github.com/siddontang/ledisdb/store/goleveldb/iterator.go deleted file mode 100644 index c1fd8b5573bb..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/goleveldb/iterator.go +++ /dev/null @@ -1,49 +0,0 @@ -package goleveldb - -import ( - "github.com/syndtr/goleveldb/leveldb/iterator" -) - -type Iterator struct { - it iterator.Iterator -} - -func (it *Iterator) Key() []byte { - return it.it.Key() -} - -func (it *Iterator) Value() []byte { - return it.it.Value() -} - -func (it *Iterator) Close() error { - if it.it != nil { - it.it.Release() - it.it = nil - } - return nil -} - -func (it *Iterator) Valid() bool { - return it.it.Valid() -} - -func (it *Iterator) Next() { - it.it.Next() -} - -func (it *Iterator) Prev() { - it.it.Prev() -} - -func (it *Iterator) First() { - it.it.First() -} - -func (it *Iterator) Last() { - it.it.Last() -} - -func (it *Iterator) Seek(key []byte) { - it.it.Seek(key) -} diff --git a/vendor/github.com/siddontang/ledisdb/store/goleveldb/snapshot.go b/vendor/github.com/siddontang/ledisdb/store/goleveldb/snapshot.go deleted file mode 100644 index c615579bb89d..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/goleveldb/snapshot.go +++ /dev/null @@ -1,26 +0,0 @@ -package goleveldb - -import ( - "github.com/siddontang/ledisdb/store/driver" - "github.com/syndtr/goleveldb/leveldb" -) - -type Snapshot struct { - db *DB - snp *leveldb.Snapshot -} - -func (s *Snapshot) Get(key []byte) ([]byte, error) { - return s.snp.Get(key, s.db.iteratorOpts) -} - -func (s *Snapshot) NewIterator() driver.IIterator { - it := &Iterator{ - s.snp.NewIterator(nil, s.db.iteratorOpts), - } - return it -} - -func (s *Snapshot) Close() { - s.snp.Release() -} diff --git a/vendor/github.com/siddontang/ledisdb/store/iterator.go b/vendor/github.com/siddontang/ledisdb/store/iterator.go deleted file mode 100644 index d81976bdc827..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/iterator.go +++ /dev/null @@ -1,333 +0,0 @@ -package store - -import ( - "bytes" - "github.com/siddontang/ledisdb/store/driver" -) - -const ( - IteratorForward uint8 = 0 - IteratorBackward uint8 = 1 -) - -const ( - RangeClose uint8 = 0x00 - RangeLOpen uint8 = 0x01 - RangeROpen uint8 = 0x10 - RangeOpen uint8 = 0x11 -) - -// min must less or equal than max -// -// range type: -// -// close: [min, max] -// open: (min, max) -// lopen: (min, max] -// ropen: [min, max) -// -type Range struct { - Min []byte - Max []byte - - Type uint8 -} - -type Limit struct { - Offset int - Count int -} - -type Iterator struct { - it driver.IIterator - st *Stat -} - -// Returns a copy of key. -func (it *Iterator) Key() []byte { - k := it.it.Key() - if k == nil { - return nil - } - - return append([]byte{}, k...) -} - -// Returns a copy of value. -func (it *Iterator) Value() []byte { - v := it.it.Value() - if v == nil { - return nil - } - - return append([]byte{}, v...) -} - -// Returns a reference of key. -// you must be careful that it will be changed after next iterate. -func (it *Iterator) RawKey() []byte { - return it.it.Key() -} - -// Returns a reference of value. -// you must be careful that it will be changed after next iterate. -func (it *Iterator) RawValue() []byte { - return it.it.Value() -} - -// Copy key to b, if b len is small or nil, returns a new one. -func (it *Iterator) BufKey(b []byte) []byte { - k := it.RawKey() - if k == nil { - return nil - } - if b == nil { - b = []byte{} - } - - b = b[0:0] - return append(b, k...) -} - -// Copy value to b, if b len is small or nil, returns a new one. -func (it *Iterator) BufValue(b []byte) []byte { - v := it.RawValue() - if v == nil { - return nil - } - - if b == nil { - b = []byte{} - } - - b = b[0:0] - return append(b, v...) -} - -func (it *Iterator) Close() { - if it.it != nil { - it.st.IterCloseNum.Add(1) - it.it.Close() - it.it = nil - } -} - -func (it *Iterator) Valid() bool { - return it.it.Valid() -} - -func (it *Iterator) Next() { - it.st.IterSeekNum.Add(1) - it.it.Next() -} - -func (it *Iterator) Prev() { - it.st.IterSeekNum.Add(1) - it.it.Prev() -} - -func (it *Iterator) SeekToFirst() { - it.st.IterSeekNum.Add(1) - it.it.First() -} - -func (it *Iterator) SeekToLast() { - it.st.IterSeekNum.Add(1) - it.it.Last() -} - -func (it *Iterator) Seek(key []byte) { - it.st.IterSeekNum.Add(1) - it.it.Seek(key) -} - -// Finds by key, if not found, nil returns. -func (it *Iterator) Find(key []byte) []byte { - it.Seek(key) - if it.Valid() { - k := it.RawKey() - if k == nil { - return nil - } else if bytes.Equal(k, key) { - return it.Value() - } - } - - return nil -} - -// Finds by key, if not found, nil returns, else a reference of value returns. -// you must be careful that it will be changed after next iterate. -func (it *Iterator) RawFind(key []byte) []byte { - it.Seek(key) - if it.Valid() { - k := it.RawKey() - if k == nil { - return nil - } else if bytes.Equal(k, key) { - return it.RawValue() - } - } - - return nil -} - -type RangeLimitIterator struct { - it *Iterator - - r *Range - l *Limit - - step int - - //0 for IteratorForward, 1 for IteratorBackward - direction uint8 -} - -func (it *RangeLimitIterator) Key() []byte { - return it.it.Key() -} - -func (it *RangeLimitIterator) Value() []byte { - return it.it.Value() -} - -func (it *RangeLimitIterator) RawKey() []byte { - return it.it.RawKey() -} - -func (it *RangeLimitIterator) RawValue() []byte { - return it.it.RawValue() -} - -func (it *RangeLimitIterator) BufKey(b []byte) []byte { - return it.it.BufKey(b) -} - -func (it *RangeLimitIterator) BufValue(b []byte) []byte { - return it.it.BufValue(b) -} - -func (it *RangeLimitIterator) Valid() bool { - if it.l.Offset < 0 { - return false - } else if !it.it.Valid() { - return false - } else if it.l.Count >= 0 && it.step >= it.l.Count { - return false - } - - if it.direction == IteratorForward { - if it.r.Max != nil { - r := bytes.Compare(it.it.RawKey(), it.r.Max) - if it.r.Type&RangeROpen > 0 { - return !(r >= 0) - } else { - return !(r > 0) - } - } - } else { - if it.r.Min != nil { - r := bytes.Compare(it.it.RawKey(), it.r.Min) - if it.r.Type&RangeLOpen > 0 { - return !(r <= 0) - } else { - return !(r < 0) - } - } - } - - return true -} - -func (it *RangeLimitIterator) Next() { - it.step++ - - if it.direction == IteratorForward { - it.it.Next() - } else { - it.it.Prev() - } -} - -func (it *RangeLimitIterator) Close() { - it.it.Close() -} - -func NewRangeLimitIterator(i *Iterator, r *Range, l *Limit) *RangeLimitIterator { - return rangeLimitIterator(i, r, l, IteratorForward) -} - -func NewRevRangeLimitIterator(i *Iterator, r *Range, l *Limit) *RangeLimitIterator { - return rangeLimitIterator(i, r, l, IteratorBackward) -} - -func NewRangeIterator(i *Iterator, r *Range) *RangeLimitIterator { - return rangeLimitIterator(i, r, &Limit{0, -1}, IteratorForward) -} - -func NewRevRangeIterator(i *Iterator, r *Range) *RangeLimitIterator { - return rangeLimitIterator(i, r, &Limit{0, -1}, IteratorBackward) -} - -func rangeLimitIterator(i *Iterator, r *Range, l *Limit, direction uint8) *RangeLimitIterator { - it := new(RangeLimitIterator) - - it.it = i - - it.r = r - it.l = l - it.direction = direction - - it.step = 0 - - if l.Offset < 0 { - return it - } - - if direction == IteratorForward { - if r.Min == nil { - it.it.SeekToFirst() - } else { - it.it.Seek(r.Min) - - if r.Type&RangeLOpen > 0 { - if it.it.Valid() && bytes.Equal(it.it.RawKey(), r.Min) { - it.it.Next() - } - } - } - } else { - if r.Max == nil { - it.it.SeekToLast() - } else { - it.it.Seek(r.Max) - - if !it.it.Valid() { - it.it.SeekToLast() - } else { - if !bytes.Equal(it.it.RawKey(), r.Max) { - it.it.Prev() - } - } - - if r.Type&RangeROpen > 0 { - if it.it.Valid() && bytes.Equal(it.it.RawKey(), r.Max) { - it.it.Prev() - } - } - } - } - - for i := 0; i < l.Offset; i++ { - if it.it.Valid() { - if it.direction == IteratorForward { - it.it.Next() - } else { - it.it.Prev() - } - } - } - - return it -} diff --git a/vendor/github.com/siddontang/ledisdb/store/leveldb/batch.go b/vendor/github.com/siddontang/ledisdb/store/leveldb/batch.go deleted file mode 100644 index 027aa39cdb95..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/leveldb/batch.go +++ /dev/null @@ -1,104 +0,0 @@ -// +build leveldb - -package leveldb - -// #cgo LDFLAGS: -lleveldb -// #include "leveldb/c.h" -// #include "leveldb_ext.h" -import "C" - -import ( - "github.com/syndtr/goleveldb/leveldb" - "unsafe" -) - -type WriteBatch struct { - db *DB - wbatch *C.leveldb_writebatch_t - - gbatch *leveldb.Batch -} - -func newWriteBatch(db *DB) *WriteBatch { - w := new(WriteBatch) - w.db = db - w.wbatch = C.leveldb_writebatch_create() - w.gbatch = new(leveldb.Batch) - - return w -} - -func (w *WriteBatch) Close() { - if w.wbatch != nil { - C.leveldb_writebatch_destroy(w.wbatch) - w.wbatch = nil - } - - w.gbatch = nil -} - -func (w *WriteBatch) Put(key, value []byte) { - var k, v *C.char - if len(key) != 0 { - k = (*C.char)(unsafe.Pointer(&key[0])) - } - if len(value) != 0 { - v = (*C.char)(unsafe.Pointer(&value[0])) - } - - lenk := len(key) - lenv := len(value) - - C.leveldb_writebatch_put(w.wbatch, k, C.size_t(lenk), v, C.size_t(lenv)) -} - -func (w *WriteBatch) Delete(key []byte) { - C.leveldb_writebatch_delete(w.wbatch, - (*C.char)(unsafe.Pointer(&key[0])), C.size_t(len(key))) -} - -func (w *WriteBatch) Commit() error { - return w.commit(w.db.writeOpts) -} - -func (w *WriteBatch) SyncCommit() error { - return w.commit(w.db.syncOpts) -} - -func (w *WriteBatch) Rollback() error { - C.leveldb_writebatch_clear(w.wbatch) - - return nil -} - -func (w *WriteBatch) commit(wb *WriteOptions) error { - var errStr *C.char - C.leveldb_write(w.db.db, wb.Opt, w.wbatch, &errStr) - if errStr != nil { - return saveError(errStr) - } - return nil -} - -//export leveldb_writebatch_iterate_put -func leveldb_writebatch_iterate_put(p unsafe.Pointer, k *C.char, klen C.size_t, v *C.char, vlen C.size_t) { - b := (*leveldb.Batch)(p) - key := slice(unsafe.Pointer(k), int(klen)) - value := slice(unsafe.Pointer(v), int(vlen)) - b.Put(key, value) -} - -//export leveldb_writebatch_iterate_delete -func leveldb_writebatch_iterate_delete(p unsafe.Pointer, k *C.char, klen C.size_t) { - b := (*leveldb.Batch)(p) - key := slice(unsafe.Pointer(k), int(klen)) - b.Delete(key) -} - -func (w *WriteBatch) Data() []byte { - w.gbatch.Reset() - C.leveldb_writebatch_iterate_ext(w.wbatch, - unsafe.Pointer(w.gbatch)) - b := w.gbatch.Dump() - return b -} diff --git a/vendor/github.com/siddontang/ledisdb/store/leveldb/cache.go b/vendor/github.com/siddontang/ledisdb/store/leveldb/cache.go deleted file mode 100644 index e5587cbf89e2..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/leveldb/cache.go +++ /dev/null @@ -1,20 +0,0 @@ -// +build leveldb - -package leveldb - -// #cgo LDFLAGS: -lleveldb -// #include -// #include "leveldb/c.h" -import "C" - -type Cache struct { - Cache *C.leveldb_cache_t -} - -func NewLRUCache(capacity int) *Cache { - return &Cache{C.leveldb_cache_create_lru(C.size_t(capacity))} -} - -func (c *Cache) Close() { - C.leveldb_cache_destroy(c.Cache) -} diff --git a/vendor/github.com/siddontang/ledisdb/store/leveldb/const.go b/vendor/github.com/siddontang/ledisdb/store/leveldb/const.go deleted file mode 100644 index df5b3c7a83b5..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/leveldb/const.go +++ /dev/null @@ -1,3 +0,0 @@ -package leveldb - -const DBName = "leveldb" diff --git a/vendor/github.com/siddontang/ledisdb/store/leveldb/db.go b/vendor/github.com/siddontang/ledisdb/store/leveldb/db.go deleted file mode 100644 index 64bbc2bfddf7..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/leveldb/db.go +++ /dev/null @@ -1,315 +0,0 @@ -// +build leveldb - -// Package leveldb is a wrapper for c++ leveldb -package leveldb - -/* -#cgo LDFLAGS: -lleveldb -#include -#include "leveldb_ext.h" -*/ -import "C" - -import ( - "github.com/siddontang/ledisdb/config" - "github.com/siddontang/ledisdb/store/driver" - "os" - "runtime" - "unsafe" -) - -const defaultFilterBits int = 10 - -type Store struct { -} - -func (s Store) String() string { - return DBName -} - -func (s Store) Open(path string, cfg *config.Config) (driver.IDB, error) { - if err := os.MkdirAll(path, 0755); err != nil { - return nil, err - } - - db := new(DB) - db.path = path - db.cfg = &cfg.LevelDB - - if err := db.open(); err != nil { - return nil, err - } - - return db, nil -} - -func (s Store) Repair(path string, cfg *config.Config) error { - db := new(DB) - db.cfg = &cfg.LevelDB - db.path = path - - err := db.open() - defer db.Close() - - //open ok, do not need repair - if err == nil { - return nil - } - - var errStr *C.char - ldbname := C.CString(path) - defer C.leveldb_free(unsafe.Pointer(ldbname)) - - C.leveldb_repair_db(db.opts.Opt, ldbname, &errStr) - if errStr != nil { - return saveError(errStr) - } - return nil -} - -type DB struct { - path string - - cfg *config.LevelDBConfig - - db *C.leveldb_t - - opts *Options - - //for default read and write options - readOpts *ReadOptions - writeOpts *WriteOptions - iteratorOpts *ReadOptions - - syncOpts *WriteOptions - - cache *Cache - - filter *FilterPolicy -} - -func (db *DB) open() error { - db.initOptions(db.cfg) - - var errStr *C.char - ldbname := C.CString(db.path) - defer C.leveldb_free(unsafe.Pointer(ldbname)) - - db.db = C.leveldb_open(db.opts.Opt, ldbname, &errStr) - if errStr != nil { - db.db = nil - return saveError(errStr) - } - return nil -} - -func (db *DB) initOptions(cfg *config.LevelDBConfig) { - opts := NewOptions() - - opts.SetCreateIfMissing(true) - - db.cache = NewLRUCache(cfg.CacheSize) - opts.SetCache(db.cache) - - //we must use bloomfilter - db.filter = NewBloomFilter(defaultFilterBits) - opts.SetFilterPolicy(db.filter) - - if !cfg.Compression { - opts.SetCompression(NoCompression) - } else { - opts.SetCompression(SnappyCompression) - } - - opts.SetBlockSize(cfg.BlockSize) - - opts.SetWriteBufferSize(cfg.WriteBufferSize) - - opts.SetMaxOpenFiles(cfg.MaxOpenFiles) - - db.opts = opts - - db.readOpts = NewReadOptions() - db.writeOpts = NewWriteOptions() - - db.syncOpts = NewWriteOptions() - db.syncOpts.SetSync(true) - - db.iteratorOpts = NewReadOptions() - db.iteratorOpts.SetFillCache(false) -} - -func (db *DB) Close() error { - if db.db != nil { - C.leveldb_close(db.db) - db.db = nil - } - - db.opts.Close() - - if db.cache != nil { - db.cache.Close() - } - - if db.filter != nil { - db.filter.Close() - } - - db.readOpts.Close() - db.writeOpts.Close() - db.iteratorOpts.Close() - - return nil -} - -func (db *DB) Put(key, value []byte) error { - return db.put(db.writeOpts, key, value) -} - -func (db *DB) Get(key []byte) ([]byte, error) { - return db.get(db.readOpts, key) -} - -func (db *DB) Delete(key []byte) error { - return db.delete(db.writeOpts, key) -} - -func (db *DB) SyncPut(key []byte, value []byte) error { - return db.put(db.syncOpts, key, value) -} - -func (db *DB) SyncDelete(key []byte) error { - return db.delete(db.syncOpts, key) -} - -func (db *DB) NewWriteBatch() driver.IWriteBatch { - wb := newWriteBatch(db) - - runtime.SetFinalizer(wb, func(w *WriteBatch) { - w.Close() - }) - - return wb -} - -func (db *DB) NewIterator() driver.IIterator { - it := new(Iterator) - - it.it = C.leveldb_create_iterator(db.db, db.iteratorOpts.Opt) - - return it -} - -func (db *DB) NewSnapshot() (driver.ISnapshot, error) { - snap := &Snapshot{ - db: db, - snap: C.leveldb_create_snapshot(db.db), - readOpts: NewReadOptions(), - iteratorOpts: NewReadOptions(), - } - snap.readOpts.SetSnapshot(snap) - snap.iteratorOpts.SetSnapshot(snap) - snap.iteratorOpts.SetFillCache(false) - - return snap, nil -} - -func (db *DB) put(wo *WriteOptions, key, value []byte) error { - var errStr *C.char - var k, v *C.char - if len(key) != 0 { - k = (*C.char)(unsafe.Pointer(&key[0])) - } - if len(value) != 0 { - v = (*C.char)(unsafe.Pointer(&value[0])) - } - - lenk := len(key) - lenv := len(value) - C.leveldb_put( - db.db, wo.Opt, k, C.size_t(lenk), v, C.size_t(lenv), &errStr) - - if errStr != nil { - return saveError(errStr) - } - return nil -} - -func (db *DB) get(ro *ReadOptions, key []byte) ([]byte, error) { - var errStr *C.char - var vallen C.size_t - var k *C.char - if len(key) != 0 { - k = (*C.char)(unsafe.Pointer(&key[0])) - } - - value := C.leveldb_get( - db.db, ro.Opt, k, C.size_t(len(key)), &vallen, &errStr) - - if errStr != nil { - return nil, saveError(errStr) - } - - if value == nil { - return nil, nil - } - - defer C.leveldb_free(unsafe.Pointer(value)) - - return C.GoBytes(unsafe.Pointer(value), C.int(vallen)), nil -} - -func (db *DB) getSlice(ro *ReadOptions, key []byte) (driver.ISlice, error) { - var errStr *C.char - var vallen C.size_t - var k *C.char - if len(key) != 0 { - k = (*C.char)(unsafe.Pointer(&key[0])) - } - - value := C.leveldb_get( - db.db, ro.Opt, k, C.size_t(len(key)), &vallen, &errStr) - - if errStr != nil { - return nil, saveError(errStr) - } - - if value == nil { - return nil, nil - } - - return NewCSlice(unsafe.Pointer(value), int(vallen)), nil -} - -func (db *DB) delete(wo *WriteOptions, key []byte) error { - var errStr *C.char - var k *C.char - if len(key) != 0 { - k = (*C.char)(unsafe.Pointer(&key[0])) - } - - C.leveldb_delete( - db.db, wo.Opt, k, C.size_t(len(key)), &errStr) - - if errStr != nil { - return saveError(errStr) - } - return nil -} - -func (db *DB) Begin() (driver.Tx, error) { - return nil, driver.ErrTxSupport -} - -func (db *DB) Compact() error { - C.leveldb_compact_range(db.db, nil, 0, nil, 0) - return nil -} - -func (db *DB) GetSlice(key []byte) (driver.ISlice, error) { - return db.getSlice(db.readOpts, key) -} - -func init() { - driver.Register(Store{}) -} diff --git a/vendor/github.com/siddontang/ledisdb/store/leveldb/filterpolicy.go b/vendor/github.com/siddontang/ledisdb/store/leveldb/filterpolicy.go deleted file mode 100644 index 640139fb8b8e..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/leveldb/filterpolicy.go +++ /dev/null @@ -1,21 +0,0 @@ -// +build leveldb - -package leveldb - -// #cgo LDFLAGS: -lleveldb -// #include -// #include "leveldb/c.h" -import "C" - -type FilterPolicy struct { - Policy *C.leveldb_filterpolicy_t -} - -func NewBloomFilter(bitsPerKey int) *FilterPolicy { - policy := C.leveldb_filterpolicy_create_bloom(C.int(bitsPerKey)) - return &FilterPolicy{policy} -} - -func (fp *FilterPolicy) Close() { - C.leveldb_filterpolicy_destroy(fp.Policy) -} diff --git a/vendor/github.com/siddontang/ledisdb/store/leveldb/iterator.go b/vendor/github.com/siddontang/ledisdb/store/leveldb/iterator.go deleted file mode 100644 index 49cfd7db18ea..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/leveldb/iterator.go +++ /dev/null @@ -1,70 +0,0 @@ -// +build leveldb - -package leveldb - -// #cgo LDFLAGS: -lleveldb -// #include -// #include "leveldb/c.h" -// #include "leveldb_ext.h" -import "C" - -import ( - "unsafe" -) - -type Iterator struct { - it *C.leveldb_iterator_t - isValid C.uchar -} - -func (it *Iterator) Key() []byte { - var klen C.size_t - kdata := C.leveldb_iter_key(it.it, &klen) - if kdata == nil { - return nil - } - - return slice(unsafe.Pointer(kdata), int(C.int(klen))) -} - -func (it *Iterator) Value() []byte { - var vlen C.size_t - vdata := C.leveldb_iter_value(it.it, &vlen) - if vdata == nil { - return nil - } - - return slice(unsafe.Pointer(vdata), int(C.int(vlen))) -} - -func (it *Iterator) Close() error { - if it.it != nil { - C.leveldb_iter_destroy(it.it) - it.it = nil - } - return nil -} - -func (it *Iterator) Valid() bool { - return ucharToBool(it.isValid) -} - -func (it *Iterator) Next() { - it.isValid = C.leveldb_iter_next_ext(it.it) -} - -func (it *Iterator) Prev() { - it.isValid = C.leveldb_iter_prev_ext(it.it) -} - -func (it *Iterator) First() { - it.isValid = C.leveldb_iter_seek_to_first_ext(it.it) -} - -func (it *Iterator) Last() { - it.isValid = C.leveldb_iter_seek_to_last_ext(it.it) -} - -func (it *Iterator) Seek(key []byte) { - it.isValid = C.leveldb_iter_seek_ext(it.it, (*C.char)(unsafe.Pointer(&key[0])), C.size_t(len(key))) -} diff --git a/vendor/github.com/siddontang/ledisdb/store/leveldb/leveldb_ext.cc b/vendor/github.com/siddontang/ledisdb/store/leveldb/leveldb_ext.cc deleted file mode 100644 index 540b7397b01b..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/leveldb/leveldb_ext.cc +++ /dev/null @@ -1,95 +0,0 @@ -// +build leveldb - -#include "leveldb_ext.h" - -#include -//#include - -//#include "leveldb/db.h" - -//using namespace leveldb; - -extern "C" { - -// static bool SaveError(char** errptr, const Status& s) { -// assert(errptr != NULL); -// if (s.ok()) { -// return false; -// } else if (*errptr == NULL) { -// *errptr = strdup(s.ToString().c_str()); -// } else { -// free(*errptr); -// *errptr = strdup(s.ToString().c_str()); -// } -// return true; -// } - -// void* leveldb_get_ext( -// leveldb_t* db, -// const leveldb_readoptions_t* options, -// const char* key, size_t keylen, -// char** valptr, -// size_t* vallen, -// char** errptr) { - -// std::string *tmp = new(std::string); - -// //very tricky, maybe changed with c++ leveldb upgrade -// Status s = (*(DB**)db)->Get(*(ReadOptions*)options, Slice(key, keylen), tmp); - -// if (s.ok()) { -// *valptr = (char*)tmp->data(); -// *vallen = tmp->size(); -// } else { -// delete(tmp); -// tmp = NULL; -// *valptr = NULL; -// *vallen = 0; -// if (!s.IsNotFound()) { -// SaveError(errptr, s); -// } -// } -// return tmp; -// } - -// void leveldb_get_free_ext(void* context) { -// std::string* s = (std::string*)context; - -// delete(s); -// } - - -unsigned char leveldb_iter_seek_to_first_ext(leveldb_iterator_t* iter) { - leveldb_iter_seek_to_first(iter); - return leveldb_iter_valid(iter); -} - -unsigned char leveldb_iter_seek_to_last_ext(leveldb_iterator_t* iter) { - leveldb_iter_seek_to_last(iter); - return leveldb_iter_valid(iter); -} - -unsigned char leveldb_iter_seek_ext(leveldb_iterator_t* iter, const char* k, size_t klen) { - leveldb_iter_seek(iter, k, klen); - return leveldb_iter_valid(iter); -} - -unsigned char leveldb_iter_next_ext(leveldb_iterator_t* iter) { - leveldb_iter_next(iter); - return leveldb_iter_valid(iter); -} - -unsigned char leveldb_iter_prev_ext(leveldb_iterator_t* iter) { - leveldb_iter_prev(iter); - return leveldb_iter_valid(iter); -} - -extern void leveldb_writebatch_iterate_put(void*, const char* k, size_t klen, const char* v, size_t vlen); -extern void leveldb_writebatch_iterate_delete(void*, const char* k, size_t klen); - -void leveldb_writebatch_iterate_ext(leveldb_writebatch_t* w, void *p) { - leveldb_writebatch_iterate(w, p, - leveldb_writebatch_iterate_put, leveldb_writebatch_iterate_delete); -} - -} \ No newline at end of file diff --git a/vendor/github.com/siddontang/ledisdb/store/leveldb/leveldb_ext.h b/vendor/github.com/siddontang/ledisdb/store/leveldb/leveldb_ext.h deleted file mode 100644 index 3eed41bdf9b0..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/leveldb/leveldb_ext.h +++ /dev/null @@ -1,41 +0,0 @@ -// +build leveldb - -#ifndef LEVELDB_EXT_H -#define LEVELDB_EXT_H - -#ifdef __cplusplus -extern "C" { -#endif - -#include "leveldb/c.h" - - -// /* Returns NULL if not found. Otherwise stores the value in **valptr. -// Stores the length of the value in *vallen. -// Returns a context must be later to free*/ -// extern void* leveldb_get_ext( -// leveldb_t* db, -// const leveldb_readoptions_t* options, -// const char* key, size_t keylen, -// char** valptr, -// size_t* vallen, -// char** errptr); - -// // Free context returns by leveldb_get_ext -// extern void leveldb_get_free_ext(void* context); - - -// Below iterator functions like leveldb iterator but returns valid status for iterator -extern unsigned char leveldb_iter_seek_to_first_ext(leveldb_iterator_t*); -extern unsigned char leveldb_iter_seek_to_last_ext(leveldb_iterator_t*); -extern unsigned char leveldb_iter_seek_ext(leveldb_iterator_t*, const char* k, size_t klen); -extern unsigned char leveldb_iter_next_ext(leveldb_iterator_t*); -extern unsigned char leveldb_iter_prev_ext(leveldb_iterator_t*); - -extern void leveldb_writebatch_iterate_ext(leveldb_writebatch_t*, void* p); - -#ifdef __cplusplus -} -#endif - -#endif \ No newline at end of file diff --git a/vendor/github.com/siddontang/ledisdb/store/leveldb/options.go b/vendor/github.com/siddontang/ledisdb/store/leveldb/options.go deleted file mode 100644 index bd3ad1694b41..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/leveldb/options.go +++ /dev/null @@ -1,122 +0,0 @@ -// +build leveldb - -package leveldb - -// #cgo LDFLAGS: -lleveldb -// #include "leveldb/c.h" -import "C" - -type CompressionOpt int - -const ( - NoCompression = CompressionOpt(0) - SnappyCompression = CompressionOpt(1) -) - -type Options struct { - Opt *C.leveldb_options_t -} - -type ReadOptions struct { - Opt *C.leveldb_readoptions_t -} - -type WriteOptions struct { - Opt *C.leveldb_writeoptions_t -} - -func NewOptions() *Options { - opt := C.leveldb_options_create() - return &Options{opt} -} - -func NewReadOptions() *ReadOptions { - opt := C.leveldb_readoptions_create() - return &ReadOptions{opt} -} - -func NewWriteOptions() *WriteOptions { - opt := C.leveldb_writeoptions_create() - return &WriteOptions{opt} -} - -func (o *Options) Close() { - C.leveldb_options_destroy(o.Opt) -} - -func (o *Options) SetComparator(cmp *C.leveldb_comparator_t) { - C.leveldb_options_set_comparator(o.Opt, cmp) -} - -func (o *Options) SetErrorIfExists(error_if_exists bool) { - eie := boolToUchar(error_if_exists) - C.leveldb_options_set_error_if_exists(o.Opt, eie) -} - -func (o *Options) SetCache(cache *Cache) { - C.leveldb_options_set_cache(o.Opt, cache.Cache) -} - -func (o *Options) SetWriteBufferSize(s int) { - C.leveldb_options_set_write_buffer_size(o.Opt, C.size_t(s)) -} - -func (o *Options) SetParanoidChecks(pc bool) { - C.leveldb_options_set_paranoid_checks(o.Opt, boolToUchar(pc)) -} - -func (o *Options) SetMaxOpenFiles(n int) { - C.leveldb_options_set_max_open_files(o.Opt, C.int(n)) -} - -func (o *Options) SetBlockSize(s int) { - C.leveldb_options_set_block_size(o.Opt, C.size_t(s)) -} - -func (o *Options) SetBlockRestartInterval(n int) { - C.leveldb_options_set_block_restart_interval(o.Opt, C.int(n)) -} - -func (o *Options) SetCompression(t CompressionOpt) { - C.leveldb_options_set_compression(o.Opt, C.int(t)) -} - -func (o *Options) SetCreateIfMissing(b bool) { - C.leveldb_options_set_create_if_missing(o.Opt, boolToUchar(b)) -} - -func (o *Options) SetFilterPolicy(fp *FilterPolicy) { - var policy *C.leveldb_filterpolicy_t - if fp != nil { - policy = fp.Policy - } - C.leveldb_options_set_filter_policy(o.Opt, policy) -} - -func (ro *ReadOptions) Close() { - C.leveldb_readoptions_destroy(ro.Opt) -} - -func (ro *ReadOptions) SetVerifyChecksums(b bool) { - C.leveldb_readoptions_set_verify_checksums(ro.Opt, boolToUchar(b)) -} - -func (ro *ReadOptions) SetFillCache(b bool) { - C.leveldb_readoptions_set_fill_cache(ro.Opt, boolToUchar(b)) -} - -func (ro *ReadOptions) SetSnapshot(snap *Snapshot) { - var s *C.leveldb_snapshot_t - if snap != nil { - s = snap.snap - } - C.leveldb_readoptions_set_snapshot(ro.Opt, s) -} - -func (wo *WriteOptions) Close() { - C.leveldb_writeoptions_destroy(wo.Opt) -} - -func (wo *WriteOptions) SetSync(b bool) { - C.leveldb_writeoptions_set_sync(wo.Opt, boolToUchar(b)) -} diff --git a/vendor/github.com/siddontang/ledisdb/store/leveldb/slice.go b/vendor/github.com/siddontang/ledisdb/store/leveldb/slice.go deleted file mode 100644 index 83ebf55c0369..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/leveldb/slice.go +++ /dev/null @@ -1,40 +0,0 @@ -// +build leveldb - -package leveldb - -// #cgo LDFLAGS: -lleveldb -// #include "leveldb/c.h" -import "C" - -import ( - "reflect" - "unsafe" -) - -type CSlice struct { - data unsafe.Pointer - size int -} - -func NewCSlice(p unsafe.Pointer, n int) *CSlice { - return &CSlice{p, n} -} - -func (s *CSlice) Data() []byte { - var value []byte - - sH := (*reflect.SliceHeader)(unsafe.Pointer(&value)) - sH.Cap = int(s.size) - sH.Len = int(s.size) - sH.Data = uintptr(s.data) - - return value -} - -func (s *CSlice) Size() int { - return int(s.size) -} - -func (s *CSlice) Free() { - C.leveldb_free(s.data) -} diff --git a/vendor/github.com/siddontang/ledisdb/store/leveldb/snapshot.go b/vendor/github.com/siddontang/ledisdb/store/leveldb/snapshot.go deleted file mode 100644 index bdc8d514da8f..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/leveldb/snapshot.go +++ /dev/null @@ -1,39 +0,0 @@ -// +build leveldb - -package leveldb - -// #cgo LDFLAGS: -lleveldb -// #include "leveldb/c.h" -import "C" - -import ( - "github.com/siddontang/ledisdb/store/driver" -) - -type Snapshot struct { - db *DB - snap *C.leveldb_snapshot_t - readOpts *ReadOptions - iteratorOpts *ReadOptions -} - -func (s *Snapshot) Get(key []byte) ([]byte, error) { - return s.db.get(s.readOpts, key) -} - -func (s *Snapshot) GetSlice(key []byte) (driver.ISlice, error) { - return s.db.getSlice(s.readOpts, key) -} - -func (s *Snapshot) NewIterator() driver.IIterator { - it := new(Iterator) - it.it = C.leveldb_create_iterator(s.db.db, s.db.iteratorOpts.Opt) - return it - -} - -func (s *Snapshot) Close() { - C.leveldb_release_snapshot(s.db.db, s.snap) - s.iteratorOpts.Close() - s.readOpts.Close() -} diff --git a/vendor/github.com/siddontang/ledisdb/store/leveldb/util.go b/vendor/github.com/siddontang/ledisdb/store/leveldb/util.go deleted file mode 100644 index 6efe33b661d6..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/leveldb/util.go +++ /dev/null @@ -1,45 +0,0 @@ -// +build leveldb - -package leveldb - -// #include "leveldb/c.h" -import "C" - -import ( - "fmt" - "reflect" - "unsafe" -) - -func boolToUchar(b bool) C.uchar { - uc := C.uchar(0) - if b { - uc = C.uchar(1) - } - return uc -} - -func ucharToBool(uc C.uchar) bool { - if uc == C.uchar(0) { - return false - } - return true -} - -func saveError(errStr *C.char) error { - if errStr != nil { - gs := C.GoString(errStr) - C.leveldb_free(unsafe.Pointer(errStr)) - return fmt.Errorf(gs) - } - return nil -} - -func slice(p unsafe.Pointer, n int) []byte { - var b []byte - pbyte := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - pbyte.Data = uintptr(p) - pbyte.Len = n - pbyte.Cap = n - return b -} diff --git a/vendor/github.com/siddontang/ledisdb/store/mdb/const.go b/vendor/github.com/siddontang/ledisdb/store/mdb/const.go deleted file mode 100644 index cdc70e05fd25..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/mdb/const.go +++ /dev/null @@ -1,3 +0,0 @@ -package mdb - -const DBName = "lmdb" diff --git a/vendor/github.com/siddontang/ledisdb/store/mdb/mdb.go b/vendor/github.com/siddontang/ledisdb/store/mdb/mdb.go deleted file mode 100644 index 6bf26f64308b..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/mdb/mdb.go +++ /dev/null @@ -1,316 +0,0 @@ -// +build !windows - -package mdb - -import ( - "github.com/siddontang/ledisdb/config" - "github.com/siddontang/ledisdb/store/driver" - mdb "github.com/szferi/gomdb" - "os" -) - -type Store struct { -} - -func (s Store) String() string { - return DBName -} - -type MDB struct { - env *mdb.Env - db mdb.DBI - path string - cfg *config.Config -} - -func (s Store) Open(path string, c *config.Config) (driver.IDB, error) { - mapSize := c.LMDB.MapSize - noSync := c.LMDB.NoSync - - if mapSize <= 0 { - mapSize = 500 * 1024 * 1024 - } - - env, err := mdb.NewEnv() - if err != nil { - return MDB{}, err - } - - // TODO: max dbs should be configurable - if err := env.SetMaxDBs(1); err != nil { - return MDB{}, err - } - - if err := env.SetMapSize(uint64(mapSize)); err != nil { - return MDB{}, err - } - - if _, err := os.Stat(path); err != nil { - err = os.MkdirAll(path, 0755) - if err != nil { - return MDB{}, err - } - } - - var flags uint = mdb.CREATE - if noSync { - flags |= mdb.NOSYNC | mdb.NOMETASYNC | mdb.WRITEMAP | mdb.MAPASYNC - } - - err = env.Open(path, flags, 0755) - if err != nil { - return MDB{}, err - } - - tx, err := env.BeginTxn(nil, 0) - if err != nil { - return MDB{}, err - } - - dbi, err := tx.DBIOpen(nil, mdb.CREATE) - if err != nil { - return MDB{}, err - } - - if err := tx.Commit(); err != nil { - return MDB{}, err - } - - db := MDB{ - env: env, - db: dbi, - path: path, - } - - return db, nil -} - -func (s Store) Repair(path string, c *config.Config) error { - println("llmd not supports repair") - return nil -} - -func (db MDB) Put(key, value []byte) error { - itr := db.iterator(false) - defer itr.Close() - - itr.err = itr.c.Put(key, value, 0) - itr.setState() - return itr.err -} - -func (db MDB) BatchPut(writes []driver.Write) error { - itr := db.iterator(false) - defer itr.Close() - - for _, w := range writes { - if w.Value == nil { - itr.key, itr.value, itr.err = itr.c.Get(w.Key, nil, mdb.SET) - if itr.err == nil { - itr.err = itr.c.Del(0) - } - } else { - itr.err = itr.c.Put(w.Key, w.Value, 0) - } - - if itr.err != nil && itr.err != mdb.NotFound { - break - } - } - itr.setState() - - return itr.err -} - -func (db MDB) SyncBatchPut(writes []driver.Write) error { - if err := db.BatchPut(writes); err != nil { - return err - } - - return db.env.Sync(1) -} - -func (db MDB) Get(key []byte) ([]byte, error) { - tx, err := db.env.BeginTxn(nil, mdb.RDONLY) - if err != nil { - return nil, err - } - defer tx.Commit() - - v, err := tx.Get(db.db, key) - if err == mdb.NotFound { - return nil, nil - } - return v, err -} - -func (db MDB) Delete(key []byte) error { - itr := db.iterator(false) - defer itr.Close() - - itr.key, itr.value, itr.err = itr.c.Get(key, nil, mdb.SET) - if itr.err == nil { - itr.err = itr.c.Del(0) - } - itr.setState() - return itr.Error() -} - -func (db MDB) SyncPut(key []byte, value []byte) error { - if err := db.Put(key, value); err != nil { - return err - } - - return db.env.Sync(1) -} - -func (db MDB) SyncDelete(key []byte) error { - if err := db.Delete(key); err != nil { - return err - } - - return db.env.Sync(1) -} - -type MDBIterator struct { - key []byte - value []byte - c *mdb.Cursor - tx *mdb.Txn - valid bool - err error - - closeAutoCommit bool -} - -func (itr *MDBIterator) Key() []byte { - return itr.key -} - -func (itr *MDBIterator) Value() []byte { - return itr.value -} - -func (itr *MDBIterator) Valid() bool { - return itr.valid -} - -func (itr *MDBIterator) Error() error { - return itr.err -} - -func (itr *MDBIterator) getCurrent() { - itr.key, itr.value, itr.err = itr.c.Get(nil, nil, mdb.GET_CURRENT) - itr.setState() -} - -func (itr *MDBIterator) Seek(key []byte) { - itr.key, itr.value, itr.err = itr.c.Get(key, nil, mdb.SET_RANGE) - itr.setState() -} -func (itr *MDBIterator) Next() { - itr.key, itr.value, itr.err = itr.c.Get(nil, nil, mdb.NEXT) - itr.setState() -} - -func (itr *MDBIterator) Prev() { - itr.key, itr.value, itr.err = itr.c.Get(nil, nil, mdb.PREV) - itr.setState() -} - -func (itr *MDBIterator) First() { - itr.key, itr.value, itr.err = itr.c.Get(nil, nil, mdb.FIRST) - itr.setState() -} - -func (itr *MDBIterator) Last() { - itr.key, itr.value, itr.err = itr.c.Get(nil, nil, mdb.LAST) - itr.setState() -} - -func (itr *MDBIterator) setState() { - if itr.err != nil { - if itr.err == mdb.NotFound { - itr.err = nil - } - itr.valid = false - } else { - itr.valid = true - } -} - -func (itr *MDBIterator) Close() error { - if err := itr.c.Close(); err != nil { - itr.tx.Abort() - return err - } - - if !itr.closeAutoCommit { - return itr.err - } - - if itr.err != nil { - itr.tx.Abort() - return itr.err - } - return itr.tx.Commit() -} - -func (_ MDB) Name() string { - return "lmdb" -} - -func (db MDB) Path() string { - return db.path -} - -func (db MDB) iterator(rdonly bool) *MDBIterator { - flags := uint(0) - if rdonly { - flags = mdb.RDONLY - } - tx, err := db.env.BeginTxn(nil, flags) - if err != nil { - return &MDBIterator{nil, nil, nil, nil, false, err, true} - } - - c, err := tx.CursorOpen(db.db) - if err != nil { - tx.Abort() - return &MDBIterator{nil, nil, nil, nil, false, err, true} - } - - return &MDBIterator{nil, nil, c, tx, true, nil, true} -} - -func (db MDB) Close() error { - db.env.DBIClose(db.db) - if err := db.env.Close(); err != nil { - panic(err) - } - return nil -} - -func (db MDB) NewIterator() driver.IIterator { - return db.iterator(true) -} - -func (db MDB) NewWriteBatch() driver.IWriteBatch { - return driver.NewWriteBatch(db) -} - -func (db MDB) Begin() (driver.Tx, error) { - return newTx(db) -} - -func (db MDB) NewSnapshot() (driver.ISnapshot, error) { - return newSnapshot(db) -} - -func (db MDB) Compact() error { - return nil -} - -func init() { - driver.Register(Store{}) -} diff --git a/vendor/github.com/siddontang/ledisdb/store/mdb/snapshot.go b/vendor/github.com/siddontang/ledisdb/store/mdb/snapshot.go deleted file mode 100644 index 270907d7117c..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/mdb/snapshot.go +++ /dev/null @@ -1,43 +0,0 @@ -// +build !windows - -package mdb - -import ( - "github.com/siddontang/ledisdb/store/driver" - mdb "github.com/szferi/gomdb" -) - -type Snapshot struct { - db mdb.DBI - tx *mdb.Txn -} - -func newSnapshot(db MDB) (*Snapshot, error) { - tx, err := db.env.BeginTxn(nil, mdb.RDONLY) - if err != nil { - return nil, err - } - - return &Snapshot{db.db, tx}, nil -} - -func (s *Snapshot) Get(key []byte) ([]byte, error) { - v, err := s.tx.Get(s.db, key) - if err == mdb.NotFound { - return nil, nil - } - return v, err -} - -func (s *Snapshot) NewIterator() driver.IIterator { - c, err := s.tx.CursorOpen(s.db) - if err != nil { - return &MDBIterator{nil, nil, nil, nil, false, err, false} - } - - return &MDBIterator{nil, nil, c, s.tx, true, nil, false} -} - -func (s *Snapshot) Close() { - s.tx.Commit() -} diff --git a/vendor/github.com/siddontang/ledisdb/store/mdb/tx.go b/vendor/github.com/siddontang/ledisdb/store/mdb/tx.go deleted file mode 100644 index e2e3be0171be..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/mdb/tx.go +++ /dev/null @@ -1,90 +0,0 @@ -// +build !windows - -package mdb - -import ( - "github.com/siddontang/ledisdb/store/driver" - mdb "github.com/szferi/gomdb" -) - -type Tx struct { - db mdb.DBI - tx *mdb.Txn -} - -func newTx(db MDB) (*Tx, error) { - tx, err := db.env.BeginTxn(nil, uint(0)) - if err != nil { - return nil, err - } - - return &Tx{db.db, tx}, nil -} - -func (t *Tx) Get(key []byte) ([]byte, error) { - v, err := t.tx.Get(t.db, key) - if err == mdb.NotFound { - return nil, nil - } - return v, err -} - -func (t *Tx) Put(key []byte, value []byte) error { - return t.tx.Put(t.db, key, value, mdb.NODUPDATA) -} - -func (t *Tx) Delete(key []byte) error { - return t.tx.Del(t.db, key, nil) -} - -func (t *Tx) NewIterator() driver.IIterator { - return t.newIterator() -} - -func (t *Tx) newIterator() *MDBIterator { - c, err := t.tx.CursorOpen(t.db) - if err != nil { - return &MDBIterator{nil, nil, nil, nil, false, err, false} - } - - return &MDBIterator{nil, nil, c, t.tx, true, nil, false} -} - -func (t *Tx) NewWriteBatch() driver.IWriteBatch { - return driver.NewWriteBatch(t) -} - -func (t *Tx) BatchPut(writes []driver.Write) error { - itr := t.newIterator() - - for _, w := range writes { - if w.Value == nil { - itr.key, itr.value, itr.err = itr.c.Get(w.Key, nil, mdb.SET) - if itr.err == nil { - itr.err = itr.c.Del(0) - } - } else { - itr.err = itr.c.Put(w.Key, w.Value, 0) - } - - if itr.err != nil && itr.err != mdb.NotFound { - break - } - } - itr.setState() - - return itr.Close() -} - -func (t *Tx) SyncBatchPut(writes []driver.Write) error { - return t.BatchPut(writes) -} - -func (t *Tx) Rollback() error { - t.tx.Abort() - return nil -} - -func (t *Tx) Commit() error { - return t.tx.Commit() -} diff --git a/vendor/github.com/siddontang/ledisdb/store/rocksdb/batch.go b/vendor/github.com/siddontang/ledisdb/store/rocksdb/batch.go deleted file mode 100644 index bb727e7015ef..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/rocksdb/batch.go +++ /dev/null @@ -1,83 +0,0 @@ -// +build rocksdb - -package rocksdb - -// #cgo LDFLAGS: -lrocksdb -// #include "rocksdb/c.h" -// #include "rocksdb_ext.h" -import "C" - -import ( - "unsafe" -) - -type WriteBatch struct { - db *DB - wbatch *C.rocksdb_writebatch_t - commitOk bool -} - -func (w *WriteBatch) Close() { - if w.wbatch != nil { - C.rocksdb_writebatch_destroy(w.wbatch) - w.wbatch = nil - } -} - -func (w *WriteBatch) Put(key, value []byte) { - w.commitOk = false - - var k, v *C.char - if len(key) != 0 { - k = (*C.char)(unsafe.Pointer(&key[0])) - } - if len(value) != 0 { - v = (*C.char)(unsafe.Pointer(&value[0])) - } - - lenk := len(key) - lenv := len(value) - - C.rocksdb_writebatch_put(w.wbatch, k, C.size_t(lenk), v, C.size_t(lenv)) -} - -func (w *WriteBatch) Delete(key []byte) { - w.commitOk = false - - C.rocksdb_writebatch_delete(w.wbatch, - (*C.char)(unsafe.Pointer(&key[0])), C.size_t(len(key))) -} - -func (w *WriteBatch) Commit() error { - return w.commit(w.db.writeOpts) -} - -func (w *WriteBatch) SyncCommit() error { - return w.commit(w.db.syncOpts) -} - -func (w *WriteBatch) Rollback() error { - if !w.commitOk { - C.rocksdb_writebatch_clear(w.wbatch) - } - return nil -} - -func (w *WriteBatch) commit(wb *WriteOptions) error { - w.commitOk = true - - var errStr *C.char - C.rocksdb_write_ext(w.db.db, wb.Opt, w.wbatch, &errStr) - if errStr != nil { - w.commitOk = false - return saveError(errStr) - } - return nil -} - -func (w *WriteBatch) Data() []byte { - var vallen C.size_t - value := C.rocksdb_writebatch_data(w.wbatch, &vallen) - - return slice(unsafe.Pointer(value), int(vallen)) -} diff --git a/vendor/github.com/siddontang/ledisdb/store/rocksdb/cache.go b/vendor/github.com/siddontang/ledisdb/store/rocksdb/cache.go deleted file mode 100644 index 931998ba4af4..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/rocksdb/cache.go +++ /dev/null @@ -1,20 +0,0 @@ -// +build rocksdb - -package rocksdb - -// #cgo LDFLAGS: -lrocksdb -// #include -// #include "rocksdb/c.h" -import "C" - -type Cache struct { - Cache *C.rocksdb_cache_t -} - -func NewLRUCache(capacity int) *Cache { - return &Cache{C.rocksdb_cache_create_lru(C.size_t(capacity))} -} - -func (c *Cache) Close() { - C.rocksdb_cache_destroy(c.Cache) -} diff --git a/vendor/github.com/siddontang/ledisdb/store/rocksdb/const.go b/vendor/github.com/siddontang/ledisdb/store/rocksdb/const.go deleted file mode 100644 index f4155bbe2018..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/rocksdb/const.go +++ /dev/null @@ -1,3 +0,0 @@ -package rocksdb - -const DBName = "rocksdb" diff --git a/vendor/github.com/siddontang/ledisdb/store/rocksdb/db.go b/vendor/github.com/siddontang/ledisdb/store/rocksdb/db.go deleted file mode 100644 index 952121b8b365..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/rocksdb/db.go +++ /dev/null @@ -1,346 +0,0 @@ -// +build rocksdb - -// Package rocksdb is a wrapper for c++ rocksdb -package rocksdb - -/* -#cgo LDFLAGS: -lrocksdb -#include -#include -#include "rocksdb_ext.h" -*/ -import "C" - -import ( - "github.com/siddontang/ledisdb/config" - "github.com/siddontang/ledisdb/store/driver" - "os" - "runtime" - "unsafe" -) - -const defaultFilterBits int = 10 - -type Store struct { -} - -func (s Store) String() string { - return DBName -} - -func (s Store) Open(path string, cfg *config.Config) (driver.IDB, error) { - if err := os.MkdirAll(path, 0755); err != nil { - return nil, err - } - - db := new(DB) - db.path = path - db.cfg = &cfg.RocksDB - - if err := db.open(); err != nil { - return nil, err - } - - return db, nil -} - -func (s Store) Repair(path string, cfg *config.Config) error { - db := new(DB) - db.path = path - db.cfg = &cfg.RocksDB - - err := db.open() - defer db.Close() - - //open ok, do not need repair - if err == nil { - return nil - } - - var errStr *C.char - ldbname := C.CString(path) - defer C.free(unsafe.Pointer(ldbname)) - - C.rocksdb_repair_db(db.opts.Opt, ldbname, &errStr) - if errStr != nil { - return saveError(errStr) - } - return nil -} - -type DB struct { - path string - - cfg *config.RocksDBConfig - - db *C.rocksdb_t - - env *Env - - opts *Options - blockOpts *BlockBasedTableOptions - - //for default read and write options - readOpts *ReadOptions - writeOpts *WriteOptions - iteratorOpts *ReadOptions - - syncOpts *WriteOptions - - cache *Cache - - filter *FilterPolicy -} - -func (db *DB) open() error { - db.initOptions(db.cfg) - - var errStr *C.char - ldbname := C.CString(db.path) - defer C.free(unsafe.Pointer(ldbname)) - - db.db = C.rocksdb_open(db.opts.Opt, ldbname, &errStr) - if errStr != nil { - db.db = nil - return saveError(errStr) - } - return nil -} - -func (db *DB) initOptions(cfg *config.RocksDBConfig) { - opts := NewOptions() - blockOpts := NewBlockBasedTableOptions() - - opts.SetCreateIfMissing(true) - - db.env = NewDefaultEnv() - db.env.SetBackgroundThreads(cfg.BackgroundThreads) - db.env.SetHighPriorityBackgroundThreads(cfg.HighPriorityBackgroundThreads) - opts.SetEnv(db.env) - - db.cache = NewLRUCache(cfg.CacheSize) - blockOpts.SetCache(db.cache) - - //we must use bloomfilter - db.filter = NewBloomFilter(defaultFilterBits) - blockOpts.SetFilterPolicy(db.filter) - blockOpts.SetBlockSize(cfg.BlockSize) - opts.SetBlockBasedTableFactory(blockOpts) - - opts.SetCompression(CompressionOpt(cfg.Compression)) - opts.SetWriteBufferSize(cfg.WriteBufferSize) - opts.SetMaxOpenFiles(cfg.MaxOpenFiles) - opts.SetMaxBackgroundCompactions(cfg.MaxBackgroundCompactions) - opts.SetMaxBackgroundFlushes(cfg.MaxBackgroundFlushes) - opts.SetLevel0FileNumCompactionTrigger(cfg.Level0FileNumCompactionTrigger) - opts.SetLevel0SlowdownWritesTrigger(cfg.Level0SlowdownWritesTrigger) - opts.SetLevel0StopWritesTrigger(cfg.Level0StopWritesTrigger) - opts.SetTargetFileSizeBase(cfg.TargetFileSizeBase) - opts.SetTargetFileSizeMultiplier(cfg.TargetFileSizeMultiplier) - opts.SetMaxBytesForLevelBase(cfg.MaxBytesForLevelBase) - opts.SetMaxBytesForLevelMultiplier(cfg.MaxBytesForLevelMultiplier) - opts.DisableDataSync(cfg.DisableDataSync) - opts.SetMinWriteBufferNumberToMerge(cfg.MinWriteBufferNumberToMerge) - opts.DisableAutoCompactions(cfg.DisableAutoCompactions) - opts.EnableStatistics(cfg.EnableStatistics) - opts.UseFsync(cfg.UseFsync) - opts.AllowOsBuffer(cfg.AllowOsBuffer) - opts.SetStatsDumpPeriodSec(cfg.StatsDumpPeriodSec) - - db.opts = opts - db.blockOpts = blockOpts - - db.readOpts = NewReadOptions() - db.writeOpts = NewWriteOptions() - db.writeOpts.DisableWAL(cfg.DisableWAL) - - db.syncOpts = NewWriteOptions() - db.syncOpts.SetSync(true) - db.syncOpts.DisableWAL(cfg.DisableWAL) - - db.iteratorOpts = NewReadOptions() - db.iteratorOpts.SetFillCache(false) -} - -func (db *DB) Close() error { - if db.db != nil { - C.rocksdb_close(db.db) - db.db = nil - } - - if db.filter != nil { - db.filter.Close() - } - - if db.cache != nil { - db.cache.Close() - } - - if db.env != nil { - db.env.Close() - } - - //db.blockOpts.Close() - - db.opts.Close() - - db.readOpts.Close() - db.writeOpts.Close() - db.iteratorOpts.Close() - - return nil -} - -func (db *DB) Put(key, value []byte) error { - return db.put(db.writeOpts, key, value) -} - -func (db *DB) Get(key []byte) ([]byte, error) { - return db.get(db.readOpts, key) -} - -func (db *DB) Delete(key []byte) error { - return db.delete(db.writeOpts, key) -} - -func (db *DB) SyncPut(key []byte, value []byte) error { - return db.put(db.syncOpts, key, value) -} - -func (db *DB) SyncDelete(key []byte) error { - return db.delete(db.syncOpts, key) -} - -func (db *DB) NewWriteBatch() driver.IWriteBatch { - wb := &WriteBatch{ - db: db, - wbatch: C.rocksdb_writebatch_create(), - } - - runtime.SetFinalizer(wb, func(w *WriteBatch) { - w.Close() - }) - - return wb -} - -func (db *DB) NewIterator() driver.IIterator { - it := new(Iterator) - - it.it = C.rocksdb_create_iterator(db.db, db.iteratorOpts.Opt) - - return it -} - -func (db *DB) NewSnapshot() (driver.ISnapshot, error) { - snap := &Snapshot{ - db: db, - snap: C.rocksdb_create_snapshot(db.db), - readOpts: NewReadOptions(), - iteratorOpts: NewReadOptions(), - } - snap.readOpts.SetSnapshot(snap) - snap.iteratorOpts.SetSnapshot(snap) - snap.iteratorOpts.SetFillCache(false) - - return snap, nil -} - -func (db *DB) put(wo *WriteOptions, key, value []byte) error { - var errStr *C.char - var k, v *C.char - if len(key) != 0 { - k = (*C.char)(unsafe.Pointer(&key[0])) - } - if len(value) != 0 { - v = (*C.char)(unsafe.Pointer(&value[0])) - } - - lenk := len(key) - lenv := len(value) - C.rocksdb_put( - db.db, wo.Opt, k, C.size_t(lenk), v, C.size_t(lenv), &errStr) - - if errStr != nil { - return saveError(errStr) - } - return nil -} - -func (db *DB) get(ro *ReadOptions, key []byte) ([]byte, error) { - var errStr *C.char - var vallen C.size_t - var k *C.char - if len(key) != 0 { - k = (*C.char)(unsafe.Pointer(&key[0])) - } - - value := C.rocksdb_get( - db.db, ro.Opt, k, C.size_t(len(key)), &vallen, &errStr) - - if errStr != nil { - return nil, saveError(errStr) - } - - if value == nil { - return nil, nil - } - - defer C.free(unsafe.Pointer(value)) - return C.GoBytes(unsafe.Pointer(value), C.int(vallen)), nil -} - -func (db *DB) getSlice(ro *ReadOptions, key []byte) (driver.ISlice, error) { - var errStr *C.char - var vallen C.size_t - var k *C.char - if len(key) != 0 { - k = (*C.char)(unsafe.Pointer(&key[0])) - } - - value := C.rocksdb_get( - db.db, ro.Opt, k, C.size_t(len(key)), &vallen, &errStr) - - if errStr != nil { - return nil, saveError(errStr) - } - - if value == nil { - return nil, nil - } - - return NewCSlice(unsafe.Pointer(value), int(vallen)), nil -} - -func (db *DB) delete(wo *WriteOptions, key []byte) error { - var errStr *C.char - var k *C.char - if len(key) != 0 { - k = (*C.char)(unsafe.Pointer(&key[0])) - } - - C.rocksdb_delete( - db.db, wo.Opt, k, C.size_t(len(key)), &errStr) - - if errStr != nil { - return saveError(errStr) - } - return nil -} - -func (db *DB) Begin() (driver.Tx, error) { - return nil, driver.ErrTxSupport -} - -func (db *DB) Compact() error { - C.rocksdb_compact_range(db.db, nil, 0, nil, 0) - return nil -} - -func (db *DB) GetSlice(key []byte) (driver.ISlice, error) { - return db.getSlice(db.readOpts, key) -} - -func init() { - driver.Register(Store{}) -} diff --git a/vendor/github.com/siddontang/ledisdb/store/rocksdb/env.go b/vendor/github.com/siddontang/ledisdb/store/rocksdb/env.go deleted file mode 100644 index e239c1b6c0e8..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/rocksdb/env.go +++ /dev/null @@ -1,27 +0,0 @@ -// +build rocksdb - -package rocksdb - -// #cgo LDFLAGS: -lrocksdb -// #include "rocksdb/c.h" -import "C" - -type Env struct { - Env *C.rocksdb_env_t -} - -func NewDefaultEnv() *Env { - return &Env{C.rocksdb_create_default_env()} -} - -func (env *Env) SetHighPriorityBackgroundThreads(n int) { - C.rocksdb_env_set_high_priority_background_threads(env.Env, C.int(n)) -} - -func (env *Env) SetBackgroundThreads(n int) { - C.rocksdb_env_set_background_threads(env.Env, C.int(n)) -} - -func (env *Env) Close() { - C.rocksdb_env_destroy(env.Env) -} diff --git a/vendor/github.com/siddontang/ledisdb/store/rocksdb/filterpolicy.go b/vendor/github.com/siddontang/ledisdb/store/rocksdb/filterpolicy.go deleted file mode 100644 index 3be4ef6acac7..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/rocksdb/filterpolicy.go +++ /dev/null @@ -1,21 +0,0 @@ -// +build rocksdb - -package rocksdb - -// #cgo LDFLAGS: -lrocksdb -// #include -// #include "rocksdb/c.h" -import "C" - -type FilterPolicy struct { - Policy *C.rocksdb_filterpolicy_t -} - -func NewBloomFilter(bitsPerKey int) *FilterPolicy { - policy := C.rocksdb_filterpolicy_create_bloom(C.int(bitsPerKey)) - return &FilterPolicy{policy} -} - -func (fp *FilterPolicy) Close() { - C.rocksdb_filterpolicy_destroy(fp.Policy) -} diff --git a/vendor/github.com/siddontang/ledisdb/store/rocksdb/iterator.go b/vendor/github.com/siddontang/ledisdb/store/rocksdb/iterator.go deleted file mode 100644 index 046c5e9dfaa4..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/rocksdb/iterator.go +++ /dev/null @@ -1,70 +0,0 @@ -// +build rocksdb - -package rocksdb - -// #cgo LDFLAGS: -lrocksdb -// #include -// #include "rocksdb/c.h" -// #include "rocksdb_ext.h" -import "C" - -import ( - "unsafe" -) - -type Iterator struct { - it *C.rocksdb_iterator_t - isValid C.uchar -} - -func (it *Iterator) Key() []byte { - var klen C.size_t - kdata := C.rocksdb_iter_key(it.it, &klen) - if kdata == nil { - return nil - } - - return slice(unsafe.Pointer(kdata), int(C.int(klen))) -} - -func (it *Iterator) Value() []byte { - var vlen C.size_t - vdata := C.rocksdb_iter_value(it.it, &vlen) - if vdata == nil { - return nil - } - - return slice(unsafe.Pointer(vdata), int(C.int(vlen))) -} - -func (it *Iterator) Close() error { - if it.it != nil { - C.rocksdb_iter_destroy(it.it) - it.it = nil - } - return nil -} - -func (it *Iterator) Valid() bool { - return ucharToBool(it.isValid) -} - -func (it *Iterator) Next() { - it.isValid = C.rocksdb_iter_next_ext(it.it) -} - -func (it *Iterator) Prev() { - it.isValid = C.rocksdb_iter_prev_ext(it.it) -} - -func (it *Iterator) First() { - it.isValid = C.rocksdb_iter_seek_to_first_ext(it.it) -} - -func (it *Iterator) Last() { - it.isValid = C.rocksdb_iter_seek_to_last_ext(it.it) -} - -func (it *Iterator) Seek(key []byte) { - it.isValid = C.rocksdb_iter_seek_ext(it.it, (*C.char)(unsafe.Pointer(&key[0])), C.size_t(len(key))) -} diff --git a/vendor/github.com/siddontang/ledisdb/store/rocksdb/options.go b/vendor/github.com/siddontang/ledisdb/store/rocksdb/options.go deleted file mode 100644 index 27836790fda4..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/rocksdb/options.go +++ /dev/null @@ -1,233 +0,0 @@ -// +build rocksdb - -package rocksdb - -// #cgo LDFLAGS: -lrocksdb -// #include "rocksdb/c.h" -import "C" - -type CompressionOpt int - -const ( - NoCompression = CompressionOpt(0) - SnappyCompression = CompressionOpt(1) - ZlibCompression = CompressionOpt(2) - Bz2Compression = CompressionOpt(3) - Lz4Compression = CompressionOpt(4) - Lz4hcCompression = CompressionOpt(5) -) - -type Options struct { - Opt *C.rocksdb_options_t -} - -type ReadOptions struct { - Opt *C.rocksdb_readoptions_t -} - -type WriteOptions struct { - Opt *C.rocksdb_writeoptions_t -} - -type BlockBasedTableOptions struct { - Opt *C.rocksdb_block_based_table_options_t -} - -func NewOptions() *Options { - opt := C.rocksdb_options_create() - return &Options{opt} -} - -func NewReadOptions() *ReadOptions { - opt := C.rocksdb_readoptions_create() - return &ReadOptions{opt} -} - -func NewWriteOptions() *WriteOptions { - opt := C.rocksdb_writeoptions_create() - return &WriteOptions{opt} -} - -func NewBlockBasedTableOptions() *BlockBasedTableOptions { - opt := C.rocksdb_block_based_options_create() - return &BlockBasedTableOptions{opt} -} - -func (o *Options) Close() { - C.rocksdb_options_destroy(o.Opt) -} - -func (o *Options) IncreaseParallelism(n int) { - C.rocksdb_options_increase_parallelism(o.Opt, C.int(n)) -} - -func (o *Options) OptimizeLevelStyleCompaction(n int) { - C.rocksdb_options_optimize_level_style_compaction(o.Opt, C.uint64_t(n)) -} - -func (o *Options) SetComparator(cmp *C.rocksdb_comparator_t) { - C.rocksdb_options_set_comparator(o.Opt, cmp) -} - -func (o *Options) SetErrorIfExists(error_if_exists bool) { - eie := boolToUchar(error_if_exists) - C.rocksdb_options_set_error_if_exists(o.Opt, eie) -} - -func (o *Options) SetEnv(env *Env) { - C.rocksdb_options_set_env(o.Opt, env.Env) -} - -func (o *Options) SetWriteBufferSize(s int) { - C.rocksdb_options_set_write_buffer_size(o.Opt, C.size_t(s)) -} - -func (o *Options) SetParanoidChecks(pc bool) { - C.rocksdb_options_set_paranoid_checks(o.Opt, boolToUchar(pc)) -} - -func (o *Options) SetMaxOpenFiles(n int) { - C.rocksdb_options_set_max_open_files(o.Opt, C.int(n)) -} - -func (o *Options) SetCompression(t CompressionOpt) { - C.rocksdb_options_set_compression(o.Opt, C.int(t)) -} - -func (o *Options) SetCreateIfMissing(b bool) { - C.rocksdb_options_set_create_if_missing(o.Opt, boolToUchar(b)) -} - -func (o *Options) SetMaxWriteBufferNumber(n int) { - C.rocksdb_options_set_max_write_buffer_number(o.Opt, C.int(n)) -} - -func (o *Options) SetMaxBackgroundCompactions(n int) { - C.rocksdb_options_set_max_background_compactions(o.Opt, C.int(n)) -} - -func (o *Options) SetMaxBackgroundFlushes(n int) { - C.rocksdb_options_set_max_background_flushes(o.Opt, C.int(n)) -} - -func (o *Options) SetNumLevels(n int) { - C.rocksdb_options_set_num_levels(o.Opt, C.int(n)) -} - -func (o *Options) SetLevel0FileNumCompactionTrigger(n int) { - C.rocksdb_options_set_level0_file_num_compaction_trigger(o.Opt, C.int(n)) -} - -func (o *Options) SetLevel0SlowdownWritesTrigger(n int) { - C.rocksdb_options_set_level0_slowdown_writes_trigger(o.Opt, C.int(n)) -} - -func (o *Options) SetLevel0StopWritesTrigger(n int) { - C.rocksdb_options_set_level0_stop_writes_trigger(o.Opt, C.int(n)) -} - -func (o *Options) SetTargetFileSizeBase(n int) { - C.rocksdb_options_set_target_file_size_base(o.Opt, C.uint64_t(uint64(n))) -} - -func (o *Options) SetTargetFileSizeMultiplier(n int) { - C.rocksdb_options_set_target_file_size_multiplier(o.Opt, C.int(n)) -} - -func (o *Options) SetMaxBytesForLevelBase(n int) { - C.rocksdb_options_set_max_bytes_for_level_base(o.Opt, C.uint64_t(uint64(n))) -} - -func (o *Options) SetMaxBytesForLevelMultiplier(n int) { - C.rocksdb_options_set_max_bytes_for_level_multiplier(o.Opt, C.int(n)) -} - -func (o *Options) SetBlockBasedTableFactory(opt *BlockBasedTableOptions) { - C.rocksdb_options_set_block_based_table_factory(o.Opt, opt.Opt) -} - -func (o *Options) SetMinWriteBufferNumberToMerge(n int) { - C.rocksdb_options_set_min_write_buffer_number_to_merge(o.Opt, C.int(n)) -} - -func (o *Options) DisableDataSync(b bool) { - C.rocksdb_options_set_disable_data_sync(o.Opt, boolToInt(b)) -} - -func (o *Options) DisableAutoCompactions(b bool) { - C.rocksdb_options_set_disable_auto_compactions(o.Opt, boolToInt(b)) -} - -func (o *Options) UseFsync(b bool) { - C.rocksdb_options_set_use_fsync(o.Opt, boolToInt(b)) -} - -func (o *Options) AllowOsBuffer(b bool) { - C.rocksdb_options_set_allow_os_buffer(o.Opt, boolToUchar(b)) -} - -func (o *Options) EnableStatistics(b bool) { - if b { - C.rocksdb_options_enable_statistics(o.Opt) - } -} - -func (o *Options) SetStatsDumpPeriodSec(n int) { - C.rocksdb_options_set_stats_dump_period_sec(o.Opt, C.uint(n)) -} - -func (o *BlockBasedTableOptions) Close() { - C.rocksdb_block_based_options_destroy(o.Opt) -} - -func (o *BlockBasedTableOptions) SetFilterPolicy(fp *FilterPolicy) { - var policy *C.rocksdb_filterpolicy_t - if fp != nil { - policy = fp.Policy - } - C.rocksdb_block_based_options_set_filter_policy(o.Opt, policy) -} - -func (o *BlockBasedTableOptions) SetBlockSize(s int) { - C.rocksdb_block_based_options_set_block_size(o.Opt, C.size_t(s)) -} - -func (o *BlockBasedTableOptions) SetBlockRestartInterval(n int) { - C.rocksdb_block_based_options_set_block_restart_interval(o.Opt, C.int(n)) -} - -func (o *BlockBasedTableOptions) SetCache(cache *Cache) { - C.rocksdb_block_based_options_set_block_cache(o.Opt, cache.Cache) -} - -func (ro *ReadOptions) Close() { - C.rocksdb_readoptions_destroy(ro.Opt) -} - -func (ro *ReadOptions) SetVerifyChecksums(b bool) { - C.rocksdb_readoptions_set_verify_checksums(ro.Opt, boolToUchar(b)) -} - -func (ro *ReadOptions) SetFillCache(b bool) { - C.rocksdb_readoptions_set_fill_cache(ro.Opt, boolToUchar(b)) -} - -func (ro *ReadOptions) SetSnapshot(snap *Snapshot) { - var s *C.rocksdb_snapshot_t - if snap != nil { - s = snap.snap - } - C.rocksdb_readoptions_set_snapshot(ro.Opt, s) -} - -func (wo *WriteOptions) Close() { - C.rocksdb_writeoptions_destroy(wo.Opt) -} - -func (wo *WriteOptions) SetSync(b bool) { - C.rocksdb_writeoptions_set_sync(wo.Opt, boolToUchar(b)) -} - -func (wo *WriteOptions) DisableWAL(b bool) { - C.rocksdb_writeoptions_disable_WAL(wo.Opt, boolToInt(b)) -} diff --git a/vendor/github.com/siddontang/ledisdb/store/rocksdb/rocksdb_ext.cc b/vendor/github.com/siddontang/ledisdb/store/rocksdb/rocksdb_ext.cc deleted file mode 100644 index 39036ab96cc2..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/rocksdb/rocksdb_ext.cc +++ /dev/null @@ -1,44 +0,0 @@ -// +build rocksdb - -#include "rocksdb_ext.h" - -#include -#include - -extern "C" { - -unsigned char rocksdb_iter_seek_to_first_ext(rocksdb_iterator_t* iter) { - rocksdb_iter_seek_to_first(iter); - return rocksdb_iter_valid(iter); -} - -unsigned char rocksdb_iter_seek_to_last_ext(rocksdb_iterator_t* iter) { - rocksdb_iter_seek_to_last(iter); - return rocksdb_iter_valid(iter); -} - -unsigned char rocksdb_iter_seek_ext(rocksdb_iterator_t* iter, const char* k, size_t klen) { - rocksdb_iter_seek(iter, k, klen); - return rocksdb_iter_valid(iter); -} - -unsigned char rocksdb_iter_next_ext(rocksdb_iterator_t* iter) { - rocksdb_iter_next(iter); - return rocksdb_iter_valid(iter); -} - -unsigned char rocksdb_iter_prev_ext(rocksdb_iterator_t* iter) { - rocksdb_iter_prev(iter); - return rocksdb_iter_valid(iter); -} - -void rocksdb_write_ext(rocksdb_t* db, - const rocksdb_writeoptions_t* options, - rocksdb_writebatch_t* batch, char** errptr) { - rocksdb_write(db, options, batch, errptr); - if(*errptr == NULL) { - rocksdb_writebatch_clear(batch); - } -} - -} \ No newline at end of file diff --git a/vendor/github.com/siddontang/ledisdb/store/rocksdb/rocksdb_ext.h b/vendor/github.com/siddontang/ledisdb/store/rocksdb/rocksdb_ext.h deleted file mode 100644 index 11cb65304105..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/rocksdb/rocksdb_ext.h +++ /dev/null @@ -1,24 +0,0 @@ -// +build rocksdb - -#ifndef ROCKSDB_EXT_H -#define ROCKSDB_EXT_H - -#ifdef __cplusplus -extern "C" { -#endif - -#include "rocksdb/c.h" - -// Below iterator functions like rocksdb iterator but returns valid status for iterator -extern unsigned char rocksdb_iter_seek_to_first_ext(rocksdb_iterator_t*); -extern unsigned char rocksdb_iter_seek_to_last_ext(rocksdb_iterator_t*); -extern unsigned char rocksdb_iter_seek_ext(rocksdb_iterator_t*, const char* k, size_t klen); -extern unsigned char rocksdb_iter_next_ext(rocksdb_iterator_t*); -extern unsigned char rocksdb_iter_prev_ext(rocksdb_iterator_t*); -extern void rocksdb_write_ext(rocksdb_t* db, const rocksdb_writeoptions_t* options, rocksdb_writebatch_t* batch, char** errptr); - -#ifdef __cplusplus -} -#endif - -#endif \ No newline at end of file diff --git a/vendor/github.com/siddontang/ledisdb/store/rocksdb/slice.go b/vendor/github.com/siddontang/ledisdb/store/rocksdb/slice.go deleted file mode 100644 index bbaa65bd7652..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/rocksdb/slice.go +++ /dev/null @@ -1,41 +0,0 @@ -//+build rocksdb - -package rocksdb - -// #cgo LDFLAGS: -lrocksdb -// #include -// #include -import "C" - -import ( - "reflect" - "unsafe" -) - -type CSlice struct { - data unsafe.Pointer - size int -} - -func NewCSlice(p unsafe.Pointer, n int) *CSlice { - return &CSlice{p, n} -} - -func (s *CSlice) Data() []byte { - var value []byte - - sH := (*reflect.SliceHeader)(unsafe.Pointer(&value)) - sH.Cap = int(s.size) - sH.Len = int(s.size) - sH.Data = uintptr(s.data) - - return value -} - -func (s *CSlice) Size() int { - return int(s.size) -} - -func (s *CSlice) Free() { - C.free(s.data) -} diff --git a/vendor/github.com/siddontang/ledisdb/store/rocksdb/snapshot.go b/vendor/github.com/siddontang/ledisdb/store/rocksdb/snapshot.go deleted file mode 100644 index 1ced60020027..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/rocksdb/snapshot.go +++ /dev/null @@ -1,39 +0,0 @@ -// +build rocksdb - -package rocksdb - -// #cgo LDFLAGS: -lrocksdb -// #include "rocksdb/c.h" -import "C" - -import ( - "github.com/siddontang/ledisdb/store/driver" -) - -type Snapshot struct { - db *DB - snap *C.rocksdb_snapshot_t - readOpts *ReadOptions - iteratorOpts *ReadOptions -} - -func (s *Snapshot) Get(key []byte) ([]byte, error) { - return s.db.get(s.readOpts, key) -} - -func (s *Snapshot) GetSlice(key []byte) (driver.ISlice, error) { - return s.db.getSlice(s.readOpts, key) -} - -func (s *Snapshot) NewIterator() driver.IIterator { - it := new(Iterator) - it.it = C.rocksdb_create_iterator(s.db.db, s.db.iteratorOpts.Opt) - return it - -} - -func (s *Snapshot) Close() { - C.rocksdb_release_snapshot(s.db.db, s.snap) - s.iteratorOpts.Close() - s.readOpts.Close() -} diff --git a/vendor/github.com/siddontang/ledisdb/store/rocksdb/util.go b/vendor/github.com/siddontang/ledisdb/store/rocksdb/util.go deleted file mode 100644 index 22b73baf4ad4..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/rocksdb/util.go +++ /dev/null @@ -1,54 +0,0 @@ -// +build rocksdb - -package rocksdb - -// #include -// #include "rocksdb/c.h" -import "C" - -import ( - "fmt" - "reflect" - "unsafe" -) - -func boolToUchar(b bool) C.uchar { - uc := C.uchar(0) - if b { - uc = C.uchar(1) - } - return uc -} - -func ucharToBool(uc C.uchar) bool { - if uc == C.uchar(0) { - return false - } - return true -} - -func boolToInt(b bool) C.int { - uc := C.int(0) - if b { - uc = C.int(1) - } - return uc -} - -func saveError(errStr *C.char) error { - if errStr != nil { - gs := C.GoString(errStr) - C.free(unsafe.Pointer(errStr)) - return fmt.Errorf(gs) - } - return nil -} - -func slice(p unsafe.Pointer, n int) []byte { - var b []byte - pbyte := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - pbyte.Data = uintptr(p) - pbyte.Len = n - pbyte.Cap = n - return b -} diff --git a/vendor/github.com/siddontang/ledisdb/store/slice.go b/vendor/github.com/siddontang/ledisdb/store/slice.go deleted file mode 100644 index b027f4f28b8e..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/slice.go +++ /dev/null @@ -1,9 +0,0 @@ -package store - -import ( - "github.com/siddontang/ledisdb/store/driver" -) - -type Slice interface { - driver.ISlice -} diff --git a/vendor/github.com/siddontang/ledisdb/store/snapshot.go b/vendor/github.com/siddontang/ledisdb/store/snapshot.go deleted file mode 100644 index a1c9de9944e6..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/snapshot.go +++ /dev/null @@ -1,48 +0,0 @@ -package store - -import ( - "github.com/siddontang/ledisdb/store/driver" -) - -type Snapshot struct { - driver.ISnapshot - st *Stat -} - -func (s *Snapshot) NewIterator() *Iterator { - it := new(Iterator) - it.it = s.ISnapshot.NewIterator() - it.st = s.st - - s.st.IterNum.Add(1) - - return it -} - -func (s *Snapshot) Get(key []byte) ([]byte, error) { - v, err := s.ISnapshot.Get(key) - s.st.statGet(v, err) - return v, err -} - -func (s *Snapshot) GetSlice(key []byte) (Slice, error) { - if d, ok := s.ISnapshot.(driver.ISliceGeter); ok { - v, err := d.GetSlice(key) - s.st.statGet(v, err) - return v, err - } else { - v, err := s.Get(key) - if err != nil { - return nil, err - } else if v == nil { - return nil, nil - } else { - return driver.GoSlice(v), nil - } - } -} - -func (s *Snapshot) Close() { - s.st.SnapshotCloseNum.Add(1) - s.ISnapshot.Close() -} diff --git a/vendor/github.com/siddontang/ledisdb/store/stat.go b/vendor/github.com/siddontang/ledisdb/store/stat.go deleted file mode 100644 index e0a035ab8e18..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/stat.go +++ /dev/null @@ -1,37 +0,0 @@ -package store - -import ( - "github.com/siddontang/go/sync2" -) - -type Stat struct { - GetNum sync2.AtomicInt64 - GetMissingNum sync2.AtomicInt64 - GetTotalTime sync2.AtomicDuration - PutNum sync2.AtomicInt64 - DeleteNum sync2.AtomicInt64 - IterNum sync2.AtomicInt64 - IterSeekNum sync2.AtomicInt64 - IterCloseNum sync2.AtomicInt64 - SnapshotNum sync2.AtomicInt64 - SnapshotCloseNum sync2.AtomicInt64 - BatchNum sync2.AtomicInt64 - BatchCommitNum sync2.AtomicInt64 - BatchCommitTotalTime sync2.AtomicDuration - TxNum sync2.AtomicInt64 - TxCommitNum sync2.AtomicInt64 - TxCloseNum sync2.AtomicInt64 - CompactNum sync2.AtomicInt64 - CompactTotalTime sync2.AtomicDuration -} - -func (st *Stat) statGet(v interface{}, err error) { - st.GetNum.Add(1) - if v == nil && err == nil { - st.GetMissingNum.Add(1) - } -} - -func (st *Stat) Reset() { - *st = Stat{} -} diff --git a/vendor/github.com/siddontang/ledisdb/store/store.go b/vendor/github.com/siddontang/ledisdb/store/store.go deleted file mode 100644 index d1be9b012c4d..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/store.go +++ /dev/null @@ -1,63 +0,0 @@ -package store - -import ( - "fmt" - "github.com/siddontang/ledisdb/config" - "github.com/siddontang/ledisdb/store/driver" - "os" - "path" - - _ "github.com/siddontang/ledisdb/store/boltdb" - _ "github.com/siddontang/ledisdb/store/goleveldb" - _ "github.com/siddontang/ledisdb/store/leveldb" - _ "github.com/siddontang/ledisdb/store/mdb" - _ "github.com/siddontang/ledisdb/store/rocksdb" -) - -func getStorePath(cfg *config.Config) string { - if len(cfg.DBPath) > 0 { - return cfg.DBPath - } else { - return path.Join(cfg.DataDir, fmt.Sprintf("%s_data", cfg.DBName)) - } -} - -func Open(cfg *config.Config) (*DB, error) { - s, err := driver.GetStore(cfg) - if err != nil { - return nil, err - } - - path := getStorePath(cfg) - - if err := os.MkdirAll(path, 0755); err != nil { - return nil, err - } - - idb, err := s.Open(path, cfg) - if err != nil { - return nil, err - } - - db := new(DB) - db.db = idb - db.name = s.String() - db.st = &Stat{} - db.cfg = cfg - - return db, nil -} - -func Repair(cfg *config.Config) error { - s, err := driver.GetStore(cfg) - if err != nil { - return err - } - - path := getStorePath(cfg) - - return s.Repair(path, cfg) -} - -func init() { -} diff --git a/vendor/github.com/siddontang/ledisdb/store/tx.go b/vendor/github.com/siddontang/ledisdb/store/tx.go deleted file mode 100644 index 4dbf311fd843..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/tx.go +++ /dev/null @@ -1,86 +0,0 @@ -package store - -import ( - "github.com/siddontang/ledisdb/store/driver" -) - -type Tx struct { - tx driver.Tx - st *Stat -} - -func (tx *Tx) NewIterator() *Iterator { - it := new(Iterator) - it.it = tx.tx.NewIterator() - it.st = tx.st - - tx.st.IterNum.Add(1) - - return it -} - -func (tx *Tx) NewWriteBatch() *WriteBatch { - tx.st.BatchNum.Add(1) - - wb := new(WriteBatch) - wb.wb = tx.tx.NewWriteBatch() - wb.st = tx.st - return wb -} - -func (tx *Tx) RangeIterator(min []byte, max []byte, rangeType uint8) *RangeLimitIterator { - return NewRangeLimitIterator(tx.NewIterator(), &Range{min, max, rangeType}, &Limit{0, -1}) -} - -func (tx *Tx) RevRangeIterator(min []byte, max []byte, rangeType uint8) *RangeLimitIterator { - return NewRevRangeLimitIterator(tx.NewIterator(), &Range{min, max, rangeType}, &Limit{0, -1}) -} - -//count < 0, unlimit. -// -//offset must >= 0, if < 0, will get nothing. -func (tx *Tx) RangeLimitIterator(min []byte, max []byte, rangeType uint8, offset int, count int) *RangeLimitIterator { - return NewRangeLimitIterator(tx.NewIterator(), &Range{min, max, rangeType}, &Limit{offset, count}) -} - -//count < 0, unlimit. -// -//offset must >= 0, if < 0, will get nothing. -func (tx *Tx) RevRangeLimitIterator(min []byte, max []byte, rangeType uint8, offset int, count int) *RangeLimitIterator { - return NewRevRangeLimitIterator(tx.NewIterator(), &Range{min, max, rangeType}, &Limit{offset, count}) -} - -func (tx *Tx) Get(key []byte) ([]byte, error) { - v, err := tx.tx.Get(key) - tx.st.statGet(v, err) - return v, err -} - -func (tx *Tx) GetSlice(key []byte) (Slice, error) { - if v, err := tx.Get(key); err != nil { - return nil, err - } else if v == nil { - return nil, nil - } else { - return driver.GoSlice(v), nil - } -} - -func (tx *Tx) Put(key []byte, value []byte) error { - tx.st.PutNum.Add(1) - return tx.tx.Put(key, value) -} - -func (tx *Tx) Delete(key []byte) error { - tx.st.DeleteNum.Add(1) - return tx.tx.Delete(key) -} - -func (tx *Tx) Commit() error { - tx.st.TxCommitNum.Add(1) - return tx.tx.Commit() -} - -func (tx *Tx) Rollback() error { - return tx.tx.Rollback() -} diff --git a/vendor/github.com/siddontang/ledisdb/store/writebatch.go b/vendor/github.com/siddontang/ledisdb/store/writebatch.go deleted file mode 100644 index c193ae08dbaa..000000000000 --- a/vendor/github.com/siddontang/ledisdb/store/writebatch.go +++ /dev/null @@ -1,150 +0,0 @@ -package store - -import ( - "encoding/binary" - "github.com/siddontang/ledisdb/store/driver" - "github.com/syndtr/goleveldb/leveldb" - "time" -) - -type WriteBatch struct { - wb driver.IWriteBatch - st *Stat - - putNum int64 - deleteNum int64 - db *DB - - data *BatchData -} - -func (wb *WriteBatch) Close() { - wb.wb.Close() -} - -func (wb *WriteBatch) Put(key []byte, value []byte) { - wb.putNum++ - wb.wb.Put(key, value) -} - -func (wb *WriteBatch) Delete(key []byte) { - wb.deleteNum++ - wb.wb.Delete(key) -} - -func (wb *WriteBatch) Commit() error { - wb.st.BatchCommitNum.Add(1) - wb.st.PutNum.Add(wb.putNum) - wb.st.DeleteNum.Add(wb.deleteNum) - wb.putNum = 0 - wb.deleteNum = 0 - - var err error - t := time.Now() - if wb.db == nil || !wb.db.needSyncCommit() { - err = wb.wb.Commit() - } else { - err = wb.wb.SyncCommit() - } - - wb.st.BatchCommitTotalTime.Add(time.Now().Sub(t)) - - return err -} - -func (wb *WriteBatch) Rollback() error { - wb.putNum = 0 - wb.deleteNum = 0 - - return wb.wb.Rollback() -} - -// the data will be undefined after commit or rollback -func (wb *WriteBatch) BatchData() *BatchData { - data := wb.wb.Data() - if wb.data == nil { - wb.data = new(BatchData) - } - - wb.data.Load(data) - return wb.data -} - -func (wb *WriteBatch) Data() []byte { - b := wb.BatchData() - return b.Data() -} - -const BatchDataHeadLen = 12 - -/* - see leveldb batch data format for more information -*/ - -type BatchData struct { - leveldb.Batch -} - -func NewBatchData(data []byte) (*BatchData, error) { - b := new(BatchData) - - if err := b.Load(data); err != nil { - return nil, err - } - - return b, nil -} - -func (d *BatchData) Append(do *BatchData) error { - d1 := d.Dump() - d2 := do.Dump() - - n := d.Len() + do.Len() - - d1 = append(d1, d2[BatchDataHeadLen:]...) - binary.LittleEndian.PutUint32(d1[8:], uint32(n)) - - return d.Load(d1) -} - -func (d *BatchData) Data() []byte { - return d.Dump() -} - -func (d *BatchData) Reset() { - d.Batch.Reset() -} - -type BatchDataReplay interface { - Put(key, value []byte) - Delete(key []byte) -} - -type BatchItem struct { - Key []byte - Value []byte -} - -type batchItems []BatchItem - -func (bs *batchItems) Put(key, value []byte) { - *bs = append(*bs, BatchItem{key, value}) -} - -func (bs *batchItems) Delete(key []byte) { - *bs = append(*bs, BatchItem{key, nil}) -} - -func (d *BatchData) Replay(r BatchDataReplay) error { - return d.Batch.Replay(r) -} - -func (d *BatchData) Items() ([]BatchItem, error) { - is := make(batchItems, 0, d.Len()) - - if err := d.Replay(&is); err != nil { - return nil, err - } - - return []BatchItem(is), nil -} diff --git a/vendor/github.com/szferi/gomdb/LICENSE b/vendor/github.com/szferi/gomdb/LICENSE deleted file mode 100644 index 96fd7a269744..000000000000 --- a/vendor/github.com/szferi/gomdb/LICENSE +++ /dev/null @@ -1,10 +0,0 @@ -Copyright (c) 2013, Ferenc Szalai -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name of the author nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/github.com/szferi/gomdb/cursor.go b/vendor/github.com/szferi/gomdb/cursor.go deleted file mode 100644 index 75e0d1487147..000000000000 --- a/vendor/github.com/szferi/gomdb/cursor.go +++ /dev/null @@ -1,103 +0,0 @@ -package mdb - -/* -#cgo CFLAGS: -pthread -W -Wall -Wno-unused-parameter -Wbad-function-cast -O2 -g -#cgo freebsd CFLAGS: -DMDB_DSYNC=O_SYNC -#cgo openbsd CFLAGS: -DMDB_DSYNC=O_SYNC -#cgo netbsd CFLAGS: -DMDB_DSYNC=O_SYNC -#include -#include -#include "lmdb.h" -*/ -import "C" - -import ( - "errors" -) - -// MDB_cursor_op -const ( - FIRST = iota - FIRST_DUP - GET_BOTH - GET_RANGE - GET_CURRENT - GET_MULTIPLE - LAST - LAST_DUP - NEXT - NEXT_DUP - NEXT_MULTIPLE - NEXT_NODUP - PREV - PREV_DUP - PREV_NODUP - SET - SET_KEY - SET_RANGE -) - -func (cursor *Cursor) Close() error { - if cursor._cursor == nil { - return errors.New("Cursor already closed") - } - C.mdb_cursor_close(cursor._cursor) - cursor._cursor = nil - return nil -} - -func (cursor *Cursor) Txn() *Txn { - var _txn *C.MDB_txn - _txn = C.mdb_cursor_txn(cursor._cursor) - if _txn != nil { - return &Txn{_txn} - } - return nil -} - -func (cursor *Cursor) DBI() DBI { - var _dbi C.MDB_dbi - _dbi = C.mdb_cursor_dbi(cursor._cursor) - return DBI(_dbi) -} - -// Retrieves the low-level MDB cursor. -func (cursor *Cursor) MdbCursor() *C.MDB_cursor { - return cursor._cursor -} - -func (cursor *Cursor) Get(set_key, sval []byte, op uint) (key, val []byte, err error) { - k, v, err := cursor.GetVal(set_key, sval, op) - if err != nil { - return nil, nil, err - } - return k.Bytes(), v.Bytes(), nil -} - -func (cursor *Cursor) GetVal(key, val []byte, op uint) (Val, Val, error) { - ckey := Wrap(key) - cval := Wrap(val) - ret := C.mdb_cursor_get(cursor._cursor, (*C.MDB_val)(&ckey), (*C.MDB_val)(&cval), C.MDB_cursor_op(op)) - return ckey, cval, errno(ret) -} - -func (cursor *Cursor) Put(key, val []byte, flags uint) error { - ckey := Wrap(key) - cval := Wrap(val) - ret := C.mdb_cursor_put(cursor._cursor, (*C.MDB_val)(&ckey), (*C.MDB_val)(&cval), C.uint(flags)) - return errno(ret) -} - -func (cursor *Cursor) Del(flags uint) error { - ret := C.mdb_cursor_del(cursor._cursor, C.uint(flags)) - return errno(ret) -} - -func (cursor *Cursor) Count() (uint64, error) { - var _size C.size_t - ret := C.mdb_cursor_count(cursor._cursor, &_size) - if ret != SUCCESS { - return 0, errno(ret) - } - return uint64(_size), nil -} diff --git a/vendor/github.com/szferi/gomdb/env.go b/vendor/github.com/szferi/gomdb/env.go deleted file mode 100644 index 08b4d0a32900..000000000000 --- a/vendor/github.com/szferi/gomdb/env.go +++ /dev/null @@ -1,227 +0,0 @@ -package mdb - -/* -#cgo CFLAGS: -pthread -W -Wall -Wno-unused-parameter -Wbad-function-cast -O2 -g -#cgo freebsd CFLAGS: -DMDB_DSYNC=O_SYNC -#cgo openbsd CFLAGS: -DMDB_DSYNC=O_SYNC -#cgo netbsd CFLAGS: -DMDB_DSYNC=O_SYNC -#include -#include -#include "lmdb.h" -*/ -import "C" - -import ( - "errors" - "fmt" - "syscall" - "unsafe" -) - -const SUCCESS = C.MDB_SUCCESS - -// mdb_env Environment Flags -const ( - FIXEDMAP = C.MDB_FIXEDMAP // mmap at a fixed address (experimental) - NOSUBDIR = C.MDB_NOSUBDIR // no environment directory - NOSYNC = C.MDB_NOSYNC // don't fsync after commit - RDONLY = C.MDB_RDONLY // read only - NOMETASYNC = C.MDB_NOMETASYNC // don't fsync metapage after commit - WRITEMAP = C.MDB_WRITEMAP // use writable mmap - MAPASYNC = C.MDB_MAPASYNC // use asynchronous msync when MDB_WRITEMAP is use - NOTLS = C.MDB_NOTLS // tie reader locktable slots to Txn objects instead of threads -) - -type DBI uint - -type Errno C.int - -// minimum and maximum values produced for the Errno type. syscall.Errnos of -// other values may still be produced. -const minErrno, maxErrno C.int = C.MDB_KEYEXIST, C.MDB_LAST_ERRCODE - -func (e Errno) Error() string { - s := C.GoString(C.mdb_strerror(C.int(e))) - if s == "" { - return fmt.Sprint("mdb errno:", int(e)) - } - return s -} - -// for tests that can't import C -func _errno(ret int) error { - return errno(C.int(ret)) -} - -func errno(ret C.int) error { - if ret == C.MDB_SUCCESS { - return nil - } - if minErrno <= ret && ret <= maxErrno { - return Errno(ret) - } - return syscall.Errno(ret) -} - -// error codes -const ( - KeyExist = Errno(C.MDB_KEYEXIST) - NotFound = Errno(C.MDB_NOTFOUND) - PageNotFound = Errno(C.MDB_PAGE_NOTFOUND) - Corrupted = Errno(C.MDB_CORRUPTED) - Panic = Errno(C.MDB_PANIC) - VersionMismatch = Errno(C.MDB_VERSION_MISMATCH) - Invalid = Errno(C.MDB_INVALID) - MapFull = Errno(C.MDB_MAP_FULL) - DbsFull = Errno(C.MDB_DBS_FULL) - ReadersFull = Errno(C.MDB_READERS_FULL) - TlsFull = Errno(C.MDB_TLS_FULL) - TxnFull = Errno(C.MDB_TXN_FULL) - CursorFull = Errno(C.MDB_CURSOR_FULL) - PageFull = Errno(C.MDB_PAGE_FULL) - MapResized = Errno(C.MDB_MAP_RESIZED) - Incompatibile = Errno(C.MDB_INCOMPATIBLE) -) - -func Version() string { - var major, minor, patch *C.int - ver_str := C.mdb_version(major, minor, patch) - return C.GoString(ver_str) -} - -// Env is opaque structure for a database environment. -// A DB environment supports multiple databases, all residing in the -// same shared-memory map. -type Env struct { - _env *C.MDB_env -} - -// Create an MDB environment handle. -func NewEnv() (*Env, error) { - var _env *C.MDB_env - ret := C.mdb_env_create(&_env) - if ret != SUCCESS { - return nil, errno(ret) - } - return &Env{_env}, nil -} - -// Open an environment handle. If this function fails Close() must be called to discard the Env handle. -func (env *Env) Open(path string, flags uint, mode uint) error { - cpath := C.CString(path) - defer C.free(unsafe.Pointer(cpath)) - ret := C.mdb_env_open(env._env, cpath, C.uint(NOTLS|flags), C.mdb_mode_t(mode)) - return errno(ret) -} - -func (env *Env) Close() error { - if env._env == nil { - return errors.New("Environment already closed") - } - C.mdb_env_close(env._env) - env._env = nil - return nil -} - -func (env *Env) Copy(path string) error { - cpath := C.CString(path) - defer C.free(unsafe.Pointer(cpath)) - ret := C.mdb_env_copy(env._env, cpath) - return errno(ret) -} - -// Statistics for a database in the environment -type Stat struct { - PSize uint // Size of a database page. This is currently the same for all databases. - Depth uint // Depth (height) of the B-tree - BranchPages uint64 // Number of internal (non-leaf) pages - LeafPages uint64 // Number of leaf pages - OverflowPages uint64 // Number of overflow pages - Entries uint64 // Number of data items -} - -func (env *Env) Stat() (*Stat, error) { - var _stat C.MDB_stat - ret := C.mdb_env_stat(env._env, &_stat) - if ret != SUCCESS { - return nil, errno(ret) - } - stat := Stat{PSize: uint(_stat.ms_psize), - Depth: uint(_stat.ms_depth), - BranchPages: uint64(_stat.ms_branch_pages), - LeafPages: uint64(_stat.ms_leaf_pages), - OverflowPages: uint64(_stat.ms_overflow_pages), - Entries: uint64(_stat.ms_entries)} - return &stat, nil -} - -type Info struct { - MapSize uint64 // Size of the data memory map - LastPNO uint64 // ID of the last used page - LastTxnID uint64 // ID of the last committed transaction - MaxReaders uint // maximum number of threads for the environment - NumReaders uint // maximum number of threads used in the environment -} - -func (env *Env) Info() (*Info, error) { - var _info C.MDB_envinfo - ret := C.mdb_env_info(env._env, &_info) - if ret != SUCCESS { - return nil, errno(ret) - } - info := Info{MapSize: uint64(_info.me_mapsize), - LastPNO: uint64(_info.me_last_pgno), - LastTxnID: uint64(_info.me_last_txnid), - MaxReaders: uint(_info.me_maxreaders), - NumReaders: uint(_info.me_numreaders)} - return &info, nil -} - -func (env *Env) Sync(force int) error { - ret := C.mdb_env_sync(env._env, C.int(force)) - return errno(ret) -} - -func (env *Env) SetFlags(flags uint, onoff int) error { - ret := C.mdb_env_set_flags(env._env, C.uint(flags), C.int(onoff)) - return errno(ret) -} - -func (env *Env) Flags() (uint, error) { - var _flags C.uint - ret := C.mdb_env_get_flags(env._env, &_flags) - if ret != SUCCESS { - return 0, errno(ret) - } - return uint(_flags), nil -} - -func (env *Env) Path() (string, error) { - var path string - cpath := C.CString(path) - defer C.free(unsafe.Pointer(cpath)) - ret := C.mdb_env_get_path(env._env, &cpath) - if ret != SUCCESS { - return "", errno(ret) - } - return C.GoString(cpath), nil -} - -func (env *Env) SetMapSize(size uint64) error { - ret := C.mdb_env_set_mapsize(env._env, C.size_t(size)) - return errno(ret) -} - -func (env *Env) SetMaxReaders(size uint) error { - ret := C.mdb_env_set_maxreaders(env._env, C.uint(size)) - return errno(ret) -} - -func (env *Env) SetMaxDBs(size DBI) error { - ret := C.mdb_env_set_maxdbs(env._env, C.MDB_dbi(size)) - return errno(ret) -} - -func (env *Env) DBIClose(dbi DBI) { - C.mdb_dbi_close(env._env, C.MDB_dbi(dbi)) -} diff --git a/vendor/github.com/szferi/gomdb/lmdb.h b/vendor/github.com/szferi/gomdb/lmdb.h deleted file mode 100644 index 11b46bfa4866..000000000000 --- a/vendor/github.com/szferi/gomdb/lmdb.h +++ /dev/null @@ -1,1553 +0,0 @@ -/** @file lmdb.h - * @brief Lightning memory-mapped database library - * - * @mainpage Lightning Memory-Mapped Database Manager (LMDB) - * - * @section intro_sec Introduction - * LMDB is a Btree-based database management library modeled loosely on the - * BerkeleyDB API, but much simplified. The entire database is exposed - * in a memory map, and all data fetches return data directly - * from the mapped memory, so no malloc's or memcpy's occur during - * data fetches. As such, the library is extremely simple because it - * requires no page caching layer of its own, and it is extremely high - * performance and memory-efficient. It is also fully transactional with - * full ACID semantics, and when the memory map is read-only, the - * database integrity cannot be corrupted by stray pointer writes from - * application code. - * - * The library is fully thread-aware and supports concurrent read/write - * access from multiple processes and threads. Data pages use a copy-on- - * write strategy so no active data pages are ever overwritten, which - * also provides resistance to corruption and eliminates the need of any - * special recovery procedures after a system crash. Writes are fully - * serialized; only one write transaction may be active at a time, which - * guarantees that writers can never deadlock. The database structure is - * multi-versioned so readers run with no locks; writers cannot block - * readers, and readers don't block writers. - * - * Unlike other well-known database mechanisms which use either write-ahead - * transaction logs or append-only data writes, LMDB requires no maintenance - * during operation. Both write-ahead loggers and append-only databases - * require periodic checkpointing and/or compaction of their log or database - * files otherwise they grow without bound. LMDB tracks free pages within - * the database and re-uses them for new write operations, so the database - * size does not grow without bound in normal use. - * - * The memory map can be used as a read-only or read-write map. It is - * read-only by default as this provides total immunity to corruption. - * Using read-write mode offers much higher write performance, but adds - * the possibility for stray application writes thru pointers to silently - * corrupt the database. Of course if your application code is known to - * be bug-free (...) then this is not an issue. - * - * @section caveats_sec Caveats - * Troubleshooting the lock file, plus semaphores on BSD systems: - * - * - A broken lockfile can cause sync issues. - * Stale reader transactions left behind by an aborted program - * cause further writes to grow the database quickly, and - * stale locks can block further operation. - * - * Fix: Check for stale readers periodically, using the - * #mdb_reader_check function or the \ref mdb_stat_1 "mdb_stat" tool. Or just - * make all programs using the database close it; the lockfile - * is always reset on first open of the environment. - * - * - On BSD systems or others configured with MDB_USE_POSIX_SEM, - * startup can fail due to semaphores owned by another userid. - * - * Fix: Open and close the database as the user which owns the - * semaphores (likely last user) or as root, while no other - * process is using the database. - * - * Restrictions/caveats (in addition to those listed for some functions): - * - * - Only the database owner should normally use the database on - * BSD systems or when otherwise configured with MDB_USE_POSIX_SEM. - * Multiple users can cause startup to fail later, as noted above. - * - * - There is normally no pure read-only mode, since readers need write - * access to locks and lock file. Exceptions: On read-only filesystems - * or with the #MDB_NOLOCK flag described under #mdb_env_open(). - * - * - By default, in versions before 0.9.10, unused portions of the data - * file might receive garbage data from memory freed by other code. - * (This does not happen when using the #MDB_WRITEMAP flag.) As of - * 0.9.10 the default behavior is to initialize such memory before - * writing to the data file. Since there may be a slight performance - * cost due to this initialization, applications may disable it using - * the #MDB_NOMEMINIT flag. Applications handling sensitive data - * which must not be written should not use this flag. This flag is - * irrelevant when using #MDB_WRITEMAP. - * - * - A thread can only use one transaction at a time, plus any child - * transactions. Each transaction belongs to one thread. See below. - * The #MDB_NOTLS flag changes this for read-only transactions. - * - * - Use an MDB_env* in the process which opened it, without fork()ing. - * - * - Do not have open an LMDB database twice in the same process at - * the same time. Not even from a plain open() call - close()ing it - * breaks flock() advisory locking. - * - * - Avoid long-lived transactions. Read transactions prevent - * reuse of pages freed by newer write transactions, thus the - * database can grow quickly. Write transactions prevent - * other write transactions, since writes are serialized. - * - * - Avoid suspending a process with active transactions. These - * would then be "long-lived" as above. Also read transactions - * suspended when writers commit could sometimes see wrong data. - * - * ...when several processes can use a database concurrently: - * - * - Avoid aborting a process with an active transaction. - * The transaction becomes "long-lived" as above until a check - * for stale readers is performed or the lockfile is reset, - * since the process may not remove it from the lockfile. - * - * - If you do that anyway, do a periodic check for stale readers. Or - * close the environment once in a while, so the lockfile can get reset. - * - * - Do not use LMDB databases on remote filesystems, even between - * processes on the same host. This breaks flock() on some OSes, - * possibly memory map sync, and certainly sync between programs - * on different hosts. - * - * - Opening a database can fail if another process is opening or - * closing it at exactly the same time. - * - * @author Howard Chu, Symas Corporation. - * - * @copyright Copyright 2011-2014 Howard Chu, Symas Corp. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - * - * @par Derived From: - * This code is derived from btree.c written by Martin Hedenfalk. - * - * Copyright (c) 2009, 2010 Martin Hedenfalk - * - * Permission to use, copy, modify, and distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ -#ifndef _LMDB_H_ -#define _LMDB_H_ - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/** Unix permissions for creating files, or dummy definition for Windows */ -#ifdef _MSC_VER -typedef int mdb_mode_t; -#else -typedef mode_t mdb_mode_t; -#endif - -/** An abstraction for a file handle. - * On POSIX systems file handles are small integers. On Windows - * they're opaque pointers. - */ -#ifdef _WIN32 -typedef void *mdb_filehandle_t; -#else -typedef int mdb_filehandle_t; -#endif - -/** @defgroup mdb LMDB API - * @{ - * @brief OpenLDAP Lightning Memory-Mapped Database Manager - */ -/** @defgroup Version Version Macros - * @{ - */ -/** Library major version */ -#define MDB_VERSION_MAJOR 0 -/** Library minor version */ -#define MDB_VERSION_MINOR 9 -/** Library patch version */ -#define MDB_VERSION_PATCH 14 - -/** Combine args a,b,c into a single integer for easy version comparisons */ -#define MDB_VERINT(a,b,c) (((a) << 24) | ((b) << 16) | (c)) - -/** The full library version as a single integer */ -#define MDB_VERSION_FULL \ - MDB_VERINT(MDB_VERSION_MAJOR,MDB_VERSION_MINOR,MDB_VERSION_PATCH) - -/** The release date of this library version */ -#define MDB_VERSION_DATE "July 24, 2014" - -/** A stringifier for the version info */ -#define MDB_VERSTR(a,b,c,d) "LMDB " #a "." #b "." #c ": (" d ")" - -/** A helper for the stringifier macro */ -#define MDB_VERFOO(a,b,c,d) MDB_VERSTR(a,b,c,d) - -/** The full library version as a C string */ -#define MDB_VERSION_STRING \ - MDB_VERFOO(MDB_VERSION_MAJOR,MDB_VERSION_MINOR,MDB_VERSION_PATCH,MDB_VERSION_DATE) -/** @} */ - -/** @brief Opaque structure for a database environment. - * - * A DB environment supports multiple databases, all residing in the same - * shared-memory map. - */ -typedef struct MDB_env MDB_env; - -/** @brief Opaque structure for a transaction handle. - * - * All database operations require a transaction handle. Transactions may be - * read-only or read-write. - */ -typedef struct MDB_txn MDB_txn; - -/** @brief A handle for an individual database in the DB environment. */ -typedef unsigned int MDB_dbi; - -/** @brief Opaque structure for navigating through a database */ -typedef struct MDB_cursor MDB_cursor; - -/** @brief Generic structure used for passing keys and data in and out - * of the database. - * - * Values returned from the database are valid only until a subsequent - * update operation, or the end of the transaction. Do not modify or - * free them, they commonly point into the database itself. - * - * Key sizes must be between 1 and #mdb_env_get_maxkeysize() inclusive. - * The same applies to data sizes in databases with the #MDB_DUPSORT flag. - * Other data items can in theory be from 0 to 0xffffffff bytes long. - */ -typedef struct MDB_val { - size_t mv_size; /**< size of the data item */ - void *mv_data; /**< address of the data item */ -} MDB_val; - -/** @brief A callback function used to compare two keys in a database */ -typedef int (MDB_cmp_func)(const MDB_val *a, const MDB_val *b); - -/** @brief A callback function used to relocate a position-dependent data item - * in a fixed-address database. - * - * The \b newptr gives the item's desired address in - * the memory map, and \b oldptr gives its previous address. The item's actual - * data resides at the address in \b item. This callback is expected to walk - * through the fields of the record in \b item and modify any - * values based at the \b oldptr address to be relative to the \b newptr address. - * @param[in,out] item The item that is to be relocated. - * @param[in] oldptr The previous address. - * @param[in] newptr The new address to relocate to. - * @param[in] relctx An application-provided context, set by #mdb_set_relctx(). - * @todo This feature is currently unimplemented. - */ -typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *relctx); - -/** @defgroup mdb_env Environment Flags - * @{ - */ - /** mmap at a fixed address (experimental) */ -#define MDB_FIXEDMAP 0x01 - /** no environment directory */ -#define MDB_NOSUBDIR 0x4000 - /** don't fsync after commit */ -#define MDB_NOSYNC 0x10000 - /** read only */ -#define MDB_RDONLY 0x20000 - /** don't fsync metapage after commit */ -#define MDB_NOMETASYNC 0x40000 - /** use writable mmap */ -#define MDB_WRITEMAP 0x80000 - /** use asynchronous msync when #MDB_WRITEMAP is used */ -#define MDB_MAPASYNC 0x100000 - /** tie reader locktable slots to #MDB_txn objects instead of to threads */ -#define MDB_NOTLS 0x200000 - /** don't do any locking, caller must manage their own locks */ -#define MDB_NOLOCK 0x400000 - /** don't do readahead (no effect on Windows) */ -#define MDB_NORDAHEAD 0x800000 - /** don't initialize malloc'd memory before writing to datafile */ -#define MDB_NOMEMINIT 0x1000000 -/** @} */ - -/** @defgroup mdb_dbi_open Database Flags - * @{ - */ - /** use reverse string keys */ -#define MDB_REVERSEKEY 0x02 - /** use sorted duplicates */ -#define MDB_DUPSORT 0x04 - /** numeric keys in native byte order. - * The keys must all be of the same size. */ -#define MDB_INTEGERKEY 0x08 - /** with #MDB_DUPSORT, sorted dup items have fixed size */ -#define MDB_DUPFIXED 0x10 - /** with #MDB_DUPSORT, dups are numeric in native byte order */ -#define MDB_INTEGERDUP 0x20 - /** with #MDB_DUPSORT, use reverse string dups */ -#define MDB_REVERSEDUP 0x40 - /** create DB if not already existing */ -#define MDB_CREATE 0x40000 -/** @} */ - -/** @defgroup mdb_put Write Flags - * @{ - */ -/** For put: Don't write if the key already exists. */ -#define MDB_NOOVERWRITE 0x10 -/** Only for #MDB_DUPSORT
- * For put: don't write if the key and data pair already exist.
- * For mdb_cursor_del: remove all duplicate data items. - */ -#define MDB_NODUPDATA 0x20 -/** For mdb_cursor_put: overwrite the current key/data pair */ -#define MDB_CURRENT 0x40 -/** For put: Just reserve space for data, don't copy it. Return a - * pointer to the reserved space. - */ -#define MDB_RESERVE 0x10000 -/** Data is being appended, don't split full pages. */ -#define MDB_APPEND 0x20000 -/** Duplicate data is being appended, don't split full pages. */ -#define MDB_APPENDDUP 0x40000 -/** Store multiple data items in one call. Only for #MDB_DUPFIXED. */ -#define MDB_MULTIPLE 0x80000 -/* @} */ - -/** @defgroup mdb_copy Copy Flags - * @{ - */ -/** Compacting copy: Omit free space from copy, and renumber all - * pages sequentially. - */ -#define MDB_CP_COMPACT 0x01 -/* @} */ - -/** @brief Cursor Get operations. - * - * This is the set of all operations for retrieving data - * using a cursor. - */ -typedef enum MDB_cursor_op { - MDB_FIRST, /**< Position at first key/data item */ - MDB_FIRST_DUP, /**< Position at first data item of current key. - Only for #MDB_DUPSORT */ - MDB_GET_BOTH, /**< Position at key/data pair. Only for #MDB_DUPSORT */ - MDB_GET_BOTH_RANGE, /**< position at key, nearest data. Only for #MDB_DUPSORT */ - MDB_GET_CURRENT, /**< Return key/data at current cursor position */ - MDB_GET_MULTIPLE, /**< Return key and up to a page of duplicate data items - from current cursor position. Move cursor to prepare - for #MDB_NEXT_MULTIPLE. Only for #MDB_DUPFIXED */ - MDB_LAST, /**< Position at last key/data item */ - MDB_LAST_DUP, /**< Position at last data item of current key. - Only for #MDB_DUPSORT */ - MDB_NEXT, /**< Position at next data item */ - MDB_NEXT_DUP, /**< Position at next data item of current key. - Only for #MDB_DUPSORT */ - MDB_NEXT_MULTIPLE, /**< Return key and up to a page of duplicate data items - from next cursor position. Move cursor to prepare - for #MDB_NEXT_MULTIPLE. Only for #MDB_DUPFIXED */ - MDB_NEXT_NODUP, /**< Position at first data item of next key */ - MDB_PREV, /**< Position at previous data item */ - MDB_PREV_DUP, /**< Position at previous data item of current key. - Only for #MDB_DUPSORT */ - MDB_PREV_NODUP, /**< Position at last data item of previous key */ - MDB_SET, /**< Position at specified key */ - MDB_SET_KEY, /**< Position at specified key, return key + data */ - MDB_SET_RANGE /**< Position at first key greater than or equal to specified key. */ -} MDB_cursor_op; - -/** @defgroup errors Return Codes - * - * BerkeleyDB uses -30800 to -30999, we'll go under them - * @{ - */ - /** Successful result */ -#define MDB_SUCCESS 0 - /** key/data pair already exists */ -#define MDB_KEYEXIST (-30799) - /** key/data pair not found (EOF) */ -#define MDB_NOTFOUND (-30798) - /** Requested page not found - this usually indicates corruption */ -#define MDB_PAGE_NOTFOUND (-30797) - /** Located page was wrong type */ -#define MDB_CORRUPTED (-30796) - /** Update of meta page failed, probably I/O error */ -#define MDB_PANIC (-30795) - /** Environment version mismatch */ -#define MDB_VERSION_MISMATCH (-30794) - /** File is not a valid LMDB file */ -#define MDB_INVALID (-30793) - /** Environment mapsize reached */ -#define MDB_MAP_FULL (-30792) - /** Environment maxdbs reached */ -#define MDB_DBS_FULL (-30791) - /** Environment maxreaders reached */ -#define MDB_READERS_FULL (-30790) - /** Too many TLS keys in use - Windows only */ -#define MDB_TLS_FULL (-30789) - /** Txn has too many dirty pages */ -#define MDB_TXN_FULL (-30788) - /** Cursor stack too deep - internal error */ -#define MDB_CURSOR_FULL (-30787) - /** Page has not enough space - internal error */ -#define MDB_PAGE_FULL (-30786) - /** Database contents grew beyond environment mapsize */ -#define MDB_MAP_RESIZED (-30785) - /** MDB_INCOMPATIBLE: Operation and DB incompatible, or DB flags changed */ -#define MDB_INCOMPATIBLE (-30784) - /** Invalid reuse of reader locktable slot */ -#define MDB_BAD_RSLOT (-30783) - /** Transaction cannot recover - it must be aborted */ -#define MDB_BAD_TXN (-30782) - /** Unsupported size of key/DB name/data, or wrong DUPFIXED size */ -#define MDB_BAD_VALSIZE (-30781) - /** The specified DBI was changed unexpectedly */ -#define MDB_BAD_DBI (-30780) - /** The last defined error code */ -#define MDB_LAST_ERRCODE MDB_BAD_DBI -/** @} */ - -/** @brief Statistics for a database in the environment */ -typedef struct MDB_stat { - unsigned int ms_psize; /**< Size of a database page. - This is currently the same for all databases. */ - unsigned int ms_depth; /**< Depth (height) of the B-tree */ - size_t ms_branch_pages; /**< Number of internal (non-leaf) pages */ - size_t ms_leaf_pages; /**< Number of leaf pages */ - size_t ms_overflow_pages; /**< Number of overflow pages */ - size_t ms_entries; /**< Number of data items */ -} MDB_stat; - -/** @brief Information about the environment */ -typedef struct MDB_envinfo { - void *me_mapaddr; /**< Address of map, if fixed */ - size_t me_mapsize; /**< Size of the data memory map */ - size_t me_last_pgno; /**< ID of the last used page */ - size_t me_last_txnid; /**< ID of the last committed transaction */ - unsigned int me_maxreaders; /**< max reader slots in the environment */ - unsigned int me_numreaders; /**< max reader slots used in the environment */ -} MDB_envinfo; - - /** @brief Return the LMDB library version information. - * - * @param[out] major if non-NULL, the library major version number is copied here - * @param[out] minor if non-NULL, the library minor version number is copied here - * @param[out] patch if non-NULL, the library patch version number is copied here - * @retval "version string" The library version as a string - */ -char *mdb_version(int *major, int *minor, int *patch); - - /** @brief Return a string describing a given error code. - * - * This function is a superset of the ANSI C X3.159-1989 (ANSI C) strerror(3) - * function. If the error code is greater than or equal to 0, then the string - * returned by the system function strerror(3) is returned. If the error code - * is less than 0, an error string corresponding to the LMDB library error is - * returned. See @ref errors for a list of LMDB-specific error codes. - * @param[in] err The error code - * @retval "error message" The description of the error - */ -char *mdb_strerror(int err); - - /** @brief Create an LMDB environment handle. - * - * This function allocates memory for a #MDB_env structure. To release - * the allocated memory and discard the handle, call #mdb_env_close(). - * Before the handle may be used, it must be opened using #mdb_env_open(). - * Various other options may also need to be set before opening the handle, - * e.g. #mdb_env_set_mapsize(), #mdb_env_set_maxreaders(), #mdb_env_set_maxdbs(), - * depending on usage requirements. - * @param[out] env The address where the new handle will be stored - * @return A non-zero error value on failure and 0 on success. - */ -int mdb_env_create(MDB_env **env); - - /** @brief Open an environment handle. - * - * If this function fails, #mdb_env_close() must be called to discard the #MDB_env handle. - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[in] path The directory in which the database files reside. This - * directory must already exist and be writable. - * @param[in] flags Special options for this environment. This parameter - * must be set to 0 or by bitwise OR'ing together one or more of the - * values described here. - * Flags set by mdb_env_set_flags() are also used. - *
    - *
  • #MDB_FIXEDMAP - * use a fixed address for the mmap region. This flag must be specified - * when creating the environment, and is stored persistently in the environment. - * If successful, the memory map will always reside at the same virtual address - * and pointers used to reference data items in the database will be constant - * across multiple invocations. This option may not always work, depending on - * how the operating system has allocated memory to shared libraries and other uses. - * The feature is highly experimental. - *
  • #MDB_NOSUBDIR - * By default, LMDB creates its environment in a directory whose - * pathname is given in \b path, and creates its data and lock files - * under that directory. With this option, \b path is used as-is for - * the database main data file. The database lock file is the \b path - * with "-lock" appended. - *
  • #MDB_RDONLY - * Open the environment in read-only mode. No write operations will be - * allowed. LMDB will still modify the lock file - except on read-only - * filesystems, where LMDB does not use locks. - *
  • #MDB_WRITEMAP - * Use a writeable memory map unless MDB_RDONLY is set. This is faster - * and uses fewer mallocs, but loses protection from application bugs - * like wild pointer writes and other bad updates into the database. - * Incompatible with nested transactions. - * Processes with and without MDB_WRITEMAP on the same environment do - * not cooperate well. - *
  • #MDB_NOMETASYNC - * Flush system buffers to disk only once per transaction, omit the - * metadata flush. Defer that until the system flushes files to disk, - * or next non-MDB_RDONLY commit or #mdb_env_sync(). This optimization - * maintains database integrity, but a system crash may undo the last - * committed transaction. I.e. it preserves the ACI (atomicity, - * consistency, isolation) but not D (durability) database property. - * This flag may be changed at any time using #mdb_env_set_flags(). - *
  • #MDB_NOSYNC - * Don't flush system buffers to disk when committing a transaction. - * This optimization means a system crash can corrupt the database or - * lose the last transactions if buffers are not yet flushed to disk. - * The risk is governed by how often the system flushes dirty buffers - * to disk and how often #mdb_env_sync() is called. However, if the - * filesystem preserves write order and the #MDB_WRITEMAP flag is not - * used, transactions exhibit ACI (atomicity, consistency, isolation) - * properties and only lose D (durability). I.e. database integrity - * is maintained, but a system crash may undo the final transactions. - * Note that (#MDB_NOSYNC | #MDB_WRITEMAP) leaves the system with no - * hint for when to write transactions to disk, unless #mdb_env_sync() - * is called. (#MDB_MAPASYNC | #MDB_WRITEMAP) may be preferable. - * This flag may be changed at any time using #mdb_env_set_flags(). - *
  • #MDB_MAPASYNC - * When using #MDB_WRITEMAP, use asynchronous flushes to disk. - * As with #MDB_NOSYNC, a system crash can then corrupt the - * database or lose the last transactions. Calling #mdb_env_sync() - * ensures on-disk database integrity until next commit. - * This flag may be changed at any time using #mdb_env_set_flags(). - *
  • #MDB_NOTLS - * Don't use Thread-Local Storage. Tie reader locktable slots to - * #MDB_txn objects instead of to threads. I.e. #mdb_txn_reset() keeps - * the slot reseved for the #MDB_txn object. A thread may use parallel - * read-only transactions. A read-only transaction may span threads if - * the user synchronizes its use. Applications that multiplex many - * user threads over individual OS threads need this option. Such an - * application must also serialize the write transactions in an OS - * thread, since LMDB's write locking is unaware of the user threads. - *
  • #MDB_NOLOCK - * Don't do any locking. If concurrent access is anticipated, the - * caller must manage all concurrency itself. For proper operation - * the caller must enforce single-writer semantics, and must ensure - * that no readers are using old transactions while a writer is - * active. The simplest approach is to use an exclusive lock so that - * no readers may be active at all when a writer begins. - *
  • #MDB_NORDAHEAD - * Turn off readahead. Most operating systems perform readahead on - * read requests by default. This option turns it off if the OS - * supports it. Turning it off may help random read performance - * when the DB is larger than RAM and system RAM is full. - * The option is not implemented on Windows. - *
  • #MDB_NOMEMINIT - * Don't initialize malloc'd memory before writing to unused spaces - * in the data file. By default, memory for pages written to the data - * file is obtained using malloc. While these pages may be reused in - * subsequent transactions, freshly malloc'd pages will be initialized - * to zeroes before use. This avoids persisting leftover data from other - * code (that used the heap and subsequently freed the memory) into the - * data file. Note that many other system libraries may allocate - * and free memory from the heap for arbitrary uses. E.g., stdio may - * use the heap for file I/O buffers. This initialization step has a - * modest performance cost so some applications may want to disable - * it using this flag. This option can be a problem for applications - * which handle sensitive data like passwords, and it makes memory - * checkers like Valgrind noisy. This flag is not needed with #MDB_WRITEMAP, - * which writes directly to the mmap instead of using malloc for pages. The - * initialization is also skipped if #MDB_RESERVE is used; the - * caller is expected to overwrite all of the memory that was - * reserved in that case. - * This flag may be changed at any time using #mdb_env_set_flags(). - *
- * @param[in] mode The UNIX permissions to set on created files. This parameter - * is ignored on Windows. - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • #MDB_VERSION_MISMATCH - the version of the LMDB library doesn't match the - * version that created the database environment. - *
  • #MDB_INVALID - the environment file headers are corrupted. - *
  • ENOENT - the directory specified by the path parameter doesn't exist. - *
  • EACCES - the user didn't have permission to access the environment files. - *
  • EAGAIN - the environment was locked by another process. - *
- */ -int mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode); - - /** @brief Copy an LMDB environment to the specified path. - * - * This function may be used to make a backup of an existing environment. - * No lockfile is created, since it gets recreated at need. - * @note This call can trigger significant file size growth if run in - * parallel with write transactions, because it employs a read-only - * transaction. See long-lived transactions under @ref caveats_sec. - * @param[in] env An environment handle returned by #mdb_env_create(). It - * must have already been opened successfully. - * @param[in] path The directory in which the copy will reside. This - * directory must already exist and be writable but must otherwise be - * empty. - * @return A non-zero error value on failure and 0 on success. - */ -int mdb_env_copy(MDB_env *env, const char *path); - - /** @brief Copy an LMDB environment to the specified file descriptor. - * - * This function may be used to make a backup of an existing environment. - * No lockfile is created, since it gets recreated at need. - * @note This call can trigger significant file size growth if run in - * parallel with write transactions, because it employs a read-only - * transaction. See long-lived transactions under @ref caveats_sec. - * @param[in] env An environment handle returned by #mdb_env_create(). It - * must have already been opened successfully. - * @param[in] fd The filedescriptor to write the copy to. It must - * have already been opened for Write access. - * @return A non-zero error value on failure and 0 on success. - */ -int mdb_env_copyfd(MDB_env *env, mdb_filehandle_t fd); - - /** @brief Copy an LMDB environment to the specified path, with options. - * - * This function may be used to make a backup of an existing environment. - * No lockfile is created, since it gets recreated at need. - * @note This call can trigger significant file size growth if run in - * parallel with write transactions, because it employs a read-only - * transaction. See long-lived transactions under @ref caveats_sec. - * @param[in] env An environment handle returned by #mdb_env_create(). It - * must have already been opened successfully. - * @param[in] path The directory in which the copy will reside. This - * directory must already exist and be writable but must otherwise be - * empty. - * @param[in] flags Special options for this operation. This parameter - * must be set to 0 or by bitwise OR'ing together one or more of the - * values described here. - *
    - *
  • #MDB_CP_COMPACT - Perform compaction while copying: omit free - * pages and sequentially renumber all pages in output. This option - * consumes more CPU and runs more slowly than the default. - *
- * @return A non-zero error value on failure and 0 on success. - */ -int mdb_env_copy2(MDB_env *env, const char *path, unsigned int flags); - - /** @brief Copy an LMDB environment to the specified file descriptor, - * with options. - * - * This function may be used to make a backup of an existing environment. - * No lockfile is created, since it gets recreated at need. See - * #mdb_env_copy2() for further details. - * @note This call can trigger significant file size growth if run in - * parallel with write transactions, because it employs a read-only - * transaction. See long-lived transactions under @ref caveats_sec. - * @param[in] env An environment handle returned by #mdb_env_create(). It - * must have already been opened successfully. - * @param[in] fd The filedescriptor to write the copy to. It must - * have already been opened for Write access. - * @param[in] flags Special options for this operation. - * See #mdb_env_copy2() for options. - * @return A non-zero error value on failure and 0 on success. - */ -int mdb_env_copyfd2(MDB_env *env, mdb_filehandle_t fd, unsigned int flags); - - /** @brief Return statistics about the LMDB environment. - * - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[out] stat The address of an #MDB_stat structure - * where the statistics will be copied - */ -int mdb_env_stat(MDB_env *env, MDB_stat *stat); - - /** @brief Return information about the LMDB environment. - * - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[out] stat The address of an #MDB_envinfo structure - * where the information will be copied - */ -int mdb_env_info(MDB_env *env, MDB_envinfo *stat); - - /** @brief Flush the data buffers to disk. - * - * Data is always written to disk when #mdb_txn_commit() is called, - * but the operating system may keep it buffered. LMDB always flushes - * the OS buffers upon commit as well, unless the environment was - * opened with #MDB_NOSYNC or in part #MDB_NOMETASYNC. - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[in] force If non-zero, force a synchronous flush. Otherwise - * if the environment has the #MDB_NOSYNC flag set the flushes - * will be omitted, and with #MDB_MAPASYNC they will be asynchronous. - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
  • EIO - an error occurred during synchronization. - *
- */ -int mdb_env_sync(MDB_env *env, int force); - - /** @brief Close the environment and release the memory map. - * - * Only a single thread may call this function. All transactions, databases, - * and cursors must already be closed before calling this function. Attempts to - * use any such handles after calling this function will cause a SIGSEGV. - * The environment handle will be freed and must not be used again after this call. - * @param[in] env An environment handle returned by #mdb_env_create() - */ -void mdb_env_close(MDB_env *env); - - /** @brief Set environment flags. - * - * This may be used to set some flags in addition to those from - * #mdb_env_open(), or to unset these flags. If several threads - * change the flags at the same time, the result is undefined. - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[in] flags The flags to change, bitwise OR'ed together - * @param[in] onoff A non-zero value sets the flags, zero clears them. - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_env_set_flags(MDB_env *env, unsigned int flags, int onoff); - - /** @brief Get environment flags. - * - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[out] flags The address of an integer to store the flags - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_env_get_flags(MDB_env *env, unsigned int *flags); - - /** @brief Return the path that was used in #mdb_env_open(). - * - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[out] path Address of a string pointer to contain the path. This - * is the actual string in the environment, not a copy. It should not be - * altered in any way. - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_env_get_path(MDB_env *env, const char **path); - - /** @brief Return the filedescriptor for the given environment. - * - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[out] fd Address of a mdb_filehandle_t to contain the descriptor. - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *fd); - - /** @brief Set the size of the memory map to use for this environment. - * - * The size should be a multiple of the OS page size. The default is - * 10485760 bytes. The size of the memory map is also the maximum size - * of the database. The value should be chosen as large as possible, - * to accommodate future growth of the database. - * This function should be called after #mdb_env_create() and before #mdb_env_open(). - * It may be called at later times if no transactions are active in - * this process. Note that the library does not check for this condition, - * the caller must ensure it explicitly. - * - * The new size takes effect immediately for the current process but - * will not be persisted to any others until a write transaction has been - * committed by the current process. Also, only mapsize increases are - * persisted into the environment. - * - * If the mapsize is increased by another process, and data has grown - * beyond the range of the current mapsize, #mdb_txn_begin() will - * return #MDB_MAP_RESIZED. This function may be called with a size - * of zero to adopt the new size. - * - * Any attempt to set a size smaller than the space already consumed - * by the environment will be silently changed to the current size of the used space. - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[in] size The size in bytes - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified, or the environment has - * an active write transaction. - *
- */ -int mdb_env_set_mapsize(MDB_env *env, size_t size); - - /** @brief Set the maximum number of threads/reader slots for the environment. - * - * This defines the number of slots in the lock table that is used to track readers in the - * the environment. The default is 126. - * Starting a read-only transaction normally ties a lock table slot to the - * current thread until the environment closes or the thread exits. If - * MDB_NOTLS is in use, #mdb_txn_begin() instead ties the slot to the - * MDB_txn object until it or the #MDB_env object is destroyed. - * This function may only be called after #mdb_env_create() and before #mdb_env_open(). - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[in] readers The maximum number of reader lock table slots - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified, or the environment is already open. - *
- */ -int mdb_env_set_maxreaders(MDB_env *env, unsigned int readers); - - /** @brief Get the maximum number of threads/reader slots for the environment. - * - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[out] readers Address of an integer to store the number of readers - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_env_get_maxreaders(MDB_env *env, unsigned int *readers); - - /** @brief Set the maximum number of named databases for the environment. - * - * This function is only needed if multiple databases will be used in the - * environment. Simpler applications that use the environment as a single - * unnamed database can ignore this option. - * This function may only be called after #mdb_env_create() and before #mdb_env_open(). - * - * Currently a moderate number of slots are cheap but a huge number gets - * expensive: 7-120 words per transaction, and every #mdb_dbi_open() - * does a linear search of the opened slots. - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[in] dbs The maximum number of databases - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified, or the environment is already open. - *
- */ -int mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs); - - /** @brief Get the maximum size of keys and #MDB_DUPSORT data we can write. - * - * Depends on the compile-time constant #MDB_MAXKEYSIZE. Default 511. - * See @ref MDB_val. - * @param[in] env An environment handle returned by #mdb_env_create() - * @return The maximum size of a key we can write - */ -int mdb_env_get_maxkeysize(MDB_env *env); - - /** @brief Set application information associated with the #MDB_env. - * - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[in] ctx An arbitrary pointer for whatever the application needs. - * @return A non-zero error value on failure and 0 on success. - */ -int mdb_env_set_userctx(MDB_env *env, void *ctx); - - /** @brief Get the application information associated with the #MDB_env. - * - * @param[in] env An environment handle returned by #mdb_env_create() - * @return The pointer set by #mdb_env_set_userctx(). - */ -void *mdb_env_get_userctx(MDB_env *env); - - /** @brief A callback function for most LMDB assert() failures, - * called before printing the message and aborting. - * - * @param[in] env An environment handle returned by #mdb_env_create(). - * @param[in] msg The assertion message, not including newline. - */ -typedef void MDB_assert_func(MDB_env *env, const char *msg); - - /** Set or reset the assert() callback of the environment. - * Disabled if liblmdb is buillt with NDEBUG. - * @note This hack should become obsolete as lmdb's error handling matures. - * @param[in] env An environment handle returned by #mdb_env_create(). - * @param[in] func An #MDB_assert_func function, or 0. - * @return A non-zero error value on failure and 0 on success. - */ -int mdb_env_set_assert(MDB_env *env, MDB_assert_func *func); - - /** @brief Create a transaction for use with the environment. - * - * The transaction handle may be discarded using #mdb_txn_abort() or #mdb_txn_commit(). - * @note A transaction and its cursors must only be used by a single - * thread, and a thread may only have a single transaction at a time. - * If #MDB_NOTLS is in use, this does not apply to read-only transactions. - * @note Cursors may not span transactions. - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[in] parent If this parameter is non-NULL, the new transaction - * will be a nested transaction, with the transaction indicated by \b parent - * as its parent. Transactions may be nested to any level. A parent - * transaction and its cursors may not issue any other operations than - * mdb_txn_commit and mdb_txn_abort while it has active child transactions. - * @param[in] flags Special options for this transaction. This parameter - * must be set to 0 or by bitwise OR'ing together one or more of the - * values described here. - *
    - *
  • #MDB_RDONLY - * This transaction will not perform any write operations. - *
- * @param[out] txn Address where the new #MDB_txn handle will be stored - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • #MDB_PANIC - a fatal error occurred earlier and the environment - * must be shut down. - *
  • #MDB_MAP_RESIZED - another process wrote data beyond this MDB_env's - * mapsize and this environment's map must be resized as well. - * See #mdb_env_set_mapsize(). - *
  • #MDB_READERS_FULL - a read-only transaction was requested and - * the reader lock table is full. See #mdb_env_set_maxreaders(). - *
  • ENOMEM - out of memory. - *
- */ -int mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **txn); - - /** @brief Returns the transaction's #MDB_env - * - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - */ -MDB_env *mdb_txn_env(MDB_txn *txn); - - /** @brief Commit all the operations of a transaction into the database. - * - * The transaction handle is freed. It and its cursors must not be used - * again after this call, except with #mdb_cursor_renew(). - * @note Earlier documentation incorrectly said all cursors would be freed. - * Only write-transactions free cursors. - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
  • ENOSPC - no more disk space. - *
  • EIO - a low-level I/O error occurred while writing. - *
  • ENOMEM - out of memory. - *
- */ -int mdb_txn_commit(MDB_txn *txn); - - /** @brief Abandon all the operations of the transaction instead of saving them. - * - * The transaction handle is freed. It and its cursors must not be used - * again after this call, except with #mdb_cursor_renew(). - * @note Earlier documentation incorrectly said all cursors would be freed. - * Only write-transactions free cursors. - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - */ -void mdb_txn_abort(MDB_txn *txn); - - /** @brief Reset a read-only transaction. - * - * Abort the transaction like #mdb_txn_abort(), but keep the transaction - * handle. #mdb_txn_renew() may reuse the handle. This saves allocation - * overhead if the process will start a new read-only transaction soon, - * and also locking overhead if #MDB_NOTLS is in use. The reader table - * lock is released, but the table slot stays tied to its thread or - * #MDB_txn. Use mdb_txn_abort() to discard a reset handle, and to free - * its lock table slot if MDB_NOTLS is in use. - * Cursors opened within the transaction must not be used - * again after this call, except with #mdb_cursor_renew(). - * Reader locks generally don't interfere with writers, but they keep old - * versions of database pages allocated. Thus they prevent the old pages - * from being reused when writers commit new data, and so under heavy load - * the database size may grow much more rapidly than otherwise. - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - */ -void mdb_txn_reset(MDB_txn *txn); - - /** @brief Renew a read-only transaction. - * - * This acquires a new reader lock for a transaction handle that had been - * released by #mdb_txn_reset(). It must be called before a reset transaction - * may be used again. - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • #MDB_PANIC - a fatal error occurred earlier and the environment - * must be shut down. - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_txn_renew(MDB_txn *txn); - -/** Compat with version <= 0.9.4, avoid clash with libmdb from MDB Tools project */ -#define mdb_open(txn,name,flags,dbi) mdb_dbi_open(txn,name,flags,dbi) -/** Compat with version <= 0.9.4, avoid clash with libmdb from MDB Tools project */ -#define mdb_close(env,dbi) mdb_dbi_close(env,dbi) - - /** @brief Open a database in the environment. - * - * A database handle denotes the name and parameters of a database, - * independently of whether such a database exists. - * The database handle may be discarded by calling #mdb_dbi_close(). - * The old database handle is returned if the database was already open. - * The handle may only be closed once. - * The database handle will be private to the current transaction until - * the transaction is successfully committed. If the transaction is - * aborted the handle will be closed automatically. - * After a successful commit the - * handle will reside in the shared environment, and may be used - * by other transactions. This function must not be called from - * multiple concurrent transactions. A transaction that uses this function - * must finish (either commit or abort) before any other transaction may - * use this function. - * - * To use named databases (with name != NULL), #mdb_env_set_maxdbs() - * must be called before opening the environment. Database names - * are kept as keys in the unnamed database. - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] name The name of the database to open. If only a single - * database is needed in the environment, this value may be NULL. - * @param[in] flags Special options for this database. This parameter - * must be set to 0 or by bitwise OR'ing together one or more of the - * values described here. - *
    - *
  • #MDB_REVERSEKEY - * Keys are strings to be compared in reverse order, from the end - * of the strings to the beginning. By default, Keys are treated as strings and - * compared from beginning to end. - *
  • #MDB_DUPSORT - * Duplicate keys may be used in the database. (Or, from another perspective, - * keys may have multiple data items, stored in sorted order.) By default - * keys must be unique and may have only a single data item. - *
  • #MDB_INTEGERKEY - * Keys are binary integers in native byte order. Setting this option - * requires all keys to be the same size, typically sizeof(int) - * or sizeof(size_t). - *
  • #MDB_DUPFIXED - * This flag may only be used in combination with #MDB_DUPSORT. This option - * tells the library that the data items for this database are all the same - * size, which allows further optimizations in storage and retrieval. When - * all data items are the same size, the #MDB_GET_MULTIPLE and #MDB_NEXT_MULTIPLE - * cursor operations may be used to retrieve multiple items at once. - *
  • #MDB_INTEGERDUP - * This option specifies that duplicate data items are also integers, and - * should be sorted as such. - *
  • #MDB_REVERSEDUP - * This option specifies that duplicate data items should be compared as - * strings in reverse order. - *
  • #MDB_CREATE - * Create the named database if it doesn't exist. This option is not - * allowed in a read-only transaction or a read-only environment. - *
- * @param[out] dbi Address where the new #MDB_dbi handle will be stored - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • #MDB_NOTFOUND - the specified database doesn't exist in the environment - * and #MDB_CREATE was not specified. - *
  • #MDB_DBS_FULL - too many databases have been opened. See #mdb_env_set_maxdbs(). - *
- */ -int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *dbi); - - /** @brief Retrieve statistics for a database. - * - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - * @param[out] stat The address of an #MDB_stat structure - * where the statistics will be copied - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *stat); - - /** @brief Retrieve the DB flags for a database handle. - * - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - * @param[out] flags Address where the flags will be returned. - * @return A non-zero error value on failure and 0 on success. - */ -int mdb_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned int *flags); - - /** @brief Close a database handle. Normally unnecessary. Use with care: - * - * This call is not mutex protected. Handles should only be closed by - * a single thread, and only if no other threads are going to reference - * the database handle or one of its cursors any further. Do not close - * a handle if an existing transaction has modified its database. - * Doing so can cause misbehavior from database corruption to errors - * like MDB_BAD_VALSIZE (since the DB name is gone). - * - * Closing a database handle is not necessary, but lets #mdb_dbi_open() - * reuse the handle value. Usually it's better to set a bigger - * #mdb_env_set_maxdbs(), unless that value would be large. - * - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - */ -void mdb_dbi_close(MDB_env *env, MDB_dbi dbi); - - /** @brief Empty or delete+close a database. - * - * See #mdb_dbi_close() for restrictions about closing the DB handle. - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - * @param[in] del 0 to empty the DB, 1 to delete it from the - * environment and close the DB handle. - * @return A non-zero error value on failure and 0 on success. - */ -int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del); - - /** @brief Set a custom key comparison function for a database. - * - * The comparison function is called whenever it is necessary to compare a - * key specified by the application with a key currently stored in the database. - * If no comparison function is specified, and no special key flags were specified - * with #mdb_dbi_open(), the keys are compared lexically, with shorter keys collating - * before longer keys. - * @warning This function must be called before any data access functions are used, - * otherwise data corruption may occur. The same comparison function must be used by every - * program accessing the database, every time the database is used. - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - * @param[in] cmp A #MDB_cmp_func function - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp); - - /** @brief Set a custom data comparison function for a #MDB_DUPSORT database. - * - * This comparison function is called whenever it is necessary to compare a data - * item specified by the application with a data item currently stored in the database. - * This function only takes effect if the database was opened with the #MDB_DUPSORT - * flag. - * If no comparison function is specified, and no special key flags were specified - * with #mdb_dbi_open(), the data items are compared lexically, with shorter items collating - * before longer items. - * @warning This function must be called before any data access functions are used, - * otherwise data corruption may occur. The same comparison function must be used by every - * program accessing the database, every time the database is used. - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - * @param[in] cmp A #MDB_cmp_func function - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp); - - /** @brief Set a relocation function for a #MDB_FIXEDMAP database. - * - * @todo The relocation function is called whenever it is necessary to move the data - * of an item to a different position in the database (e.g. through tree - * balancing operations, shifts as a result of adds or deletes, etc.). It is - * intended to allow address/position-dependent data items to be stored in - * a database in an environment opened with the #MDB_FIXEDMAP option. - * Currently the relocation feature is unimplemented and setting - * this function has no effect. - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - * @param[in] rel A #MDB_rel_func function - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_set_relfunc(MDB_txn *txn, MDB_dbi dbi, MDB_rel_func *rel); - - /** @brief Set a context pointer for a #MDB_FIXEDMAP database's relocation function. - * - * See #mdb_set_relfunc and #MDB_rel_func for more details. - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - * @param[in] ctx An arbitrary pointer for whatever the application needs. - * It will be passed to the callback function set by #mdb_set_relfunc - * as its \b relctx parameter whenever the callback is invoked. - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_set_relctx(MDB_txn *txn, MDB_dbi dbi, void *ctx); - - /** @brief Get items from a database. - * - * This function retrieves key/data pairs from the database. The address - * and length of the data associated with the specified \b key are returned - * in the structure to which \b data refers. - * If the database supports duplicate keys (#MDB_DUPSORT) then the - * first data item for the key will be returned. Retrieval of other - * items requires the use of #mdb_cursor_get(). - * - * @note The memory pointed to by the returned values is owned by the - * database. The caller need not dispose of the memory, and may not - * modify it in any way. For values returned in a read-only transaction - * any modification attempts will cause a SIGSEGV. - * @note Values returned from the database are valid only until a - * subsequent update operation, or the end of the transaction. - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - * @param[in] key The key to search for in the database - * @param[out] data The data corresponding to the key - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • #MDB_NOTFOUND - the key was not in the database. - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_get(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data); - - /** @brief Store items into a database. - * - * This function stores key/data pairs in the database. The default behavior - * is to enter the new key/data pair, replacing any previously existing key - * if duplicates are disallowed, or adding a duplicate data item if - * duplicates are allowed (#MDB_DUPSORT). - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - * @param[in] key The key to store in the database - * @param[in,out] data The data to store - * @param[in] flags Special options for this operation. This parameter - * must be set to 0 or by bitwise OR'ing together one or more of the - * values described here. - *
    - *
  • #MDB_NODUPDATA - enter the new key/data pair only if it does not - * already appear in the database. This flag may only be specified - * if the database was opened with #MDB_DUPSORT. The function will - * return #MDB_KEYEXIST if the key/data pair already appears in the - * database. - *
  • #MDB_NOOVERWRITE - enter the new key/data pair only if the key - * does not already appear in the database. The function will return - * #MDB_KEYEXIST if the key already appears in the database, even if - * the database supports duplicates (#MDB_DUPSORT). The \b data - * parameter will be set to point to the existing item. - *
  • #MDB_RESERVE - reserve space for data of the given size, but - * don't copy the given data. Instead, return a pointer to the - * reserved space, which the caller can fill in later - before - * the next update operation or the transaction ends. This saves - * an extra memcpy if the data is being generated later. - * LMDB does nothing else with this memory, the caller is expected - * to modify all of the space requested. - *
  • #MDB_APPEND - append the given key/data pair to the end of the - * database. No key comparisons are performed. This option allows - * fast bulk loading when keys are already known to be in the - * correct order. Loading unsorted keys with this flag will cause - * data corruption. - *
  • #MDB_APPENDDUP - as above, but for sorted dup data. - *
- * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • #MDB_MAP_FULL - the database is full, see #mdb_env_set_mapsize(). - *
  • #MDB_TXN_FULL - the transaction has too many dirty pages. - *
  • EACCES - an attempt was made to write in a read-only transaction. - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_put(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, - unsigned int flags); - - /** @brief Delete items from a database. - * - * This function removes key/data pairs from the database. - * If the database does not support sorted duplicate data items - * (#MDB_DUPSORT) the data parameter is ignored. - * If the database supports sorted duplicates and the data parameter - * is NULL, all of the duplicate data items for the key will be - * deleted. Otherwise, if the data parameter is non-NULL - * only the matching data item will be deleted. - * This function will return #MDB_NOTFOUND if the specified key/data - * pair is not in the database. - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - * @param[in] key The key to delete from the database - * @param[in] data The data to delete - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EACCES - an attempt was made to write in a read-only transaction. - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_del(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data); - - /** @brief Create a cursor handle. - * - * A cursor is associated with a specific transaction and database. - * A cursor cannot be used when its database handle is closed. Nor - * when its transaction has ended, except with #mdb_cursor_renew(). - * It can be discarded with #mdb_cursor_close(). - * A cursor in a write-transaction can be closed before its transaction - * ends, and will otherwise be closed when its transaction ends. - * A cursor in a read-only transaction must be closed explicitly, before - * or after its transaction ends. It can be reused with - * #mdb_cursor_renew() before finally closing it. - * @note Earlier documentation said that cursors in every transaction - * were closed when the transaction committed or aborted. - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - * @param[out] cursor Address where the new #MDB_cursor handle will be stored - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **cursor); - - /** @brief Close a cursor handle. - * - * The cursor handle will be freed and must not be used again after this call. - * Its transaction must still be live if it is a write-transaction. - * @param[in] cursor A cursor handle returned by #mdb_cursor_open() - */ -void mdb_cursor_close(MDB_cursor *cursor); - - /** @brief Renew a cursor handle. - * - * A cursor is associated with a specific transaction and database. - * Cursors that are only used in read-only - * transactions may be re-used, to avoid unnecessary malloc/free overhead. - * The cursor may be associated with a new read-only transaction, and - * referencing the same database handle as it was created with. - * This may be done whether the previous transaction is live or dead. - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] cursor A cursor handle returned by #mdb_cursor_open() - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_cursor_renew(MDB_txn *txn, MDB_cursor *cursor); - - /** @brief Return the cursor's transaction handle. - * - * @param[in] cursor A cursor handle returned by #mdb_cursor_open() - */ -MDB_txn *mdb_cursor_txn(MDB_cursor *cursor); - - /** @brief Return the cursor's database handle. - * - * @param[in] cursor A cursor handle returned by #mdb_cursor_open() - */ -MDB_dbi mdb_cursor_dbi(MDB_cursor *cursor); - - /** @brief Retrieve by cursor. - * - * This function retrieves key/data pairs from the database. The address and length - * of the key are returned in the object to which \b key refers (except for the - * case of the #MDB_SET option, in which the \b key object is unchanged), and - * the address and length of the data are returned in the object to which \b data - * refers. - * See #mdb_get() for restrictions on using the output values. - * @param[in] cursor A cursor handle returned by #mdb_cursor_open() - * @param[in,out] key The key for a retrieved item - * @param[in,out] data The data of a retrieved item - * @param[in] op A cursor operation #MDB_cursor_op - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • #MDB_NOTFOUND - no matching key found. - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_cursor_get(MDB_cursor *cursor, MDB_val *key, MDB_val *data, - MDB_cursor_op op); - - /** @brief Store by cursor. - * - * This function stores key/data pairs into the database. - * The cursor is positioned at the new item, or on failure usually near it. - * @note Earlier documentation incorrectly said errors would leave the - * state of the cursor unchanged. - * @param[in] cursor A cursor handle returned by #mdb_cursor_open() - * @param[in] key The key operated on. - * @param[in] data The data operated on. - * @param[in] flags Options for this operation. This parameter - * must be set to 0 or one of the values described here. - *
    - *
  • #MDB_CURRENT - replace the item at the current cursor position. - * The \b key parameter must still be provided, and must match it. - * If using sorted duplicates (#MDB_DUPSORT) the data item must still - * sort into the same place. This is intended to be used when the - * new data is the same size as the old. Otherwise it will simply - * perform a delete of the old record followed by an insert. - *
  • #MDB_NODUPDATA - enter the new key/data pair only if it does not - * already appear in the database. This flag may only be specified - * if the database was opened with #MDB_DUPSORT. The function will - * return #MDB_KEYEXIST if the key/data pair already appears in the - * database. - *
  • #MDB_NOOVERWRITE - enter the new key/data pair only if the key - * does not already appear in the database. The function will return - * #MDB_KEYEXIST if the key already appears in the database, even if - * the database supports duplicates (#MDB_DUPSORT). - *
  • #MDB_RESERVE - reserve space for data of the given size, but - * don't copy the given data. Instead, return a pointer to the - * reserved space, which the caller can fill in later. This saves - * an extra memcpy if the data is being generated later. - *
  • #MDB_APPEND - append the given key/data pair to the end of the - * database. No key comparisons are performed. This option allows - * fast bulk loading when keys are already known to be in the - * correct order. Loading unsorted keys with this flag will cause - * data corruption. - *
  • #MDB_APPENDDUP - as above, but for sorted dup data. - *
  • #MDB_MULTIPLE - store multiple contiguous data elements in a - * single request. This flag may only be specified if the database - * was opened with #MDB_DUPFIXED. The \b data argument must be an - * array of two MDB_vals. The mv_size of the first MDB_val must be - * the size of a single data element. The mv_data of the first MDB_val - * must point to the beginning of the array of contiguous data elements. - * The mv_size of the second MDB_val must be the count of the number - * of data elements to store. On return this field will be set to - * the count of the number of elements actually written. The mv_data - * of the second MDB_val is unused. - *
- * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • #MDB_MAP_FULL - the database is full, see #mdb_env_set_mapsize(). - *
  • #MDB_TXN_FULL - the transaction has too many dirty pages. - *
  • EACCES - an attempt was made to modify a read-only database. - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_cursor_put(MDB_cursor *cursor, MDB_val *key, MDB_val *data, - unsigned int flags); - - /** @brief Delete current key/data pair - * - * This function deletes the key/data pair to which the cursor refers. - * @param[in] cursor A cursor handle returned by #mdb_cursor_open() - * @param[in] flags Options for this operation. This parameter - * must be set to 0 or one of the values described here. - *
    - *
  • #MDB_NODUPDATA - delete all of the data items for the current key. - * This flag may only be specified if the database was opened with #MDB_DUPSORT. - *
- * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EACCES - an attempt was made to modify a read-only database. - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_cursor_del(MDB_cursor *cursor, unsigned int flags); - - /** @brief Return count of duplicates for current key. - * - * This call is only valid on databases that support sorted duplicate - * data items #MDB_DUPSORT. - * @param[in] cursor A cursor handle returned by #mdb_cursor_open() - * @param[out] countp Address where the count will be stored - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - cursor is not initialized, or an invalid parameter was specified. - *
- */ -int mdb_cursor_count(MDB_cursor *cursor, size_t *countp); - - /** @brief Compare two data items according to a particular database. - * - * This returns a comparison as if the two data items were keys in the - * specified database. - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - * @param[in] a The first item to compare - * @param[in] b The second item to compare - * @return < 0 if a < b, 0 if a == b, > 0 if a > b - */ -int mdb_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b); - - /** @brief Compare two data items according to a particular database. - * - * This returns a comparison as if the two items were data items of - * the specified database. The database must have the #MDB_DUPSORT flag. - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - * @param[in] a The first item to compare - * @param[in] b The second item to compare - * @return < 0 if a < b, 0 if a == b, > 0 if a > b - */ -int mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b); - - /** @brief A callback function used to print a message from the library. - * - * @param[in] msg The string to be printed. - * @param[in] ctx An arbitrary context pointer for the callback. - * @return < 0 on failure, >= 0 on success. - */ -typedef int (MDB_msg_func)(const char *msg, void *ctx); - - /** @brief Dump the entries in the reader lock table. - * - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[in] func A #MDB_msg_func function - * @param[in] ctx Anything the message function needs - * @return < 0 on failure, >= 0 on success. - */ -int mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx); - - /** @brief Check for stale entries in the reader lock table. - * - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[out] dead Number of stale slots that were cleared - * @return 0 on success, non-zero on failure. - */ -int mdb_reader_check(MDB_env *env, int *dead); -/** @} */ - -#ifdef __cplusplus -} -#endif -/** @page tools LMDB Command Line Tools - The following describes the command line tools that are available for LMDB. - \li \ref mdb_copy_1 - \li \ref mdb_dump_1 - \li \ref mdb_load_1 - \li \ref mdb_stat_1 -*/ - -#endif /* _LMDB_H_ */ diff --git a/vendor/github.com/szferi/gomdb/mdb.c b/vendor/github.com/szferi/gomdb/mdb.c deleted file mode 100644 index c87886da074d..000000000000 --- a/vendor/github.com/szferi/gomdb/mdb.c +++ /dev/null @@ -1,9364 +0,0 @@ -/** @file mdb.c - * @brief Lightning memory-mapped database library - * - * A Btree-based database management library modeled loosely on the - * BerkeleyDB API, but much simplified. - */ -/* - * Copyright 2011-2014 Howard Chu, Symas Corp. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - * - * This code is derived from btree.c written by Martin Hedenfalk. - * - * Copyright (c) 2009, 2010 Martin Hedenfalk - * - * Permission to use, copy, modify, and distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ -#ifndef _GNU_SOURCE -#define _GNU_SOURCE 1 -#endif -#ifdef _WIN32 -#include -#include -/** getpid() returns int; MinGW defines pid_t but MinGW64 typedefs it - * as int64 which is wrong. MSVC doesn't define it at all, so just - * don't use it. - */ -#define MDB_PID_T int -#define MDB_THR_T DWORD -#include -#include -#ifdef __GNUC__ -# include -#else -# define LITTLE_ENDIAN 1234 -# define BIG_ENDIAN 4321 -# define BYTE_ORDER LITTLE_ENDIAN -# ifndef SSIZE_MAX -# define SSIZE_MAX INT_MAX -# endif -#endif -#else -#include -#include -#define MDB_PID_T pid_t -#define MDB_THR_T pthread_t -#include -#include -#include -#ifdef HAVE_SYS_FILE_H -#include -#endif -#include -#endif - -#if defined(__mips) && defined(__linux) -/* MIPS has cache coherency issues, requires explicit cache control */ -#include -extern int cacheflush(char *addr, int nbytes, int cache); -#define CACHEFLUSH(addr, bytes, cache) cacheflush(addr, bytes, cache) -#else -#define CACHEFLUSH(addr, bytes, cache) -#endif - - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if defined(__sun) -/* Most platforms have posix_memalign, older may only have memalign */ -#define HAVE_MEMALIGN 1 -#include -#endif - -#if !(defined(BYTE_ORDER) || defined(__BYTE_ORDER)) -#include -#include /* defines BYTE_ORDER on HPUX and Solaris */ -#endif - -#if defined(__APPLE__) || defined (BSD) -# define MDB_USE_POSIX_SEM 1 -# define MDB_FDATASYNC fsync -#elif defined(ANDROID) -# define MDB_FDATASYNC fsync -#endif - -#ifndef _WIN32 -#include -#ifdef MDB_USE_POSIX_SEM -# define MDB_USE_HASH 1 -#include -#endif -#endif - -#ifdef USE_VALGRIND -#include -#define VGMEMP_CREATE(h,r,z) VALGRIND_CREATE_MEMPOOL(h,r,z) -#define VGMEMP_ALLOC(h,a,s) VALGRIND_MEMPOOL_ALLOC(h,a,s) -#define VGMEMP_FREE(h,a) VALGRIND_MEMPOOL_FREE(h,a) -#define VGMEMP_DESTROY(h) VALGRIND_DESTROY_MEMPOOL(h) -#define VGMEMP_DEFINED(a,s) VALGRIND_MAKE_MEM_DEFINED(a,s) -#else -#define VGMEMP_CREATE(h,r,z) -#define VGMEMP_ALLOC(h,a,s) -#define VGMEMP_FREE(h,a) -#define VGMEMP_DESTROY(h) -#define VGMEMP_DEFINED(a,s) -#endif - -#ifndef BYTE_ORDER -# if (defined(_LITTLE_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN)) -/* Solaris just defines one or the other */ -# define LITTLE_ENDIAN 1234 -# define BIG_ENDIAN 4321 -# ifdef _LITTLE_ENDIAN -# define BYTE_ORDER LITTLE_ENDIAN -# else -# define BYTE_ORDER BIG_ENDIAN -# endif -# else -# define BYTE_ORDER __BYTE_ORDER -# endif -#endif - -#ifndef LITTLE_ENDIAN -#define LITTLE_ENDIAN __LITTLE_ENDIAN -#endif -#ifndef BIG_ENDIAN -#define BIG_ENDIAN __BIG_ENDIAN -#endif - -#if defined(__i386) || defined(__x86_64) || defined(_M_IX86) -#define MISALIGNED_OK 1 -#endif - -#include "lmdb.h" -#include "midl.h" - -#if (BYTE_ORDER == LITTLE_ENDIAN) == (BYTE_ORDER == BIG_ENDIAN) -# error "Unknown or unsupported endianness (BYTE_ORDER)" -#elif (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF -# error "Two's complement, reasonably sized integer types, please" -#endif - -#ifdef __GNUC__ -/** Put infrequently used env functions in separate section */ -# ifdef __APPLE__ -# define ESECT __attribute__ ((section("__TEXT,text_env"))) -# else -# define ESECT __attribute__ ((section("text_env"))) -# endif -#else -#define ESECT -#endif - -/** @defgroup internal LMDB Internals - * @{ - */ -/** @defgroup compat Compatibility Macros - * A bunch of macros to minimize the amount of platform-specific ifdefs - * needed throughout the rest of the code. When the features this library - * needs are similar enough to POSIX to be hidden in a one-or-two line - * replacement, this macro approach is used. - * @{ - */ - - /** Features under development */ -#ifndef MDB_DEVEL -#define MDB_DEVEL 0 -#endif - - /** Wrapper around __func__, which is a C99 feature */ -#if __STDC_VERSION__ >= 199901L -# define mdb_func_ __func__ -#elif __GNUC__ >= 2 || _MSC_VER >= 1300 -# define mdb_func_ __FUNCTION__ -#else -/* If a debug message says (), update the #if statements above */ -# define mdb_func_ "" -#endif - -#ifdef _WIN32 -#define MDB_USE_HASH 1 -#define MDB_PIDLOCK 0 -#define THREAD_RET DWORD -#define pthread_t HANDLE -#define pthread_mutex_t HANDLE -#define pthread_cond_t HANDLE -#define pthread_key_t DWORD -#define pthread_self() GetCurrentThreadId() -#define pthread_key_create(x,y) \ - ((*(x) = TlsAlloc()) == TLS_OUT_OF_INDEXES ? ErrCode() : 0) -#define pthread_key_delete(x) TlsFree(x) -#define pthread_getspecific(x) TlsGetValue(x) -#define pthread_setspecific(x,y) (TlsSetValue(x,y) ? 0 : ErrCode()) -#define pthread_mutex_unlock(x) ReleaseMutex(*x) -#define pthread_mutex_lock(x) WaitForSingleObject(*x, INFINITE) -#define pthread_cond_signal(x) SetEvent(*x) -#define pthread_cond_wait(cond,mutex) do{SignalObjectAndWait(*mutex, *cond, INFINITE, FALSE); WaitForSingleObject(*mutex, INFINITE);}while(0) -#define THREAD_CREATE(thr,start,arg) thr=CreateThread(NULL,0,start,arg,0,NULL) -#define THREAD_FINISH(thr) WaitForSingleObject(thr, INFINITE) -#define LOCK_MUTEX_R(env) pthread_mutex_lock(&(env)->me_rmutex) -#define UNLOCK_MUTEX_R(env) pthread_mutex_unlock(&(env)->me_rmutex) -#define LOCK_MUTEX_W(env) pthread_mutex_lock(&(env)->me_wmutex) -#define UNLOCK_MUTEX_W(env) pthread_mutex_unlock(&(env)->me_wmutex) -#define getpid() GetCurrentProcessId() -#define MDB_FDATASYNC(fd) (!FlushFileBuffers(fd)) -#define MDB_MSYNC(addr,len,flags) (!FlushViewOfFile(addr,len)) -#define ErrCode() GetLastError() -#define GET_PAGESIZE(x) {SYSTEM_INFO si; GetSystemInfo(&si); (x) = si.dwPageSize;} -#define close(fd) (CloseHandle(fd) ? 0 : -1) -#define munmap(ptr,len) UnmapViewOfFile(ptr) -#ifdef PROCESS_QUERY_LIMITED_INFORMATION -#define MDB_PROCESS_QUERY_LIMITED_INFORMATION PROCESS_QUERY_LIMITED_INFORMATION -#else -#define MDB_PROCESS_QUERY_LIMITED_INFORMATION 0x1000 -#endif -#define Z "I" -#else -#define THREAD_RET void * -#define THREAD_CREATE(thr,start,arg) pthread_create(&thr,NULL,start,arg) -#define THREAD_FINISH(thr) pthread_join(thr,NULL) -#define Z "z" /**< printf format modifier for size_t */ - - /** For MDB_LOCK_FORMAT: True if readers take a pid lock in the lockfile */ -#define MDB_PIDLOCK 1 - -#ifdef MDB_USE_POSIX_SEM - -#define LOCK_MUTEX_R(env) mdb_sem_wait((env)->me_rmutex) -#define UNLOCK_MUTEX_R(env) sem_post((env)->me_rmutex) -#define LOCK_MUTEX_W(env) mdb_sem_wait((env)->me_wmutex) -#define UNLOCK_MUTEX_W(env) sem_post((env)->me_wmutex) - -static int -mdb_sem_wait(sem_t *sem) -{ - int rc; - while ((rc = sem_wait(sem)) && (rc = errno) == EINTR) ; - return rc; -} - -#else - /** Lock the reader mutex. - */ -#define LOCK_MUTEX_R(env) pthread_mutex_lock(&(env)->me_txns->mti_mutex) - /** Unlock the reader mutex. - */ -#define UNLOCK_MUTEX_R(env) pthread_mutex_unlock(&(env)->me_txns->mti_mutex) - - /** Lock the writer mutex. - * Only a single write transaction is allowed at a time. Other writers - * will block waiting for this mutex. - */ -#define LOCK_MUTEX_W(env) pthread_mutex_lock(&(env)->me_txns->mti_wmutex) - /** Unlock the writer mutex. - */ -#define UNLOCK_MUTEX_W(env) pthread_mutex_unlock(&(env)->me_txns->mti_wmutex) -#endif /* MDB_USE_POSIX_SEM */ - - /** Get the error code for the last failed system function. - */ -#define ErrCode() errno - - /** An abstraction for a file handle. - * On POSIX systems file handles are small integers. On Windows - * they're opaque pointers. - */ -#define HANDLE int - - /** A value for an invalid file handle. - * Mainly used to initialize file variables and signify that they are - * unused. - */ -#define INVALID_HANDLE_VALUE (-1) - - /** Get the size of a memory page for the system. - * This is the basic size that the platform's memory manager uses, and is - * fundamental to the use of memory-mapped files. - */ -#define GET_PAGESIZE(x) ((x) = sysconf(_SC_PAGE_SIZE)) -#endif - -#if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) -#define MNAME_LEN 32 -#else -#define MNAME_LEN (sizeof(pthread_mutex_t)) -#endif - -/** @} */ - -#ifndef _WIN32 -/** A flag for opening a file and requesting synchronous data writes. - * This is only used when writing a meta page. It's not strictly needed; - * we could just do a normal write and then immediately perform a flush. - * But if this flag is available it saves us an extra system call. - * - * @note If O_DSYNC is undefined but exists in /usr/include, - * preferably set some compiler flag to get the definition. - * Otherwise compile with the less efficient -DMDB_DSYNC=O_SYNC. - */ -#ifndef MDB_DSYNC -# define MDB_DSYNC O_DSYNC -#endif -#endif - -/** Function for flushing the data of a file. Define this to fsync - * if fdatasync() is not supported. - */ -#ifndef MDB_FDATASYNC -# define MDB_FDATASYNC fdatasync -#endif - -#ifndef MDB_MSYNC -# define MDB_MSYNC(addr,len,flags) msync(addr,len,flags) -#endif - -#ifndef MS_SYNC -#define MS_SYNC 1 -#endif - -#ifndef MS_ASYNC -#define MS_ASYNC 0 -#endif - - /** A page number in the database. - * Note that 64 bit page numbers are overkill, since pages themselves - * already represent 12-13 bits of addressable memory, and the OS will - * always limit applications to a maximum of 63 bits of address space. - * - * @note In the #MDB_node structure, we only store 48 bits of this value, - * which thus limits us to only 60 bits of addressable data. - */ -typedef MDB_ID pgno_t; - - /** A transaction ID. - * See struct MDB_txn.mt_txnid for details. - */ -typedef MDB_ID txnid_t; - -/** @defgroup debug Debug Macros - * @{ - */ -#ifndef MDB_DEBUG - /** Enable debug output. Needs variable argument macros (a C99 feature). - * Set this to 1 for copious tracing. Set to 2 to add dumps of all IDLs - * read from and written to the database (used for free space management). - */ -#define MDB_DEBUG 0 -#endif - -#if MDB_DEBUG -static int mdb_debug; -static txnid_t mdb_debug_start; - - /** Print a debug message with printf formatting. - * Requires double parenthesis around 2 or more args. - */ -# define DPRINTF(args) ((void) ((mdb_debug) && DPRINTF0 args)) -# define DPRINTF0(fmt, ...) \ - fprintf(stderr, "%s:%d " fmt "\n", mdb_func_, __LINE__, __VA_ARGS__) -#else -# define DPRINTF(args) ((void) 0) -#endif - /** Print a debug string. - * The string is printed literally, with no format processing. - */ -#define DPUTS(arg) DPRINTF(("%s", arg)) - /** Debuging output value of a cursor DBI: Negative in a sub-cursor. */ -#define DDBI(mc) \ - (((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi) -/** @} */ - - /** @brief The maximum size of a database page. - * - * It is 32k or 64k, since value-PAGEBASE must fit in - * #MDB_page.%mp_upper. - * - * LMDB will use database pages < OS pages if needed. - * That causes more I/O in write transactions: The OS must - * know (read) the whole page before writing a partial page. - * - * Note that we don't currently support Huge pages. On Linux, - * regular data files cannot use Huge pages, and in general - * Huge pages aren't actually pageable. We rely on the OS - * demand-pager to read our data and page it out when memory - * pressure from other processes is high. So until OSs have - * actual paging support for Huge pages, they're not viable. - */ -#define MAX_PAGESIZE (PAGEBASE ? 0x10000 : 0x8000) - - /** The minimum number of keys required in a database page. - * Setting this to a larger value will place a smaller bound on the - * maximum size of a data item. Data items larger than this size will - * be pushed into overflow pages instead of being stored directly in - * the B-tree node. This value used to default to 4. With a page size - * of 4096 bytes that meant that any item larger than 1024 bytes would - * go into an overflow page. That also meant that on average 2-3KB of - * each overflow page was wasted space. The value cannot be lower than - * 2 because then there would no longer be a tree structure. With this - * value, items larger than 2KB will go into overflow pages, and on - * average only 1KB will be wasted. - */ -#define MDB_MINKEYS 2 - - /** A stamp that identifies a file as an LMDB file. - * There's nothing special about this value other than that it is easily - * recognizable, and it will reflect any byte order mismatches. - */ -#define MDB_MAGIC 0xBEEFC0DE - - /** The version number for a database's datafile format. */ -#define MDB_DATA_VERSION ((MDB_DEVEL) ? 999 : 1) - /** The version number for a database's lockfile format. */ -#define MDB_LOCK_VERSION 1 - - /** @brief The max size of a key we can write, or 0 for dynamic max. - * - * Define this as 0 to compute the max from the page size. 511 - * is default for backwards compat: liblmdb <= 0.9.10 can break - * when modifying a DB with keys/dupsort data bigger than its max. - * #MDB_DEVEL sets the default to 0. - * - * Data items in an #MDB_DUPSORT database are also limited to - * this size, since they're actually keys of a sub-DB. Keys and - * #MDB_DUPSORT data items must fit on a node in a regular page. - */ -#ifndef MDB_MAXKEYSIZE -#define MDB_MAXKEYSIZE ((MDB_DEVEL) ? 0 : 511) -#endif - - /** The maximum size of a key we can write to the environment. */ -#if MDB_MAXKEYSIZE -#define ENV_MAXKEY(env) (MDB_MAXKEYSIZE) -#else -#define ENV_MAXKEY(env) ((env)->me_maxkey) -#endif - - /** @brief The maximum size of a data item. - * - * We only store a 32 bit value for node sizes. - */ -#define MAXDATASIZE 0xffffffffUL - -#if MDB_DEBUG - /** Key size which fits in a #DKBUF. - * @ingroup debug - */ -#define DKBUF_MAXKEYSIZE ((MDB_MAXKEYSIZE) > 0 ? (MDB_MAXKEYSIZE) : 511) - /** A key buffer. - * @ingroup debug - * This is used for printing a hex dump of a key's contents. - */ -#define DKBUF char kbuf[DKBUF_MAXKEYSIZE*2+1] - /** Display a key in hex. - * @ingroup debug - * Invoke a function to display a key in hex. - */ -#define DKEY(x) mdb_dkey(x, kbuf) -#else -#define DKBUF -#define DKEY(x) 0 -#endif - - /** An invalid page number. - * Mainly used to denote an empty tree. - */ -#define P_INVALID (~(pgno_t)0) - - /** Test if the flags \b f are set in a flag word \b w. */ -#define F_ISSET(w, f) (((w) & (f)) == (f)) - - /** Round \b n up to an even number. */ -#define EVEN(n) (((n) + 1U) & -2) /* sign-extending -2 to match n+1U */ - - /** Used for offsets within a single page. - * Since memory pages are typically 4 or 8KB in size, 12-13 bits, - * this is plenty. - */ -typedef uint16_t indx_t; - - /** Default size of memory map. - * This is certainly too small for any actual applications. Apps should always set - * the size explicitly using #mdb_env_set_mapsize(). - */ -#define DEFAULT_MAPSIZE 1048576 - -/** @defgroup readers Reader Lock Table - * Readers don't acquire any locks for their data access. Instead, they - * simply record their transaction ID in the reader table. The reader - * mutex is needed just to find an empty slot in the reader table. The - * slot's address is saved in thread-specific data so that subsequent read - * transactions started by the same thread need no further locking to proceed. - * - * If #MDB_NOTLS is set, the slot address is not saved in thread-specific data. - * - * No reader table is used if the database is on a read-only filesystem, or - * if #MDB_NOLOCK is set. - * - * Since the database uses multi-version concurrency control, readers don't - * actually need any locking. This table is used to keep track of which - * readers are using data from which old transactions, so that we'll know - * when a particular old transaction is no longer in use. Old transactions - * that have discarded any data pages can then have those pages reclaimed - * for use by a later write transaction. - * - * The lock table is constructed such that reader slots are aligned with the - * processor's cache line size. Any slot is only ever used by one thread. - * This alignment guarantees that there will be no contention or cache - * thrashing as threads update their own slot info, and also eliminates - * any need for locking when accessing a slot. - * - * A writer thread will scan every slot in the table to determine the oldest - * outstanding reader transaction. Any freed pages older than this will be - * reclaimed by the writer. The writer doesn't use any locks when scanning - * this table. This means that there's no guarantee that the writer will - * see the most up-to-date reader info, but that's not required for correct - * operation - all we need is to know the upper bound on the oldest reader, - * we don't care at all about the newest reader. So the only consequence of - * reading stale information here is that old pages might hang around a - * while longer before being reclaimed. That's actually good anyway, because - * the longer we delay reclaiming old pages, the more likely it is that a - * string of contiguous pages can be found after coalescing old pages from - * many old transactions together. - * @{ - */ - /** Number of slots in the reader table. - * This value was chosen somewhat arbitrarily. 126 readers plus a - * couple mutexes fit exactly into 8KB on my development machine. - * Applications should set the table size using #mdb_env_set_maxreaders(). - */ -#define DEFAULT_READERS 126 - - /** The size of a CPU cache line in bytes. We want our lock structures - * aligned to this size to avoid false cache line sharing in the - * lock table. - * This value works for most CPUs. For Itanium this should be 128. - */ -#ifndef CACHELINE -#define CACHELINE 64 -#endif - - /** The information we store in a single slot of the reader table. - * In addition to a transaction ID, we also record the process and - * thread ID that owns a slot, so that we can detect stale information, - * e.g. threads or processes that went away without cleaning up. - * @note We currently don't check for stale records. We simply re-init - * the table when we know that we're the only process opening the - * lock file. - */ -typedef struct MDB_rxbody { - /** Current Transaction ID when this transaction began, or (txnid_t)-1. - * Multiple readers that start at the same time will probably have the - * same ID here. Again, it's not important to exclude them from - * anything; all we need to know is which version of the DB they - * started from so we can avoid overwriting any data used in that - * particular version. - */ - txnid_t mrb_txnid; - /** The process ID of the process owning this reader txn. */ - MDB_PID_T mrb_pid; - /** The thread ID of the thread owning this txn. */ - MDB_THR_T mrb_tid; -} MDB_rxbody; - - /** The actual reader record, with cacheline padding. */ -typedef struct MDB_reader { - union { - MDB_rxbody mrx; - /** shorthand for mrb_txnid */ -#define mr_txnid mru.mrx.mrb_txnid -#define mr_pid mru.mrx.mrb_pid -#define mr_tid mru.mrx.mrb_tid - /** cache line alignment */ - char pad[(sizeof(MDB_rxbody)+CACHELINE-1) & ~(CACHELINE-1)]; - } mru; -} MDB_reader; - - /** The header for the reader table. - * The table resides in a memory-mapped file. (This is a different file - * than is used for the main database.) - * - * For POSIX the actual mutexes reside in the shared memory of this - * mapped file. On Windows, mutexes are named objects allocated by the - * kernel; we store the mutex names in this mapped file so that other - * processes can grab them. This same approach is also used on - * MacOSX/Darwin (using named semaphores) since MacOSX doesn't support - * process-shared POSIX mutexes. For these cases where a named object - * is used, the object name is derived from a 64 bit FNV hash of the - * environment pathname. As such, naming collisions are extremely - * unlikely. If a collision occurs, the results are unpredictable. - */ -typedef struct MDB_txbody { - /** Stamp identifying this as an LMDB file. It must be set - * to #MDB_MAGIC. */ - uint32_t mtb_magic; - /** Format of this lock file. Must be set to #MDB_LOCK_FORMAT. */ - uint32_t mtb_format; -#if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) - char mtb_rmname[MNAME_LEN]; -#else - /** Mutex protecting access to this table. - * This is the reader lock that #LOCK_MUTEX_R acquires. - */ - pthread_mutex_t mtb_mutex; -#endif - /** The ID of the last transaction committed to the database. - * This is recorded here only for convenience; the value can always - * be determined by reading the main database meta pages. - */ - txnid_t mtb_txnid; - /** The number of slots that have been used in the reader table. - * This always records the maximum count, it is not decremented - * when readers release their slots. - */ - unsigned mtb_numreaders; -} MDB_txbody; - - /** The actual reader table definition. */ -typedef struct MDB_txninfo { - union { - MDB_txbody mtb; -#define mti_magic mt1.mtb.mtb_magic -#define mti_format mt1.mtb.mtb_format -#define mti_mutex mt1.mtb.mtb_mutex -#define mti_rmname mt1.mtb.mtb_rmname -#define mti_txnid mt1.mtb.mtb_txnid -#define mti_numreaders mt1.mtb.mtb_numreaders - char pad[(sizeof(MDB_txbody)+CACHELINE-1) & ~(CACHELINE-1)]; - } mt1; - union { -#if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) - char mt2_wmname[MNAME_LEN]; -#define mti_wmname mt2.mt2_wmname -#else - pthread_mutex_t mt2_wmutex; -#define mti_wmutex mt2.mt2_wmutex -#endif - char pad[(MNAME_LEN+CACHELINE-1) & ~(CACHELINE-1)]; - } mt2; - MDB_reader mti_readers[1]; -} MDB_txninfo; - - /** Lockfile format signature: version, features and field layout */ -#define MDB_LOCK_FORMAT \ - ((uint32_t) \ - ((MDB_LOCK_VERSION) \ - /* Flags which describe functionality */ \ - + (((MDB_PIDLOCK) != 0) << 16))) -/** @} */ - -/** Common header for all page types. - * Overflow records occupy a number of contiguous pages with no - * headers on any page after the first. - */ -typedef struct MDB_page { -#define mp_pgno mp_p.p_pgno -#define mp_next mp_p.p_next - union { - pgno_t p_pgno; /**< page number */ - struct MDB_page *p_next; /**< for in-memory list of freed pages */ - } mp_p; - uint16_t mp_pad; -/** @defgroup mdb_page Page Flags - * @ingroup internal - * Flags for the page headers. - * @{ - */ -#define P_BRANCH 0x01 /**< branch page */ -#define P_LEAF 0x02 /**< leaf page */ -#define P_OVERFLOW 0x04 /**< overflow page */ -#define P_META 0x08 /**< meta page */ -#define P_DIRTY 0x10 /**< dirty page, also set for #P_SUBP pages */ -#define P_LEAF2 0x20 /**< for #MDB_DUPFIXED records */ -#define P_SUBP 0x40 /**< for #MDB_DUPSORT sub-pages */ -#define P_LOOSE 0x4000 /**< page was dirtied then freed, can be reused */ -#define P_KEEP 0x8000 /**< leave this page alone during spill */ -/** @} */ - uint16_t mp_flags; /**< @ref mdb_page */ -#define mp_lower mp_pb.pb.pb_lower -#define mp_upper mp_pb.pb.pb_upper -#define mp_pages mp_pb.pb_pages - union { - struct { - indx_t pb_lower; /**< lower bound of free space */ - indx_t pb_upper; /**< upper bound of free space */ - } pb; - uint32_t pb_pages; /**< number of overflow pages */ - } mp_pb; - indx_t mp_ptrs[1]; /**< dynamic size */ -} MDB_page; - - /** Size of the page header, excluding dynamic data at the end */ -#define PAGEHDRSZ ((unsigned) offsetof(MDB_page, mp_ptrs)) - - /** Address of first usable data byte in a page, after the header */ -#define METADATA(p) ((void *)((char *)(p) + PAGEHDRSZ)) - - /** ITS#7713, change PAGEBASE to handle 65536 byte pages */ -#define PAGEBASE ((MDB_DEVEL) ? PAGEHDRSZ : 0) - - /** Number of nodes on a page */ -#define NUMKEYS(p) (((p)->mp_lower - (PAGEHDRSZ-PAGEBASE)) >> 1) - - /** The amount of space remaining in the page */ -#define SIZELEFT(p) (indx_t)((p)->mp_upper - (p)->mp_lower) - - /** The percentage of space used in the page, in tenths of a percent. */ -#define PAGEFILL(env, p) (1000L * ((env)->me_psize - PAGEHDRSZ - SIZELEFT(p)) / \ - ((env)->me_psize - PAGEHDRSZ)) - /** The minimum page fill factor, in tenths of a percent. - * Pages emptier than this are candidates for merging. - */ -#define FILL_THRESHOLD 250 - - /** Test if a page is a leaf page */ -#define IS_LEAF(p) F_ISSET((p)->mp_flags, P_LEAF) - /** Test if a page is a LEAF2 page */ -#define IS_LEAF2(p) F_ISSET((p)->mp_flags, P_LEAF2) - /** Test if a page is a branch page */ -#define IS_BRANCH(p) F_ISSET((p)->mp_flags, P_BRANCH) - /** Test if a page is an overflow page */ -#define IS_OVERFLOW(p) F_ISSET((p)->mp_flags, P_OVERFLOW) - /** Test if a page is a sub page */ -#define IS_SUBP(p) F_ISSET((p)->mp_flags, P_SUBP) - - /** The number of overflow pages needed to store the given size. */ -#define OVPAGES(size, psize) ((PAGEHDRSZ-1 + (size)) / (psize) + 1) - - /** Link in #MDB_txn.%mt_loose_pgs list */ -#define NEXT_LOOSE_PAGE(p) (*(MDB_page **)((p) + 2)) - - /** Header for a single key/data pair within a page. - * Used in pages of type #P_BRANCH and #P_LEAF without #P_LEAF2. - * We guarantee 2-byte alignment for 'MDB_node's. - */ -typedef struct MDB_node { - /** lo and hi are used for data size on leaf nodes and for - * child pgno on branch nodes. On 64 bit platforms, flags - * is also used for pgno. (Branch nodes have no flags). - * They are in host byte order in case that lets some - * accesses be optimized into a 32-bit word access. - */ -#if BYTE_ORDER == LITTLE_ENDIAN - unsigned short mn_lo, mn_hi; /**< part of data size or pgno */ -#else - unsigned short mn_hi, mn_lo; -#endif -/** @defgroup mdb_node Node Flags - * @ingroup internal - * Flags for node headers. - * @{ - */ -#define F_BIGDATA 0x01 /**< data put on overflow page */ -#define F_SUBDATA 0x02 /**< data is a sub-database */ -#define F_DUPDATA 0x04 /**< data has duplicates */ - -/** valid flags for #mdb_node_add() */ -#define NODE_ADD_FLAGS (F_DUPDATA|F_SUBDATA|MDB_RESERVE|MDB_APPEND) - -/** @} */ - unsigned short mn_flags; /**< @ref mdb_node */ - unsigned short mn_ksize; /**< key size */ - char mn_data[1]; /**< key and data are appended here */ -} MDB_node; - - /** Size of the node header, excluding dynamic data at the end */ -#define NODESIZE offsetof(MDB_node, mn_data) - - /** Bit position of top word in page number, for shifting mn_flags */ -#define PGNO_TOPWORD ((pgno_t)-1 > 0xffffffffu ? 32 : 0) - - /** Size of a node in a branch page with a given key. - * This is just the node header plus the key, there is no data. - */ -#define INDXSIZE(k) (NODESIZE + ((k) == NULL ? 0 : (k)->mv_size)) - - /** Size of a node in a leaf page with a given key and data. - * This is node header plus key plus data size. - */ -#define LEAFSIZE(k, d) (NODESIZE + (k)->mv_size + (d)->mv_size) - - /** Address of node \b i in page \b p */ -#define NODEPTR(p, i) ((MDB_node *)((char *)(p) + (p)->mp_ptrs[i] + PAGEBASE)) - - /** Address of the key for the node */ -#define NODEKEY(node) (void *)((node)->mn_data) - - /** Address of the data for a node */ -#define NODEDATA(node) (void *)((char *)(node)->mn_data + (node)->mn_ksize) - - /** Get the page number pointed to by a branch node */ -#define NODEPGNO(node) \ - ((node)->mn_lo | ((pgno_t) (node)->mn_hi << 16) | \ - (PGNO_TOPWORD ? ((pgno_t) (node)->mn_flags << PGNO_TOPWORD) : 0)) - /** Set the page number in a branch node */ -#define SETPGNO(node,pgno) do { \ - (node)->mn_lo = (pgno) & 0xffff; (node)->mn_hi = (pgno) >> 16; \ - if (PGNO_TOPWORD) (node)->mn_flags = (pgno) >> PGNO_TOPWORD; } while(0) - - /** Get the size of the data in a leaf node */ -#define NODEDSZ(node) ((node)->mn_lo | ((unsigned)(node)->mn_hi << 16)) - /** Set the size of the data for a leaf node */ -#define SETDSZ(node,size) do { \ - (node)->mn_lo = (size) & 0xffff; (node)->mn_hi = (size) >> 16;} while(0) - /** The size of a key in a node */ -#define NODEKSZ(node) ((node)->mn_ksize) - - /** Copy a page number from src to dst */ -#ifdef MISALIGNED_OK -#define COPY_PGNO(dst,src) dst = src -#else -#if SIZE_MAX > 4294967295UL -#define COPY_PGNO(dst,src) do { \ - unsigned short *s, *d; \ - s = (unsigned short *)&(src); \ - d = (unsigned short *)&(dst); \ - *d++ = *s++; \ - *d++ = *s++; \ - *d++ = *s++; \ - *d = *s; \ -} while (0) -#else -#define COPY_PGNO(dst,src) do { \ - unsigned short *s, *d; \ - s = (unsigned short *)&(src); \ - d = (unsigned short *)&(dst); \ - *d++ = *s++; \ - *d = *s; \ -} while (0) -#endif -#endif - /** The address of a key in a LEAF2 page. - * LEAF2 pages are used for #MDB_DUPFIXED sorted-duplicate sub-DBs. - * There are no node headers, keys are stored contiguously. - */ -#define LEAF2KEY(p, i, ks) ((char *)(p) + PAGEHDRSZ + ((i)*(ks))) - - /** Set the \b node's key into \b keyptr, if requested. */ -#define MDB_GET_KEY(node, keyptr) { if ((keyptr) != NULL) { \ - (keyptr)->mv_size = NODEKSZ(node); (keyptr)->mv_data = NODEKEY(node); } } - - /** Set the \b node's key into \b key. */ -#define MDB_GET_KEY2(node, key) { key.mv_size = NODEKSZ(node); key.mv_data = NODEKEY(node); } - - /** Information about a single database in the environment. */ -typedef struct MDB_db { - uint32_t md_pad; /**< also ksize for LEAF2 pages */ - uint16_t md_flags; /**< @ref mdb_dbi_open */ - uint16_t md_depth; /**< depth of this tree */ - pgno_t md_branch_pages; /**< number of internal pages */ - pgno_t md_leaf_pages; /**< number of leaf pages */ - pgno_t md_overflow_pages; /**< number of overflow pages */ - size_t md_entries; /**< number of data items */ - pgno_t md_root; /**< the root page of this tree */ -} MDB_db; - - /** mdb_dbi_open flags */ -#define MDB_VALID 0x8000 /**< DB handle is valid, for me_dbflags */ -#define PERSISTENT_FLAGS (0xffff & ~(MDB_VALID)) -#define VALID_FLAGS (MDB_REVERSEKEY|MDB_DUPSORT|MDB_INTEGERKEY|MDB_DUPFIXED|\ - MDB_INTEGERDUP|MDB_REVERSEDUP|MDB_CREATE) - - /** Handle for the DB used to track free pages. */ -#define FREE_DBI 0 - /** Handle for the default DB. */ -#define MAIN_DBI 1 - - /** Meta page content. - * A meta page is the start point for accessing a database snapshot. - * Pages 0-1 are meta pages. Transaction N writes meta page #(N % 2). - */ -typedef struct MDB_meta { - /** Stamp identifying this as an LMDB file. It must be set - * to #MDB_MAGIC. */ - uint32_t mm_magic; - /** Version number of this lock file. Must be set to #MDB_DATA_VERSION. */ - uint32_t mm_version; - void *mm_address; /**< address for fixed mapping */ - size_t mm_mapsize; /**< size of mmap region */ - MDB_db mm_dbs[2]; /**< first is free space, 2nd is main db */ - /** The size of pages used in this DB */ -#define mm_psize mm_dbs[0].md_pad - /** Any persistent environment flags. @ref mdb_env */ -#define mm_flags mm_dbs[0].md_flags - pgno_t mm_last_pg; /**< last used page in file */ - txnid_t mm_txnid; /**< txnid that committed this page */ -} MDB_meta; - - /** Buffer for a stack-allocated meta page. - * The members define size and alignment, and silence type - * aliasing warnings. They are not used directly; that could - * mean incorrectly using several union members in parallel. - */ -typedef union MDB_metabuf { - MDB_page mb_page; - struct { - char mm_pad[PAGEHDRSZ]; - MDB_meta mm_meta; - } mb_metabuf; -} MDB_metabuf; - - /** Auxiliary DB info. - * The information here is mostly static/read-only. There is - * only a single copy of this record in the environment. - */ -typedef struct MDB_dbx { - MDB_val md_name; /**< name of the database */ - MDB_cmp_func *md_cmp; /**< function for comparing keys */ - MDB_cmp_func *md_dcmp; /**< function for comparing data items */ - MDB_rel_func *md_rel; /**< user relocate function */ - void *md_relctx; /**< user-provided context for md_rel */ -} MDB_dbx; - - /** A database transaction. - * Every operation requires a transaction handle. - */ -struct MDB_txn { - MDB_txn *mt_parent; /**< parent of a nested txn */ - MDB_txn *mt_child; /**< nested txn under this txn */ - pgno_t mt_next_pgno; /**< next unallocated page */ - /** The ID of this transaction. IDs are integers incrementing from 1. - * Only committed write transactions increment the ID. If a transaction - * aborts, the ID may be re-used by the next writer. - */ - txnid_t mt_txnid; - MDB_env *mt_env; /**< the DB environment */ - /** The list of pages that became unused during this transaction. - */ - MDB_IDL mt_free_pgs; - /** The list of loose pages that became unused and may be reused - * in this transaction, linked through #NEXT_LOOSE_PAGE(page). - */ - MDB_page *mt_loose_pgs; - /* #Number of loose pages (#mt_loose_pgs) */ - int mt_loose_count; - /** The sorted list of dirty pages we temporarily wrote to disk - * because the dirty list was full. page numbers in here are - * shifted left by 1, deleted slots have the LSB set. - */ - MDB_IDL mt_spill_pgs; - union { - /** For write txns: Modified pages. Sorted when not MDB_WRITEMAP. */ - MDB_ID2L dirty_list; - /** For read txns: This thread/txn's reader table slot, or NULL. */ - MDB_reader *reader; - } mt_u; - /** Array of records for each DB known in the environment. */ - MDB_dbx *mt_dbxs; - /** Array of MDB_db records for each known DB */ - MDB_db *mt_dbs; - /** Array of sequence numbers for each DB handle */ - unsigned int *mt_dbiseqs; -/** @defgroup mt_dbflag Transaction DB Flags - * @ingroup internal - * @{ - */ -#define DB_DIRTY 0x01 /**< DB was modified or is DUPSORT data */ -#define DB_STALE 0x02 /**< Named-DB record is older than txnID */ -#define DB_NEW 0x04 /**< Named-DB handle opened in this txn */ -#define DB_VALID 0x08 /**< DB handle is valid, see also #MDB_VALID */ -/** @} */ - /** In write txns, array of cursors for each DB */ - MDB_cursor **mt_cursors; - /** Array of flags for each DB */ - unsigned char *mt_dbflags; - /** Number of DB records in use. This number only ever increments; - * we don't decrement it when individual DB handles are closed. - */ - MDB_dbi mt_numdbs; - -/** @defgroup mdb_txn Transaction Flags - * @ingroup internal - * @{ - */ -#define MDB_TXN_RDONLY 0x01 /**< read-only transaction */ -#define MDB_TXN_ERROR 0x02 /**< txn is unusable after an error */ -#define MDB_TXN_DIRTY 0x04 /**< must write, even if dirty list is empty */ -#define MDB_TXN_SPILLS 0x08 /**< txn or a parent has spilled pages */ -/** @} */ - unsigned int mt_flags; /**< @ref mdb_txn */ - /** #dirty_list room: Array size - \#dirty pages visible to this txn. - * Includes ancestor txns' dirty pages not hidden by other txns' - * dirty/spilled pages. Thus commit(nested txn) has room to merge - * dirty_list into mt_parent after freeing hidden mt_parent pages. - */ - unsigned int mt_dirty_room; -}; - -/** Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty. - * At 4 keys per node, enough for 2^64 nodes, so there's probably no need to - * raise this on a 64 bit machine. - */ -#define CURSOR_STACK 32 - -struct MDB_xcursor; - - /** Cursors are used for all DB operations. - * A cursor holds a path of (page pointer, key index) from the DB - * root to a position in the DB, plus other state. #MDB_DUPSORT - * cursors include an xcursor to the current data item. Write txns - * track their cursors and keep them up to date when data moves. - * Exception: An xcursor's pointer to a #P_SUBP page can be stale. - * (A node with #F_DUPDATA but no #F_SUBDATA contains a subpage). - */ -struct MDB_cursor { - /** Next cursor on this DB in this txn */ - MDB_cursor *mc_next; - /** Backup of the original cursor if this cursor is a shadow */ - MDB_cursor *mc_backup; - /** Context used for databases with #MDB_DUPSORT, otherwise NULL */ - struct MDB_xcursor *mc_xcursor; - /** The transaction that owns this cursor */ - MDB_txn *mc_txn; - /** The database handle this cursor operates on */ - MDB_dbi mc_dbi; - /** The database record for this cursor */ - MDB_db *mc_db; - /** The database auxiliary record for this cursor */ - MDB_dbx *mc_dbx; - /** The @ref mt_dbflag for this database */ - unsigned char *mc_dbflag; - unsigned short mc_snum; /**< number of pushed pages */ - unsigned short mc_top; /**< index of top page, normally mc_snum-1 */ -/** @defgroup mdb_cursor Cursor Flags - * @ingroup internal - * Cursor state flags. - * @{ - */ -#define C_INITIALIZED 0x01 /**< cursor has been initialized and is valid */ -#define C_EOF 0x02 /**< No more data */ -#define C_SUB 0x04 /**< Cursor is a sub-cursor */ -#define C_DEL 0x08 /**< last op was a cursor_del */ -#define C_SPLITTING 0x20 /**< Cursor is in page_split */ -#define C_UNTRACK 0x40 /**< Un-track cursor when closing */ -/** @} */ - unsigned int mc_flags; /**< @ref mdb_cursor */ - MDB_page *mc_pg[CURSOR_STACK]; /**< stack of pushed pages */ - indx_t mc_ki[CURSOR_STACK]; /**< stack of page indices */ -}; - - /** Context for sorted-dup records. - * We could have gone to a fully recursive design, with arbitrarily - * deep nesting of sub-databases. But for now we only handle these - * levels - main DB, optional sub-DB, sorted-duplicate DB. - */ -typedef struct MDB_xcursor { - /** A sub-cursor for traversing the Dup DB */ - MDB_cursor mx_cursor; - /** The database record for this Dup DB */ - MDB_db mx_db; - /** The auxiliary DB record for this Dup DB */ - MDB_dbx mx_dbx; - /** The @ref mt_dbflag for this Dup DB */ - unsigned char mx_dbflag; -} MDB_xcursor; - - /** State of FreeDB old pages, stored in the MDB_env */ -typedef struct MDB_pgstate { - pgno_t *mf_pghead; /**< Reclaimed freeDB pages, or NULL before use */ - txnid_t mf_pglast; /**< ID of last used record, or 0 if !mf_pghead */ -} MDB_pgstate; - - /** The database environment. */ -struct MDB_env { - HANDLE me_fd; /**< The main data file */ - HANDLE me_lfd; /**< The lock file */ - HANDLE me_mfd; /**< just for writing the meta pages */ - /** Failed to update the meta page. Probably an I/O error. */ -#define MDB_FATAL_ERROR 0x80000000U - /** Some fields are initialized. */ -#define MDB_ENV_ACTIVE 0x20000000U - /** me_txkey is set */ -#define MDB_ENV_TXKEY 0x10000000U - uint32_t me_flags; /**< @ref mdb_env */ - unsigned int me_psize; /**< DB page size, inited from me_os_psize */ - unsigned int me_os_psize; /**< OS page size, from #GET_PAGESIZE */ - unsigned int me_maxreaders; /**< size of the reader table */ - unsigned int me_numreaders; /**< max numreaders set by this env */ - MDB_dbi me_numdbs; /**< number of DBs opened */ - MDB_dbi me_maxdbs; /**< size of the DB table */ - MDB_PID_T me_pid; /**< process ID of this env */ - char *me_path; /**< path to the DB files */ - char *me_map; /**< the memory map of the data file */ - MDB_txninfo *me_txns; /**< the memory map of the lock file or NULL */ - MDB_meta *me_metas[2]; /**< pointers to the two meta pages */ - void *me_pbuf; /**< scratch area for DUPSORT put() */ - MDB_txn *me_txn; /**< current write transaction */ - size_t me_mapsize; /**< size of the data memory map */ - off_t me_size; /**< current file size */ - pgno_t me_maxpg; /**< me_mapsize / me_psize */ - MDB_dbx *me_dbxs; /**< array of static DB info */ - uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */ - unsigned int *me_dbiseqs; /**< array of dbi sequence numbers */ - pthread_key_t me_txkey; /**< thread-key for readers */ - MDB_pgstate me_pgstate; /**< state of old pages from freeDB */ -# define me_pglast me_pgstate.mf_pglast -# define me_pghead me_pgstate.mf_pghead - MDB_page *me_dpages; /**< list of malloc'd blocks for re-use */ - /** IDL of pages that became unused in a write txn */ - MDB_IDL me_free_pgs; - /** ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE. */ - MDB_ID2L me_dirty_list; - /** Max number of freelist items that can fit in a single overflow page */ - int me_maxfree_1pg; - /** Max size of a node on a page */ - unsigned int me_nodemax; -#if !(MDB_MAXKEYSIZE) - unsigned int me_maxkey; /**< max size of a key */ -#endif - int me_live_reader; /**< have liveness lock in reader table */ -#ifdef _WIN32 - int me_pidquery; /**< Used in OpenProcess */ - HANDLE me_rmutex; /* Windows mutexes don't reside in shared mem */ - HANDLE me_wmutex; -#elif defined(MDB_USE_POSIX_SEM) - sem_t *me_rmutex; /* Shared mutexes are not supported */ - sem_t *me_wmutex; -#endif - void *me_userctx; /**< User-settable context */ - MDB_assert_func *me_assert_func; /**< Callback for assertion failures */ -}; - - /** Nested transaction */ -typedef struct MDB_ntxn { - MDB_txn mnt_txn; /**< the transaction */ - MDB_pgstate mnt_pgstate; /**< parent transaction's saved freestate */ -} MDB_ntxn; - - /** max number of pages to commit in one writev() call */ -#define MDB_COMMIT_PAGES 64 -#if defined(IOV_MAX) && IOV_MAX < MDB_COMMIT_PAGES -#undef MDB_COMMIT_PAGES -#define MDB_COMMIT_PAGES IOV_MAX -#endif - - /** max bytes to write in one call */ -#define MAX_WRITE (0x80000000U >> (sizeof(ssize_t) == 4)) - - /** Check \b txn and \b dbi arguments to a function */ -#define TXN_DBI_EXIST(txn, dbi) \ - ((txn) && (dbi) < (txn)->mt_numdbs && ((txn)->mt_dbflags[dbi] & DB_VALID)) - - /** Check for misused \b dbi handles */ -#define TXN_DBI_CHANGED(txn, dbi) \ - ((txn)->mt_dbiseqs[dbi] != (txn)->mt_env->me_dbiseqs[dbi]) - -static int mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp); -static int mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp); -static int mdb_page_touch(MDB_cursor *mc); - -static int mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **mp, int *lvl); -static int mdb_page_search_root(MDB_cursor *mc, - MDB_val *key, int modify); -#define MDB_PS_MODIFY 1 -#define MDB_PS_ROOTONLY 2 -#define MDB_PS_FIRST 4 -#define MDB_PS_LAST 8 -static int mdb_page_search(MDB_cursor *mc, - MDB_val *key, int flags); -static int mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst); - -#define MDB_SPLIT_REPLACE MDB_APPENDDUP /**< newkey is not new */ -static int mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, - pgno_t newpgno, unsigned int nflags); - -static int mdb_env_read_header(MDB_env *env, MDB_meta *meta); -static int mdb_env_pick_meta(const MDB_env *env); -static int mdb_env_write_meta(MDB_txn *txn); -#if !(defined(_WIN32) || defined(MDB_USE_POSIX_SEM)) /* Drop unused excl arg */ -# define mdb_env_close0(env, excl) mdb_env_close1(env) -#endif -static void mdb_env_close0(MDB_env *env, int excl); - -static MDB_node *mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp); -static int mdb_node_add(MDB_cursor *mc, indx_t indx, - MDB_val *key, MDB_val *data, pgno_t pgno, unsigned int flags); -static void mdb_node_del(MDB_cursor *mc, int ksize); -static void mdb_node_shrink(MDB_page *mp, indx_t indx); -static int mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst); -static int mdb_node_read(MDB_txn *txn, MDB_node *leaf, MDB_val *data); -static size_t mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data); -static size_t mdb_branch_size(MDB_env *env, MDB_val *key); - -static int mdb_rebalance(MDB_cursor *mc); -static int mdb_update_key(MDB_cursor *mc, MDB_val *key); - -static void mdb_cursor_pop(MDB_cursor *mc); -static int mdb_cursor_push(MDB_cursor *mc, MDB_page *mp); - -static int mdb_cursor_del0(MDB_cursor *mc); -static int mdb_del0(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, unsigned flags); -static int mdb_cursor_sibling(MDB_cursor *mc, int move_right); -static int mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op); -static int mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op); -static int mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op, - int *exactp); -static int mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data); -static int mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data); - -static void mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx); -static void mdb_xcursor_init0(MDB_cursor *mc); -static void mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node); - -static int mdb_drop0(MDB_cursor *mc, int subs); -static void mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi); - -/** @cond */ -static MDB_cmp_func mdb_cmp_memn, mdb_cmp_memnr, mdb_cmp_int, mdb_cmp_cint, mdb_cmp_long; -/** @endcond */ - -#ifdef _WIN32 -static SECURITY_DESCRIPTOR mdb_null_sd; -static SECURITY_ATTRIBUTES mdb_all_sa; -static int mdb_sec_inited; -#endif - -/** Return the library version info. */ -char * -mdb_version(int *major, int *minor, int *patch) -{ - if (major) *major = MDB_VERSION_MAJOR; - if (minor) *minor = MDB_VERSION_MINOR; - if (patch) *patch = MDB_VERSION_PATCH; - return MDB_VERSION_STRING; -} - -/** Table of descriptions for LMDB @ref errors */ -static char *const mdb_errstr[] = { - "MDB_KEYEXIST: Key/data pair already exists", - "MDB_NOTFOUND: No matching key/data pair found", - "MDB_PAGE_NOTFOUND: Requested page not found", - "MDB_CORRUPTED: Located page was wrong type", - "MDB_PANIC: Update of meta page failed", - "MDB_VERSION_MISMATCH: Database environment version mismatch", - "MDB_INVALID: File is not an LMDB file", - "MDB_MAP_FULL: Environment mapsize limit reached", - "MDB_DBS_FULL: Environment maxdbs limit reached", - "MDB_READERS_FULL: Environment maxreaders limit reached", - "MDB_TLS_FULL: Thread-local storage keys full - too many environments open", - "MDB_TXN_FULL: Transaction has too many dirty pages - transaction too big", - "MDB_CURSOR_FULL: Internal error - cursor stack limit reached", - "MDB_PAGE_FULL: Internal error - page has no more space", - "MDB_MAP_RESIZED: Database contents grew beyond environment mapsize", - "MDB_INCOMPATIBLE: Operation and DB incompatible, or DB flags changed", - "MDB_BAD_RSLOT: Invalid reuse of reader locktable slot", - "MDB_BAD_TXN: Transaction cannot recover - it must be aborted", - "MDB_BAD_VALSIZE: Unsupported size of key/DB name/data, or wrong DUPFIXED size", - "MDB_BAD_DBI: The specified DBI handle was closed/changed unexpectedly", -}; - -char * -mdb_strerror(int err) -{ -#ifdef _WIN32 - /** HACK: pad 4KB on stack over the buf. Return system msgs in buf. - * This works as long as no function between the call to mdb_strerror - * and the actual use of the message uses more than 4K of stack. - */ - char pad[4096]; - char buf[1024], *ptr = buf; -#endif - int i; - if (!err) - return ("Successful return: 0"); - - if (err >= MDB_KEYEXIST && err <= MDB_LAST_ERRCODE) { - i = err - MDB_KEYEXIST; - return mdb_errstr[i]; - } - -#ifdef _WIN32 - /* These are the C-runtime error codes we use. The comment indicates - * their numeric value, and the Win32 error they would correspond to - * if the error actually came from a Win32 API. A major mess, we should - * have used LMDB-specific error codes for everything. - */ - switch(err) { - case ENOENT: /* 2, FILE_NOT_FOUND */ - case EIO: /* 5, ACCESS_DENIED */ - case ENOMEM: /* 12, INVALID_ACCESS */ - case EACCES: /* 13, INVALID_DATA */ - case EBUSY: /* 16, CURRENT_DIRECTORY */ - case EINVAL: /* 22, BAD_COMMAND */ - case ENOSPC: /* 28, OUT_OF_PAPER */ - return strerror(err); - default: - ; - } - buf[0] = 0; - FormatMessage(FORMAT_MESSAGE_FROM_SYSTEM | - FORMAT_MESSAGE_IGNORE_INSERTS, - NULL, err, 0, ptr, sizeof(buf), pad); - return ptr; -#else - return strerror(err); -#endif -} - -/** assert(3) variant in cursor context */ -#define mdb_cassert(mc, expr) mdb_assert0((mc)->mc_txn->mt_env, expr, #expr) -/** assert(3) variant in transaction context */ -#define mdb_tassert(mc, expr) mdb_assert0((txn)->mt_env, expr, #expr) -/** assert(3) variant in environment context */ -#define mdb_eassert(env, expr) mdb_assert0(env, expr, #expr) - -#ifndef NDEBUG -# define mdb_assert0(env, expr, expr_txt) ((expr) ? (void)0 : \ - mdb_assert_fail(env, expr_txt, mdb_func_, __FILE__, __LINE__)) - -static void -mdb_assert_fail(MDB_env *env, const char *expr_txt, - const char *func, const char *file, int line) -{ - char buf[400]; - sprintf(buf, "%.100s:%d: Assertion '%.200s' failed in %.40s()", - file, line, expr_txt, func); - if (env->me_assert_func) - env->me_assert_func(env, buf); - fprintf(stderr, "%s\n", buf); - abort(); -} -#else -# define mdb_assert0(env, expr, expr_txt) ((void) 0) -#endif /* NDEBUG */ - -#if MDB_DEBUG -/** Return the page number of \b mp which may be sub-page, for debug output */ -static pgno_t -mdb_dbg_pgno(MDB_page *mp) -{ - pgno_t ret; - COPY_PGNO(ret, mp->mp_pgno); - return ret; -} - -/** Display a key in hexadecimal and return the address of the result. - * @param[in] key the key to display - * @param[in] buf the buffer to write into. Should always be #DKBUF. - * @return The key in hexadecimal form. - */ -char * -mdb_dkey(MDB_val *key, char *buf) -{ - char *ptr = buf; - unsigned char *c = key->mv_data; - unsigned int i; - - if (!key) - return ""; - - if (key->mv_size > DKBUF_MAXKEYSIZE) - return "MDB_MAXKEYSIZE"; - /* may want to make this a dynamic check: if the key is mostly - * printable characters, print it as-is instead of converting to hex. - */ -#if 1 - buf[0] = '\0'; - for (i=0; imv_size; i++) - ptr += sprintf(ptr, "%02x", *c++); -#else - sprintf(buf, "%.*s", key->mv_size, key->mv_data); -#endif - return buf; -} - -static const char * -mdb_leafnode_type(MDB_node *n) -{ - static char *const tp[2][2] = {{"", ": DB"}, {": sub-page", ": sub-DB"}}; - return F_ISSET(n->mn_flags, F_BIGDATA) ? ": overflow page" : - tp[F_ISSET(n->mn_flags, F_DUPDATA)][F_ISSET(n->mn_flags, F_SUBDATA)]; -} - -/** Display all the keys in the page. */ -void -mdb_page_list(MDB_page *mp) -{ - pgno_t pgno = mdb_dbg_pgno(mp); - const char *type, *state = (mp->mp_flags & P_DIRTY) ? ", dirty" : ""; - MDB_node *node; - unsigned int i, nkeys, nsize, total = 0; - MDB_val key; - DKBUF; - - switch (mp->mp_flags & (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP)) { - case P_BRANCH: type = "Branch page"; break; - case P_LEAF: type = "Leaf page"; break; - case P_LEAF|P_SUBP: type = "Sub-page"; break; - case P_LEAF|P_LEAF2: type = "LEAF2 page"; break; - case P_LEAF|P_LEAF2|P_SUBP: type = "LEAF2 sub-page"; break; - case P_OVERFLOW: - fprintf(stderr, "Overflow page %"Z"u pages %u%s\n", - pgno, mp->mp_pages, state); - return; - case P_META: - fprintf(stderr, "Meta-page %"Z"u txnid %"Z"u\n", - pgno, ((MDB_meta *)METADATA(mp))->mm_txnid); - return; - default: - fprintf(stderr, "Bad page %"Z"u flags 0x%u\n", pgno, mp->mp_flags); - return; - } - - nkeys = NUMKEYS(mp); - fprintf(stderr, "%s %"Z"u numkeys %d%s\n", type, pgno, nkeys, state); - - for (i=0; imp_pad; - key.mv_data = LEAF2KEY(mp, i, nsize); - total += nsize; - fprintf(stderr, "key %d: nsize %d, %s\n", i, nsize, DKEY(&key)); - continue; - } - node = NODEPTR(mp, i); - key.mv_size = node->mn_ksize; - key.mv_data = node->mn_data; - nsize = NODESIZE + key.mv_size; - if (IS_BRANCH(mp)) { - fprintf(stderr, "key %d: page %"Z"u, %s\n", i, NODEPGNO(node), - DKEY(&key)); - total += nsize; - } else { - if (F_ISSET(node->mn_flags, F_BIGDATA)) - nsize += sizeof(pgno_t); - else - nsize += NODEDSZ(node); - total += nsize; - nsize += sizeof(indx_t); - fprintf(stderr, "key %d: nsize %d, %s%s\n", - i, nsize, DKEY(&key), mdb_leafnode_type(node)); - } - total = EVEN(total); - } - fprintf(stderr, "Total: header %d + contents %d + unused %d\n", - IS_LEAF2(mp) ? PAGEHDRSZ : PAGEBASE + mp->mp_lower, total, SIZELEFT(mp)); -} - -void -mdb_cursor_chk(MDB_cursor *mc) -{ - unsigned int i; - MDB_node *node; - MDB_page *mp; - - if (!mc->mc_snum && !(mc->mc_flags & C_INITIALIZED)) return; - for (i=0; imc_top; i++) { - mp = mc->mc_pg[i]; - node = NODEPTR(mp, mc->mc_ki[i]); - if (NODEPGNO(node) != mc->mc_pg[i+1]->mp_pgno) - printf("oops!\n"); - } - if (mc->mc_ki[i] >= NUMKEYS(mc->mc_pg[i])) - printf("ack!\n"); -} -#endif - -#if (MDB_DEBUG) > 2 -/** Count all the pages in each DB and in the freelist - * and make sure it matches the actual number of pages - * being used. - * All named DBs must be open for a correct count. - */ -static void mdb_audit(MDB_txn *txn) -{ - MDB_cursor mc; - MDB_val key, data; - MDB_ID freecount, count; - MDB_dbi i; - int rc; - - freecount = 0; - mdb_cursor_init(&mc, txn, FREE_DBI, NULL); - while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) - freecount += *(MDB_ID *)data.mv_data; - mdb_tassert(txn, rc == MDB_NOTFOUND); - - count = 0; - for (i = 0; imt_numdbs; i++) { - MDB_xcursor mx; - if (!(txn->mt_dbflags[i] & DB_VALID)) - continue; - mdb_cursor_init(&mc, txn, i, &mx); - if (txn->mt_dbs[i].md_root == P_INVALID) - continue; - count += txn->mt_dbs[i].md_branch_pages + - txn->mt_dbs[i].md_leaf_pages + - txn->mt_dbs[i].md_overflow_pages; - if (txn->mt_dbs[i].md_flags & MDB_DUPSORT) { - rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST); - for (; rc == MDB_SUCCESS; rc = mdb_cursor_sibling(&mc, 1)) { - unsigned j; - MDB_page *mp; - mp = mc.mc_pg[mc.mc_top]; - for (j=0; jmn_flags & F_SUBDATA) { - MDB_db db; - memcpy(&db, NODEDATA(leaf), sizeof(db)); - count += db.md_branch_pages + db.md_leaf_pages + - db.md_overflow_pages; - } - } - } - mdb_tassert(txn, rc == MDB_NOTFOUND); - } - } - if (freecount + count + 2 /* metapages */ != txn->mt_next_pgno) { - fprintf(stderr, "audit: %lu freecount: %lu count: %lu total: %lu next_pgno: %lu\n", - txn->mt_txnid, freecount, count+2, freecount+count+2, txn->mt_next_pgno); - } -} -#endif - -int -mdb_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) -{ - return txn->mt_dbxs[dbi].md_cmp(a, b); -} - -int -mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) -{ - return txn->mt_dbxs[dbi].md_dcmp(a, b); -} - -/** Allocate memory for a page. - * Re-use old malloc'd pages first for singletons, otherwise just malloc. - */ -static MDB_page * -mdb_page_malloc(MDB_txn *txn, unsigned num) -{ - MDB_env *env = txn->mt_env; - MDB_page *ret = env->me_dpages; - size_t psize = env->me_psize, sz = psize, off; - /* For ! #MDB_NOMEMINIT, psize counts how much to init. - * For a single page alloc, we init everything after the page header. - * For multi-page, we init the final page; if the caller needed that - * many pages they will be filling in at least up to the last page. - */ - if (num == 1) { - if (ret) { - VGMEMP_ALLOC(env, ret, sz); - VGMEMP_DEFINED(ret, sizeof(ret->mp_next)); - env->me_dpages = ret->mp_next; - return ret; - } - psize -= off = PAGEHDRSZ; - } else { - sz *= num; - off = sz - psize; - } - if ((ret = malloc(sz)) != NULL) { - VGMEMP_ALLOC(env, ret, sz); - if (!(env->me_flags & MDB_NOMEMINIT)) { - memset((char *)ret + off, 0, psize); - ret->mp_pad = 0; - } - } else { - txn->mt_flags |= MDB_TXN_ERROR; - } - return ret; -} -/** Free a single page. - * Saves single pages to a list, for future reuse. - * (This is not used for multi-page overflow pages.) - */ -static void -mdb_page_free(MDB_env *env, MDB_page *mp) -{ - mp->mp_next = env->me_dpages; - VGMEMP_FREE(env, mp); - env->me_dpages = mp; -} - -/** Free a dirty page */ -static void -mdb_dpage_free(MDB_env *env, MDB_page *dp) -{ - if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) { - mdb_page_free(env, dp); - } else { - /* large pages just get freed directly */ - VGMEMP_FREE(env, dp); - free(dp); - } -} - -/** Return all dirty pages to dpage list */ -static void -mdb_dlist_free(MDB_txn *txn) -{ - MDB_env *env = txn->mt_env; - MDB_ID2L dl = txn->mt_u.dirty_list; - unsigned i, n = dl[0].mid; - - for (i = 1; i <= n; i++) { - mdb_dpage_free(env, dl[i].mptr); - } - dl[0].mid = 0; -} - -/** Loosen or free a single page. - * Saves single pages to a list for future reuse - * in this same txn. It has been pulled from the freeDB - * and already resides on the dirty list, but has been - * deleted. Use these pages first before pulling again - * from the freeDB. - * - * If the page wasn't dirtied in this txn, just add it - * to this txn's free list. - */ -static int -mdb_page_loose(MDB_cursor *mc, MDB_page *mp) -{ - int loose = 0; - pgno_t pgno = mp->mp_pgno; - MDB_txn *txn = mc->mc_txn; - - if ((mp->mp_flags & P_DIRTY) && mc->mc_dbi != FREE_DBI) { - if (txn->mt_parent) { - MDB_ID2 *dl = txn->mt_u.dirty_list; - /* If txn has a parent, make sure the page is in our - * dirty list. - */ - if (dl[0].mid) { - unsigned x = mdb_mid2l_search(dl, pgno); - if (x <= dl[0].mid && dl[x].mid == pgno) { - if (mp != dl[x].mptr) { /* bad cursor? */ - mc->mc_flags &= ~(C_INITIALIZED|C_EOF); - txn->mt_flags |= MDB_TXN_ERROR; - return MDB_CORRUPTED; - } - /* ok, it's ours */ - loose = 1; - } - } - } else { - /* no parent txn, so it's just ours */ - loose = 1; - } - } - if (loose) { - DPRINTF(("loosen db %d page %"Z"u", DDBI(mc), - mp->mp_pgno)); - NEXT_LOOSE_PAGE(mp) = txn->mt_loose_pgs; - txn->mt_loose_pgs = mp; - txn->mt_loose_count++; - mp->mp_flags |= P_LOOSE; - } else { - int rc = mdb_midl_append(&txn->mt_free_pgs, pgno); - if (rc) - return rc; - } - - return MDB_SUCCESS; -} - -/** Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn. - * @param[in] mc A cursor handle for the current operation. - * @param[in] pflags Flags of the pages to update: - * P_DIRTY to set P_KEEP, P_DIRTY|P_KEEP to clear it. - * @param[in] all No shortcuts. Needed except after a full #mdb_page_flush(). - * @return 0 on success, non-zero on failure. - */ -static int -mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all) -{ - enum { Mask = P_SUBP|P_DIRTY|P_LOOSE|P_KEEP }; - MDB_txn *txn = mc->mc_txn; - MDB_cursor *m3; - MDB_xcursor *mx; - MDB_page *dp, *mp; - MDB_node *leaf; - unsigned i, j; - int rc = MDB_SUCCESS, level; - - /* Mark pages seen by cursors */ - if (mc->mc_flags & C_UNTRACK) - mc = NULL; /* will find mc in mt_cursors */ - for (i = txn->mt_numdbs;; mc = txn->mt_cursors[--i]) { - for (; mc; mc=mc->mc_next) { - if (!(mc->mc_flags & C_INITIALIZED)) - continue; - for (m3 = mc;; m3 = &mx->mx_cursor) { - mp = NULL; - for (j=0; jmc_snum; j++) { - mp = m3->mc_pg[j]; - if ((mp->mp_flags & Mask) == pflags) - mp->mp_flags ^= P_KEEP; - } - mx = m3->mc_xcursor; - /* Proceed to mx if it is at a sub-database */ - if (! (mx && (mx->mx_cursor.mc_flags & C_INITIALIZED))) - break; - if (! (mp && (mp->mp_flags & P_LEAF))) - break; - leaf = NODEPTR(mp, m3->mc_ki[j-1]); - if (!(leaf->mn_flags & F_SUBDATA)) - break; - } - } - if (i == 0) - break; - } - - if (all) { - /* Mark dirty root pages */ - for (i=0; imt_numdbs; i++) { - if (txn->mt_dbflags[i] & DB_DIRTY) { - pgno_t pgno = txn->mt_dbs[i].md_root; - if (pgno == P_INVALID) - continue; - if ((rc = mdb_page_get(txn, pgno, &dp, &level)) != MDB_SUCCESS) - break; - if ((dp->mp_flags & Mask) == pflags && level <= 1) - dp->mp_flags ^= P_KEEP; - } - } - } - - return rc; -} - -static int mdb_page_flush(MDB_txn *txn, int keep); - -/** Spill pages from the dirty list back to disk. - * This is intended to prevent running into #MDB_TXN_FULL situations, - * but note that they may still occur in a few cases: - * 1) our estimate of the txn size could be too small. Currently this - * seems unlikely, except with a large number of #MDB_MULTIPLE items. - * 2) child txns may run out of space if their parents dirtied a - * lot of pages and never spilled them. TODO: we probably should do - * a preemptive spill during #mdb_txn_begin() of a child txn, if - * the parent's dirty_room is below a given threshold. - * - * Otherwise, if not using nested txns, it is expected that apps will - * not run into #MDB_TXN_FULL any more. The pages are flushed to disk - * the same way as for a txn commit, e.g. their P_DIRTY flag is cleared. - * If the txn never references them again, they can be left alone. - * If the txn only reads them, they can be used without any fuss. - * If the txn writes them again, they can be dirtied immediately without - * going thru all of the work of #mdb_page_touch(). Such references are - * handled by #mdb_page_unspill(). - * - * Also note, we never spill DB root pages, nor pages of active cursors, - * because we'll need these back again soon anyway. And in nested txns, - * we can't spill a page in a child txn if it was already spilled in a - * parent txn. That would alter the parent txns' data even though - * the child hasn't committed yet, and we'd have no way to undo it if - * the child aborted. - * - * @param[in] m0 cursor A cursor handle identifying the transaction and - * database for which we are checking space. - * @param[in] key For a put operation, the key being stored. - * @param[in] data For a put operation, the data being stored. - * @return 0 on success, non-zero on failure. - */ -static int -mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) -{ - MDB_txn *txn = m0->mc_txn; - MDB_page *dp; - MDB_ID2L dl = txn->mt_u.dirty_list; - unsigned int i, j, need; - int rc; - - if (m0->mc_flags & C_SUB) - return MDB_SUCCESS; - - /* Estimate how much space this op will take */ - i = m0->mc_db->md_depth; - /* Named DBs also dirty the main DB */ - if (m0->mc_dbi > MAIN_DBI) - i += txn->mt_dbs[MAIN_DBI].md_depth; - /* For puts, roughly factor in the key+data size */ - if (key) - i += (LEAFSIZE(key, data) + txn->mt_env->me_psize) / txn->mt_env->me_psize; - i += i; /* double it for good measure */ - need = i; - - if (txn->mt_dirty_room > i) - return MDB_SUCCESS; - - if (!txn->mt_spill_pgs) { - txn->mt_spill_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX); - if (!txn->mt_spill_pgs) - return ENOMEM; - } else { - /* purge deleted slots */ - MDB_IDL sl = txn->mt_spill_pgs; - unsigned int num = sl[0]; - j=0; - for (i=1; i<=num; i++) { - if (!(sl[i] & 1)) - sl[++j] = sl[i]; - } - sl[0] = j; - } - - /* Preserve pages which may soon be dirtied again */ - if ((rc = mdb_pages_xkeep(m0, P_DIRTY, 1)) != MDB_SUCCESS) - goto done; - - /* Less aggressive spill - we originally spilled the entire dirty list, - * with a few exceptions for cursor pages and DB root pages. But this - * turns out to be a lot of wasted effort because in a large txn many - * of those pages will need to be used again. So now we spill only 1/8th - * of the dirty pages. Testing revealed this to be a good tradeoff, - * better than 1/2, 1/4, or 1/10. - */ - if (need < MDB_IDL_UM_MAX / 8) - need = MDB_IDL_UM_MAX / 8; - - /* Save the page IDs of all the pages we're flushing */ - /* flush from the tail forward, this saves a lot of shifting later on. */ - for (i=dl[0].mid; i && need; i--) { - MDB_ID pn = dl[i].mid << 1; - dp = dl[i].mptr; - if (dp->mp_flags & (P_LOOSE|P_KEEP)) - continue; - /* Can't spill twice, make sure it's not already in a parent's - * spill list. - */ - if (txn->mt_parent) { - MDB_txn *tx2; - for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) { - if (tx2->mt_spill_pgs) { - j = mdb_midl_search(tx2->mt_spill_pgs, pn); - if (j <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[j] == pn) { - dp->mp_flags |= P_KEEP; - break; - } - } - } - if (tx2) - continue; - } - if ((rc = mdb_midl_append(&txn->mt_spill_pgs, pn))) - goto done; - need--; - } - mdb_midl_sort(txn->mt_spill_pgs); - - /* Flush the spilled part of dirty list */ - if ((rc = mdb_page_flush(txn, i)) != MDB_SUCCESS) - goto done; - - /* Reset any dirty pages we kept that page_flush didn't see */ - rc = mdb_pages_xkeep(m0, P_DIRTY|P_KEEP, i); - -done: - txn->mt_flags |= rc ? MDB_TXN_ERROR : MDB_TXN_SPILLS; - return rc; -} - -/** Find oldest txnid still referenced. Expects txn->mt_txnid > 0. */ -static txnid_t -mdb_find_oldest(MDB_txn *txn) -{ - int i; - txnid_t mr, oldest = txn->mt_txnid - 1; - if (txn->mt_env->me_txns) { - MDB_reader *r = txn->mt_env->me_txns->mti_readers; - for (i = txn->mt_env->me_txns->mti_numreaders; --i >= 0; ) { - if (r[i].mr_pid) { - mr = r[i].mr_txnid; - if (oldest > mr) - oldest = mr; - } - } - } - return oldest; -} - -/** Add a page to the txn's dirty list */ -static void -mdb_page_dirty(MDB_txn *txn, MDB_page *mp) -{ - MDB_ID2 mid; - int rc, (*insert)(MDB_ID2L, MDB_ID2 *); - - if (txn->mt_env->me_flags & MDB_WRITEMAP) { - insert = mdb_mid2l_append; - } else { - insert = mdb_mid2l_insert; - } - mid.mid = mp->mp_pgno; - mid.mptr = mp; - rc = insert(txn->mt_u.dirty_list, &mid); - mdb_tassert(txn, rc == 0); - txn->mt_dirty_room--; -} - -/** Allocate page numbers and memory for writing. Maintain me_pglast, - * me_pghead and mt_next_pgno. - * - * If there are free pages available from older transactions, they - * are re-used first. Otherwise allocate a new page at mt_next_pgno. - * Do not modify the freedB, just merge freeDB records into me_pghead[] - * and move me_pglast to say which records were consumed. Only this - * function can create me_pghead and move me_pglast/mt_next_pgno. - * @param[in] mc cursor A cursor handle identifying the transaction and - * database for which we are allocating. - * @param[in] num the number of pages to allocate. - * @param[out] mp Address of the allocated page(s). Requests for multiple pages - * will always be satisfied by a single contiguous chunk of memory. - * @return 0 on success, non-zero on failure. - */ -static int -mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) -{ -#ifdef MDB_PARANOID /* Seems like we can ignore this now */ - /* Get at most more freeDB records once me_pghead - * has enough pages. If not enough, use new pages from the map. - * If and mc is updating the freeDB, only get new - * records if me_pghead is empty. Then the freelist cannot play - * catch-up with itself by growing while trying to save it. - */ - enum { Paranoid = 1, Max_retries = 500 }; -#else - enum { Paranoid = 0, Max_retries = INT_MAX /*infinite*/ }; -#endif - int rc, retry = num * 20; - MDB_txn *txn = mc->mc_txn; - MDB_env *env = txn->mt_env; - pgno_t pgno, *mop = env->me_pghead; - unsigned i, j, mop_len = mop ? mop[0] : 0, n2 = num-1; - MDB_page *np; - txnid_t oldest = 0, last; - MDB_cursor_op op; - MDB_cursor m2; - - /* If there are any loose pages, just use them */ - if (num == 1 && txn->mt_loose_pgs) { - np = txn->mt_loose_pgs; - txn->mt_loose_pgs = NEXT_LOOSE_PAGE(np); - txn->mt_loose_count--; - DPRINTF(("db %d use loose page %"Z"u", DDBI(mc), - np->mp_pgno)); - *mp = np; - return MDB_SUCCESS; - } - - *mp = NULL; - - /* If our dirty list is already full, we can't do anything */ - if (txn->mt_dirty_room == 0) { - rc = MDB_TXN_FULL; - goto fail; - } - - for (op = MDB_FIRST;; op = MDB_NEXT) { - MDB_val key, data; - MDB_node *leaf; - pgno_t *idl; - - /* Seek a big enough contiguous page range. Prefer - * pages at the tail, just truncating the list. - */ - if (mop_len > n2) { - i = mop_len; - do { - pgno = mop[i]; - if (mop[i-n2] == pgno+n2) - goto search_done; - } while (--i > n2); - if (--retry < 0) - break; - } - - if (op == MDB_FIRST) { /* 1st iteration */ - /* Prepare to fetch more and coalesce */ - oldest = mdb_find_oldest(txn); - last = env->me_pglast; - mdb_cursor_init(&m2, txn, FREE_DBI, NULL); - if (last) { - op = MDB_SET_RANGE; - key.mv_data = &last; /* will look up last+1 */ - key.mv_size = sizeof(last); - } - if (Paranoid && mc->mc_dbi == FREE_DBI) - retry = -1; - } - if (Paranoid && retry < 0 && mop_len) - break; - - last++; - /* Do not fetch more if the record will be too recent */ - if (oldest <= last) - break; - rc = mdb_cursor_get(&m2, &key, NULL, op); - if (rc) { - if (rc == MDB_NOTFOUND) - break; - goto fail; - } - last = *(txnid_t*)key.mv_data; - if (oldest <= last) - break; - np = m2.mc_pg[m2.mc_top]; - leaf = NODEPTR(np, m2.mc_ki[m2.mc_top]); - if ((rc = mdb_node_read(txn, leaf, &data)) != MDB_SUCCESS) - return rc; - - idl = (MDB_ID *) data.mv_data; - i = idl[0]; - if (!mop) { - if (!(env->me_pghead = mop = mdb_midl_alloc(i))) { - rc = ENOMEM; - goto fail; - } - } else { - if ((rc = mdb_midl_need(&env->me_pghead, i)) != 0) - goto fail; - mop = env->me_pghead; - } - env->me_pglast = last; -#if (MDB_DEBUG) > 1 - DPRINTF(("IDL read txn %"Z"u root %"Z"u num %u", - last, txn->mt_dbs[FREE_DBI].md_root, i)); - for (j = i; j; j--) - DPRINTF(("IDL %"Z"u", idl[j])); -#endif - /* Merge in descending sorted order */ - mdb_midl_xmerge(mop, idl); - mop_len = mop[0]; - } - - /* Use new pages from the map when nothing suitable in the freeDB */ - i = 0; - pgno = txn->mt_next_pgno; - if (pgno + num >= env->me_maxpg) { - DPUTS("DB size maxed out"); - rc = MDB_MAP_FULL; - goto fail; - } - -search_done: - if (env->me_flags & MDB_WRITEMAP) { - np = (MDB_page *)(env->me_map + env->me_psize * pgno); - } else { - if (!(np = mdb_page_malloc(txn, num))) { - rc = ENOMEM; - goto fail; - } - } - if (i) { - mop[0] = mop_len -= num; - /* Move any stragglers down */ - for (j = i-num; j < mop_len; ) - mop[++j] = mop[++i]; - } else { - txn->mt_next_pgno = pgno + num; - } - np->mp_pgno = pgno; - mdb_page_dirty(txn, np); - *mp = np; - - return MDB_SUCCESS; - -fail: - txn->mt_flags |= MDB_TXN_ERROR; - return rc; -} - -/** Copy the used portions of a non-overflow page. - * @param[in] dst page to copy into - * @param[in] src page to copy from - * @param[in] psize size of a page - */ -static void -mdb_page_copy(MDB_page *dst, MDB_page *src, unsigned int psize) -{ - enum { Align = sizeof(pgno_t) }; - indx_t upper = src->mp_upper, lower = src->mp_lower, unused = upper-lower; - - /* If page isn't full, just copy the used portion. Adjust - * alignment so memcpy may copy words instead of bytes. - */ - if ((unused &= -Align) && !IS_LEAF2(src)) { - upper = (upper + PAGEBASE) & -Align; - memcpy(dst, src, (lower + PAGEBASE + (Align-1)) & -Align); - memcpy((pgno_t *)((char *)dst+upper), (pgno_t *)((char *)src+upper), - psize - upper); - } else { - memcpy(dst, src, psize - unused); - } -} - -/** Pull a page off the txn's spill list, if present. - * If a page being referenced was spilled to disk in this txn, bring - * it back and make it dirty/writable again. - * @param[in] txn the transaction handle. - * @param[in] mp the page being referenced. It must not be dirty. - * @param[out] ret the writable page, if any. ret is unchanged if - * mp wasn't spilled. - */ -static int -mdb_page_unspill(MDB_txn *txn, MDB_page *mp, MDB_page **ret) -{ - MDB_env *env = txn->mt_env; - const MDB_txn *tx2; - unsigned x; - pgno_t pgno = mp->mp_pgno, pn = pgno << 1; - - for (tx2 = txn; tx2; tx2=tx2->mt_parent) { - if (!tx2->mt_spill_pgs) - continue; - x = mdb_midl_search(tx2->mt_spill_pgs, pn); - if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) { - MDB_page *np; - int num; - if (txn->mt_dirty_room == 0) - return MDB_TXN_FULL; - if (IS_OVERFLOW(mp)) - num = mp->mp_pages; - else - num = 1; - if (env->me_flags & MDB_WRITEMAP) { - np = mp; - } else { - np = mdb_page_malloc(txn, num); - if (!np) - return ENOMEM; - if (num > 1) - memcpy(np, mp, num * env->me_psize); - else - mdb_page_copy(np, mp, env->me_psize); - } - if (tx2 == txn) { - /* If in current txn, this page is no longer spilled. - * If it happens to be the last page, truncate the spill list. - * Otherwise mark it as deleted by setting the LSB. - */ - if (x == txn->mt_spill_pgs[0]) - txn->mt_spill_pgs[0]--; - else - txn->mt_spill_pgs[x] |= 1; - } /* otherwise, if belonging to a parent txn, the - * page remains spilled until child commits - */ - - mdb_page_dirty(txn, np); - np->mp_flags |= P_DIRTY; - *ret = np; - break; - } - } - return MDB_SUCCESS; -} - -/** Touch a page: make it dirty and re-insert into tree with updated pgno. - * @param[in] mc cursor pointing to the page to be touched - * @return 0 on success, non-zero on failure. - */ -static int -mdb_page_touch(MDB_cursor *mc) -{ - MDB_page *mp = mc->mc_pg[mc->mc_top], *np; - MDB_txn *txn = mc->mc_txn; - MDB_cursor *m2, *m3; - pgno_t pgno; - int rc; - - if (!F_ISSET(mp->mp_flags, P_DIRTY)) { - if (txn->mt_flags & MDB_TXN_SPILLS) { - np = NULL; - rc = mdb_page_unspill(txn, mp, &np); - if (rc) - goto fail; - if (np) - goto done; - } - if ((rc = mdb_midl_need(&txn->mt_free_pgs, 1)) || - (rc = mdb_page_alloc(mc, 1, &np))) - goto fail; - pgno = np->mp_pgno; - DPRINTF(("touched db %d page %"Z"u -> %"Z"u", DDBI(mc), - mp->mp_pgno, pgno)); - mdb_cassert(mc, mp->mp_pgno != pgno); - mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); - /* Update the parent page, if any, to point to the new page */ - if (mc->mc_top) { - MDB_page *parent = mc->mc_pg[mc->mc_top-1]; - MDB_node *node = NODEPTR(parent, mc->mc_ki[mc->mc_top-1]); - SETPGNO(node, pgno); - } else { - mc->mc_db->md_root = pgno; - } - } else if (txn->mt_parent && !IS_SUBP(mp)) { - MDB_ID2 mid, *dl = txn->mt_u.dirty_list; - pgno = mp->mp_pgno; - /* If txn has a parent, make sure the page is in our - * dirty list. - */ - if (dl[0].mid) { - unsigned x = mdb_mid2l_search(dl, pgno); - if (x <= dl[0].mid && dl[x].mid == pgno) { - if (mp != dl[x].mptr) { /* bad cursor? */ - mc->mc_flags &= ~(C_INITIALIZED|C_EOF); - txn->mt_flags |= MDB_TXN_ERROR; - return MDB_CORRUPTED; - } - return 0; - } - } - mdb_cassert(mc, dl[0].mid < MDB_IDL_UM_MAX); - /* No - copy it */ - np = mdb_page_malloc(txn, 1); - if (!np) - return ENOMEM; - mid.mid = pgno; - mid.mptr = np; - rc = mdb_mid2l_insert(dl, &mid); - mdb_cassert(mc, rc == 0); - } else { - return 0; - } - - mdb_page_copy(np, mp, txn->mt_env->me_psize); - np->mp_pgno = pgno; - np->mp_flags |= P_DIRTY; - -done: - /* Adjust cursors pointing to mp */ - mc->mc_pg[mc->mc_top] = np; - m2 = txn->mt_cursors[mc->mc_dbi]; - if (mc->mc_flags & C_SUB) { - for (; m2; m2=m2->mc_next) { - m3 = &m2->mc_xcursor->mx_cursor; - if (m3->mc_snum < mc->mc_snum) continue; - if (m3->mc_pg[mc->mc_top] == mp) - m3->mc_pg[mc->mc_top] = np; - } - } else { - for (; m2; m2=m2->mc_next) { - if (m2->mc_snum < mc->mc_snum) continue; - if (m2->mc_pg[mc->mc_top] == mp) { - m2->mc_pg[mc->mc_top] = np; - if ((mc->mc_db->md_flags & MDB_DUPSORT) && - IS_LEAF(np) && - m2->mc_ki[mc->mc_top] == mc->mc_ki[mc->mc_top]) - { - MDB_node *leaf = NODEPTR(np, mc->mc_ki[mc->mc_top]); - if (!(leaf->mn_flags & F_SUBDATA)) - m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); - } - } - } - } - return 0; - -fail: - txn->mt_flags |= MDB_TXN_ERROR; - return rc; -} - -int -mdb_env_sync(MDB_env *env, int force) -{ - int rc = 0; - if (force || !F_ISSET(env->me_flags, MDB_NOSYNC)) { - if (env->me_flags & MDB_WRITEMAP) { - int flags = ((env->me_flags & MDB_MAPASYNC) && !force) - ? MS_ASYNC : MS_SYNC; - if (MDB_MSYNC(env->me_map, env->me_mapsize, flags)) - rc = ErrCode(); -#ifdef _WIN32 - else if (flags == MS_SYNC && MDB_FDATASYNC(env->me_fd)) - rc = ErrCode(); -#endif - } else { - if (MDB_FDATASYNC(env->me_fd)) - rc = ErrCode(); - } - } - return rc; -} - -/** Back up parent txn's cursors, then grab the originals for tracking */ -static int -mdb_cursor_shadow(MDB_txn *src, MDB_txn *dst) -{ - MDB_cursor *mc, *bk; - MDB_xcursor *mx; - size_t size; - int i; - - for (i = src->mt_numdbs; --i >= 0; ) { - if ((mc = src->mt_cursors[i]) != NULL) { - size = sizeof(MDB_cursor); - if (mc->mc_xcursor) - size += sizeof(MDB_xcursor); - for (; mc; mc = bk->mc_next) { - bk = malloc(size); - if (!bk) - return ENOMEM; - *bk = *mc; - mc->mc_backup = bk; - mc->mc_db = &dst->mt_dbs[i]; - /* Kill pointers into src - and dst to reduce abuse: The - * user may not use mc until dst ends. Otherwise we'd... - */ - mc->mc_txn = NULL; /* ...set this to dst */ - mc->mc_dbflag = NULL; /* ...and &dst->mt_dbflags[i] */ - if ((mx = mc->mc_xcursor) != NULL) { - *(MDB_xcursor *)(bk+1) = *mx; - mx->mx_cursor.mc_txn = NULL; /* ...and dst. */ - } - mc->mc_next = dst->mt_cursors[i]; - dst->mt_cursors[i] = mc; - } - } - } - return MDB_SUCCESS; -} - -/** Close this write txn's cursors, give parent txn's cursors back to parent. - * @param[in] txn the transaction handle. - * @param[in] merge true to keep changes to parent cursors, false to revert. - * @return 0 on success, non-zero on failure. - */ -static void -mdb_cursors_close(MDB_txn *txn, unsigned merge) -{ - MDB_cursor **cursors = txn->mt_cursors, *mc, *next, *bk; - MDB_xcursor *mx; - int i; - - for (i = txn->mt_numdbs; --i >= 0; ) { - for (mc = cursors[i]; mc; mc = next) { - next = mc->mc_next; - if ((bk = mc->mc_backup) != NULL) { - if (merge) { - /* Commit changes to parent txn */ - mc->mc_next = bk->mc_next; - mc->mc_backup = bk->mc_backup; - mc->mc_txn = bk->mc_txn; - mc->mc_db = bk->mc_db; - mc->mc_dbflag = bk->mc_dbflag; - if ((mx = mc->mc_xcursor) != NULL) - mx->mx_cursor.mc_txn = bk->mc_txn; - } else { - /* Abort nested txn */ - *mc = *bk; - if ((mx = mc->mc_xcursor) != NULL) - *mx = *(MDB_xcursor *)(bk+1); - } - mc = bk; - } - /* Only malloced cursors are permanently tracked. */ - free(mc); - } - cursors[i] = NULL; - } -} - -#if !(MDB_DEBUG) -#define mdb_txn_reset0(txn, act) mdb_txn_reset0(txn) -#endif -static void -mdb_txn_reset0(MDB_txn *txn, const char *act); - -#if !(MDB_PIDLOCK) /* Currently the same as defined(_WIN32) */ -enum Pidlock_op { - Pidset, Pidcheck -}; -#else -enum Pidlock_op { - Pidset = F_SETLK, Pidcheck = F_GETLK -}; -#endif - -/** Set or check a pid lock. Set returns 0 on success. - * Check returns 0 if the process is certainly dead, nonzero if it may - * be alive (the lock exists or an error happened so we do not know). - * - * On Windows Pidset is a no-op, we merely check for the existence - * of the process with the given pid. On POSIX we use a single byte - * lock on the lockfile, set at an offset equal to the pid. - */ -static int -mdb_reader_pid(MDB_env *env, enum Pidlock_op op, MDB_PID_T pid) -{ -#if !(MDB_PIDLOCK) /* Currently the same as defined(_WIN32) */ - int ret = 0; - HANDLE h; - if (op == Pidcheck) { - h = OpenProcess(env->me_pidquery, FALSE, pid); - /* No documented "no such process" code, but other program use this: */ - if (!h) - return ErrCode() != ERROR_INVALID_PARAMETER; - /* A process exists until all handles to it close. Has it exited? */ - ret = WaitForSingleObject(h, 0) != 0; - CloseHandle(h); - } - return ret; -#else - for (;;) { - int rc; - struct flock lock_info; - memset(&lock_info, 0, sizeof(lock_info)); - lock_info.l_type = F_WRLCK; - lock_info.l_whence = SEEK_SET; - lock_info.l_start = pid; - lock_info.l_len = 1; - if ((rc = fcntl(env->me_lfd, op, &lock_info)) == 0) { - if (op == F_GETLK && lock_info.l_type != F_UNLCK) - rc = -1; - } else if ((rc = ErrCode()) == EINTR) { - continue; - } - return rc; - } -#endif -} - -/** Common code for #mdb_txn_begin() and #mdb_txn_renew(). - * @param[in] txn the transaction handle to initialize - * @return 0 on success, non-zero on failure. - */ -static int -mdb_txn_renew0(MDB_txn *txn) -{ - MDB_env *env = txn->mt_env; - MDB_txninfo *ti = env->me_txns; - MDB_meta *meta; - unsigned int i, nr; - uint16_t x; - int rc, new_notls = 0; - - /* Setup db info */ - txn->mt_numdbs = env->me_numdbs; - txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */ - - if (txn->mt_flags & MDB_TXN_RDONLY) { - if (!ti) { - meta = env->me_metas[ mdb_env_pick_meta(env) ]; - txn->mt_txnid = meta->mm_txnid; - txn->mt_u.reader = NULL; - } else { - MDB_reader *r = (env->me_flags & MDB_NOTLS) ? txn->mt_u.reader : - pthread_getspecific(env->me_txkey); - if (r) { - if (r->mr_pid != env->me_pid || r->mr_txnid != (txnid_t)-1) - return MDB_BAD_RSLOT; - } else { - MDB_PID_T pid = env->me_pid; - MDB_THR_T tid = pthread_self(); - - if (!env->me_live_reader) { - rc = mdb_reader_pid(env, Pidset, pid); - if (rc) - return rc; - env->me_live_reader = 1; - } - - LOCK_MUTEX_R(env); - nr = ti->mti_numreaders; - for (i=0; imti_readers[i].mr_pid == 0) - break; - if (i == env->me_maxreaders) { - UNLOCK_MUTEX_R(env); - return MDB_READERS_FULL; - } - ti->mti_readers[i].mr_pid = pid; - ti->mti_readers[i].mr_tid = tid; - if (i == nr) - ti->mti_numreaders = ++nr; - /* Save numreaders for un-mutexed mdb_env_close() */ - env->me_numreaders = nr; - UNLOCK_MUTEX_R(env); - - r = &ti->mti_readers[i]; - new_notls = (env->me_flags & MDB_NOTLS); - if (!new_notls && (rc=pthread_setspecific(env->me_txkey, r))) { - r->mr_pid = 0; - return rc; - } - } - txn->mt_txnid = r->mr_txnid = ti->mti_txnid; - txn->mt_u.reader = r; - meta = env->me_metas[txn->mt_txnid & 1]; - } - } else { - if (ti) { - LOCK_MUTEX_W(env); - - txn->mt_txnid = ti->mti_txnid; - meta = env->me_metas[txn->mt_txnid & 1]; - } else { - meta = env->me_metas[ mdb_env_pick_meta(env) ]; - txn->mt_txnid = meta->mm_txnid; - } - txn->mt_txnid++; -#if MDB_DEBUG - if (txn->mt_txnid == mdb_debug_start) - mdb_debug = 1; -#endif - txn->mt_dirty_room = MDB_IDL_UM_MAX; - txn->mt_u.dirty_list = env->me_dirty_list; - txn->mt_u.dirty_list[0].mid = 0; - txn->mt_free_pgs = env->me_free_pgs; - txn->mt_free_pgs[0] = 0; - txn->mt_spill_pgs = NULL; - env->me_txn = txn; - memcpy(txn->mt_dbiseqs, env->me_dbiseqs, env->me_maxdbs * sizeof(unsigned int)); - } - - /* Copy the DB info and flags */ - memcpy(txn->mt_dbs, meta->mm_dbs, 2 * sizeof(MDB_db)); - - /* Moved to here to avoid a data race in read TXNs */ - txn->mt_next_pgno = meta->mm_last_pg+1; - - for (i=2; imt_numdbs; i++) { - x = env->me_dbflags[i]; - txn->mt_dbs[i].md_flags = x & PERSISTENT_FLAGS; - txn->mt_dbflags[i] = (x & MDB_VALID) ? DB_VALID|DB_STALE : 0; - } - txn->mt_dbflags[0] = txn->mt_dbflags[1] = DB_VALID; - - if (env->me_maxpg < txn->mt_next_pgno) { - mdb_txn_reset0(txn, "renew0-mapfail"); - if (new_notls) { - txn->mt_u.reader->mr_pid = 0; - txn->mt_u.reader = NULL; - } - return MDB_MAP_RESIZED; - } - - return MDB_SUCCESS; -} - -int -mdb_txn_renew(MDB_txn *txn) -{ - int rc; - - if (!txn || txn->mt_dbxs) /* A reset txn has mt_dbxs==NULL */ - return EINVAL; - - if (txn->mt_env->me_flags & MDB_FATAL_ERROR) { - DPUTS("environment had fatal error, must shutdown!"); - return MDB_PANIC; - } - - rc = mdb_txn_renew0(txn); - if (rc == MDB_SUCCESS) { - DPRINTF(("renew txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", - txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', - (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root)); - } - return rc; -} - -int -mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) -{ - MDB_txn *txn; - MDB_ntxn *ntxn; - int rc, size, tsize = sizeof(MDB_txn); - - if (env->me_flags & MDB_FATAL_ERROR) { - DPUTS("environment had fatal error, must shutdown!"); - return MDB_PANIC; - } - if ((env->me_flags & MDB_RDONLY) && !(flags & MDB_RDONLY)) - return EACCES; - if (parent) { - /* Nested transactions: Max 1 child, write txns only, no writemap */ - if (parent->mt_child || - (flags & MDB_RDONLY) || - (parent->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR)) || - (env->me_flags & MDB_WRITEMAP)) - { - return (parent->mt_flags & MDB_TXN_RDONLY) ? EINVAL : MDB_BAD_TXN; - } - tsize = sizeof(MDB_ntxn); - } - size = tsize + env->me_maxdbs * (sizeof(MDB_db)+1); - if (!(flags & MDB_RDONLY)) { - size += env->me_maxdbs * sizeof(MDB_cursor *); - /* child txns use parent's dbiseqs */ - if (!parent) - size += env->me_maxdbs * sizeof(unsigned int); - } - - if ((txn = calloc(1, size)) == NULL) { - DPRINTF(("calloc: %s", strerror(errno))); - return ENOMEM; - } - txn->mt_dbs = (MDB_db *) ((char *)txn + tsize); - if (flags & MDB_RDONLY) { - txn->mt_flags |= MDB_TXN_RDONLY; - txn->mt_dbflags = (unsigned char *)(txn->mt_dbs + env->me_maxdbs); - txn->mt_dbiseqs = env->me_dbiseqs; - } else { - txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs); - if (parent) { - txn->mt_dbiseqs = parent->mt_dbiseqs; - txn->mt_dbflags = (unsigned char *)(txn->mt_cursors + env->me_maxdbs); - } else { - txn->mt_dbiseqs = (unsigned int *)(txn->mt_cursors + env->me_maxdbs); - txn->mt_dbflags = (unsigned char *)(txn->mt_dbiseqs + env->me_maxdbs); - } - } - txn->mt_env = env; - - if (parent) { - unsigned int i; - txn->mt_u.dirty_list = malloc(sizeof(MDB_ID2)*MDB_IDL_UM_SIZE); - if (!txn->mt_u.dirty_list || - !(txn->mt_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX))) - { - free(txn->mt_u.dirty_list); - free(txn); - return ENOMEM; - } - txn->mt_txnid = parent->mt_txnid; - txn->mt_dirty_room = parent->mt_dirty_room; - txn->mt_u.dirty_list[0].mid = 0; - txn->mt_spill_pgs = NULL; - txn->mt_next_pgno = parent->mt_next_pgno; - parent->mt_child = txn; - txn->mt_parent = parent; - txn->mt_numdbs = parent->mt_numdbs; - txn->mt_flags = parent->mt_flags; - txn->mt_dbxs = parent->mt_dbxs; - memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); - /* Copy parent's mt_dbflags, but clear DB_NEW */ - for (i=0; imt_numdbs; i++) - txn->mt_dbflags[i] = parent->mt_dbflags[i] & ~DB_NEW; - rc = 0; - ntxn = (MDB_ntxn *)txn; - ntxn->mnt_pgstate = env->me_pgstate; /* save parent me_pghead & co */ - if (env->me_pghead) { - size = MDB_IDL_SIZEOF(env->me_pghead); - env->me_pghead = mdb_midl_alloc(env->me_pghead[0]); - if (env->me_pghead) - memcpy(env->me_pghead, ntxn->mnt_pgstate.mf_pghead, size); - else - rc = ENOMEM; - } - if (!rc) - rc = mdb_cursor_shadow(parent, txn); - if (rc) - mdb_txn_reset0(txn, "beginchild-fail"); - } else { - rc = mdb_txn_renew0(txn); - } - if (rc) - free(txn); - else { - *ret = txn; - DPRINTF(("begin txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", - txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', - (void *) txn, (void *) env, txn->mt_dbs[MAIN_DBI].md_root)); - } - - return rc; -} - -MDB_env * -mdb_txn_env(MDB_txn *txn) -{ - if(!txn) return NULL; - return txn->mt_env; -} - -/** Export or close DBI handles opened in this txn. */ -static void -mdb_dbis_update(MDB_txn *txn, int keep) -{ - int i; - MDB_dbi n = txn->mt_numdbs; - MDB_env *env = txn->mt_env; - unsigned char *tdbflags = txn->mt_dbflags; - - for (i = n; --i >= 2;) { - if (tdbflags[i] & DB_NEW) { - if (keep) { - env->me_dbflags[i] = txn->mt_dbs[i].md_flags | MDB_VALID; - } else { - char *ptr = env->me_dbxs[i].md_name.mv_data; - if (ptr) { - env->me_dbxs[i].md_name.mv_data = NULL; - env->me_dbxs[i].md_name.mv_size = 0; - env->me_dbflags[i] = 0; - env->me_dbiseqs[i]++; - free(ptr); - } - } - } - } - if (keep && env->me_numdbs < n) - env->me_numdbs = n; -} - -/** Common code for #mdb_txn_reset() and #mdb_txn_abort(). - * May be called twice for readonly txns: First reset it, then abort. - * @param[in] txn the transaction handle to reset - * @param[in] act why the transaction is being reset - */ -static void -mdb_txn_reset0(MDB_txn *txn, const char *act) -{ - MDB_env *env = txn->mt_env; - - /* Close any DBI handles opened in this txn */ - mdb_dbis_update(txn, 0); - - DPRINTF(("%s txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", - act, txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', - (void *) txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root)); - - if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { - if (txn->mt_u.reader) { - txn->mt_u.reader->mr_txnid = (txnid_t)-1; - if (!(env->me_flags & MDB_NOTLS)) - txn->mt_u.reader = NULL; /* txn does not own reader */ - } - txn->mt_numdbs = 0; /* close nothing if called again */ - txn->mt_dbxs = NULL; /* mark txn as reset */ - } else { - mdb_cursors_close(txn, 0); - - if (!(env->me_flags & MDB_WRITEMAP)) { - mdb_dlist_free(txn); - } - mdb_midl_free(env->me_pghead); - - if (txn->mt_parent) { - txn->mt_parent->mt_child = NULL; - env->me_pgstate = ((MDB_ntxn *)txn)->mnt_pgstate; - mdb_midl_free(txn->mt_free_pgs); - mdb_midl_free(txn->mt_spill_pgs); - free(txn->mt_u.dirty_list); - return; - } - - if (mdb_midl_shrink(&txn->mt_free_pgs)) - env->me_free_pgs = txn->mt_free_pgs; - env->me_pghead = NULL; - env->me_pglast = 0; - - env->me_txn = NULL; - /* The writer mutex was locked in mdb_txn_begin. */ - if (env->me_txns) - UNLOCK_MUTEX_W(env); - } -} - -void -mdb_txn_reset(MDB_txn *txn) -{ - if (txn == NULL) - return; - - /* This call is only valid for read-only txns */ - if (!(txn->mt_flags & MDB_TXN_RDONLY)) - return; - - mdb_txn_reset0(txn, "reset"); -} - -void -mdb_txn_abort(MDB_txn *txn) -{ - if (txn == NULL) - return; - - if (txn->mt_child) - mdb_txn_abort(txn->mt_child); - - mdb_txn_reset0(txn, "abort"); - /* Free reader slot tied to this txn (if MDB_NOTLS && writable FS) */ - if ((txn->mt_flags & MDB_TXN_RDONLY) && txn->mt_u.reader) - txn->mt_u.reader->mr_pid = 0; - - free(txn); -} - -/** Save the freelist as of this transaction to the freeDB. - * This changes the freelist. Keep trying until it stabilizes. - */ -static int -mdb_freelist_save(MDB_txn *txn) -{ - /* env->me_pghead[] can grow and shrink during this call. - * env->me_pglast and txn->mt_free_pgs[] can only grow. - * Page numbers cannot disappear from txn->mt_free_pgs[]. - */ - MDB_cursor mc; - MDB_env *env = txn->mt_env; - int rc, maxfree_1pg = env->me_maxfree_1pg, more = 1; - txnid_t pglast = 0, head_id = 0; - pgno_t freecnt = 0, *free_pgs, *mop; - ssize_t head_room = 0, total_room = 0, mop_len, clean_limit; - - mdb_cursor_init(&mc, txn, FREE_DBI, NULL); - - if (env->me_pghead) { - /* Make sure first page of freeDB is touched and on freelist */ - rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST|MDB_PS_MODIFY); - if (rc && rc != MDB_NOTFOUND) - return rc; - } - - if (!env->me_pghead && txn->mt_loose_pgs) { - /* Put loose page numbers in mt_free_pgs, since - * we may be unable to return them to me_pghead. - */ - MDB_page *mp = txn->mt_loose_pgs; - if ((rc = mdb_midl_need(&txn->mt_free_pgs, txn->mt_loose_count)) != 0) - return rc; - for (; mp; mp = NEXT_LOOSE_PAGE(mp)) - mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); - txn->mt_loose_pgs = NULL; - txn->mt_loose_count = 0; - } - - /* MDB_RESERVE cancels meminit in ovpage malloc (when no WRITEMAP) */ - clean_limit = (env->me_flags & (MDB_NOMEMINIT|MDB_WRITEMAP)) - ? SSIZE_MAX : maxfree_1pg; - - for (;;) { - /* Come back here after each Put() in case freelist changed */ - MDB_val key, data; - pgno_t *pgs; - ssize_t j; - - /* If using records from freeDB which we have not yet - * deleted, delete them and any we reserved for me_pghead. - */ - while (pglast < env->me_pglast) { - rc = mdb_cursor_first(&mc, &key, NULL); - if (rc) - return rc; - pglast = head_id = *(txnid_t *)key.mv_data; - total_room = head_room = 0; - mdb_tassert(txn, pglast <= env->me_pglast); - rc = mdb_cursor_del(&mc, 0); - if (rc) - return rc; - } - - /* Save the IDL of pages freed by this txn, to a single record */ - if (freecnt < txn->mt_free_pgs[0]) { - if (!freecnt) { - /* Make sure last page of freeDB is touched and on freelist */ - rc = mdb_page_search(&mc, NULL, MDB_PS_LAST|MDB_PS_MODIFY); - if (rc && rc != MDB_NOTFOUND) - return rc; - } - free_pgs = txn->mt_free_pgs; - /* Write to last page of freeDB */ - key.mv_size = sizeof(txn->mt_txnid); - key.mv_data = &txn->mt_txnid; - do { - freecnt = free_pgs[0]; - data.mv_size = MDB_IDL_SIZEOF(free_pgs); - rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); - if (rc) - return rc; - /* Retry if mt_free_pgs[] grew during the Put() */ - free_pgs = txn->mt_free_pgs; - } while (freecnt < free_pgs[0]); - mdb_midl_sort(free_pgs); - memcpy(data.mv_data, free_pgs, data.mv_size); -#if (MDB_DEBUG) > 1 - { - unsigned int i = free_pgs[0]; - DPRINTF(("IDL write txn %"Z"u root %"Z"u num %u", - txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i)); - for (; i; i--) - DPRINTF(("IDL %"Z"u", free_pgs[i])); - } -#endif - continue; - } - - mop = env->me_pghead; - mop_len = (mop ? mop[0] : 0) + txn->mt_loose_count; - - /* Reserve records for me_pghead[]. Split it if multi-page, - * to avoid searching freeDB for a page range. Use keys in - * range [1,me_pglast]: Smaller than txnid of oldest reader. - */ - if (total_room >= mop_len) { - if (total_room == mop_len || --more < 0) - break; - } else if (head_room >= maxfree_1pg && head_id > 1) { - /* Keep current record (overflow page), add a new one */ - head_id--; - head_room = 0; - } - /* (Re)write {key = head_id, IDL length = head_room} */ - total_room -= head_room; - head_room = mop_len - total_room; - if (head_room > maxfree_1pg && head_id > 1) { - /* Overflow multi-page for part of me_pghead */ - head_room /= head_id; /* amortize page sizes */ - head_room += maxfree_1pg - head_room % (maxfree_1pg + 1); - } else if (head_room < 0) { - /* Rare case, not bothering to delete this record */ - head_room = 0; - } - key.mv_size = sizeof(head_id); - key.mv_data = &head_id; - data.mv_size = (head_room + 1) * sizeof(pgno_t); - rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); - if (rc) - return rc; - /* IDL is initially empty, zero out at least the length */ - pgs = (pgno_t *)data.mv_data; - j = head_room > clean_limit ? head_room : 0; - do { - pgs[j] = 0; - } while (--j >= 0); - total_room += head_room; - } - - /* Return loose page numbers to me_pghead, though usually none are - * left at this point. The pages themselves remain in dirty_list. - */ - if (txn->mt_loose_pgs) { - MDB_page *mp = txn->mt_loose_pgs; - unsigned count = txn->mt_loose_count; - MDB_IDL loose; - /* Room for loose pages + temp IDL with same */ - if ((rc = mdb_midl_need(&env->me_pghead, 2*count+1)) != 0) - return rc; - mop = env->me_pghead; - loose = mop + MDB_IDL_ALLOCLEN(mop) - count; - for (count = 0; mp; mp = NEXT_LOOSE_PAGE(mp)) - loose[ ++count ] = mp->mp_pgno; - loose[0] = count; - mdb_midl_sort(loose); - mdb_midl_xmerge(mop, loose); - txn->mt_loose_pgs = NULL; - txn->mt_loose_count = 0; - mop_len = mop[0]; - } - - /* Fill in the reserved me_pghead records */ - rc = MDB_SUCCESS; - if (mop_len) { - MDB_val key, data; - - mop += mop_len; - rc = mdb_cursor_first(&mc, &key, &data); - for (; !rc; rc = mdb_cursor_next(&mc, &key, &data, MDB_NEXT)) { - txnid_t id = *(txnid_t *)key.mv_data; - ssize_t len = (ssize_t)(data.mv_size / sizeof(MDB_ID)) - 1; - MDB_ID save; - - mdb_tassert(txn, len >= 0 && id <= env->me_pglast); - key.mv_data = &id; - if (len > mop_len) { - len = mop_len; - data.mv_size = (len + 1) * sizeof(MDB_ID); - } - data.mv_data = mop -= len; - save = mop[0]; - mop[0] = len; - rc = mdb_cursor_put(&mc, &key, &data, MDB_CURRENT); - mop[0] = save; - if (rc || !(mop_len -= len)) - break; - } - } - return rc; -} - -/** Flush (some) dirty pages to the map, after clearing their dirty flag. - * @param[in] txn the transaction that's being committed - * @param[in] keep number of initial pages in dirty_list to keep dirty. - * @return 0 on success, non-zero on failure. - */ -static int -mdb_page_flush(MDB_txn *txn, int keep) -{ - MDB_env *env = txn->mt_env; - MDB_ID2L dl = txn->mt_u.dirty_list; - unsigned psize = env->me_psize, j; - int i, pagecount = dl[0].mid, rc; - size_t size = 0, pos = 0; - pgno_t pgno = 0; - MDB_page *dp = NULL; -#ifdef _WIN32 - OVERLAPPED ov; -#else - struct iovec iov[MDB_COMMIT_PAGES]; - ssize_t wpos = 0, wsize = 0, wres; - size_t next_pos = 1; /* impossible pos, so pos != next_pos */ - int n = 0; -#endif - - j = i = keep; - - if (env->me_flags & MDB_WRITEMAP) { - /* Clear dirty flags */ - while (++i <= pagecount) { - dp = dl[i].mptr; - /* Don't flush this page yet */ - if (dp->mp_flags & (P_LOOSE|P_KEEP)) { - dp->mp_flags &= ~P_KEEP; - dl[++j] = dl[i]; - continue; - } - dp->mp_flags &= ~P_DIRTY; - } - goto done; - } - - /* Write the pages */ - for (;;) { - if (++i <= pagecount) { - dp = dl[i].mptr; - /* Don't flush this page yet */ - if (dp->mp_flags & (P_LOOSE|P_KEEP)) { - dp->mp_flags &= ~P_KEEP; - dl[i].mid = 0; - continue; - } - pgno = dl[i].mid; - /* clear dirty flag */ - dp->mp_flags &= ~P_DIRTY; - pos = pgno * psize; - size = psize; - if (IS_OVERFLOW(dp)) size *= dp->mp_pages; - } -#ifdef _WIN32 - else break; - - /* Windows actually supports scatter/gather I/O, but only on - * unbuffered file handles. Since we're relying on the OS page - * cache for all our data, that's self-defeating. So we just - * write pages one at a time. We use the ov structure to set - * the write offset, to at least save the overhead of a Seek - * system call. - */ - DPRINTF(("committing page %"Z"u", pgno)); - memset(&ov, 0, sizeof(ov)); - ov.Offset = pos & 0xffffffff; - ov.OffsetHigh = pos >> 16 >> 16; - if (!WriteFile(env->me_fd, dp, size, NULL, &ov)) { - rc = ErrCode(); - DPRINTF(("WriteFile: %d", rc)); - return rc; - } -#else - /* Write up to MDB_COMMIT_PAGES dirty pages at a time. */ - if (pos!=next_pos || n==MDB_COMMIT_PAGES || wsize+size>MAX_WRITE) { - if (n) { - /* Write previous page(s) */ -#ifdef MDB_USE_PWRITEV - wres = pwritev(env->me_fd, iov, n, wpos); -#else - if (n == 1) { - wres = pwrite(env->me_fd, iov[0].iov_base, wsize, wpos); - } else { - if (lseek(env->me_fd, wpos, SEEK_SET) == -1) { - rc = ErrCode(); - DPRINTF(("lseek: %s", strerror(rc))); - return rc; - } - wres = writev(env->me_fd, iov, n); - } -#endif - if (wres != wsize) { - if (wres < 0) { - rc = ErrCode(); - DPRINTF(("Write error: %s", strerror(rc))); - } else { - rc = EIO; /* TODO: Use which error code? */ - DPUTS("short write, filesystem full?"); - } - return rc; - } - n = 0; - } - if (i > pagecount) - break; - wpos = pos; - wsize = 0; - } - DPRINTF(("committing page %"Z"u", pgno)); - next_pos = pos + size; - iov[n].iov_len = size; - iov[n].iov_base = (char *)dp; - wsize += size; - n++; -#endif /* _WIN32 */ - } - - /* MIPS has cache coherency issues, this is a no-op everywhere else - * Note: for any size >= on-chip cache size, entire on-chip cache is - * flushed. - */ - CACHEFLUSH(env->me_map, txn->mt_next_pgno * env->me_psize, DCACHE); - - for (i = keep; ++i <= pagecount; ) { - dp = dl[i].mptr; - /* This is a page we skipped above */ - if (!dl[i].mid) { - dl[++j] = dl[i]; - dl[j].mid = dp->mp_pgno; - continue; - } - mdb_dpage_free(env, dp); - } - -done: - i--; - txn->mt_dirty_room += i - j; - dl[0].mid = j; - return MDB_SUCCESS; -} - -int -mdb_txn_commit(MDB_txn *txn) -{ - int rc; - unsigned int i; - MDB_env *env; - - if (txn == NULL || txn->mt_env == NULL) - return EINVAL; - - if (txn->mt_child) { - rc = mdb_txn_commit(txn->mt_child); - txn->mt_child = NULL; - if (rc) - goto fail; - } - - env = txn->mt_env; - - if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { - mdb_dbis_update(txn, 1); - txn->mt_numdbs = 2; /* so txn_abort() doesn't close any new handles */ - mdb_txn_abort(txn); - return MDB_SUCCESS; - } - - if (F_ISSET(txn->mt_flags, MDB_TXN_ERROR)) { - DPUTS("error flag is set, can't commit"); - if (txn->mt_parent) - txn->mt_parent->mt_flags |= MDB_TXN_ERROR; - rc = MDB_BAD_TXN; - goto fail; - } - - if (txn->mt_parent) { - MDB_txn *parent = txn->mt_parent; - MDB_page **lp; - MDB_ID2L dst, src; - MDB_IDL pspill; - unsigned x, y, len, ps_len; - - /* Append our free list to parent's */ - rc = mdb_midl_append_list(&parent->mt_free_pgs, txn->mt_free_pgs); - if (rc) - goto fail; - mdb_midl_free(txn->mt_free_pgs); - /* Failures after this must either undo the changes - * to the parent or set MDB_TXN_ERROR in the parent. - */ - - parent->mt_next_pgno = txn->mt_next_pgno; - parent->mt_flags = txn->mt_flags; - - /* Merge our cursors into parent's and close them */ - mdb_cursors_close(txn, 1); - - /* Update parent's DB table. */ - memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); - parent->mt_numdbs = txn->mt_numdbs; - parent->mt_dbflags[0] = txn->mt_dbflags[0]; - parent->mt_dbflags[1] = txn->mt_dbflags[1]; - for (i=2; imt_numdbs; i++) { - /* preserve parent's DB_NEW status */ - x = parent->mt_dbflags[i] & DB_NEW; - parent->mt_dbflags[i] = txn->mt_dbflags[i] | x; - } - - dst = parent->mt_u.dirty_list; - src = txn->mt_u.dirty_list; - /* Remove anything in our dirty list from parent's spill list */ - if ((pspill = parent->mt_spill_pgs) && (ps_len = pspill[0])) { - x = y = ps_len; - pspill[0] = (pgno_t)-1; - /* Mark our dirty pages as deleted in parent spill list */ - for (i=0, len=src[0].mid; ++i <= len; ) { - MDB_ID pn = src[i].mid << 1; - while (pn > pspill[x]) - x--; - if (pn == pspill[x]) { - pspill[x] = 1; - y = --x; - } - } - /* Squash deleted pagenums if we deleted any */ - for (x=y; ++x <= ps_len; ) - if (!(pspill[x] & 1)) - pspill[++y] = pspill[x]; - pspill[0] = y; - } - - /* Find len = length of merging our dirty list with parent's */ - x = dst[0].mid; - dst[0].mid = 0; /* simplify loops */ - if (parent->mt_parent) { - len = x + src[0].mid; - y = mdb_mid2l_search(src, dst[x].mid + 1) - 1; - for (i = x; y && i; y--) { - pgno_t yp = src[y].mid; - while (yp < dst[i].mid) - i--; - if (yp == dst[i].mid) { - i--; - len--; - } - } - } else { /* Simplify the above for single-ancestor case */ - len = MDB_IDL_UM_MAX - txn->mt_dirty_room; - } - /* Merge our dirty list with parent's */ - y = src[0].mid; - for (i = len; y; dst[i--] = src[y--]) { - pgno_t yp = src[y].mid; - while (yp < dst[x].mid) - dst[i--] = dst[x--]; - if (yp == dst[x].mid) - free(dst[x--].mptr); - } - mdb_tassert(txn, i == x); - dst[0].mid = len; - free(txn->mt_u.dirty_list); - parent->mt_dirty_room = txn->mt_dirty_room; - if (txn->mt_spill_pgs) { - if (parent->mt_spill_pgs) { - /* TODO: Prevent failure here, so parent does not fail */ - rc = mdb_midl_append_list(&parent->mt_spill_pgs, txn->mt_spill_pgs); - if (rc) - parent->mt_flags |= MDB_TXN_ERROR; - mdb_midl_free(txn->mt_spill_pgs); - mdb_midl_sort(parent->mt_spill_pgs); - } else { - parent->mt_spill_pgs = txn->mt_spill_pgs; - } - } - - /* Append our loose page list to parent's */ - for (lp = &parent->mt_loose_pgs; *lp; lp = &NEXT_LOOSE_PAGE(lp)) - ; - *lp = txn->mt_loose_pgs; - parent->mt_loose_count += txn->mt_loose_count; - - parent->mt_child = NULL; - mdb_midl_free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pghead); - free(txn); - return rc; - } - - if (txn != env->me_txn) { - DPUTS("attempt to commit unknown transaction"); - rc = EINVAL; - goto fail; - } - - mdb_cursors_close(txn, 0); - - if (!txn->mt_u.dirty_list[0].mid && - !(txn->mt_flags & (MDB_TXN_DIRTY|MDB_TXN_SPILLS))) - goto done; - - DPRINTF(("committing txn %"Z"u %p on mdbenv %p, root page %"Z"u", - txn->mt_txnid, (void*)txn, (void*)env, txn->mt_dbs[MAIN_DBI].md_root)); - - /* Update DB root pointers */ - if (txn->mt_numdbs > 2) { - MDB_cursor mc; - MDB_dbi i; - MDB_val data; - data.mv_size = sizeof(MDB_db); - - mdb_cursor_init(&mc, txn, MAIN_DBI, NULL); - for (i = 2; i < txn->mt_numdbs; i++) { - if (txn->mt_dbflags[i] & DB_DIRTY) { - if (TXN_DBI_CHANGED(txn, i)) { - rc = MDB_BAD_DBI; - goto fail; - } - data.mv_data = &txn->mt_dbs[i]; - rc = mdb_cursor_put(&mc, &txn->mt_dbxs[i].md_name, &data, 0); - if (rc) - goto fail; - } - } - } - - rc = mdb_freelist_save(txn); - if (rc) - goto fail; - - mdb_midl_free(env->me_pghead); - env->me_pghead = NULL; - if (mdb_midl_shrink(&txn->mt_free_pgs)) - env->me_free_pgs = txn->mt_free_pgs; - -#if (MDB_DEBUG) > 2 - mdb_audit(txn); -#endif - - if ((rc = mdb_page_flush(txn, 0)) || - (rc = mdb_env_sync(env, 0)) || - (rc = mdb_env_write_meta(txn))) - goto fail; - - /* Free P_LOOSE pages left behind in dirty_list */ - if (!(env->me_flags & MDB_WRITEMAP)) - mdb_dlist_free(txn); - -done: - env->me_pglast = 0; - env->me_txn = NULL; - mdb_dbis_update(txn, 1); - - if (env->me_txns) - UNLOCK_MUTEX_W(env); - free(txn); - - return MDB_SUCCESS; - -fail: - mdb_txn_abort(txn); - return rc; -} - -/** Read the environment parameters of a DB environment before - * mapping it into memory. - * @param[in] env the environment handle - * @param[out] meta address of where to store the meta information - * @return 0 on success, non-zero on failure. - */ -static int ESECT -mdb_env_read_header(MDB_env *env, MDB_meta *meta) -{ - MDB_metabuf pbuf; - MDB_page *p; - MDB_meta *m; - int i, rc, off; - enum { Size = sizeof(pbuf) }; - - /* We don't know the page size yet, so use a minimum value. - * Read both meta pages so we can use the latest one. - */ - - for (i=off=0; i<2; i++, off = meta->mm_psize) { -#ifdef _WIN32 - DWORD len; - OVERLAPPED ov; - memset(&ov, 0, sizeof(ov)); - ov.Offset = off; - rc = ReadFile(env->me_fd, &pbuf, Size, &len, &ov) ? (int)len : -1; - if (rc == -1 && ErrCode() == ERROR_HANDLE_EOF) - rc = 0; -#else - rc = pread(env->me_fd, &pbuf, Size, off); -#endif - if (rc != Size) { - if (rc == 0 && off == 0) - return ENOENT; - rc = rc < 0 ? (int) ErrCode() : MDB_INVALID; - DPRINTF(("read: %s", mdb_strerror(rc))); - return rc; - } - - p = (MDB_page *)&pbuf; - - if (!F_ISSET(p->mp_flags, P_META)) { - DPRINTF(("page %"Z"u not a meta page", p->mp_pgno)); - return MDB_INVALID; - } - - m = METADATA(p); - if (m->mm_magic != MDB_MAGIC) { - DPUTS("meta has invalid magic"); - return MDB_INVALID; - } - - if (m->mm_version != MDB_DATA_VERSION) { - DPRINTF(("database is version %u, expected version %u", - m->mm_version, MDB_DATA_VERSION)); - return MDB_VERSION_MISMATCH; - } - - if (off == 0 || m->mm_txnid > meta->mm_txnid) - *meta = *m; - } - return 0; -} - -static void ESECT -mdb_env_init_meta0(MDB_env *env, MDB_meta *meta) -{ - meta->mm_magic = MDB_MAGIC; - meta->mm_version = MDB_DATA_VERSION; - meta->mm_mapsize = env->me_mapsize; - meta->mm_psize = env->me_psize; - meta->mm_last_pg = 1; - meta->mm_flags = env->me_flags & 0xffff; - meta->mm_flags |= MDB_INTEGERKEY; - meta->mm_dbs[0].md_root = P_INVALID; - meta->mm_dbs[1].md_root = P_INVALID; -} - -/** Write the environment parameters of a freshly created DB environment. - * @param[in] env the environment handle - * @param[out] meta address of where to store the meta information - * @return 0 on success, non-zero on failure. - */ -static int ESECT -mdb_env_init_meta(MDB_env *env, MDB_meta *meta) -{ - MDB_page *p, *q; - int rc; - unsigned int psize; -#ifdef _WIN32 - DWORD len; - OVERLAPPED ov; - memset(&ov, 0, sizeof(ov)); -#define DO_PWRITE(rc, fd, ptr, size, len, pos) do { \ - ov.Offset = pos; \ - rc = WriteFile(fd, ptr, size, &len, &ov); } while(0) -#else - int len; -#define DO_PWRITE(rc, fd, ptr, size, len, pos) do { \ - len = pwrite(fd, ptr, size, pos); \ - rc = (len >= 0); } while(0) -#endif - - DPUTS("writing new meta page"); - - psize = env->me_psize; - - mdb_env_init_meta0(env, meta); - - p = calloc(2, psize); - p->mp_pgno = 0; - p->mp_flags = P_META; - *(MDB_meta *)METADATA(p) = *meta; - - q = (MDB_page *)((char *)p + psize); - q->mp_pgno = 1; - q->mp_flags = P_META; - *(MDB_meta *)METADATA(q) = *meta; - - DO_PWRITE(rc, env->me_fd, p, psize * 2, len, 0); - if (!rc) - rc = ErrCode(); - else if ((unsigned) len == psize * 2) - rc = MDB_SUCCESS; - else - rc = ENOSPC; - free(p); - return rc; -} - -/** Update the environment info to commit a transaction. - * @param[in] txn the transaction that's being committed - * @return 0 on success, non-zero on failure. - */ -static int -mdb_env_write_meta(MDB_txn *txn) -{ - MDB_env *env; - MDB_meta meta, metab, *mp; - size_t mapsize; - off_t off; - int rc, len, toggle; - char *ptr; - HANDLE mfd; -#ifdef _WIN32 - OVERLAPPED ov; -#else - int r2; -#endif - - toggle = txn->mt_txnid & 1; - DPRINTF(("writing meta page %d for root page %"Z"u", - toggle, txn->mt_dbs[MAIN_DBI].md_root)); - - env = txn->mt_env; - mp = env->me_metas[toggle]; - mapsize = env->me_metas[toggle ^ 1]->mm_mapsize; - /* Persist any increases of mapsize config */ - if (mapsize < env->me_mapsize) - mapsize = env->me_mapsize; - - if (env->me_flags & MDB_WRITEMAP) { - mp->mm_mapsize = mapsize; - mp->mm_dbs[0] = txn->mt_dbs[0]; - mp->mm_dbs[1] = txn->mt_dbs[1]; - mp->mm_last_pg = txn->mt_next_pgno - 1; - mp->mm_txnid = txn->mt_txnid; - if (!(env->me_flags & (MDB_NOMETASYNC|MDB_NOSYNC))) { - unsigned meta_size = env->me_psize; - rc = (env->me_flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC; - ptr = env->me_map; - if (toggle) { -#ifndef _WIN32 /* POSIX msync() requires ptr = start of OS page */ - if (meta_size < env->me_os_psize) - meta_size += meta_size; - else -#endif - ptr += meta_size; - } - if (MDB_MSYNC(ptr, meta_size, rc)) { - rc = ErrCode(); - goto fail; - } - } - goto done; - } - metab.mm_txnid = env->me_metas[toggle]->mm_txnid; - metab.mm_last_pg = env->me_metas[toggle]->mm_last_pg; - - meta.mm_mapsize = mapsize; - meta.mm_dbs[0] = txn->mt_dbs[0]; - meta.mm_dbs[1] = txn->mt_dbs[1]; - meta.mm_last_pg = txn->mt_next_pgno - 1; - meta.mm_txnid = txn->mt_txnid; - - off = offsetof(MDB_meta, mm_mapsize); - ptr = (char *)&meta + off; - len = sizeof(MDB_meta) - off; - if (toggle) - off += env->me_psize; - off += PAGEHDRSZ; - - /* Write to the SYNC fd */ - mfd = env->me_flags & (MDB_NOSYNC|MDB_NOMETASYNC) ? - env->me_fd : env->me_mfd; -#ifdef _WIN32 - { - memset(&ov, 0, sizeof(ov)); - ov.Offset = off; - if (!WriteFile(mfd, ptr, len, (DWORD *)&rc, &ov)) - rc = -1; - } -#else - rc = pwrite(mfd, ptr, len, off); -#endif - if (rc != len) { - rc = rc < 0 ? ErrCode() : EIO; - DPUTS("write failed, disk error?"); - /* On a failure, the pagecache still contains the new data. - * Write some old data back, to prevent it from being used. - * Use the non-SYNC fd; we know it will fail anyway. - */ - meta.mm_last_pg = metab.mm_last_pg; - meta.mm_txnid = metab.mm_txnid; -#ifdef _WIN32 - memset(&ov, 0, sizeof(ov)); - ov.Offset = off; - WriteFile(env->me_fd, ptr, len, NULL, &ov); -#else - r2 = pwrite(env->me_fd, ptr, len, off); - (void)r2; /* Silence warnings. We don't care about pwrite's return value */ -#endif -fail: - env->me_flags |= MDB_FATAL_ERROR; - return rc; - } - /* MIPS has cache coherency issues, this is a no-op everywhere else */ - CACHEFLUSH(env->me_map + off, len, DCACHE); -done: - /* Memory ordering issues are irrelevant; since the entire writer - * is wrapped by wmutex, all of these changes will become visible - * after the wmutex is unlocked. Since the DB is multi-version, - * readers will get consistent data regardless of how fresh or - * how stale their view of these values is. - */ - if (env->me_txns) - env->me_txns->mti_txnid = txn->mt_txnid; - - return MDB_SUCCESS; -} - -/** Check both meta pages to see which one is newer. - * @param[in] env the environment handle - * @return meta toggle (0 or 1). - */ -static int -mdb_env_pick_meta(const MDB_env *env) -{ - return (env->me_metas[0]->mm_txnid < env->me_metas[1]->mm_txnid); -} - -int ESECT -mdb_env_create(MDB_env **env) -{ - MDB_env *e; - - e = calloc(1, sizeof(MDB_env)); - if (!e) - return ENOMEM; - - e->me_maxreaders = DEFAULT_READERS; - e->me_maxdbs = e->me_numdbs = 2; - e->me_fd = INVALID_HANDLE_VALUE; - e->me_lfd = INVALID_HANDLE_VALUE; - e->me_mfd = INVALID_HANDLE_VALUE; -#ifdef MDB_USE_POSIX_SEM - e->me_rmutex = SEM_FAILED; - e->me_wmutex = SEM_FAILED; -#endif - e->me_pid = getpid(); - GET_PAGESIZE(e->me_os_psize); - VGMEMP_CREATE(e,0,0); - *env = e; - return MDB_SUCCESS; -} - -static int ESECT -mdb_env_map(MDB_env *env, void *addr) -{ - MDB_page *p; - unsigned int flags = env->me_flags; -#ifdef _WIN32 - int rc; - HANDLE mh; - LONG sizelo, sizehi; - size_t msize; - - if (flags & MDB_RDONLY) { - /* Don't set explicit map size, use whatever exists */ - msize = 0; - sizelo = 0; - sizehi = 0; - } else { - msize = env->me_mapsize; - sizelo = msize & 0xffffffff; - sizehi = msize >> 16 >> 16; /* only needed on Win64 */ - - /* Windows won't create mappings for zero length files. - * and won't map more than the file size. - * Just set the maxsize right now. - */ - if (SetFilePointer(env->me_fd, sizelo, &sizehi, 0) != (DWORD)sizelo - || !SetEndOfFile(env->me_fd) - || SetFilePointer(env->me_fd, 0, NULL, 0) != 0) - return ErrCode(); - } - - mh = CreateFileMapping(env->me_fd, NULL, flags & MDB_WRITEMAP ? - PAGE_READWRITE : PAGE_READONLY, - sizehi, sizelo, NULL); - if (!mh) - return ErrCode(); - env->me_map = MapViewOfFileEx(mh, flags & MDB_WRITEMAP ? - FILE_MAP_WRITE : FILE_MAP_READ, - 0, 0, msize, addr); - rc = env->me_map ? 0 : ErrCode(); - CloseHandle(mh); - if (rc) - return rc; -#else - int prot = PROT_READ; - if (flags & MDB_WRITEMAP) { - prot |= PROT_WRITE; - if (ftruncate(env->me_fd, env->me_mapsize) < 0) - return ErrCode(); - } - env->me_map = mmap(addr, env->me_mapsize, prot, MAP_SHARED, - env->me_fd, 0); - if (env->me_map == MAP_FAILED) { - env->me_map = NULL; - return ErrCode(); - } - - if (flags & MDB_NORDAHEAD) { - /* Turn off readahead. It's harmful when the DB is larger than RAM. */ -#ifdef MADV_RANDOM - madvise(env->me_map, env->me_mapsize, MADV_RANDOM); -#else -#ifdef POSIX_MADV_RANDOM - posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_RANDOM); -#endif /* POSIX_MADV_RANDOM */ -#endif /* MADV_RANDOM */ - } -#endif /* _WIN32 */ - - /* Can happen because the address argument to mmap() is just a - * hint. mmap() can pick another, e.g. if the range is in use. - * The MAP_FIXED flag would prevent that, but then mmap could - * instead unmap existing pages to make room for the new map. - */ - if (addr && env->me_map != addr) - return EBUSY; /* TODO: Make a new MDB_* error code? */ - - p = (MDB_page *)env->me_map; - env->me_metas[0] = METADATA(p); - env->me_metas[1] = (MDB_meta *)((char *)env->me_metas[0] + env->me_psize); - - return MDB_SUCCESS; -} - -int ESECT -mdb_env_set_mapsize(MDB_env *env, size_t size) -{ - /* If env is already open, caller is responsible for making - * sure there are no active txns. - */ - if (env->me_map) { - int rc; - void *old; - if (env->me_txn) - return EINVAL; - if (!size) - size = env->me_metas[mdb_env_pick_meta(env)]->mm_mapsize; - else if (size < env->me_mapsize) { - /* If the configured size is smaller, make sure it's - * still big enough. Silently round up to minimum if not. - */ - size_t minsize = (env->me_metas[mdb_env_pick_meta(env)]->mm_last_pg + 1) * env->me_psize; - if (size < minsize) - size = minsize; - } - munmap(env->me_map, env->me_mapsize); - env->me_mapsize = size; - old = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : NULL; - rc = mdb_env_map(env, old); - if (rc) - return rc; - } - env->me_mapsize = size; - if (env->me_psize) - env->me_maxpg = env->me_mapsize / env->me_psize; - return MDB_SUCCESS; -} - -int ESECT -mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs) -{ - if (env->me_map) - return EINVAL; - env->me_maxdbs = dbs + 2; /* Named databases + main and free DB */ - return MDB_SUCCESS; -} - -int ESECT -mdb_env_set_maxreaders(MDB_env *env, unsigned int readers) -{ - if (env->me_map || readers < 1) - return EINVAL; - env->me_maxreaders = readers; - return MDB_SUCCESS; -} - -int ESECT -mdb_env_get_maxreaders(MDB_env *env, unsigned int *readers) -{ - if (!env || !readers) - return EINVAL; - *readers = env->me_maxreaders; - return MDB_SUCCESS; -} - -/** Further setup required for opening an LMDB environment - */ -static int ESECT -mdb_env_open2(MDB_env *env) -{ - unsigned int flags = env->me_flags; - int i, newenv = 0, rc; - MDB_meta meta; - -#ifdef _WIN32 - /* See if we should use QueryLimited */ - rc = GetVersion(); - if ((rc & 0xff) > 5) - env->me_pidquery = MDB_PROCESS_QUERY_LIMITED_INFORMATION; - else - env->me_pidquery = PROCESS_QUERY_INFORMATION; -#endif /* _WIN32 */ - - memset(&meta, 0, sizeof(meta)); - - if ((i = mdb_env_read_header(env, &meta)) != 0) { - if (i != ENOENT) - return i; - DPUTS("new mdbenv"); - newenv = 1; - env->me_psize = env->me_os_psize; - if (env->me_psize > MAX_PAGESIZE) - env->me_psize = MAX_PAGESIZE; - } else { - env->me_psize = meta.mm_psize; - } - - /* Was a mapsize configured? */ - if (!env->me_mapsize) { - /* If this is a new environment, take the default, - * else use the size recorded in the existing env. - */ - env->me_mapsize = newenv ? DEFAULT_MAPSIZE : meta.mm_mapsize; - } else if (env->me_mapsize < meta.mm_mapsize) { - /* If the configured size is smaller, make sure it's - * still big enough. Silently round up to minimum if not. - */ - size_t minsize = (meta.mm_last_pg + 1) * meta.mm_psize; - if (env->me_mapsize < minsize) - env->me_mapsize = minsize; - } - - rc = mdb_env_map(env, (flags & MDB_FIXEDMAP) ? meta.mm_address : NULL); - if (rc) - return rc; - - if (newenv) { - if (flags & MDB_FIXEDMAP) - meta.mm_address = env->me_map; - i = mdb_env_init_meta(env, &meta); - if (i != MDB_SUCCESS) { - return i; - } - } - - env->me_maxfree_1pg = (env->me_psize - PAGEHDRSZ) / sizeof(pgno_t) - 1; - env->me_nodemax = (((env->me_psize - PAGEHDRSZ) / MDB_MINKEYS) & -2) - - sizeof(indx_t); -#if !(MDB_MAXKEYSIZE) - env->me_maxkey = env->me_nodemax - (NODESIZE + sizeof(MDB_db)); -#endif - env->me_maxpg = env->me_mapsize / env->me_psize; - -#if MDB_DEBUG - { - int toggle = mdb_env_pick_meta(env); - MDB_db *db = &env->me_metas[toggle]->mm_dbs[MAIN_DBI]; - - DPRINTF(("opened database version %u, pagesize %u", - env->me_metas[0]->mm_version, env->me_psize)); - DPRINTF(("using meta page %d", toggle)); - DPRINTF(("depth: %u", db->md_depth)); - DPRINTF(("entries: %"Z"u", db->md_entries)); - DPRINTF(("branch pages: %"Z"u", db->md_branch_pages)); - DPRINTF(("leaf pages: %"Z"u", db->md_leaf_pages)); - DPRINTF(("overflow pages: %"Z"u", db->md_overflow_pages)); - DPRINTF(("root: %"Z"u", db->md_root)); - } -#endif - - return MDB_SUCCESS; -} - - -/** Release a reader thread's slot in the reader lock table. - * This function is called automatically when a thread exits. - * @param[in] ptr This points to the slot in the reader lock table. - */ -static void -mdb_env_reader_dest(void *ptr) -{ - MDB_reader *reader = ptr; - - reader->mr_pid = 0; -} - -#ifdef _WIN32 -/** Junk for arranging thread-specific callbacks on Windows. This is - * necessarily platform and compiler-specific. Windows supports up - * to 1088 keys. Let's assume nobody opens more than 64 environments - * in a single process, for now. They can override this if needed. - */ -#ifndef MAX_TLS_KEYS -#define MAX_TLS_KEYS 64 -#endif -static pthread_key_t mdb_tls_keys[MAX_TLS_KEYS]; -static int mdb_tls_nkeys; - -static void NTAPI mdb_tls_callback(PVOID module, DWORD reason, PVOID ptr) -{ - int i; - switch(reason) { - case DLL_PROCESS_ATTACH: break; - case DLL_THREAD_ATTACH: break; - case DLL_THREAD_DETACH: - for (i=0; ime_txns->mti_txnid = env->me_metas[toggle]->mm_txnid; - -#ifdef _WIN32 - { - OVERLAPPED ov; - /* First acquire a shared lock. The Unlock will - * then release the existing exclusive lock. - */ - memset(&ov, 0, sizeof(ov)); - if (!LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) { - rc = ErrCode(); - } else { - UnlockFile(env->me_lfd, 0, 0, 1, 0); - *excl = 0; - } - } -#else - { - struct flock lock_info; - /* The shared lock replaces the existing lock */ - memset((void *)&lock_info, 0, sizeof(lock_info)); - lock_info.l_type = F_RDLCK; - lock_info.l_whence = SEEK_SET; - lock_info.l_start = 0; - lock_info.l_len = 1; - while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) && - (rc = ErrCode()) == EINTR) ; - *excl = rc ? -1 : 0; /* error may mean we lost the lock */ - } -#endif - - return rc; -} - -/** Try to get exlusive lock, otherwise shared. - * Maintain *excl = -1: no/unknown lock, 0: shared, 1: exclusive. - */ -static int ESECT -mdb_env_excl_lock(MDB_env *env, int *excl) -{ - int rc = 0; -#ifdef _WIN32 - if (LockFile(env->me_lfd, 0, 0, 1, 0)) { - *excl = 1; - } else { - OVERLAPPED ov; - memset(&ov, 0, sizeof(ov)); - if (LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) { - *excl = 0; - } else { - rc = ErrCode(); - } - } -#else - struct flock lock_info; - memset((void *)&lock_info, 0, sizeof(lock_info)); - lock_info.l_type = F_WRLCK; - lock_info.l_whence = SEEK_SET; - lock_info.l_start = 0; - lock_info.l_len = 1; - while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) && - (rc = ErrCode()) == EINTR) ; - if (!rc) { - *excl = 1; - } else -# ifdef MDB_USE_POSIX_SEM - if (*excl < 0) /* always true when !MDB_USE_POSIX_SEM */ -# endif - { - lock_info.l_type = F_RDLCK; - while ((rc = fcntl(env->me_lfd, F_SETLKW, &lock_info)) && - (rc = ErrCode()) == EINTR) ; - if (rc == 0) - *excl = 0; - } -#endif - return rc; -} - -#ifdef MDB_USE_HASH -/* - * hash_64 - 64 bit Fowler/Noll/Vo-0 FNV-1a hash code - * - * @(#) $Revision: 5.1 $ - * @(#) $Id: hash_64a.c,v 5.1 2009/06/30 09:01:38 chongo Exp $ - * @(#) $Source: /usr/local/src/cmd/fnv/RCS/hash_64a.c,v $ - * - * http://www.isthe.com/chongo/tech/comp/fnv/index.html - * - *** - * - * Please do not copyright this code. This code is in the public domain. - * - * LANDON CURT NOLL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, - * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO - * EVENT SHALL LANDON CURT NOLL BE LIABLE FOR ANY SPECIAL, INDIRECT OR - * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF - * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR - * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR - * PERFORMANCE OF THIS SOFTWARE. - * - * By: - * chongo /\oo/\ - * http://www.isthe.com/chongo/ - * - * Share and Enjoy! :-) - */ - -typedef unsigned long long mdb_hash_t; -#define MDB_HASH_INIT ((mdb_hash_t)0xcbf29ce484222325ULL) - -/** perform a 64 bit Fowler/Noll/Vo FNV-1a hash on a buffer - * @param[in] val value to hash - * @param[in] hval initial value for hash - * @return 64 bit hash - * - * NOTE: To use the recommended 64 bit FNV-1a hash, use MDB_HASH_INIT as the - * hval arg on the first call. - */ -static mdb_hash_t -mdb_hash_val(MDB_val *val, mdb_hash_t hval) -{ - unsigned char *s = (unsigned char *)val->mv_data; /* unsigned string */ - unsigned char *end = s + val->mv_size; - /* - * FNV-1a hash each octet of the string - */ - while (s < end) { - /* xor the bottom with the current octet */ - hval ^= (mdb_hash_t)*s++; - - /* multiply by the 64 bit FNV magic prime mod 2^64 */ - hval += (hval << 1) + (hval << 4) + (hval << 5) + - (hval << 7) + (hval << 8) + (hval << 40); - } - /* return our new hash value */ - return hval; -} - -/** Hash the string and output the encoded hash. - * This uses modified RFC1924 Ascii85 encoding to accommodate systems with - * very short name limits. We don't care about the encoding being reversible, - * we just want to preserve as many bits of the input as possible in a - * small printable string. - * @param[in] str string to hash - * @param[out] encbuf an array of 11 chars to hold the hash - */ -static const char mdb_a85[]= "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~"; - -static void -mdb_pack85(unsigned long l, char *out) -{ - int i; - - for (i=0; i<5; i++) { - *out++ = mdb_a85[l % 85]; - l /= 85; - } -} - -static void -mdb_hash_enc(MDB_val *val, char *encbuf) -{ - mdb_hash_t h = mdb_hash_val(val, MDB_HASH_INIT); - - mdb_pack85(h, encbuf); - mdb_pack85(h>>32, encbuf+5); - encbuf[10] = '\0'; -} -#endif - -/** Open and/or initialize the lock region for the environment. - * @param[in] env The LMDB environment. - * @param[in] lpath The pathname of the file used for the lock region. - * @param[in] mode The Unix permissions for the file, if we create it. - * @param[out] excl Resulting file lock type: -1 none, 0 shared, 1 exclusive - * @param[in,out] excl In -1, out lock type: -1 none, 0 shared, 1 exclusive - * @return 0 on success, non-zero on failure. - */ -static int ESECT -mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) -{ -#ifdef _WIN32 -# define MDB_ERRCODE_ROFS ERROR_WRITE_PROTECT -#else -# define MDB_ERRCODE_ROFS EROFS -#ifdef O_CLOEXEC /* Linux: Open file and set FD_CLOEXEC atomically */ -# define MDB_CLOEXEC O_CLOEXEC -#else - int fdflags; -# define MDB_CLOEXEC 0 -#endif -#endif - int rc; - off_t size, rsize; - -#ifdef _WIN32 - env->me_lfd = CreateFile(lpath, GENERIC_READ|GENERIC_WRITE, - FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_ALWAYS, - FILE_ATTRIBUTE_NORMAL, NULL); -#else - env->me_lfd = open(lpath, O_RDWR|O_CREAT|MDB_CLOEXEC, mode); -#endif - if (env->me_lfd == INVALID_HANDLE_VALUE) { - rc = ErrCode(); - if (rc == MDB_ERRCODE_ROFS && (env->me_flags & MDB_RDONLY)) { - return MDB_SUCCESS; - } - goto fail_errno; - } -#if ! ((MDB_CLOEXEC) || defined(_WIN32)) - /* Lose record locks when exec*() */ - if ((fdflags = fcntl(env->me_lfd, F_GETFD) | FD_CLOEXEC) >= 0) - fcntl(env->me_lfd, F_SETFD, fdflags); -#endif - - if (!(env->me_flags & MDB_NOTLS)) { - rc = pthread_key_create(&env->me_txkey, mdb_env_reader_dest); - if (rc) - goto fail; - env->me_flags |= MDB_ENV_TXKEY; -#ifdef _WIN32 - /* Windows TLS callbacks need help finding their TLS info. */ - if (mdb_tls_nkeys >= MAX_TLS_KEYS) { - rc = MDB_TLS_FULL; - goto fail; - } - mdb_tls_keys[mdb_tls_nkeys++] = env->me_txkey; -#endif - } - - /* Try to get exclusive lock. If we succeed, then - * nobody is using the lock region and we should initialize it. - */ - if ((rc = mdb_env_excl_lock(env, excl))) goto fail; - -#ifdef _WIN32 - size = GetFileSize(env->me_lfd, NULL); -#else - size = lseek(env->me_lfd, 0, SEEK_END); - if (size == -1) goto fail_errno; -#endif - rsize = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo); - if (size < rsize && *excl > 0) { -#ifdef _WIN32 - if (SetFilePointer(env->me_lfd, rsize, NULL, FILE_BEGIN) != (DWORD)rsize - || !SetEndOfFile(env->me_lfd)) - goto fail_errno; -#else - if (ftruncate(env->me_lfd, rsize) != 0) goto fail_errno; -#endif - } else { - rsize = size; - size = rsize - sizeof(MDB_txninfo); - env->me_maxreaders = size/sizeof(MDB_reader) + 1; - } - { -#ifdef _WIN32 - HANDLE mh; - mh = CreateFileMapping(env->me_lfd, NULL, PAGE_READWRITE, - 0, 0, NULL); - if (!mh) goto fail_errno; - env->me_txns = MapViewOfFileEx(mh, FILE_MAP_WRITE, 0, 0, rsize, NULL); - CloseHandle(mh); - if (!env->me_txns) goto fail_errno; -#else - void *m = mmap(NULL, rsize, PROT_READ|PROT_WRITE, MAP_SHARED, - env->me_lfd, 0); - if (m == MAP_FAILED) goto fail_errno; - env->me_txns = m; -#endif - } - if (*excl > 0) { -#ifdef _WIN32 - BY_HANDLE_FILE_INFORMATION stbuf; - struct { - DWORD volume; - DWORD nhigh; - DWORD nlow; - } idbuf; - MDB_val val; - char encbuf[11]; - - if (!mdb_sec_inited) { - InitializeSecurityDescriptor(&mdb_null_sd, - SECURITY_DESCRIPTOR_REVISION); - SetSecurityDescriptorDacl(&mdb_null_sd, TRUE, 0, FALSE); - mdb_all_sa.nLength = sizeof(SECURITY_ATTRIBUTES); - mdb_all_sa.bInheritHandle = FALSE; - mdb_all_sa.lpSecurityDescriptor = &mdb_null_sd; - mdb_sec_inited = 1; - } - if (!GetFileInformationByHandle(env->me_lfd, &stbuf)) goto fail_errno; - idbuf.volume = stbuf.dwVolumeSerialNumber; - idbuf.nhigh = stbuf.nFileIndexHigh; - idbuf.nlow = stbuf.nFileIndexLow; - val.mv_data = &idbuf; - val.mv_size = sizeof(idbuf); - mdb_hash_enc(&val, encbuf); - sprintf(env->me_txns->mti_rmname, "Global\\MDBr%s", encbuf); - sprintf(env->me_txns->mti_wmname, "Global\\MDBw%s", encbuf); - env->me_rmutex = CreateMutex(&mdb_all_sa, FALSE, env->me_txns->mti_rmname); - if (!env->me_rmutex) goto fail_errno; - env->me_wmutex = CreateMutex(&mdb_all_sa, FALSE, env->me_txns->mti_wmname); - if (!env->me_wmutex) goto fail_errno; -#elif defined(MDB_USE_POSIX_SEM) - struct stat stbuf; - struct { - dev_t dev; - ino_t ino; - } idbuf; - MDB_val val; - char encbuf[11]; - -#if defined(__NetBSD__) -#define MDB_SHORT_SEMNAMES 1 /* limited to 14 chars */ -#endif - if (fstat(env->me_lfd, &stbuf)) goto fail_errno; - idbuf.dev = stbuf.st_dev; - idbuf.ino = stbuf.st_ino; - val.mv_data = &idbuf; - val.mv_size = sizeof(idbuf); - mdb_hash_enc(&val, encbuf); -#ifdef MDB_SHORT_SEMNAMES - encbuf[9] = '\0'; /* drop name from 15 chars to 14 chars */ -#endif - sprintf(env->me_txns->mti_rmname, "/MDBr%s", encbuf); - sprintf(env->me_txns->mti_wmname, "/MDBw%s", encbuf); - /* Clean up after a previous run, if needed: Try to - * remove both semaphores before doing anything else. - */ - sem_unlink(env->me_txns->mti_rmname); - sem_unlink(env->me_txns->mti_wmname); - env->me_rmutex = sem_open(env->me_txns->mti_rmname, - O_CREAT|O_EXCL, mode, 1); - if (env->me_rmutex == SEM_FAILED) goto fail_errno; - env->me_wmutex = sem_open(env->me_txns->mti_wmname, - O_CREAT|O_EXCL, mode, 1); - if (env->me_wmutex == SEM_FAILED) goto fail_errno; -#else /* MDB_USE_POSIX_SEM */ - pthread_mutexattr_t mattr; - - if ((rc = pthread_mutexattr_init(&mattr)) - || (rc = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED)) - || (rc = pthread_mutex_init(&env->me_txns->mti_mutex, &mattr)) - || (rc = pthread_mutex_init(&env->me_txns->mti_wmutex, &mattr))) - goto fail; - pthread_mutexattr_destroy(&mattr); -#endif /* _WIN32 || MDB_USE_POSIX_SEM */ - - env->me_txns->mti_magic = MDB_MAGIC; - env->me_txns->mti_format = MDB_LOCK_FORMAT; - env->me_txns->mti_txnid = 0; - env->me_txns->mti_numreaders = 0; - - } else { - if (env->me_txns->mti_magic != MDB_MAGIC) { - DPUTS("lock region has invalid magic"); - rc = MDB_INVALID; - goto fail; - } - if (env->me_txns->mti_format != MDB_LOCK_FORMAT) { - DPRINTF(("lock region has format+version 0x%x, expected 0x%x", - env->me_txns->mti_format, MDB_LOCK_FORMAT)); - rc = MDB_VERSION_MISMATCH; - goto fail; - } - rc = ErrCode(); - if (rc && rc != EACCES && rc != EAGAIN) { - goto fail; - } -#ifdef _WIN32 - env->me_rmutex = OpenMutex(SYNCHRONIZE, FALSE, env->me_txns->mti_rmname); - if (!env->me_rmutex) goto fail_errno; - env->me_wmutex = OpenMutex(SYNCHRONIZE, FALSE, env->me_txns->mti_wmname); - if (!env->me_wmutex) goto fail_errno; -#elif defined(MDB_USE_POSIX_SEM) - env->me_rmutex = sem_open(env->me_txns->mti_rmname, 0); - if (env->me_rmutex == SEM_FAILED) goto fail_errno; - env->me_wmutex = sem_open(env->me_txns->mti_wmname, 0); - if (env->me_wmutex == SEM_FAILED) goto fail_errno; -#endif - } - return MDB_SUCCESS; - -fail_errno: - rc = ErrCode(); -fail: - return rc; -} - - /** The name of the lock file in the DB environment */ -#define LOCKNAME "/lock.mdb" - /** The name of the data file in the DB environment */ -#define DATANAME "/data.mdb" - /** The suffix of the lock file when no subdir is used */ -#define LOCKSUFF "-lock" - /** Only a subset of the @ref mdb_env flags can be changed - * at runtime. Changing other flags requires closing the - * environment and re-opening it with the new flags. - */ -#define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC|MDB_NOMEMINIT) -#define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY|MDB_WRITEMAP| \ - MDB_NOTLS|MDB_NOLOCK|MDB_NORDAHEAD) - -#if VALID_FLAGS & PERSISTENT_FLAGS & (CHANGEABLE|CHANGELESS) -# error "Persistent DB flags & env flags overlap, but both go in mm_flags" -#endif - -int ESECT -mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode) -{ - int oflags, rc, len, excl = -1; - char *lpath, *dpath; - - if (env->me_fd!=INVALID_HANDLE_VALUE || (flags & ~(CHANGEABLE|CHANGELESS))) - return EINVAL; - - len = strlen(path); - if (flags & MDB_NOSUBDIR) { - rc = len + sizeof(LOCKSUFF) + len + 1; - } else { - rc = len + sizeof(LOCKNAME) + len + sizeof(DATANAME); - } - lpath = malloc(rc); - if (!lpath) - return ENOMEM; - if (flags & MDB_NOSUBDIR) { - dpath = lpath + len + sizeof(LOCKSUFF); - sprintf(lpath, "%s" LOCKSUFF, path); - strcpy(dpath, path); - } else { - dpath = lpath + len + sizeof(LOCKNAME); - sprintf(lpath, "%s" LOCKNAME, path); - sprintf(dpath, "%s" DATANAME, path); - } - - rc = MDB_SUCCESS; - flags |= env->me_flags; - if (flags & MDB_RDONLY) { - /* silently ignore WRITEMAP when we're only getting read access */ - flags &= ~MDB_WRITEMAP; - } else { - if (!((env->me_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX)) && - (env->me_dirty_list = calloc(MDB_IDL_UM_SIZE, sizeof(MDB_ID2))))) - rc = ENOMEM; - } - env->me_flags = flags |= MDB_ENV_ACTIVE; - if (rc) - goto leave; - - env->me_path = strdup(path); - env->me_dbxs = calloc(env->me_maxdbs, sizeof(MDB_dbx)); - env->me_dbflags = calloc(env->me_maxdbs, sizeof(uint16_t)); - env->me_dbiseqs = calloc(env->me_maxdbs, sizeof(unsigned int)); - if (!(env->me_dbxs && env->me_path && env->me_dbflags && env->me_dbiseqs)) { - rc = ENOMEM; - goto leave; - } - - /* For RDONLY, get lockfile after we know datafile exists */ - if (!(flags & (MDB_RDONLY|MDB_NOLOCK))) { - rc = mdb_env_setup_locks(env, lpath, mode, &excl); - if (rc) - goto leave; - } - -#ifdef _WIN32 - if (F_ISSET(flags, MDB_RDONLY)) { - oflags = GENERIC_READ; - len = OPEN_EXISTING; - } else { - oflags = GENERIC_READ|GENERIC_WRITE; - len = OPEN_ALWAYS; - } - mode = FILE_ATTRIBUTE_NORMAL; - env->me_fd = CreateFile(dpath, oflags, FILE_SHARE_READ|FILE_SHARE_WRITE, - NULL, len, mode, NULL); -#else - if (F_ISSET(flags, MDB_RDONLY)) - oflags = O_RDONLY; - else - oflags = O_RDWR | O_CREAT; - - env->me_fd = open(dpath, oflags, mode); -#endif - if (env->me_fd == INVALID_HANDLE_VALUE) { - rc = ErrCode(); - goto leave; - } - - if ((flags & (MDB_RDONLY|MDB_NOLOCK)) == MDB_RDONLY) { - rc = mdb_env_setup_locks(env, lpath, mode, &excl); - if (rc) - goto leave; - } - - if ((rc = mdb_env_open2(env)) == MDB_SUCCESS) { - if (flags & (MDB_RDONLY|MDB_WRITEMAP)) { - env->me_mfd = env->me_fd; - } else { - /* Synchronous fd for meta writes. Needed even with - * MDB_NOSYNC/MDB_NOMETASYNC, in case these get reset. - */ -#ifdef _WIN32 - len = OPEN_EXISTING; - env->me_mfd = CreateFile(dpath, oflags, - FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, len, - mode | FILE_FLAG_WRITE_THROUGH, NULL); -#else - oflags &= ~O_CREAT; - env->me_mfd = open(dpath, oflags | MDB_DSYNC, mode); -#endif - if (env->me_mfd == INVALID_HANDLE_VALUE) { - rc = ErrCode(); - goto leave; - } - } - DPRINTF(("opened dbenv %p", (void *) env)); - if (excl > 0) { - rc = mdb_env_share_locks(env, &excl); - if (rc) - goto leave; - } - if (!((flags & MDB_RDONLY) || - (env->me_pbuf = calloc(1, env->me_psize)))) - rc = ENOMEM; - } - -leave: - if (rc) { - mdb_env_close0(env, excl); - } - free(lpath); - return rc; -} - -/** Destroy resources from mdb_env_open(), clear our readers & DBIs */ -static void ESECT -mdb_env_close0(MDB_env *env, int excl) -{ - int i; - - if (!(env->me_flags & MDB_ENV_ACTIVE)) - return; - - /* Doing this here since me_dbxs may not exist during mdb_env_close */ - for (i = env->me_maxdbs; --i > MAIN_DBI; ) - free(env->me_dbxs[i].md_name.mv_data); - - free(env->me_pbuf); - free(env->me_dbiseqs); - free(env->me_dbflags); - free(env->me_dbxs); - free(env->me_path); - free(env->me_dirty_list); - mdb_midl_free(env->me_free_pgs); - - if (env->me_flags & MDB_ENV_TXKEY) { - pthread_key_delete(env->me_txkey); -#ifdef _WIN32 - /* Delete our key from the global list */ - for (i=0; ime_txkey) { - mdb_tls_keys[i] = mdb_tls_keys[mdb_tls_nkeys-1]; - mdb_tls_nkeys--; - break; - } -#endif - } - - if (env->me_map) { - munmap(env->me_map, env->me_mapsize); - } - if (env->me_mfd != env->me_fd && env->me_mfd != INVALID_HANDLE_VALUE) - (void) close(env->me_mfd); - if (env->me_fd != INVALID_HANDLE_VALUE) - (void) close(env->me_fd); - if (env->me_txns) { - MDB_PID_T pid = env->me_pid; - /* Clearing readers is done in this function because - * me_txkey with its destructor must be disabled first. - */ - for (i = env->me_numreaders; --i >= 0; ) - if (env->me_txns->mti_readers[i].mr_pid == pid) - env->me_txns->mti_readers[i].mr_pid = 0; -#ifdef _WIN32 - if (env->me_rmutex) { - CloseHandle(env->me_rmutex); - if (env->me_wmutex) CloseHandle(env->me_wmutex); - } - /* Windows automatically destroys the mutexes when - * the last handle closes. - */ -#elif defined(MDB_USE_POSIX_SEM) - if (env->me_rmutex != SEM_FAILED) { - sem_close(env->me_rmutex); - if (env->me_wmutex != SEM_FAILED) - sem_close(env->me_wmutex); - /* If we have the filelock: If we are the - * only remaining user, clean up semaphores. - */ - if (excl == 0) - mdb_env_excl_lock(env, &excl); - if (excl > 0) { - sem_unlink(env->me_txns->mti_rmname); - sem_unlink(env->me_txns->mti_wmname); - } - } -#endif - munmap((void *)env->me_txns, (env->me_maxreaders-1)*sizeof(MDB_reader)+sizeof(MDB_txninfo)); - } - if (env->me_lfd != INVALID_HANDLE_VALUE) { -#ifdef _WIN32 - if (excl >= 0) { - /* Unlock the lockfile. Windows would have unlocked it - * after closing anyway, but not necessarily at once. - */ - UnlockFile(env->me_lfd, 0, 0, 1, 0); - } -#endif - (void) close(env->me_lfd); - } - - env->me_flags &= ~(MDB_ENV_ACTIVE|MDB_ENV_TXKEY); -} - - -void ESECT -mdb_env_close(MDB_env *env) -{ - MDB_page *dp; - - if (env == NULL) - return; - - VGMEMP_DESTROY(env); - while ((dp = env->me_dpages) != NULL) { - VGMEMP_DEFINED(&dp->mp_next, sizeof(dp->mp_next)); - env->me_dpages = dp->mp_next; - free(dp); - } - - mdb_env_close0(env, 0); - free(env); -} - -/** Compare two items pointing at aligned size_t's */ -static int -mdb_cmp_long(const MDB_val *a, const MDB_val *b) -{ - return (*(size_t *)a->mv_data < *(size_t *)b->mv_data) ? -1 : - *(size_t *)a->mv_data > *(size_t *)b->mv_data; -} - -/** Compare two items pointing at aligned unsigned int's */ -static int -mdb_cmp_int(const MDB_val *a, const MDB_val *b) -{ - return (*(unsigned int *)a->mv_data < *(unsigned int *)b->mv_data) ? -1 : - *(unsigned int *)a->mv_data > *(unsigned int *)b->mv_data; -} - -/** Compare two items pointing at unsigned ints of unknown alignment. - * Nodes and keys are guaranteed to be 2-byte aligned. - */ -static int -mdb_cmp_cint(const MDB_val *a, const MDB_val *b) -{ -#if BYTE_ORDER == LITTLE_ENDIAN - unsigned short *u, *c; - int x; - - u = (unsigned short *) ((char *) a->mv_data + a->mv_size); - c = (unsigned short *) ((char *) b->mv_data + a->mv_size); - do { - x = *--u - *--c; - } while(!x && u > (unsigned short *)a->mv_data); - return x; -#else - unsigned short *u, *c, *end; - int x; - - end = (unsigned short *) ((char *) a->mv_data + a->mv_size); - u = (unsigned short *)a->mv_data; - c = (unsigned short *)b->mv_data; - do { - x = *u++ - *c++; - } while(!x && u < end); - return x; -#endif -} - -/** Compare two items pointing at size_t's of unknown alignment. */ -#ifdef MISALIGNED_OK -# define mdb_cmp_clong mdb_cmp_long -#else -# define mdb_cmp_clong mdb_cmp_cint -#endif - -/** Compare two items lexically */ -static int -mdb_cmp_memn(const MDB_val *a, const MDB_val *b) -{ - int diff; - ssize_t len_diff; - unsigned int len; - - len = a->mv_size; - len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size; - if (len_diff > 0) { - len = b->mv_size; - len_diff = 1; - } - - diff = memcmp(a->mv_data, b->mv_data, len); - return diff ? diff : len_diff<0 ? -1 : len_diff; -} - -/** Compare two items in reverse byte order */ -static int -mdb_cmp_memnr(const MDB_val *a, const MDB_val *b) -{ - const unsigned char *p1, *p2, *p1_lim; - ssize_t len_diff; - int diff; - - p1_lim = (const unsigned char *)a->mv_data; - p1 = (const unsigned char *)a->mv_data + a->mv_size; - p2 = (const unsigned char *)b->mv_data + b->mv_size; - - len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size; - if (len_diff > 0) { - p1_lim += len_diff; - len_diff = 1; - } - - while (p1 > p1_lim) { - diff = *--p1 - *--p2; - if (diff) - return diff; - } - return len_diff<0 ? -1 : len_diff; -} - -/** Search for key within a page, using binary search. - * Returns the smallest entry larger or equal to the key. - * If exactp is non-null, stores whether the found entry was an exact match - * in *exactp (1 or 0). - * Updates the cursor index with the index of the found entry. - * If no entry larger or equal to the key is found, returns NULL. - */ -static MDB_node * -mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp) -{ - unsigned int i = 0, nkeys; - int low, high; - int rc = 0; - MDB_page *mp = mc->mc_pg[mc->mc_top]; - MDB_node *node = NULL; - MDB_val nodekey; - MDB_cmp_func *cmp; - DKBUF; - - nkeys = NUMKEYS(mp); - - DPRINTF(("searching %u keys in %s %spage %"Z"u", - nkeys, IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", - mdb_dbg_pgno(mp))); - - low = IS_LEAF(mp) ? 0 : 1; - high = nkeys - 1; - cmp = mc->mc_dbx->md_cmp; - - /* Branch pages have no data, so if using integer keys, - * alignment is guaranteed. Use faster mdb_cmp_int. - */ - if (cmp == mdb_cmp_cint && IS_BRANCH(mp)) { - if (NODEPTR(mp, 1)->mn_ksize == sizeof(size_t)) - cmp = mdb_cmp_long; - else - cmp = mdb_cmp_int; - } - - if (IS_LEAF2(mp)) { - nodekey.mv_size = mc->mc_db->md_pad; - node = NODEPTR(mp, 0); /* fake */ - while (low <= high) { - i = (low + high) >> 1; - nodekey.mv_data = LEAF2KEY(mp, i, nodekey.mv_size); - rc = cmp(key, &nodekey); - DPRINTF(("found leaf index %u [%s], rc = %i", - i, DKEY(&nodekey), rc)); - if (rc == 0) - break; - if (rc > 0) - low = i + 1; - else - high = i - 1; - } - } else { - while (low <= high) { - i = (low + high) >> 1; - - node = NODEPTR(mp, i); - nodekey.mv_size = NODEKSZ(node); - nodekey.mv_data = NODEKEY(node); - - rc = cmp(key, &nodekey); -#if MDB_DEBUG - if (IS_LEAF(mp)) - DPRINTF(("found leaf index %u [%s], rc = %i", - i, DKEY(&nodekey), rc)); - else - DPRINTF(("found branch index %u [%s -> %"Z"u], rc = %i", - i, DKEY(&nodekey), NODEPGNO(node), rc)); -#endif - if (rc == 0) - break; - if (rc > 0) - low = i + 1; - else - high = i - 1; - } - } - - if (rc > 0) { /* Found entry is less than the key. */ - i++; /* Skip to get the smallest entry larger than key. */ - if (!IS_LEAF2(mp)) - node = NODEPTR(mp, i); - } - if (exactp) - *exactp = (rc == 0 && nkeys > 0); - /* store the key index */ - mc->mc_ki[mc->mc_top] = i; - if (i >= nkeys) - /* There is no entry larger or equal to the key. */ - return NULL; - - /* nodeptr is fake for LEAF2 */ - return node; -} - -#if 0 -static void -mdb_cursor_adjust(MDB_cursor *mc, func) -{ - MDB_cursor *m2; - - for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { - if (m2->mc_pg[m2->mc_top] == mc->mc_pg[mc->mc_top]) { - func(mc, m2); - } - } -} -#endif - -/** Pop a page off the top of the cursor's stack. */ -static void -mdb_cursor_pop(MDB_cursor *mc) -{ - if (mc->mc_snum) { -#if MDB_DEBUG - MDB_page *top = mc->mc_pg[mc->mc_top]; -#endif - mc->mc_snum--; - if (mc->mc_snum) - mc->mc_top--; - - DPRINTF(("popped page %"Z"u off db %d cursor %p", top->mp_pgno, - DDBI(mc), (void *) mc)); - } -} - -/** Push a page onto the top of the cursor's stack. */ -static int -mdb_cursor_push(MDB_cursor *mc, MDB_page *mp) -{ - DPRINTF(("pushing page %"Z"u on db %d cursor %p", mp->mp_pgno, - DDBI(mc), (void *) mc)); - - if (mc->mc_snum >= CURSOR_STACK) { - mc->mc_txn->mt_flags |= MDB_TXN_ERROR; - return MDB_CURSOR_FULL; - } - - mc->mc_top = mc->mc_snum++; - mc->mc_pg[mc->mc_top] = mp; - mc->mc_ki[mc->mc_top] = 0; - - return MDB_SUCCESS; -} - -/** Find the address of the page corresponding to a given page number. - * @param[in] txn the transaction for this access. - * @param[in] pgno the page number for the page to retrieve. - * @param[out] ret address of a pointer where the page's address will be stored. - * @param[out] lvl dirty_list inheritance level of found page. 1=current txn, 0=mapped page. - * @return 0 on success, non-zero on failure. - */ -static int -mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **ret, int *lvl) -{ - MDB_env *env = txn->mt_env; - MDB_page *p = NULL; - int level; - - if (!((txn->mt_flags & MDB_TXN_RDONLY) | (env->me_flags & MDB_WRITEMAP))) { - MDB_txn *tx2 = txn; - level = 1; - do { - MDB_ID2L dl = tx2->mt_u.dirty_list; - unsigned x; - /* Spilled pages were dirtied in this txn and flushed - * because the dirty list got full. Bring this page - * back in from the map (but don't unspill it here, - * leave that unless page_touch happens again). - */ - if (tx2->mt_spill_pgs) { - MDB_ID pn = pgno << 1; - x = mdb_midl_search(tx2->mt_spill_pgs, pn); - if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) { - p = (MDB_page *)(env->me_map + env->me_psize * pgno); - goto done; - } - } - if (dl[0].mid) { - unsigned x = mdb_mid2l_search(dl, pgno); - if (x <= dl[0].mid && dl[x].mid == pgno) { - p = dl[x].mptr; - goto done; - } - } - level++; - } while ((tx2 = tx2->mt_parent) != NULL); - } - - if (pgno < txn->mt_next_pgno) { - level = 0; - p = (MDB_page *)(env->me_map + env->me_psize * pgno); - } else { - DPRINTF(("page %"Z"u not found", pgno)); - txn->mt_flags |= MDB_TXN_ERROR; - return MDB_PAGE_NOTFOUND; - } - -done: - *ret = p; - if (lvl) - *lvl = level; - return MDB_SUCCESS; -} - -/** Finish #mdb_page_search() / #mdb_page_search_lowest(). - * The cursor is at the root page, set up the rest of it. - */ -static int -mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) -{ - MDB_page *mp = mc->mc_pg[mc->mc_top]; - int rc; - DKBUF; - - while (IS_BRANCH(mp)) { - MDB_node *node; - indx_t i; - - DPRINTF(("branch page %"Z"u has %u keys", mp->mp_pgno, NUMKEYS(mp))); - mdb_cassert(mc, NUMKEYS(mp) > 1); - DPRINTF(("found index 0 to page %"Z"u", NODEPGNO(NODEPTR(mp, 0)))); - - if (flags & (MDB_PS_FIRST|MDB_PS_LAST)) { - i = 0; - if (flags & MDB_PS_LAST) - i = NUMKEYS(mp) - 1; - } else { - int exact; - node = mdb_node_search(mc, key, &exact); - if (node == NULL) - i = NUMKEYS(mp) - 1; - else { - i = mc->mc_ki[mc->mc_top]; - if (!exact) { - mdb_cassert(mc, i > 0); - i--; - } - } - DPRINTF(("following index %u for key [%s]", i, DKEY(key))); - } - - mdb_cassert(mc, i < NUMKEYS(mp)); - node = NODEPTR(mp, i); - - if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(node), &mp, NULL)) != 0) - return rc; - - mc->mc_ki[mc->mc_top] = i; - if ((rc = mdb_cursor_push(mc, mp))) - return rc; - - if (flags & MDB_PS_MODIFY) { - if ((rc = mdb_page_touch(mc)) != 0) - return rc; - mp = mc->mc_pg[mc->mc_top]; - } - } - - if (!IS_LEAF(mp)) { - DPRINTF(("internal error, index points to a %02X page!?", - mp->mp_flags)); - mc->mc_txn->mt_flags |= MDB_TXN_ERROR; - return MDB_CORRUPTED; - } - - DPRINTF(("found leaf page %"Z"u for key [%s]", mp->mp_pgno, - key ? DKEY(key) : "null")); - mc->mc_flags |= C_INITIALIZED; - mc->mc_flags &= ~C_EOF; - - return MDB_SUCCESS; -} - -/** Search for the lowest key under the current branch page. - * This just bypasses a NUMKEYS check in the current page - * before calling mdb_page_search_root(), because the callers - * are all in situations where the current page is known to - * be underfilled. - */ -static int -mdb_page_search_lowest(MDB_cursor *mc) -{ - MDB_page *mp = mc->mc_pg[mc->mc_top]; - MDB_node *node = NODEPTR(mp, 0); - int rc; - - if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(node), &mp, NULL)) != 0) - return rc; - - mc->mc_ki[mc->mc_top] = 0; - if ((rc = mdb_cursor_push(mc, mp))) - return rc; - return mdb_page_search_root(mc, NULL, MDB_PS_FIRST); -} - -/** Search for the page a given key should be in. - * Push it and its parent pages on the cursor stack. - * @param[in,out] mc the cursor for this operation. - * @param[in] key the key to search for, or NULL for first/last page. - * @param[in] flags If MDB_PS_MODIFY is set, visited pages in the DB - * are touched (updated with new page numbers). - * If MDB_PS_FIRST or MDB_PS_LAST is set, find first or last leaf. - * This is used by #mdb_cursor_first() and #mdb_cursor_last(). - * If MDB_PS_ROOTONLY set, just fetch root node, no further lookups. - * @return 0 on success, non-zero on failure. - */ -static int -mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags) -{ - int rc; - pgno_t root; - - /* Make sure the txn is still viable, then find the root from - * the txn's db table and set it as the root of the cursor's stack. - */ - if (F_ISSET(mc->mc_txn->mt_flags, MDB_TXN_ERROR)) { - DPUTS("transaction has failed, must abort"); - return MDB_BAD_TXN; - } else { - /* Make sure we're using an up-to-date root */ - if (*mc->mc_dbflag & DB_STALE) { - MDB_cursor mc2; - if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi)) - return MDB_BAD_DBI; - mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL); - rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, 0); - if (rc) - return rc; - { - MDB_val data; - int exact = 0; - uint16_t flags; - MDB_node *leaf = mdb_node_search(&mc2, - &mc->mc_dbx->md_name, &exact); - if (!exact) - return MDB_NOTFOUND; - rc = mdb_node_read(mc->mc_txn, leaf, &data); - if (rc) - return rc; - memcpy(&flags, ((char *) data.mv_data + offsetof(MDB_db, md_flags)), - sizeof(uint16_t)); - /* The txn may not know this DBI, or another process may - * have dropped and recreated the DB with other flags. - */ - if ((mc->mc_db->md_flags & PERSISTENT_FLAGS) != flags) - return MDB_INCOMPATIBLE; - memcpy(mc->mc_db, data.mv_data, sizeof(MDB_db)); - } - *mc->mc_dbflag &= ~DB_STALE; - } - root = mc->mc_db->md_root; - - if (root == P_INVALID) { /* Tree is empty. */ - DPUTS("tree is empty"); - return MDB_NOTFOUND; - } - } - - mdb_cassert(mc, root > 1); - if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root) - if ((rc = mdb_page_get(mc->mc_txn, root, &mc->mc_pg[0], NULL)) != 0) - return rc; - - mc->mc_snum = 1; - mc->mc_top = 0; - - DPRINTF(("db %d root page %"Z"u has flags 0x%X", - DDBI(mc), root, mc->mc_pg[0]->mp_flags)); - - if (flags & MDB_PS_MODIFY) { - if ((rc = mdb_page_touch(mc))) - return rc; - } - - if (flags & MDB_PS_ROOTONLY) - return MDB_SUCCESS; - - return mdb_page_search_root(mc, key, flags); -} - -static int -mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp) -{ - MDB_txn *txn = mc->mc_txn; - pgno_t pg = mp->mp_pgno; - unsigned x = 0, ovpages = mp->mp_pages; - MDB_env *env = txn->mt_env; - MDB_IDL sl = txn->mt_spill_pgs; - MDB_ID pn = pg << 1; - int rc; - - DPRINTF(("free ov page %"Z"u (%d)", pg, ovpages)); - /* If the page is dirty or on the spill list we just acquired it, - * so we should give it back to our current free list, if any. - * Otherwise put it onto the list of pages we freed in this txn. - * - * Won't create me_pghead: me_pglast must be inited along with it. - * Unsupported in nested txns: They would need to hide the page - * range in ancestor txns' dirty and spilled lists. - */ - if (env->me_pghead && - !txn->mt_parent && - ((mp->mp_flags & P_DIRTY) || - (sl && (x = mdb_midl_search(sl, pn)) <= sl[0] && sl[x] == pn))) - { - unsigned i, j; - pgno_t *mop; - MDB_ID2 *dl, ix, iy; - rc = mdb_midl_need(&env->me_pghead, ovpages); - if (rc) - return rc; - if (!(mp->mp_flags & P_DIRTY)) { - /* This page is no longer spilled */ - if (x == sl[0]) - sl[0]--; - else - sl[x] |= 1; - goto release; - } - /* Remove from dirty list */ - dl = txn->mt_u.dirty_list; - x = dl[0].mid--; - for (ix = dl[x]; ix.mptr != mp; ix = iy) { - if (x > 1) { - x--; - iy = dl[x]; - dl[x] = ix; - } else { - mdb_cassert(mc, x > 1); - j = ++(dl[0].mid); - dl[j] = ix; /* Unsorted. OK when MDB_TXN_ERROR. */ - txn->mt_flags |= MDB_TXN_ERROR; - return MDB_CORRUPTED; - } - } - if (!(env->me_flags & MDB_WRITEMAP)) - mdb_dpage_free(env, mp); -release: - /* Insert in me_pghead */ - mop = env->me_pghead; - j = mop[0] + ovpages; - for (i = mop[0]; i && mop[i] < pg; i--) - mop[j--] = mop[i]; - while (j>i) - mop[j--] = pg++; - mop[0] += ovpages; - } else { - rc = mdb_midl_append_range(&txn->mt_free_pgs, pg, ovpages); - if (rc) - return rc; - } - mc->mc_db->md_overflow_pages -= ovpages; - return 0; -} - -/** Return the data associated with a given node. - * @param[in] txn The transaction for this operation. - * @param[in] leaf The node being read. - * @param[out] data Updated to point to the node's data. - * @return 0 on success, non-zero on failure. - */ -static int -mdb_node_read(MDB_txn *txn, MDB_node *leaf, MDB_val *data) -{ - MDB_page *omp; /* overflow page */ - pgno_t pgno; - int rc; - - if (!F_ISSET(leaf->mn_flags, F_BIGDATA)) { - data->mv_size = NODEDSZ(leaf); - data->mv_data = NODEDATA(leaf); - return MDB_SUCCESS; - } - - /* Read overflow data. - */ - data->mv_size = NODEDSZ(leaf); - memcpy(&pgno, NODEDATA(leaf), sizeof(pgno)); - if ((rc = mdb_page_get(txn, pgno, &omp, NULL)) != 0) { - DPRINTF(("read overflow page %"Z"u failed", pgno)); - return rc; - } - data->mv_data = METADATA(omp); - - return MDB_SUCCESS; -} - -int -mdb_get(MDB_txn *txn, MDB_dbi dbi, - MDB_val *key, MDB_val *data) -{ - MDB_cursor mc; - MDB_xcursor mx; - int exact = 0; - DKBUF; - - DPRINTF(("===> get db %u key [%s]", dbi, DKEY(key))); - - if (!key || !data || dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) - return EINVAL; - - if (txn->mt_flags & MDB_TXN_ERROR) - return MDB_BAD_TXN; - - mdb_cursor_init(&mc, txn, dbi, &mx); - return mdb_cursor_set(&mc, key, data, MDB_SET, &exact); -} - -/** Find a sibling for a page. - * Replaces the page at the top of the cursor's stack with the - * specified sibling, if one exists. - * @param[in] mc The cursor for this operation. - * @param[in] move_right Non-zero if the right sibling is requested, - * otherwise the left sibling. - * @return 0 on success, non-zero on failure. - */ -static int -mdb_cursor_sibling(MDB_cursor *mc, int move_right) -{ - int rc; - MDB_node *indx; - MDB_page *mp; - - if (mc->mc_snum < 2) { - return MDB_NOTFOUND; /* root has no siblings */ - } - - mdb_cursor_pop(mc); - DPRINTF(("parent page is page %"Z"u, index %u", - mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top])); - - if (move_right ? (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mc->mc_pg[mc->mc_top])) - : (mc->mc_ki[mc->mc_top] == 0)) { - DPRINTF(("no more keys left, moving to %s sibling", - move_right ? "right" : "left")); - if ((rc = mdb_cursor_sibling(mc, move_right)) != MDB_SUCCESS) { - /* undo cursor_pop before returning */ - mc->mc_top++; - mc->mc_snum++; - return rc; - } - } else { - if (move_right) - mc->mc_ki[mc->mc_top]++; - else - mc->mc_ki[mc->mc_top]--; - DPRINTF(("just moving to %s index key %u", - move_right ? "right" : "left", mc->mc_ki[mc->mc_top])); - } - mdb_cassert(mc, IS_BRANCH(mc->mc_pg[mc->mc_top])); - - indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(indx), &mp, NULL)) != 0) { - /* mc will be inconsistent if caller does mc_snum++ as above */ - mc->mc_flags &= ~(C_INITIALIZED|C_EOF); - return rc; - } - - mdb_cursor_push(mc, mp); - if (!move_right) - mc->mc_ki[mc->mc_top] = NUMKEYS(mp)-1; - - return MDB_SUCCESS; -} - -/** Move the cursor to the next data item. */ -static int -mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) -{ - MDB_page *mp; - MDB_node *leaf; - int rc; - - if (mc->mc_flags & C_EOF) { - return MDB_NOTFOUND; - } - - mdb_cassert(mc, mc->mc_flags & C_INITIALIZED); - - mp = mc->mc_pg[mc->mc_top]; - - if (mc->mc_db->md_flags & MDB_DUPSORT) { - leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - if (op == MDB_NEXT || op == MDB_NEXT_DUP) { - rc = mdb_cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_NEXT); - if (op != MDB_NEXT || rc != MDB_NOTFOUND) { - if (rc == MDB_SUCCESS) - MDB_GET_KEY(leaf, key); - return rc; - } - } - } else { - mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); - if (op == MDB_NEXT_DUP) - return MDB_NOTFOUND; - } - } - - DPRINTF(("cursor_next: top page is %"Z"u in cursor %p", - mdb_dbg_pgno(mp), (void *) mc)); - if (mc->mc_flags & C_DEL) - goto skip; - - if (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mp)) { - DPUTS("=====> move to next sibling page"); - if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS) { - mc->mc_flags |= C_EOF; - return rc; - } - mp = mc->mc_pg[mc->mc_top]; - DPRINTF(("next page is %"Z"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top])); - } else - mc->mc_ki[mc->mc_top]++; - -skip: - DPRINTF(("==> cursor points to page %"Z"u with %u keys, key index %u", - mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top])); - - if (IS_LEAF2(mp)) { - key->mv_size = mc->mc_db->md_pad; - key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); - return MDB_SUCCESS; - } - - mdb_cassert(mc, IS_LEAF(mp)); - leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - mdb_xcursor_init1(mc, leaf); - } - if (data) { - if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) - return rc; - - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); - if (rc != MDB_SUCCESS) - return rc; - } - } - - MDB_GET_KEY(leaf, key); - return MDB_SUCCESS; -} - -/** Move the cursor to the previous data item. */ -static int -mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) -{ - MDB_page *mp; - MDB_node *leaf; - int rc; - - mdb_cassert(mc, mc->mc_flags & C_INITIALIZED); - - mp = mc->mc_pg[mc->mc_top]; - - if (mc->mc_db->md_flags & MDB_DUPSORT) { - leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - if (op == MDB_PREV || op == MDB_PREV_DUP) { - rc = mdb_cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_PREV); - if (op != MDB_PREV || rc != MDB_NOTFOUND) { - if (rc == MDB_SUCCESS) { - MDB_GET_KEY(leaf, key); - mc->mc_flags &= ~C_EOF; - } - return rc; - } - } else { - mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); - if (op == MDB_PREV_DUP) - return MDB_NOTFOUND; - } - } - } - - DPRINTF(("cursor_prev: top page is %"Z"u in cursor %p", - mdb_dbg_pgno(mp), (void *) mc)); - - if (mc->mc_ki[mc->mc_top] == 0) { - DPUTS("=====> move to prev sibling page"); - if ((rc = mdb_cursor_sibling(mc, 0)) != MDB_SUCCESS) { - return rc; - } - mp = mc->mc_pg[mc->mc_top]; - mc->mc_ki[mc->mc_top] = NUMKEYS(mp) - 1; - DPRINTF(("prev page is %"Z"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top])); - } else - mc->mc_ki[mc->mc_top]--; - - mc->mc_flags &= ~C_EOF; - - DPRINTF(("==> cursor points to page %"Z"u with %u keys, key index %u", - mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top])); - - if (IS_LEAF2(mp)) { - key->mv_size = mc->mc_db->md_pad; - key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); - return MDB_SUCCESS; - } - - mdb_cassert(mc, IS_LEAF(mp)); - leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - mdb_xcursor_init1(mc, leaf); - } - if (data) { - if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) - return rc; - - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); - if (rc != MDB_SUCCESS) - return rc; - } - } - - MDB_GET_KEY(leaf, key); - return MDB_SUCCESS; -} - -/** Set the cursor on a specific data item. */ -static int -mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, - MDB_cursor_op op, int *exactp) -{ - int rc; - MDB_page *mp; - MDB_node *leaf = NULL; - DKBUF; - - if (key->mv_size == 0) - return MDB_BAD_VALSIZE; - - if (mc->mc_xcursor) - mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); - - /* See if we're already on the right page */ - if (mc->mc_flags & C_INITIALIZED) { - MDB_val nodekey; - - mp = mc->mc_pg[mc->mc_top]; - if (!NUMKEYS(mp)) { - mc->mc_ki[mc->mc_top] = 0; - return MDB_NOTFOUND; - } - if (mp->mp_flags & P_LEAF2) { - nodekey.mv_size = mc->mc_db->md_pad; - nodekey.mv_data = LEAF2KEY(mp, 0, nodekey.mv_size); - } else { - leaf = NODEPTR(mp, 0); - MDB_GET_KEY2(leaf, nodekey); - } - rc = mc->mc_dbx->md_cmp(key, &nodekey); - if (rc == 0) { - /* Probably happens rarely, but first node on the page - * was the one we wanted. - */ - mc->mc_ki[mc->mc_top] = 0; - if (exactp) - *exactp = 1; - goto set1; - } - if (rc > 0) { - unsigned int i; - unsigned int nkeys = NUMKEYS(mp); - if (nkeys > 1) { - if (mp->mp_flags & P_LEAF2) { - nodekey.mv_data = LEAF2KEY(mp, - nkeys-1, nodekey.mv_size); - } else { - leaf = NODEPTR(mp, nkeys-1); - MDB_GET_KEY2(leaf, nodekey); - } - rc = mc->mc_dbx->md_cmp(key, &nodekey); - if (rc == 0) { - /* last node was the one we wanted */ - mc->mc_ki[mc->mc_top] = nkeys-1; - if (exactp) - *exactp = 1; - goto set1; - } - if (rc < 0) { - if (mc->mc_ki[mc->mc_top] < NUMKEYS(mp)) { - /* This is definitely the right page, skip search_page */ - if (mp->mp_flags & P_LEAF2) { - nodekey.mv_data = LEAF2KEY(mp, - mc->mc_ki[mc->mc_top], nodekey.mv_size); - } else { - leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - MDB_GET_KEY2(leaf, nodekey); - } - rc = mc->mc_dbx->md_cmp(key, &nodekey); - if (rc == 0) { - /* current node was the one we wanted */ - if (exactp) - *exactp = 1; - goto set1; - } - } - rc = 0; - goto set2; - } - } - /* If any parents have right-sibs, search. - * Otherwise, there's nothing further. - */ - for (i=0; imc_top; i++) - if (mc->mc_ki[i] < - NUMKEYS(mc->mc_pg[i])-1) - break; - if (i == mc->mc_top) { - /* There are no other pages */ - mc->mc_ki[mc->mc_top] = nkeys; - return MDB_NOTFOUND; - } - } - if (!mc->mc_top) { - /* There are no other pages */ - mc->mc_ki[mc->mc_top] = 0; - if (op == MDB_SET_RANGE && !exactp) { - rc = 0; - goto set1; - } else - return MDB_NOTFOUND; - } - } - - rc = mdb_page_search(mc, key, 0); - if (rc != MDB_SUCCESS) - return rc; - - mp = mc->mc_pg[mc->mc_top]; - mdb_cassert(mc, IS_LEAF(mp)); - -set2: - leaf = mdb_node_search(mc, key, exactp); - if (exactp != NULL && !*exactp) { - /* MDB_SET specified and not an exact match. */ - return MDB_NOTFOUND; - } - - if (leaf == NULL) { - DPUTS("===> inexact leaf not found, goto sibling"); - if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS) - return rc; /* no entries matched */ - mp = mc->mc_pg[mc->mc_top]; - mdb_cassert(mc, IS_LEAF(mp)); - leaf = NODEPTR(mp, 0); - } - -set1: - mc->mc_flags |= C_INITIALIZED; - mc->mc_flags &= ~C_EOF; - - if (IS_LEAF2(mp)) { - if (op == MDB_SET_RANGE || op == MDB_SET_KEY) { - key->mv_size = mc->mc_db->md_pad; - key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); - } - return MDB_SUCCESS; - } - - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - mdb_xcursor_init1(mc, leaf); - } - if (data) { - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - if (op == MDB_SET || op == MDB_SET_KEY || op == MDB_SET_RANGE) { - rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); - } else { - int ex2, *ex2p; - if (op == MDB_GET_BOTH) { - ex2p = &ex2; - ex2 = 0; - } else { - ex2p = NULL; - } - rc = mdb_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_SET_RANGE, ex2p); - if (rc != MDB_SUCCESS) - return rc; - } - } else if (op == MDB_GET_BOTH || op == MDB_GET_BOTH_RANGE) { - MDB_val d2; - if ((rc = mdb_node_read(mc->mc_txn, leaf, &d2)) != MDB_SUCCESS) - return rc; - rc = mc->mc_dbx->md_dcmp(data, &d2); - if (rc) { - if (op == MDB_GET_BOTH || rc > 0) - return MDB_NOTFOUND; - rc = 0; - *data = d2; - } - - } else { - if (mc->mc_xcursor) - mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); - if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) - return rc; - } - } - - /* The key already matches in all other cases */ - if (op == MDB_SET_RANGE || op == MDB_SET_KEY) - MDB_GET_KEY(leaf, key); - DPRINTF(("==> cursor placed on key [%s]", DKEY(key))); - - return rc; -} - -/** Move the cursor to the first item in the database. */ -static int -mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data) -{ - int rc; - MDB_node *leaf; - - if (mc->mc_xcursor) - mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); - - if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { - rc = mdb_page_search(mc, NULL, MDB_PS_FIRST); - if (rc != MDB_SUCCESS) - return rc; - } - mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); - - leaf = NODEPTR(mc->mc_pg[mc->mc_top], 0); - mc->mc_flags |= C_INITIALIZED; - mc->mc_flags &= ~C_EOF; - - mc->mc_ki[mc->mc_top] = 0; - - if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { - key->mv_size = mc->mc_db->md_pad; - key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], 0, key->mv_size); - return MDB_SUCCESS; - } - - if (data) { - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - mdb_xcursor_init1(mc, leaf); - rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); - if (rc) - return rc; - } else { - if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) - return rc; - } - } - MDB_GET_KEY(leaf, key); - return MDB_SUCCESS; -} - -/** Move the cursor to the last item in the database. */ -static int -mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data) -{ - int rc; - MDB_node *leaf; - - if (mc->mc_xcursor) - mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); - - if (!(mc->mc_flags & C_EOF)) { - - if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { - rc = mdb_page_search(mc, NULL, MDB_PS_LAST); - if (rc != MDB_SUCCESS) - return rc; - } - mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); - - } - mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]) - 1; - mc->mc_flags |= C_INITIALIZED|C_EOF; - leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - - if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { - key->mv_size = mc->mc_db->md_pad; - key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], key->mv_size); - return MDB_SUCCESS; - } - - if (data) { - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - mdb_xcursor_init1(mc, leaf); - rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); - if (rc) - return rc; - } else { - if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) - return rc; - } - } - - MDB_GET_KEY(leaf, key); - return MDB_SUCCESS; -} - -int -mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, - MDB_cursor_op op) -{ - int rc; - int exact = 0; - int (*mfunc)(MDB_cursor *mc, MDB_val *key, MDB_val *data); - - if (mc == NULL) - return EINVAL; - - if (mc->mc_txn->mt_flags & MDB_TXN_ERROR) - return MDB_BAD_TXN; - - switch (op) { - case MDB_GET_CURRENT: - if (!(mc->mc_flags & C_INITIALIZED)) { - rc = EINVAL; - } else { - MDB_page *mp = mc->mc_pg[mc->mc_top]; - int nkeys = NUMKEYS(mp); - if (!nkeys || mc->mc_ki[mc->mc_top] >= nkeys) { - mc->mc_ki[mc->mc_top] = nkeys; - rc = MDB_NOTFOUND; - break; - } - rc = MDB_SUCCESS; - if (IS_LEAF2(mp)) { - key->mv_size = mc->mc_db->md_pad; - key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); - } else { - MDB_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - MDB_GET_KEY(leaf, key); - if (data) { - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - if (mc->mc_flags & C_DEL) - mdb_xcursor_init1(mc, leaf); - rc = mdb_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_GET_CURRENT); - } else { - rc = mdb_node_read(mc->mc_txn, leaf, data); - } - } - } - } - break; - case MDB_GET_BOTH: - case MDB_GET_BOTH_RANGE: - if (data == NULL) { - rc = EINVAL; - break; - } - if (mc->mc_xcursor == NULL) { - rc = MDB_INCOMPATIBLE; - break; - } - /* FALLTHRU */ - case MDB_SET: - case MDB_SET_KEY: - case MDB_SET_RANGE: - if (key == NULL) { - rc = EINVAL; - } else { - rc = mdb_cursor_set(mc, key, data, op, - op == MDB_SET_RANGE ? NULL : &exact); - } - break; - case MDB_GET_MULTIPLE: - if (data == NULL || !(mc->mc_flags & C_INITIALIZED)) { - rc = EINVAL; - break; - } - if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { - rc = MDB_INCOMPATIBLE; - break; - } - rc = MDB_SUCCESS; - if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) || - (mc->mc_xcursor->mx_cursor.mc_flags & C_EOF)) - break; - goto fetchm; - case MDB_NEXT_MULTIPLE: - if (data == NULL) { - rc = EINVAL; - break; - } - if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { - rc = MDB_INCOMPATIBLE; - break; - } - if (!(mc->mc_flags & C_INITIALIZED)) - rc = mdb_cursor_first(mc, key, data); - else - rc = mdb_cursor_next(mc, key, data, MDB_NEXT_DUP); - if (rc == MDB_SUCCESS) { - if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { - MDB_cursor *mx; -fetchm: - mx = &mc->mc_xcursor->mx_cursor; - data->mv_size = NUMKEYS(mx->mc_pg[mx->mc_top]) * - mx->mc_db->md_pad; - data->mv_data = METADATA(mx->mc_pg[mx->mc_top]); - mx->mc_ki[mx->mc_top] = NUMKEYS(mx->mc_pg[mx->mc_top])-1; - } else { - rc = MDB_NOTFOUND; - } - } - break; - case MDB_NEXT: - case MDB_NEXT_DUP: - case MDB_NEXT_NODUP: - if (!(mc->mc_flags & C_INITIALIZED)) - rc = mdb_cursor_first(mc, key, data); - else - rc = mdb_cursor_next(mc, key, data, op); - break; - case MDB_PREV: - case MDB_PREV_DUP: - case MDB_PREV_NODUP: - if (!(mc->mc_flags & C_INITIALIZED)) { - rc = mdb_cursor_last(mc, key, data); - if (rc) - break; - mc->mc_flags |= C_INITIALIZED; - mc->mc_ki[mc->mc_top]++; - } - rc = mdb_cursor_prev(mc, key, data, op); - break; - case MDB_FIRST: - rc = mdb_cursor_first(mc, key, data); - break; - case MDB_FIRST_DUP: - mfunc = mdb_cursor_first; - mmove: - if (data == NULL || !(mc->mc_flags & C_INITIALIZED)) { - rc = EINVAL; - break; - } - if (mc->mc_xcursor == NULL) { - rc = MDB_INCOMPATIBLE; - break; - } - { - MDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { - MDB_GET_KEY(leaf, key); - if (data) - rc = mdb_node_read(mc->mc_txn, leaf, data); - break; - } - } - if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) { - rc = EINVAL; - break; - } - rc = mfunc(&mc->mc_xcursor->mx_cursor, data, NULL); - break; - case MDB_LAST: - rc = mdb_cursor_last(mc, key, data); - break; - case MDB_LAST_DUP: - mfunc = mdb_cursor_last; - goto mmove; - default: - DPRINTF(("unhandled/unimplemented cursor operation %u", op)); - rc = EINVAL; - break; - } - - if (mc->mc_flags & C_DEL) - mc->mc_flags ^= C_DEL; - - return rc; -} - -/** Touch all the pages in the cursor stack. Set mc_top. - * Makes sure all the pages are writable, before attempting a write operation. - * @param[in] mc The cursor to operate on. - */ -static int -mdb_cursor_touch(MDB_cursor *mc) -{ - int rc = MDB_SUCCESS; - - if (mc->mc_dbi > MAIN_DBI && !(*mc->mc_dbflag & DB_DIRTY)) { - MDB_cursor mc2; - MDB_xcursor mcx; - if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi)) - return MDB_BAD_DBI; - mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, &mcx); - rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, MDB_PS_MODIFY); - if (rc) - return rc; - *mc->mc_dbflag |= DB_DIRTY; - } - mc->mc_top = 0; - if (mc->mc_snum) { - do { - rc = mdb_page_touch(mc); - } while (!rc && ++(mc->mc_top) < mc->mc_snum); - mc->mc_top = mc->mc_snum-1; - } - return rc; -} - -/** Do not spill pages to disk if txn is getting full, may fail instead */ -#define MDB_NOSPILL 0x8000 - -int -mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, - unsigned int flags) -{ - enum { MDB_NO_ROOT = MDB_LAST_ERRCODE+10 }; /* internal code */ - MDB_env *env; - MDB_node *leaf = NULL; - MDB_page *fp, *mp; - uint16_t fp_flags; - MDB_val xdata, *rdata, dkey, olddata; - MDB_db dummy; - int do_sub = 0, insert_key, insert_data; - unsigned int mcount = 0, dcount = 0, nospill; - size_t nsize; - int rc, rc2; - unsigned int nflags; - DKBUF; - - if (mc == NULL || key == NULL) - return EINVAL; - - env = mc->mc_txn->mt_env; - - /* Check this first so counter will always be zero on any - * early failures. - */ - if (flags & MDB_MULTIPLE) { - dcount = data[1].mv_size; - data[1].mv_size = 0; - if (!F_ISSET(mc->mc_db->md_flags, MDB_DUPFIXED)) - return MDB_INCOMPATIBLE; - } - - nospill = flags & MDB_NOSPILL; - flags &= ~MDB_NOSPILL; - - if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR)) - return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; - - if (key->mv_size-1 >= ENV_MAXKEY(env)) - return MDB_BAD_VALSIZE; - -#if SIZE_MAX > MAXDATASIZE - if (data->mv_size > ((mc->mc_db->md_flags & MDB_DUPSORT) ? ENV_MAXKEY(env) : MAXDATASIZE)) - return MDB_BAD_VALSIZE; -#else - if ((mc->mc_db->md_flags & MDB_DUPSORT) && data->mv_size > ENV_MAXKEY(env)) - return MDB_BAD_VALSIZE; -#endif - - DPRINTF(("==> put db %d key [%s], size %"Z"u, data size %"Z"u", - DDBI(mc), DKEY(key), key ? key->mv_size : 0, data->mv_size)); - - dkey.mv_size = 0; - - if (flags == MDB_CURRENT) { - if (!(mc->mc_flags & C_INITIALIZED)) - return EINVAL; - rc = MDB_SUCCESS; - } else if (mc->mc_db->md_root == P_INVALID) { - /* new database, cursor has nothing to point to */ - mc->mc_snum = 0; - mc->mc_top = 0; - mc->mc_flags &= ~C_INITIALIZED; - rc = MDB_NO_ROOT; - } else { - int exact = 0; - MDB_val d2; - if (flags & MDB_APPEND) { - MDB_val k2; - rc = mdb_cursor_last(mc, &k2, &d2); - if (rc == 0) { - rc = mc->mc_dbx->md_cmp(key, &k2); - if (rc > 0) { - rc = MDB_NOTFOUND; - mc->mc_ki[mc->mc_top]++; - } else { - /* new key is <= last key */ - rc = MDB_KEYEXIST; - } - } - } else { - rc = mdb_cursor_set(mc, key, &d2, MDB_SET, &exact); - } - if ((flags & MDB_NOOVERWRITE) && rc == 0) { - DPRINTF(("duplicate key [%s]", DKEY(key))); - *data = d2; - return MDB_KEYEXIST; - } - if (rc && rc != MDB_NOTFOUND) - return rc; - } - - if (mc->mc_flags & C_DEL) - mc->mc_flags ^= C_DEL; - - /* Cursor is positioned, check for room in the dirty list */ - if (!nospill) { - if (flags & MDB_MULTIPLE) { - rdata = &xdata; - xdata.mv_size = data->mv_size * dcount; - } else { - rdata = data; - } - if ((rc2 = mdb_page_spill(mc, key, rdata))) - return rc2; - } - - if (rc == MDB_NO_ROOT) { - MDB_page *np; - /* new database, write a root leaf page */ - DPUTS("allocating new root leaf page"); - if ((rc2 = mdb_page_new(mc, P_LEAF, 1, &np))) { - return rc2; - } - mdb_cursor_push(mc, np); - mc->mc_db->md_root = np->mp_pgno; - mc->mc_db->md_depth++; - *mc->mc_dbflag |= DB_DIRTY; - if ((mc->mc_db->md_flags & (MDB_DUPSORT|MDB_DUPFIXED)) - == MDB_DUPFIXED) - np->mp_flags |= P_LEAF2; - mc->mc_flags |= C_INITIALIZED; - } else { - /* make sure all cursor pages are writable */ - rc2 = mdb_cursor_touch(mc); - if (rc2) - return rc2; - } - - insert_key = insert_data = rc; - if (insert_key) { - /* The key does not exist */ - DPRINTF(("inserting key at index %i", mc->mc_ki[mc->mc_top])); - if ((mc->mc_db->md_flags & MDB_DUPSORT) && - LEAFSIZE(key, data) > env->me_nodemax) - { - /* Too big for a node, insert in sub-DB. Set up an empty - * "old sub-page" for prep_subDB to expand to a full page. - */ - fp_flags = P_LEAF|P_DIRTY; - fp = env->me_pbuf; - fp->mp_pad = data->mv_size; /* used if MDB_DUPFIXED */ - fp->mp_lower = fp->mp_upper = (PAGEHDRSZ-PAGEBASE); - olddata.mv_size = PAGEHDRSZ; - goto prep_subDB; - } - } else { - /* there's only a key anyway, so this is a no-op */ - if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { - char *ptr; - unsigned int ksize = mc->mc_db->md_pad; - if (key->mv_size != ksize) - return MDB_BAD_VALSIZE; - ptr = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize); - memcpy(ptr, key->mv_data, ksize); -fix_parent: - /* if overwriting slot 0 of leaf, need to - * update branch key if there is a parent page - */ - if (mc->mc_top && !mc->mc_ki[mc->mc_top]) { - unsigned short top = mc->mc_top; - mc->mc_top--; - /* slot 0 is always an empty key, find real slot */ - while (mc->mc_top && !mc->mc_ki[mc->mc_top]) - mc->mc_top--; - if (mc->mc_ki[mc->mc_top]) - rc2 = mdb_update_key(mc, key); - else - rc2 = MDB_SUCCESS; - mc->mc_top = top; - if (rc2) - return rc2; - } - return MDB_SUCCESS; - } - -more: - leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - olddata.mv_size = NODEDSZ(leaf); - olddata.mv_data = NODEDATA(leaf); - - /* DB has dups? */ - if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) { - /* Prepare (sub-)page/sub-DB to accept the new item, - * if needed. fp: old sub-page or a header faking - * it. mp: new (sub-)page. offset: growth in page - * size. xdata: node data with new page or DB. - */ - unsigned i, offset = 0; - mp = fp = xdata.mv_data = env->me_pbuf; - mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno; - - /* Was a single item before, must convert now */ - if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { - /* Just overwrite the current item */ - if (flags == MDB_CURRENT) - goto current; - -#if UINT_MAX < SIZE_MAX - if (mc->mc_dbx->md_dcmp == mdb_cmp_int && olddata.mv_size == sizeof(size_t)) - mc->mc_dbx->md_dcmp = mdb_cmp_clong; -#endif - /* does data match? */ - if (!mc->mc_dbx->md_dcmp(data, &olddata)) { - if (flags & MDB_NODUPDATA) - return MDB_KEYEXIST; - /* overwrite it */ - goto current; - } - - /* Back up original data item */ - dkey.mv_size = olddata.mv_size; - dkey.mv_data = memcpy(fp+1, olddata.mv_data, olddata.mv_size); - - /* Make sub-page header for the dup items, with dummy body */ - fp->mp_flags = P_LEAF|P_DIRTY|P_SUBP; - fp->mp_lower = (PAGEHDRSZ-PAGEBASE); - xdata.mv_size = PAGEHDRSZ + dkey.mv_size + data->mv_size; - if (mc->mc_db->md_flags & MDB_DUPFIXED) { - fp->mp_flags |= P_LEAF2; - fp->mp_pad = data->mv_size; - xdata.mv_size += 2 * data->mv_size; /* leave space for 2 more */ - } else { - xdata.mv_size += 2 * (sizeof(indx_t) + NODESIZE) + - (dkey.mv_size & 1) + (data->mv_size & 1); - } - fp->mp_upper = xdata.mv_size - PAGEBASE; - olddata.mv_size = xdata.mv_size; /* pretend olddata is fp */ - } else if (leaf->mn_flags & F_SUBDATA) { - /* Data is on sub-DB, just store it */ - flags |= F_DUPDATA|F_SUBDATA; - goto put_sub; - } else { - /* Data is on sub-page */ - fp = olddata.mv_data; - switch (flags) { - default: - if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { - offset = EVEN(NODESIZE + sizeof(indx_t) + - data->mv_size); - break; - } - offset = fp->mp_pad; - if (SIZELEFT(fp) < offset) { - offset *= 4; /* space for 4 more */ - break; - } - /* FALLTHRU: Big enough MDB_DUPFIXED sub-page */ - case MDB_CURRENT: - fp->mp_flags |= P_DIRTY; - COPY_PGNO(fp->mp_pgno, mp->mp_pgno); - mc->mc_xcursor->mx_cursor.mc_pg[0] = fp; - flags |= F_DUPDATA; - goto put_sub; - } - xdata.mv_size = olddata.mv_size + offset; - } - - fp_flags = fp->mp_flags; - if (NODESIZE + NODEKSZ(leaf) + xdata.mv_size > env->me_nodemax) { - /* Too big for a sub-page, convert to sub-DB */ - fp_flags &= ~P_SUBP; -prep_subDB: - if (mc->mc_db->md_flags & MDB_DUPFIXED) { - fp_flags |= P_LEAF2; - dummy.md_pad = fp->mp_pad; - dummy.md_flags = MDB_DUPFIXED; - if (mc->mc_db->md_flags & MDB_INTEGERDUP) - dummy.md_flags |= MDB_INTEGERKEY; - } else { - dummy.md_pad = 0; - dummy.md_flags = 0; - } - dummy.md_depth = 1; - dummy.md_branch_pages = 0; - dummy.md_leaf_pages = 1; - dummy.md_overflow_pages = 0; - dummy.md_entries = NUMKEYS(fp); - xdata.mv_size = sizeof(MDB_db); - xdata.mv_data = &dummy; - if ((rc = mdb_page_alloc(mc, 1, &mp))) - return rc; - offset = env->me_psize - olddata.mv_size; - flags |= F_DUPDATA|F_SUBDATA; - dummy.md_root = mp->mp_pgno; - } - if (mp != fp) { - mp->mp_flags = fp_flags | P_DIRTY; - mp->mp_pad = fp->mp_pad; - mp->mp_lower = fp->mp_lower; - mp->mp_upper = fp->mp_upper + offset; - if (fp_flags & P_LEAF2) { - memcpy(METADATA(mp), METADATA(fp), NUMKEYS(fp) * fp->mp_pad); - } else { - memcpy((char *)mp + mp->mp_upper + PAGEBASE, (char *)fp + fp->mp_upper + PAGEBASE, - olddata.mv_size - fp->mp_upper - PAGEBASE); - for (i=0; imp_ptrs[i] = fp->mp_ptrs[i] + offset; - } - } - - rdata = &xdata; - flags |= F_DUPDATA; - do_sub = 1; - if (!insert_key) - mdb_node_del(mc, 0); - goto new_sub; - } -current: - /* overflow page overwrites need special handling */ - if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { - MDB_page *omp; - pgno_t pg; - int level, ovpages, dpages = OVPAGES(data->mv_size, env->me_psize); - - memcpy(&pg, olddata.mv_data, sizeof(pg)); - if ((rc2 = mdb_page_get(mc->mc_txn, pg, &omp, &level)) != 0) - return rc2; - ovpages = omp->mp_pages; - - /* Is the ov page large enough? */ - if (ovpages >= dpages) { - if (!(omp->mp_flags & P_DIRTY) && - (level || (env->me_flags & MDB_WRITEMAP))) - { - rc = mdb_page_unspill(mc->mc_txn, omp, &omp); - if (rc) - return rc; - level = 0; /* dirty in this txn or clean */ - } - /* Is it dirty? */ - if (omp->mp_flags & P_DIRTY) { - /* yes, overwrite it. Note in this case we don't - * bother to try shrinking the page if the new data - * is smaller than the overflow threshold. - */ - if (level > 1) { - /* It is writable only in a parent txn */ - size_t sz = (size_t) env->me_psize * ovpages, off; - MDB_page *np = mdb_page_malloc(mc->mc_txn, ovpages); - MDB_ID2 id2; - if (!np) - return ENOMEM; - id2.mid = pg; - id2.mptr = np; - rc2 = mdb_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &id2); - mdb_cassert(mc, rc2 == 0); - if (!(flags & MDB_RESERVE)) { - /* Copy end of page, adjusting alignment so - * compiler may copy words instead of bytes. - */ - off = (PAGEHDRSZ + data->mv_size) & -sizeof(size_t); - memcpy((size_t *)((char *)np + off), - (size_t *)((char *)omp + off), sz - off); - sz = PAGEHDRSZ; - } - memcpy(np, omp, sz); /* Copy beginning of page */ - omp = np; - } - SETDSZ(leaf, data->mv_size); - if (F_ISSET(flags, MDB_RESERVE)) - data->mv_data = METADATA(omp); - else - memcpy(METADATA(omp), data->mv_data, data->mv_size); - return MDB_SUCCESS; - } - } - if ((rc2 = mdb_ovpage_free(mc, omp)) != MDB_SUCCESS) - return rc2; - } else if (data->mv_size == olddata.mv_size) { - /* same size, just replace it. Note that we could - * also reuse this node if the new data is smaller, - * but instead we opt to shrink the node in that case. - */ - if (F_ISSET(flags, MDB_RESERVE)) - data->mv_data = olddata.mv_data; - else if (!(mc->mc_flags & C_SUB)) - memcpy(olddata.mv_data, data->mv_data, data->mv_size); - else { - memcpy(NODEKEY(leaf), key->mv_data, key->mv_size); - goto fix_parent; - } - return MDB_SUCCESS; - } - mdb_node_del(mc, 0); - } - - rdata = data; - -new_sub: - nflags = flags & NODE_ADD_FLAGS; - nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : mdb_leaf_size(env, key, rdata); - if (SIZELEFT(mc->mc_pg[mc->mc_top]) < nsize) { - if (( flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA ) - nflags &= ~MDB_APPEND; /* sub-page may need room to grow */ - if (!insert_key) - nflags |= MDB_SPLIT_REPLACE; - rc = mdb_page_split(mc, key, rdata, P_INVALID, nflags); - } else { - /* There is room already in this leaf page. */ - rc = mdb_node_add(mc, mc->mc_ki[mc->mc_top], key, rdata, 0, nflags); - if (rc == 0 && insert_key) { - /* Adjust other cursors pointing to mp */ - MDB_cursor *m2, *m3; - MDB_dbi dbi = mc->mc_dbi; - unsigned i = mc->mc_top; - MDB_page *mp = mc->mc_pg[i]; - - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - if (mc->mc_flags & C_SUB) - m3 = &m2->mc_xcursor->mx_cursor; - else - m3 = m2; - if (m3 == mc || m3->mc_snum < mc->mc_snum) continue; - if (m3->mc_pg[i] == mp && m3->mc_ki[i] >= mc->mc_ki[i]) { - m3->mc_ki[i]++; - } - } - } - } - - if (rc == MDB_SUCCESS) { - /* Now store the actual data in the child DB. Note that we're - * storing the user data in the keys field, so there are strict - * size limits on dupdata. The actual data fields of the child - * DB are all zero size. - */ - if (do_sub) { - int xflags; - size_t ecount; -put_sub: - xdata.mv_size = 0; - xdata.mv_data = ""; - leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (flags & MDB_CURRENT) { - xflags = MDB_CURRENT|MDB_NOSPILL; - } else { - mdb_xcursor_init1(mc, leaf); - xflags = (flags & MDB_NODUPDATA) ? - MDB_NOOVERWRITE|MDB_NOSPILL : MDB_NOSPILL; - } - /* converted, write the original data first */ - if (dkey.mv_size) { - rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, xflags); - if (rc) - goto bad_sub; - { - /* Adjust other cursors pointing to mp */ - MDB_cursor *m2; - unsigned i = mc->mc_top; - MDB_page *mp = mc->mc_pg[i]; - - for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { - if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; - if (!(m2->mc_flags & C_INITIALIZED)) continue; - if (m2->mc_pg[i] == mp && m2->mc_ki[i] == mc->mc_ki[i]) { - mdb_xcursor_init1(m2, leaf); - } - } - } - /* we've done our job */ - dkey.mv_size = 0; - } - ecount = mc->mc_xcursor->mx_db.md_entries; - if (flags & MDB_APPENDDUP) - xflags |= MDB_APPEND; - rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, data, &xdata, xflags); - if (flags & F_SUBDATA) { - void *db = NODEDATA(leaf); - memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db)); - } - insert_data = mc->mc_xcursor->mx_db.md_entries - ecount; - } - /* Increment count unless we just replaced an existing item. */ - if (insert_data) - mc->mc_db->md_entries++; - if (insert_key) { - /* Invalidate txn if we created an empty sub-DB */ - if (rc) - goto bad_sub; - /* If we succeeded and the key didn't exist before, - * make sure the cursor is marked valid. - */ - mc->mc_flags |= C_INITIALIZED; - } - if (flags & MDB_MULTIPLE) { - if (!rc) { - mcount++; - /* let caller know how many succeeded, if any */ - data[1].mv_size = mcount; - if (mcount < dcount) { - data[0].mv_data = (char *)data[0].mv_data + data[0].mv_size; - insert_key = insert_data = 0; - goto more; - } - } - } - return rc; -bad_sub: - if (rc == MDB_KEYEXIST) /* should not happen, we deleted that item */ - rc = MDB_CORRUPTED; - } - mc->mc_txn->mt_flags |= MDB_TXN_ERROR; - return rc; -} - -int -mdb_cursor_del(MDB_cursor *mc, unsigned int flags) -{ - MDB_node *leaf; - MDB_page *mp; - int rc; - - if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR)) - return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; - - if (!(mc->mc_flags & C_INITIALIZED)) - return EINVAL; - - if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) - return MDB_NOTFOUND; - - if (!(flags & MDB_NOSPILL) && (rc = mdb_page_spill(mc, NULL, NULL))) - return rc; - - rc = mdb_cursor_touch(mc); - if (rc) - return rc; - - mp = mc->mc_pg[mc->mc_top]; - if (IS_LEAF2(mp)) - goto del_key; - leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - if (flags & MDB_NODUPDATA) { - /* mdb_cursor_del0() will subtract the final entry */ - mc->mc_db->md_entries -= mc->mc_xcursor->mx_db.md_entries - 1; - } else { - if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) { - mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); - } - rc = mdb_cursor_del(&mc->mc_xcursor->mx_cursor, MDB_NOSPILL); - if (rc) - return rc; - /* If sub-DB still has entries, we're done */ - if (mc->mc_xcursor->mx_db.md_entries) { - if (leaf->mn_flags & F_SUBDATA) { - /* update subDB info */ - void *db = NODEDATA(leaf); - memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db)); - } else { - MDB_cursor *m2; - /* shrink fake page */ - mdb_node_shrink(mp, mc->mc_ki[mc->mc_top]); - leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); - /* fix other sub-DB cursors pointed at this fake page */ - for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { - if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; - if (m2->mc_pg[mc->mc_top] == mp && - m2->mc_ki[mc->mc_top] == mc->mc_ki[mc->mc_top]) - m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); - } - } - mc->mc_db->md_entries--; - mc->mc_flags |= C_DEL; - return rc; - } - /* otherwise fall thru and delete the sub-DB */ - } - - if (leaf->mn_flags & F_SUBDATA) { - /* add all the child DB's pages to the free list */ - rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0); - if (rc) - goto fail; - } - } - - /* add overflow pages to free list */ - if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { - MDB_page *omp; - pgno_t pg; - - memcpy(&pg, NODEDATA(leaf), sizeof(pg)); - if ((rc = mdb_page_get(mc->mc_txn, pg, &omp, NULL)) || - (rc = mdb_ovpage_free(mc, omp))) - goto fail; - } - -del_key: - return mdb_cursor_del0(mc); - -fail: - mc->mc_txn->mt_flags |= MDB_TXN_ERROR; - return rc; -} - -/** Allocate and initialize new pages for a database. - * @param[in] mc a cursor on the database being added to. - * @param[in] flags flags defining what type of page is being allocated. - * @param[in] num the number of pages to allocate. This is usually 1, - * unless allocating overflow pages for a large record. - * @param[out] mp Address of a page, or NULL on failure. - * @return 0 on success, non-zero on failure. - */ -static int -mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp) -{ - MDB_page *np; - int rc; - - if ((rc = mdb_page_alloc(mc, num, &np))) - return rc; - DPRINTF(("allocated new mpage %"Z"u, page size %u", - np->mp_pgno, mc->mc_txn->mt_env->me_psize)); - np->mp_flags = flags | P_DIRTY; - np->mp_lower = (PAGEHDRSZ-PAGEBASE); - np->mp_upper = mc->mc_txn->mt_env->me_psize - PAGEBASE; - - if (IS_BRANCH(np)) - mc->mc_db->md_branch_pages++; - else if (IS_LEAF(np)) - mc->mc_db->md_leaf_pages++; - else if (IS_OVERFLOW(np)) { - mc->mc_db->md_overflow_pages += num; - np->mp_pages = num; - } - *mp = np; - - return 0; -} - -/** Calculate the size of a leaf node. - * The size depends on the environment's page size; if a data item - * is too large it will be put onto an overflow page and the node - * size will only include the key and not the data. Sizes are always - * rounded up to an even number of bytes, to guarantee 2-byte alignment - * of the #MDB_node headers. - * @param[in] env The environment handle. - * @param[in] key The key for the node. - * @param[in] data The data for the node. - * @return The number of bytes needed to store the node. - */ -static size_t -mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data) -{ - size_t sz; - - sz = LEAFSIZE(key, data); - if (sz > env->me_nodemax) { - /* put on overflow page */ - sz -= data->mv_size - sizeof(pgno_t); - } - - return EVEN(sz + sizeof(indx_t)); -} - -/** Calculate the size of a branch node. - * The size should depend on the environment's page size but since - * we currently don't support spilling large keys onto overflow - * pages, it's simply the size of the #MDB_node header plus the - * size of the key. Sizes are always rounded up to an even number - * of bytes, to guarantee 2-byte alignment of the #MDB_node headers. - * @param[in] env The environment handle. - * @param[in] key The key for the node. - * @return The number of bytes needed to store the node. - */ -static size_t -mdb_branch_size(MDB_env *env, MDB_val *key) -{ - size_t sz; - - sz = INDXSIZE(key); - if (sz > env->me_nodemax) { - /* put on overflow page */ - /* not implemented */ - /* sz -= key->size - sizeof(pgno_t); */ - } - - return sz + sizeof(indx_t); -} - -/** Add a node to the page pointed to by the cursor. - * @param[in] mc The cursor for this operation. - * @param[in] indx The index on the page where the new node should be added. - * @param[in] key The key for the new node. - * @param[in] data The data for the new node, if any. - * @param[in] pgno The page number, if adding a branch node. - * @param[in] flags Flags for the node. - * @return 0 on success, non-zero on failure. Possible errors are: - *
    - *
  • ENOMEM - failed to allocate overflow pages for the node. - *
  • MDB_PAGE_FULL - there is insufficient room in the page. This error - * should never happen since all callers already calculate the - * page's free space before calling this function. - *
- */ -static int -mdb_node_add(MDB_cursor *mc, indx_t indx, - MDB_val *key, MDB_val *data, pgno_t pgno, unsigned int flags) -{ - unsigned int i; - size_t node_size = NODESIZE; - ssize_t room; - indx_t ofs; - MDB_node *node; - MDB_page *mp = mc->mc_pg[mc->mc_top]; - MDB_page *ofp = NULL; /* overflow page */ - DKBUF; - - mdb_cassert(mc, mp->mp_upper >= mp->mp_lower); - - DPRINTF(("add to %s %spage %"Z"u index %i, data size %"Z"u key size %"Z"u [%s]", - IS_LEAF(mp) ? "leaf" : "branch", - IS_SUBP(mp) ? "sub-" : "", - mdb_dbg_pgno(mp), indx, data ? data->mv_size : 0, - key ? key->mv_size : 0, key ? DKEY(key) : "null")); - - if (IS_LEAF2(mp)) { - /* Move higher keys up one slot. */ - int ksize = mc->mc_db->md_pad, dif; - char *ptr = LEAF2KEY(mp, indx, ksize); - dif = NUMKEYS(mp) - indx; - if (dif > 0) - memmove(ptr+ksize, ptr, dif*ksize); - /* insert new key */ - memcpy(ptr, key->mv_data, ksize); - - /* Just using these for counting */ - mp->mp_lower += sizeof(indx_t); - mp->mp_upper -= ksize - sizeof(indx_t); - return MDB_SUCCESS; - } - - room = (ssize_t)SIZELEFT(mp) - (ssize_t)sizeof(indx_t); - if (key != NULL) - node_size += key->mv_size; - if (IS_LEAF(mp)) { - mdb_cassert(mc, data); - if (F_ISSET(flags, F_BIGDATA)) { - /* Data already on overflow page. */ - node_size += sizeof(pgno_t); - } else if (node_size + data->mv_size > mc->mc_txn->mt_env->me_nodemax) { - int ovpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize); - int rc; - /* Put data on overflow page. */ - DPRINTF(("data size is %"Z"u, node would be %"Z"u, put data on overflow page", - data->mv_size, node_size+data->mv_size)); - node_size = EVEN(node_size + sizeof(pgno_t)); - if ((ssize_t)node_size > room) - goto full; - if ((rc = mdb_page_new(mc, P_OVERFLOW, ovpages, &ofp))) - return rc; - DPRINTF(("allocated overflow page %"Z"u", ofp->mp_pgno)); - flags |= F_BIGDATA; - goto update; - } else { - node_size += data->mv_size; - } - } - node_size = EVEN(node_size); - if ((ssize_t)node_size > room) - goto full; - -update: - /* Move higher pointers up one slot. */ - for (i = NUMKEYS(mp); i > indx; i--) - mp->mp_ptrs[i] = mp->mp_ptrs[i - 1]; - - /* Adjust free space offsets. */ - ofs = mp->mp_upper - node_size; - mdb_cassert(mc, ofs >= mp->mp_lower + sizeof(indx_t)); - mp->mp_ptrs[indx] = ofs; - mp->mp_upper = ofs; - mp->mp_lower += sizeof(indx_t); - - /* Write the node data. */ - node = NODEPTR(mp, indx); - node->mn_ksize = (key == NULL) ? 0 : key->mv_size; - node->mn_flags = flags; - if (IS_LEAF(mp)) - SETDSZ(node,data->mv_size); - else - SETPGNO(node,pgno); - - if (key) - memcpy(NODEKEY(node), key->mv_data, key->mv_size); - - if (IS_LEAF(mp)) { - mdb_cassert(mc, key); - if (ofp == NULL) { - if (F_ISSET(flags, F_BIGDATA)) - memcpy(node->mn_data + key->mv_size, data->mv_data, - sizeof(pgno_t)); - else if (F_ISSET(flags, MDB_RESERVE)) - data->mv_data = node->mn_data + key->mv_size; - else - memcpy(node->mn_data + key->mv_size, data->mv_data, - data->mv_size); - } else { - memcpy(node->mn_data + key->mv_size, &ofp->mp_pgno, - sizeof(pgno_t)); - if (F_ISSET(flags, MDB_RESERVE)) - data->mv_data = METADATA(ofp); - else - memcpy(METADATA(ofp), data->mv_data, data->mv_size); - } - } - - return MDB_SUCCESS; - -full: - DPRINTF(("not enough room in page %"Z"u, got %u ptrs", - mdb_dbg_pgno(mp), NUMKEYS(mp))); - DPRINTF(("upper-lower = %u - %u = %"Z"d", mp->mp_upper,mp->mp_lower,room)); - DPRINTF(("node size = %"Z"u", node_size)); - mc->mc_txn->mt_flags |= MDB_TXN_ERROR; - return MDB_PAGE_FULL; -} - -/** Delete the specified node from a page. - * @param[in] mc Cursor pointing to the node to delete. - * @param[in] ksize The size of a node. Only used if the page is - * part of a #MDB_DUPFIXED database. - */ -static void -mdb_node_del(MDB_cursor *mc, int ksize) -{ - MDB_page *mp = mc->mc_pg[mc->mc_top]; - indx_t indx = mc->mc_ki[mc->mc_top]; - unsigned int sz; - indx_t i, j, numkeys, ptr; - MDB_node *node; - char *base; - - DPRINTF(("delete node %u on %s page %"Z"u", indx, - IS_LEAF(mp) ? "leaf" : "branch", mdb_dbg_pgno(mp))); - numkeys = NUMKEYS(mp); - mdb_cassert(mc, indx < numkeys); - - if (IS_LEAF2(mp)) { - int x = numkeys - 1 - indx; - base = LEAF2KEY(mp, indx, ksize); - if (x) - memmove(base, base + ksize, x * ksize); - mp->mp_lower -= sizeof(indx_t); - mp->mp_upper += ksize - sizeof(indx_t); - return; - } - - node = NODEPTR(mp, indx); - sz = NODESIZE + node->mn_ksize; - if (IS_LEAF(mp)) { - if (F_ISSET(node->mn_flags, F_BIGDATA)) - sz += sizeof(pgno_t); - else - sz += NODEDSZ(node); - } - sz = EVEN(sz); - - ptr = mp->mp_ptrs[indx]; - for (i = j = 0; i < numkeys; i++) { - if (i != indx) { - mp->mp_ptrs[j] = mp->mp_ptrs[i]; - if (mp->mp_ptrs[i] < ptr) - mp->mp_ptrs[j] += sz; - j++; - } - } - - base = (char *)mp + mp->mp_upper + PAGEBASE; - memmove(base + sz, base, ptr - mp->mp_upper); - - mp->mp_lower -= sizeof(indx_t); - mp->mp_upper += sz; -} - -/** Compact the main page after deleting a node on a subpage. - * @param[in] mp The main page to operate on. - * @param[in] indx The index of the subpage on the main page. - */ -static void -mdb_node_shrink(MDB_page *mp, indx_t indx) -{ - MDB_node *node; - MDB_page *sp, *xp; - char *base; - int nsize, delta; - indx_t i, numkeys, ptr; - - node = NODEPTR(mp, indx); - sp = (MDB_page *)NODEDATA(node); - delta = SIZELEFT(sp); - xp = (MDB_page *)((char *)sp + delta); - - /* shift subpage upward */ - if (IS_LEAF2(sp)) { - nsize = NUMKEYS(sp) * sp->mp_pad; - if (nsize & 1) - return; /* do not make the node uneven-sized */ - memmove(METADATA(xp), METADATA(sp), nsize); - } else { - int i; - numkeys = NUMKEYS(sp); - for (i=numkeys-1; i>=0; i--) - xp->mp_ptrs[i] = sp->mp_ptrs[i] - delta; - } - xp->mp_upper = sp->mp_lower; - xp->mp_lower = sp->mp_lower; - xp->mp_flags = sp->mp_flags; - xp->mp_pad = sp->mp_pad; - COPY_PGNO(xp->mp_pgno, mp->mp_pgno); - - nsize = NODEDSZ(node) - delta; - SETDSZ(node, nsize); - - /* shift lower nodes upward */ - ptr = mp->mp_ptrs[indx]; - numkeys = NUMKEYS(mp); - for (i = 0; i < numkeys; i++) { - if (mp->mp_ptrs[i] <= ptr) - mp->mp_ptrs[i] += delta; - } - - base = (char *)mp + mp->mp_upper + PAGEBASE; - memmove(base + delta, base, ptr - mp->mp_upper + NODESIZE + NODEKSZ(node)); - mp->mp_upper += delta; -} - -/** Initial setup of a sorted-dups cursor. - * Sorted duplicates are implemented as a sub-database for the given key. - * The duplicate data items are actually keys of the sub-database. - * Operations on the duplicate data items are performed using a sub-cursor - * initialized when the sub-database is first accessed. This function does - * the preliminary setup of the sub-cursor, filling in the fields that - * depend only on the parent DB. - * @param[in] mc The main cursor whose sorted-dups cursor is to be initialized. - */ -static void -mdb_xcursor_init0(MDB_cursor *mc) -{ - MDB_xcursor *mx = mc->mc_xcursor; - - mx->mx_cursor.mc_xcursor = NULL; - mx->mx_cursor.mc_txn = mc->mc_txn; - mx->mx_cursor.mc_db = &mx->mx_db; - mx->mx_cursor.mc_dbx = &mx->mx_dbx; - mx->mx_cursor.mc_dbi = mc->mc_dbi; - mx->mx_cursor.mc_dbflag = &mx->mx_dbflag; - mx->mx_cursor.mc_snum = 0; - mx->mx_cursor.mc_top = 0; - mx->mx_cursor.mc_flags = C_SUB; - mx->mx_dbx.md_name.mv_size = 0; - mx->mx_dbx.md_name.mv_data = NULL; - mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp; - mx->mx_dbx.md_dcmp = NULL; - mx->mx_dbx.md_rel = mc->mc_dbx->md_rel; -} - -/** Final setup of a sorted-dups cursor. - * Sets up the fields that depend on the data from the main cursor. - * @param[in] mc The main cursor whose sorted-dups cursor is to be initialized. - * @param[in] node The data containing the #MDB_db record for the - * sorted-dup database. - */ -static void -mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node) -{ - MDB_xcursor *mx = mc->mc_xcursor; - - if (node->mn_flags & F_SUBDATA) { - memcpy(&mx->mx_db, NODEDATA(node), sizeof(MDB_db)); - mx->mx_cursor.mc_pg[0] = 0; - mx->mx_cursor.mc_snum = 0; - mx->mx_cursor.mc_top = 0; - mx->mx_cursor.mc_flags = C_SUB; - } else { - MDB_page *fp = NODEDATA(node); - mx->mx_db.md_pad = mc->mc_pg[mc->mc_top]->mp_pad; - mx->mx_db.md_flags = 0; - mx->mx_db.md_depth = 1; - mx->mx_db.md_branch_pages = 0; - mx->mx_db.md_leaf_pages = 1; - mx->mx_db.md_overflow_pages = 0; - mx->mx_db.md_entries = NUMKEYS(fp); - COPY_PGNO(mx->mx_db.md_root, fp->mp_pgno); - mx->mx_cursor.mc_snum = 1; - mx->mx_cursor.mc_top = 0; - mx->mx_cursor.mc_flags = C_INITIALIZED|C_SUB; - mx->mx_cursor.mc_pg[0] = fp; - mx->mx_cursor.mc_ki[0] = 0; - if (mc->mc_db->md_flags & MDB_DUPFIXED) { - mx->mx_db.md_flags = MDB_DUPFIXED; - mx->mx_db.md_pad = fp->mp_pad; - if (mc->mc_db->md_flags & MDB_INTEGERDUP) - mx->mx_db.md_flags |= MDB_INTEGERKEY; - } - } - DPRINTF(("Sub-db -%u root page %"Z"u", mx->mx_cursor.mc_dbi, - mx->mx_db.md_root)); - mx->mx_dbflag = DB_VALID|DB_DIRTY; /* DB_DIRTY guides mdb_cursor_touch */ -#if UINT_MAX < SIZE_MAX - if (mx->mx_dbx.md_cmp == mdb_cmp_int && mx->mx_db.md_pad == sizeof(size_t)) - mx->mx_dbx.md_cmp = mdb_cmp_clong; -#endif -} - -/** Initialize a cursor for a given transaction and database. */ -static void -mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx) -{ - mc->mc_next = NULL; - mc->mc_backup = NULL; - mc->mc_dbi = dbi; - mc->mc_txn = txn; - mc->mc_db = &txn->mt_dbs[dbi]; - mc->mc_dbx = &txn->mt_dbxs[dbi]; - mc->mc_dbflag = &txn->mt_dbflags[dbi]; - mc->mc_snum = 0; - mc->mc_top = 0; - mc->mc_pg[0] = 0; - mc->mc_flags = 0; - if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) { - mdb_tassert(txn, mx != NULL); - mc->mc_xcursor = mx; - mdb_xcursor_init0(mc); - } else { - mc->mc_xcursor = NULL; - } - if (*mc->mc_dbflag & DB_STALE) { - mdb_page_search(mc, NULL, MDB_PS_ROOTONLY); - } -} - -int -mdb_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **ret) -{ - MDB_cursor *mc; - size_t size = sizeof(MDB_cursor); - - if (!ret || !TXN_DBI_EXIST(txn, dbi)) - return EINVAL; - - if (txn->mt_flags & MDB_TXN_ERROR) - return MDB_BAD_TXN; - - /* Allow read access to the freelist */ - if (!dbi && !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) - return EINVAL; - - if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) - size += sizeof(MDB_xcursor); - - if ((mc = malloc(size)) != NULL) { - mdb_cursor_init(mc, txn, dbi, (MDB_xcursor *)(mc + 1)); - if (txn->mt_cursors) { - mc->mc_next = txn->mt_cursors[dbi]; - txn->mt_cursors[dbi] = mc; - mc->mc_flags |= C_UNTRACK; - } - } else { - return ENOMEM; - } - - *ret = mc; - - return MDB_SUCCESS; -} - -int -mdb_cursor_renew(MDB_txn *txn, MDB_cursor *mc) -{ - if (!mc || !TXN_DBI_EXIST(txn, mc->mc_dbi)) - return EINVAL; - - if ((mc->mc_flags & C_UNTRACK) || txn->mt_cursors) - return EINVAL; - - if (txn->mt_flags & MDB_TXN_ERROR) - return MDB_BAD_TXN; - - mdb_cursor_init(mc, txn, mc->mc_dbi, mc->mc_xcursor); - return MDB_SUCCESS; -} - -/* Return the count of duplicate data items for the current key */ -int -mdb_cursor_count(MDB_cursor *mc, size_t *countp) -{ - MDB_node *leaf; - - if (mc == NULL || countp == NULL) - return EINVAL; - - if (mc->mc_xcursor == NULL) - return MDB_INCOMPATIBLE; - - if (mc->mc_txn->mt_flags & MDB_TXN_ERROR) - return MDB_BAD_TXN; - - if (!(mc->mc_flags & C_INITIALIZED)) - return EINVAL; - - if (!mc->mc_snum || (mc->mc_flags & C_EOF)) - return MDB_NOTFOUND; - - leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { - *countp = 1; - } else { - if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) - return EINVAL; - - *countp = mc->mc_xcursor->mx_db.md_entries; - } - return MDB_SUCCESS; -} - -void -mdb_cursor_close(MDB_cursor *mc) -{ - if (mc && !mc->mc_backup) { - /* remove from txn, if tracked */ - if ((mc->mc_flags & C_UNTRACK) && mc->mc_txn->mt_cursors) { - MDB_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; - while (*prev && *prev != mc) prev = &(*prev)->mc_next; - if (*prev == mc) - *prev = mc->mc_next; - } - free(mc); - } -} - -MDB_txn * -mdb_cursor_txn(MDB_cursor *mc) -{ - if (!mc) return NULL; - return mc->mc_txn; -} - -MDB_dbi -mdb_cursor_dbi(MDB_cursor *mc) -{ - return mc->mc_dbi; -} - -/** Replace the key for a branch node with a new key. - * @param[in] mc Cursor pointing to the node to operate on. - * @param[in] key The new key to use. - * @return 0 on success, non-zero on failure. - */ -static int -mdb_update_key(MDB_cursor *mc, MDB_val *key) -{ - MDB_page *mp; - MDB_node *node; - char *base; - size_t len; - int delta, ksize, oksize; - indx_t ptr, i, numkeys, indx; - DKBUF; - - indx = mc->mc_ki[mc->mc_top]; - mp = mc->mc_pg[mc->mc_top]; - node = NODEPTR(mp, indx); - ptr = mp->mp_ptrs[indx]; -#if MDB_DEBUG - { - MDB_val k2; - char kbuf2[DKBUF_MAXKEYSIZE*2+1]; - k2.mv_data = NODEKEY(node); - k2.mv_size = node->mn_ksize; - DPRINTF(("update key %u (ofs %u) [%s] to [%s] on page %"Z"u", - indx, ptr, - mdb_dkey(&k2, kbuf2), - DKEY(key), - mp->mp_pgno)); - } -#endif - - /* Sizes must be 2-byte aligned. */ - ksize = EVEN(key->mv_size); - oksize = EVEN(node->mn_ksize); - delta = ksize - oksize; - - /* Shift node contents if EVEN(key length) changed. */ - if (delta) { - if (delta > 0 && SIZELEFT(mp) < delta) { - pgno_t pgno; - /* not enough space left, do a delete and split */ - DPRINTF(("Not enough room, delta = %d, splitting...", delta)); - pgno = NODEPGNO(node); - mdb_node_del(mc, 0); - return mdb_page_split(mc, key, NULL, pgno, MDB_SPLIT_REPLACE); - } - - numkeys = NUMKEYS(mp); - for (i = 0; i < numkeys; i++) { - if (mp->mp_ptrs[i] <= ptr) - mp->mp_ptrs[i] -= delta; - } - - base = (char *)mp + mp->mp_upper + PAGEBASE; - len = ptr - mp->mp_upper + NODESIZE; - memmove(base - delta, base, len); - mp->mp_upper -= delta; - - node = NODEPTR(mp, indx); - } - - /* But even if no shift was needed, update ksize */ - if (node->mn_ksize != key->mv_size) - node->mn_ksize = key->mv_size; - - if (key->mv_size) - memcpy(NODEKEY(node), key->mv_data, key->mv_size); - - return MDB_SUCCESS; -} - -static void -mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst); - -/** Move a node from csrc to cdst. - */ -static int -mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst) -{ - MDB_node *srcnode; - MDB_val key, data; - pgno_t srcpg; - MDB_cursor mn; - int rc; - unsigned short flags; - - DKBUF; - - /* Mark src and dst as dirty. */ - if ((rc = mdb_page_touch(csrc)) || - (rc = mdb_page_touch(cdst))) - return rc; - - if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { - key.mv_size = csrc->mc_db->md_pad; - key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top], key.mv_size); - data.mv_size = 0; - data.mv_data = NULL; - srcpg = 0; - flags = 0; - } else { - srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top]); - mdb_cassert(csrc, !((size_t)srcnode & 1)); - srcpg = NODEPGNO(srcnode); - flags = srcnode->mn_flags; - if (csrc->mc_ki[csrc->mc_top] == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { - unsigned int snum = csrc->mc_snum; - MDB_node *s2; - /* must find the lowest key below src */ - rc = mdb_page_search_lowest(csrc); - if (rc) - return rc; - if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { - key.mv_size = csrc->mc_db->md_pad; - key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); - } else { - s2 = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); - key.mv_size = NODEKSZ(s2); - key.mv_data = NODEKEY(s2); - } - csrc->mc_snum = snum--; - csrc->mc_top = snum; - } else { - key.mv_size = NODEKSZ(srcnode); - key.mv_data = NODEKEY(srcnode); - } - data.mv_size = NODEDSZ(srcnode); - data.mv_data = NODEDATA(srcnode); - } - if (IS_BRANCH(cdst->mc_pg[cdst->mc_top]) && cdst->mc_ki[cdst->mc_top] == 0) { - unsigned int snum = cdst->mc_snum; - MDB_node *s2; - MDB_val bkey; - /* must find the lowest key below dst */ - mdb_cursor_copy(cdst, &mn); - rc = mdb_page_search_lowest(&mn); - if (rc) - return rc; - if (IS_LEAF2(mn.mc_pg[mn.mc_top])) { - bkey.mv_size = mn.mc_db->md_pad; - bkey.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, bkey.mv_size); - } else { - s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0); - bkey.mv_size = NODEKSZ(s2); - bkey.mv_data = NODEKEY(s2); - } - mn.mc_snum = snum--; - mn.mc_top = snum; - mn.mc_ki[snum] = 0; - rc = mdb_update_key(&mn, &bkey); - if (rc) - return rc; - } - - DPRINTF(("moving %s node %u [%s] on page %"Z"u to node %u on page %"Z"u", - IS_LEAF(csrc->mc_pg[csrc->mc_top]) ? "leaf" : "branch", - csrc->mc_ki[csrc->mc_top], - DKEY(&key), - csrc->mc_pg[csrc->mc_top]->mp_pgno, - cdst->mc_ki[cdst->mc_top], cdst->mc_pg[cdst->mc_top]->mp_pgno)); - - /* Add the node to the destination page. - */ - rc = mdb_node_add(cdst, cdst->mc_ki[cdst->mc_top], &key, &data, srcpg, flags); - if (rc != MDB_SUCCESS) - return rc; - - /* Delete the node from the source page. - */ - mdb_node_del(csrc, key.mv_size); - - { - /* Adjust other cursors pointing to mp */ - MDB_cursor *m2, *m3; - MDB_dbi dbi = csrc->mc_dbi; - MDB_page *mp = csrc->mc_pg[csrc->mc_top]; - - for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - if (csrc->mc_flags & C_SUB) - m3 = &m2->mc_xcursor->mx_cursor; - else - m3 = m2; - if (m3 == csrc) continue; - if (m3->mc_pg[csrc->mc_top] == mp && m3->mc_ki[csrc->mc_top] == - csrc->mc_ki[csrc->mc_top]) { - m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top]; - m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; - } - } - } - - /* Update the parent separators. - */ - if (csrc->mc_ki[csrc->mc_top] == 0) { - if (csrc->mc_ki[csrc->mc_top-1] != 0) { - if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { - key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); - } else { - srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); - key.mv_size = NODEKSZ(srcnode); - key.mv_data = NODEKEY(srcnode); - } - DPRINTF(("update separator for source page %"Z"u to [%s]", - csrc->mc_pg[csrc->mc_top]->mp_pgno, DKEY(&key))); - mdb_cursor_copy(csrc, &mn); - mn.mc_snum--; - mn.mc_top--; - if ((rc = mdb_update_key(&mn, &key)) != MDB_SUCCESS) - return rc; - } - if (IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { - MDB_val nullkey; - indx_t ix = csrc->mc_ki[csrc->mc_top]; - nullkey.mv_size = 0; - csrc->mc_ki[csrc->mc_top] = 0; - rc = mdb_update_key(csrc, &nullkey); - csrc->mc_ki[csrc->mc_top] = ix; - mdb_cassert(csrc, rc == MDB_SUCCESS); - } - } - - if (cdst->mc_ki[cdst->mc_top] == 0) { - if (cdst->mc_ki[cdst->mc_top-1] != 0) { - if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { - key.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, key.mv_size); - } else { - srcnode = NODEPTR(cdst->mc_pg[cdst->mc_top], 0); - key.mv_size = NODEKSZ(srcnode); - key.mv_data = NODEKEY(srcnode); - } - DPRINTF(("update separator for destination page %"Z"u to [%s]", - cdst->mc_pg[cdst->mc_top]->mp_pgno, DKEY(&key))); - mdb_cursor_copy(cdst, &mn); - mn.mc_snum--; - mn.mc_top--; - if ((rc = mdb_update_key(&mn, &key)) != MDB_SUCCESS) - return rc; - } - if (IS_BRANCH(cdst->mc_pg[cdst->mc_top])) { - MDB_val nullkey; - indx_t ix = cdst->mc_ki[cdst->mc_top]; - nullkey.mv_size = 0; - cdst->mc_ki[cdst->mc_top] = 0; - rc = mdb_update_key(cdst, &nullkey); - cdst->mc_ki[cdst->mc_top] = ix; - mdb_cassert(csrc, rc == MDB_SUCCESS); - } - } - - return MDB_SUCCESS; -} - -/** Merge one page into another. - * The nodes from the page pointed to by \b csrc will - * be copied to the page pointed to by \b cdst and then - * the \b csrc page will be freed. - * @param[in] csrc Cursor pointing to the source page. - * @param[in] cdst Cursor pointing to the destination page. - * @return 0 on success, non-zero on failure. - */ -static int -mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) -{ - MDB_page *psrc, *pdst; - MDB_node *srcnode; - MDB_val key, data; - unsigned nkeys; - int rc; - indx_t i, j; - - psrc = csrc->mc_pg[csrc->mc_top]; - pdst = cdst->mc_pg[cdst->mc_top]; - - DPRINTF(("merging page %"Z"u into %"Z"u", psrc->mp_pgno, pdst->mp_pgno)); - - mdb_cassert(csrc, csrc->mc_snum > 1); /* can't merge root page */ - mdb_cassert(csrc, cdst->mc_snum > 1); - - /* Mark dst as dirty. */ - if ((rc = mdb_page_touch(cdst))) - return rc; - - /* Move all nodes from src to dst. - */ - j = nkeys = NUMKEYS(pdst); - if (IS_LEAF2(psrc)) { - key.mv_size = csrc->mc_db->md_pad; - key.mv_data = METADATA(psrc); - for (i = 0; i < NUMKEYS(psrc); i++, j++) { - rc = mdb_node_add(cdst, j, &key, NULL, 0, 0); - if (rc != MDB_SUCCESS) - return rc; - key.mv_data = (char *)key.mv_data + key.mv_size; - } - } else { - for (i = 0; i < NUMKEYS(psrc); i++, j++) { - srcnode = NODEPTR(psrc, i); - if (i == 0 && IS_BRANCH(psrc)) { - MDB_cursor mn; - MDB_node *s2; - mdb_cursor_copy(csrc, &mn); - /* must find the lowest key below src */ - rc = mdb_page_search_lowest(&mn); - if (rc) - return rc; - if (IS_LEAF2(mn.mc_pg[mn.mc_top])) { - key.mv_size = mn.mc_db->md_pad; - key.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, key.mv_size); - } else { - s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0); - key.mv_size = NODEKSZ(s2); - key.mv_data = NODEKEY(s2); - } - } else { - key.mv_size = srcnode->mn_ksize; - key.mv_data = NODEKEY(srcnode); - } - - data.mv_size = NODEDSZ(srcnode); - data.mv_data = NODEDATA(srcnode); - rc = mdb_node_add(cdst, j, &key, &data, NODEPGNO(srcnode), srcnode->mn_flags); - if (rc != MDB_SUCCESS) - return rc; - } - } - - DPRINTF(("dst page %"Z"u now has %u keys (%.1f%% filled)", - pdst->mp_pgno, NUMKEYS(pdst), - (float)PAGEFILL(cdst->mc_txn->mt_env, pdst) / 10)); - - /* Unlink the src page from parent and add to free list. - */ - csrc->mc_top--; - mdb_node_del(csrc, 0); - if (csrc->mc_ki[csrc->mc_top] == 0) { - key.mv_size = 0; - rc = mdb_update_key(csrc, &key); - if (rc) { - csrc->mc_top++; - return rc; - } - } - csrc->mc_top++; - - psrc = csrc->mc_pg[csrc->mc_top]; - /* If not operating on FreeDB, allow this page to be reused - * in this txn. Otherwise just add to free list. - */ - rc = mdb_page_loose(csrc, psrc); - if (rc) - return rc; - if (IS_LEAF(psrc)) - csrc->mc_db->md_leaf_pages--; - else - csrc->mc_db->md_branch_pages--; - { - /* Adjust other cursors pointing to mp */ - MDB_cursor *m2, *m3; - MDB_dbi dbi = csrc->mc_dbi; - - for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - if (csrc->mc_flags & C_SUB) - m3 = &m2->mc_xcursor->mx_cursor; - else - m3 = m2; - if (m3 == csrc) continue; - if (m3->mc_snum < csrc->mc_snum) continue; - if (m3->mc_pg[csrc->mc_top] == psrc) { - m3->mc_pg[csrc->mc_top] = pdst; - m3->mc_ki[csrc->mc_top] += nkeys; - } - } - } - { - unsigned int snum = cdst->mc_snum; - uint16_t depth = cdst->mc_db->md_depth; - mdb_cursor_pop(cdst); - rc = mdb_rebalance(cdst); - /* Did the tree shrink? */ - if (depth > cdst->mc_db->md_depth) - snum--; - cdst->mc_snum = snum; - cdst->mc_top = snum-1; - } - return rc; -} - -/** Copy the contents of a cursor. - * @param[in] csrc The cursor to copy from. - * @param[out] cdst The cursor to copy to. - */ -static void -mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst) -{ - unsigned int i; - - cdst->mc_txn = csrc->mc_txn; - cdst->mc_dbi = csrc->mc_dbi; - cdst->mc_db = csrc->mc_db; - cdst->mc_dbx = csrc->mc_dbx; - cdst->mc_snum = csrc->mc_snum; - cdst->mc_top = csrc->mc_top; - cdst->mc_flags = csrc->mc_flags; - - for (i=0; imc_snum; i++) { - cdst->mc_pg[i] = csrc->mc_pg[i]; - cdst->mc_ki[i] = csrc->mc_ki[i]; - } -} - -/** Rebalance the tree after a delete operation. - * @param[in] mc Cursor pointing to the page where rebalancing - * should begin. - * @return 0 on success, non-zero on failure. - */ -static int -mdb_rebalance(MDB_cursor *mc) -{ - MDB_node *node; - int rc; - unsigned int ptop, minkeys; - MDB_cursor mn; - indx_t oldki; - - minkeys = 1 + (IS_BRANCH(mc->mc_pg[mc->mc_top])); - DPRINTF(("rebalancing %s page %"Z"u (has %u keys, %.1f%% full)", - IS_LEAF(mc->mc_pg[mc->mc_top]) ? "leaf" : "branch", - mdb_dbg_pgno(mc->mc_pg[mc->mc_top]), NUMKEYS(mc->mc_pg[mc->mc_top]), - (float)PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) / 10)); - - if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= FILL_THRESHOLD && - NUMKEYS(mc->mc_pg[mc->mc_top]) >= minkeys) { - DPRINTF(("no need to rebalance page %"Z"u, above fill threshold", - mdb_dbg_pgno(mc->mc_pg[mc->mc_top]))); - return MDB_SUCCESS; - } - - if (mc->mc_snum < 2) { - MDB_page *mp = mc->mc_pg[0]; - if (IS_SUBP(mp)) { - DPUTS("Can't rebalance a subpage, ignoring"); - return MDB_SUCCESS; - } - if (NUMKEYS(mp) == 0) { - DPUTS("tree is completely empty"); - mc->mc_db->md_root = P_INVALID; - mc->mc_db->md_depth = 0; - mc->mc_db->md_leaf_pages = 0; - rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); - if (rc) - return rc; - /* Adjust cursors pointing to mp */ - mc->mc_snum = 0; - mc->mc_top = 0; - mc->mc_flags &= ~C_INITIALIZED; - { - MDB_cursor *m2, *m3; - MDB_dbi dbi = mc->mc_dbi; - - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - if (mc->mc_flags & C_SUB) - m3 = &m2->mc_xcursor->mx_cursor; - else - m3 = m2; - if (m3->mc_snum < mc->mc_snum) continue; - if (m3->mc_pg[0] == mp) { - m3->mc_snum = 0; - m3->mc_top = 0; - m3->mc_flags &= ~C_INITIALIZED; - } - } - } - } else if (IS_BRANCH(mp) && NUMKEYS(mp) == 1) { - int i; - DPUTS("collapsing root page!"); - rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); - if (rc) - return rc; - mc->mc_db->md_root = NODEPGNO(NODEPTR(mp, 0)); - rc = mdb_page_get(mc->mc_txn,mc->mc_db->md_root,&mc->mc_pg[0],NULL); - if (rc) - return rc; - mc->mc_db->md_depth--; - mc->mc_db->md_branch_pages--; - mc->mc_ki[0] = mc->mc_ki[1]; - for (i = 1; imc_db->md_depth; i++) { - mc->mc_pg[i] = mc->mc_pg[i+1]; - mc->mc_ki[i] = mc->mc_ki[i+1]; - } - { - /* Adjust other cursors pointing to mp */ - MDB_cursor *m2, *m3; - MDB_dbi dbi = mc->mc_dbi; - - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - if (mc->mc_flags & C_SUB) - m3 = &m2->mc_xcursor->mx_cursor; - else - m3 = m2; - if (m3 == mc || m3->mc_snum < mc->mc_snum) continue; - if (m3->mc_pg[0] == mp) { - m3->mc_snum--; - m3->mc_top--; - for (i=0; imc_snum; i++) { - m3->mc_pg[i] = m3->mc_pg[i+1]; - m3->mc_ki[i] = m3->mc_ki[i+1]; - } - } - } - } - } else - DPUTS("root page doesn't need rebalancing"); - return MDB_SUCCESS; - } - - /* The parent (branch page) must have at least 2 pointers, - * otherwise the tree is invalid. - */ - ptop = mc->mc_top-1; - mdb_cassert(mc, NUMKEYS(mc->mc_pg[ptop]) > 1); - - /* Leaf page fill factor is below the threshold. - * Try to move keys from left or right neighbor, or - * merge with a neighbor page. - */ - - /* Find neighbors. - */ - mdb_cursor_copy(mc, &mn); - mn.mc_xcursor = NULL; - - oldki = mc->mc_ki[mc->mc_top]; - if (mc->mc_ki[ptop] == 0) { - /* We're the leftmost leaf in our parent. - */ - DPUTS("reading right neighbor"); - mn.mc_ki[ptop]++; - node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); - rc = mdb_page_get(mc->mc_txn,NODEPGNO(node),&mn.mc_pg[mn.mc_top],NULL); - if (rc) - return rc; - mn.mc_ki[mn.mc_top] = 0; - mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]); - } else { - /* There is at least one neighbor to the left. - */ - DPUTS("reading left neighbor"); - mn.mc_ki[ptop]--; - node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); - rc = mdb_page_get(mc->mc_txn,NODEPGNO(node),&mn.mc_pg[mn.mc_top],NULL); - if (rc) - return rc; - mn.mc_ki[mn.mc_top] = NUMKEYS(mn.mc_pg[mn.mc_top]) - 1; - mc->mc_ki[mc->mc_top] = 0; - } - - DPRINTF(("found neighbor page %"Z"u (%u keys, %.1f%% full)", - mn.mc_pg[mn.mc_top]->mp_pgno, NUMKEYS(mn.mc_pg[mn.mc_top]), - (float)PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) / 10)); - - /* If the neighbor page is above threshold and has enough keys, - * move one key from it. Otherwise we should try to merge them. - * (A branch page must never have less than 2 keys.) - */ - minkeys = 1 + (IS_BRANCH(mn.mc_pg[mn.mc_top])); - if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= FILL_THRESHOLD && NUMKEYS(mn.mc_pg[mn.mc_top]) > minkeys) { - rc = mdb_node_move(&mn, mc); - if (mc->mc_ki[ptop]) { - oldki++; - } - } else { - if (mc->mc_ki[ptop] == 0) { - rc = mdb_page_merge(&mn, mc); - } else { - oldki += NUMKEYS(mn.mc_pg[mn.mc_top]); - mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1; - rc = mdb_page_merge(mc, &mn); - mdb_cursor_copy(&mn, mc); - } - mc->mc_flags &= ~C_EOF; - } - mc->mc_ki[mc->mc_top] = oldki; - return rc; -} - -/** Complete a delete operation started by #mdb_cursor_del(). */ -static int -mdb_cursor_del0(MDB_cursor *mc) -{ - int rc; - MDB_page *mp; - indx_t ki; - unsigned int nkeys; - - ki = mc->mc_ki[mc->mc_top]; - mdb_node_del(mc, mc->mc_db->md_pad); - mc->mc_db->md_entries--; - rc = mdb_rebalance(mc); - - if (rc == MDB_SUCCESS) { - MDB_cursor *m2, *m3; - MDB_dbi dbi = mc->mc_dbi; - - mp = mc->mc_pg[mc->mc_top]; - nkeys = NUMKEYS(mp); - - /* if mc points past last node in page, find next sibling */ - if (mc->mc_ki[mc->mc_top] >= nkeys) { - rc = mdb_cursor_sibling(mc, 1); - if (rc == MDB_NOTFOUND) { - mc->mc_flags |= C_EOF; - rc = MDB_SUCCESS; - } - } - - /* Adjust other cursors pointing to mp */ - for (m2 = mc->mc_txn->mt_cursors[dbi]; !rc && m2; m2=m2->mc_next) { - m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; - if (! (m2->mc_flags & m3->mc_flags & C_INITIALIZED)) - continue; - if (m3 == mc || m3->mc_snum < mc->mc_snum) - continue; - if (m3->mc_pg[mc->mc_top] == mp) { - if (m3->mc_ki[mc->mc_top] >= ki) { - m3->mc_flags |= C_DEL; - if (m3->mc_ki[mc->mc_top] > ki) - m3->mc_ki[mc->mc_top]--; - else if (mc->mc_db->md_flags & MDB_DUPSORT) - m3->mc_xcursor->mx_cursor.mc_flags |= C_EOF; - } - if (m3->mc_ki[mc->mc_top] >= nkeys) { - rc = mdb_cursor_sibling(m3, 1); - if (rc == MDB_NOTFOUND) { - m3->mc_flags |= C_EOF; - rc = MDB_SUCCESS; - } - } - } - } - mc->mc_flags |= C_DEL; - } - - if (rc) - mc->mc_txn->mt_flags |= MDB_TXN_ERROR; - return rc; -} - -int -mdb_del(MDB_txn *txn, MDB_dbi dbi, - MDB_val *key, MDB_val *data) -{ - if (!key || dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) - return EINVAL; - - if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR)) - return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; - - if (!F_ISSET(txn->mt_dbs[dbi].md_flags, MDB_DUPSORT)) { - /* must ignore any data */ - data = NULL; - } - - return mdb_del0(txn, dbi, key, data, 0); -} - -static int -mdb_del0(MDB_txn *txn, MDB_dbi dbi, - MDB_val *key, MDB_val *data, unsigned flags) -{ - MDB_cursor mc; - MDB_xcursor mx; - MDB_cursor_op op; - MDB_val rdata, *xdata; - int rc, exact = 0; - DKBUF; - - DPRINTF(("====> delete db %u key [%s]", dbi, DKEY(key))); - - mdb_cursor_init(&mc, txn, dbi, &mx); - - if (data) { - op = MDB_GET_BOTH; - rdata = *data; - xdata = &rdata; - } else { - op = MDB_SET; - xdata = NULL; - flags |= MDB_NODUPDATA; - } - rc = mdb_cursor_set(&mc, key, xdata, op, &exact); - if (rc == 0) { - /* let mdb_page_split know about this cursor if needed: - * delete will trigger a rebalance; if it needs to move - * a node from one page to another, it will have to - * update the parent's separator key(s). If the new sepkey - * is larger than the current one, the parent page may - * run out of space, triggering a split. We need this - * cursor to be consistent until the end of the rebalance. - */ - mc.mc_flags |= C_UNTRACK; - mc.mc_next = txn->mt_cursors[dbi]; - txn->mt_cursors[dbi] = &mc; - rc = mdb_cursor_del(&mc, flags); - txn->mt_cursors[dbi] = mc.mc_next; - } - return rc; -} - -/** Split a page and insert a new node. - * @param[in,out] mc Cursor pointing to the page and desired insertion index. - * The cursor will be updated to point to the actual page and index where - * the node got inserted after the split. - * @param[in] newkey The key for the newly inserted node. - * @param[in] newdata The data for the newly inserted node. - * @param[in] newpgno The page number, if the new node is a branch node. - * @param[in] nflags The #NODE_ADD_FLAGS for the new node. - * @return 0 on success, non-zero on failure. - */ -static int -mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno, - unsigned int nflags) -{ - unsigned int flags; - int rc = MDB_SUCCESS, new_root = 0, did_split = 0; - indx_t newindx; - pgno_t pgno = 0; - int i, j, split_indx, nkeys, pmax; - MDB_env *env = mc->mc_txn->mt_env; - MDB_node *node; - MDB_val sepkey, rkey, xdata, *rdata = &xdata; - MDB_page *copy = NULL; - MDB_page *mp, *rp, *pp; - int ptop; - MDB_cursor mn; - DKBUF; - - mp = mc->mc_pg[mc->mc_top]; - newindx = mc->mc_ki[mc->mc_top]; - nkeys = NUMKEYS(mp); - - DPRINTF(("-----> splitting %s page %"Z"u and adding [%s] at index %i/%i", - IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno, - DKEY(newkey), mc->mc_ki[mc->mc_top], nkeys)); - - /* Create a right sibling. */ - if ((rc = mdb_page_new(mc, mp->mp_flags, 1, &rp))) - return rc; - DPRINTF(("new right sibling: page %"Z"u", rp->mp_pgno)); - - if (mc->mc_snum < 2) { - if ((rc = mdb_page_new(mc, P_BRANCH, 1, &pp))) - goto done; - /* shift current top to make room for new parent */ - mc->mc_pg[1] = mc->mc_pg[0]; - mc->mc_ki[1] = mc->mc_ki[0]; - mc->mc_pg[0] = pp; - mc->mc_ki[0] = 0; - mc->mc_db->md_root = pp->mp_pgno; - DPRINTF(("root split! new root = %"Z"u", pp->mp_pgno)); - mc->mc_db->md_depth++; - new_root = 1; - - /* Add left (implicit) pointer. */ - if ((rc = mdb_node_add(mc, 0, NULL, NULL, mp->mp_pgno, 0)) != MDB_SUCCESS) { - /* undo the pre-push */ - mc->mc_pg[0] = mc->mc_pg[1]; - mc->mc_ki[0] = mc->mc_ki[1]; - mc->mc_db->md_root = mp->mp_pgno; - mc->mc_db->md_depth--; - goto done; - } - mc->mc_snum = 2; - mc->mc_top = 1; - ptop = 0; - } else { - ptop = mc->mc_top-1; - DPRINTF(("parent branch page is %"Z"u", mc->mc_pg[ptop]->mp_pgno)); - } - - mc->mc_flags |= C_SPLITTING; - mdb_cursor_copy(mc, &mn); - mn.mc_pg[mn.mc_top] = rp; - mn.mc_ki[ptop] = mc->mc_ki[ptop]+1; - - if (nflags & MDB_APPEND) { - mn.mc_ki[mn.mc_top] = 0; - sepkey = *newkey; - split_indx = newindx; - nkeys = 0; - } else { - - split_indx = (nkeys+1) / 2; - - if (IS_LEAF2(rp)) { - char *split, *ins; - int x; - unsigned int lsize, rsize, ksize; - /* Move half of the keys to the right sibling */ - x = mc->mc_ki[mc->mc_top] - split_indx; - ksize = mc->mc_db->md_pad; - split = LEAF2KEY(mp, split_indx, ksize); - rsize = (nkeys - split_indx) * ksize; - lsize = (nkeys - split_indx) * sizeof(indx_t); - mp->mp_lower -= lsize; - rp->mp_lower += lsize; - mp->mp_upper += rsize - lsize; - rp->mp_upper -= rsize - lsize; - sepkey.mv_size = ksize; - if (newindx == split_indx) { - sepkey.mv_data = newkey->mv_data; - } else { - sepkey.mv_data = split; - } - if (x<0) { - ins = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], ksize); - memcpy(rp->mp_ptrs, split, rsize); - sepkey.mv_data = rp->mp_ptrs; - memmove(ins+ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize); - memcpy(ins, newkey->mv_data, ksize); - mp->mp_lower += sizeof(indx_t); - mp->mp_upper -= ksize - sizeof(indx_t); - } else { - if (x) - memcpy(rp->mp_ptrs, split, x * ksize); - ins = LEAF2KEY(rp, x, ksize); - memcpy(ins, newkey->mv_data, ksize); - memcpy(ins+ksize, split + x * ksize, rsize - x * ksize); - rp->mp_lower += sizeof(indx_t); - rp->mp_upper -= ksize - sizeof(indx_t); - mc->mc_ki[mc->mc_top] = x; - mc->mc_pg[mc->mc_top] = rp; - } - } else { - int psize, nsize, k; - /* Maximum free space in an empty page */ - pmax = env->me_psize - PAGEHDRSZ; - if (IS_LEAF(mp)) - nsize = mdb_leaf_size(env, newkey, newdata); - else - nsize = mdb_branch_size(env, newkey); - nsize = EVEN(nsize); - - /* grab a page to hold a temporary copy */ - copy = mdb_page_malloc(mc->mc_txn, 1); - if (copy == NULL) { - rc = ENOMEM; - goto done; - } - copy->mp_pgno = mp->mp_pgno; - copy->mp_flags = mp->mp_flags; - copy->mp_lower = (PAGEHDRSZ-PAGEBASE); - copy->mp_upper = env->me_psize - PAGEBASE; - - /* prepare to insert */ - for (i=0, j=0; imp_ptrs[j++] = 0; - } - copy->mp_ptrs[j++] = mp->mp_ptrs[i]; - } - - /* When items are relatively large the split point needs - * to be checked, because being off-by-one will make the - * difference between success or failure in mdb_node_add. - * - * It's also relevant if a page happens to be laid out - * such that one half of its nodes are all "small" and - * the other half of its nodes are "large." If the new - * item is also "large" and falls on the half with - * "large" nodes, it also may not fit. - * - * As a final tweak, if the new item goes on the last - * spot on the page (and thus, onto the new page), bias - * the split so the new page is emptier than the old page. - * This yields better packing during sequential inserts. - */ - if (nkeys < 20 || nsize > pmax/16 || newindx >= nkeys) { - /* Find split point */ - psize = 0; - if (newindx <= split_indx || newindx >= nkeys) { - i = 0; j = 1; - k = newindx >= nkeys ? nkeys : split_indx+2; - } else { - i = nkeys; j = -1; - k = split_indx-1; - } - for (; i!=k; i+=j) { - if (i == newindx) { - psize += nsize; - node = NULL; - } else { - node = (MDB_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE); - psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t); - if (IS_LEAF(mp)) { - if (F_ISSET(node->mn_flags, F_BIGDATA)) - psize += sizeof(pgno_t); - else - psize += NODEDSZ(node); - } - psize = EVEN(psize); - } - if (psize > pmax || i == k-j) { - split_indx = i + (j<0); - break; - } - } - } - if (split_indx == newindx) { - sepkey.mv_size = newkey->mv_size; - sepkey.mv_data = newkey->mv_data; - } else { - node = (MDB_node *)((char *)mp + copy->mp_ptrs[split_indx] + PAGEBASE); - sepkey.mv_size = node->mn_ksize; - sepkey.mv_data = NODEKEY(node); - } - } - } - - DPRINTF(("separator is %d [%s]", split_indx, DKEY(&sepkey))); - - /* Copy separator key to the parent. - */ - if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(env, &sepkey)) { - mn.mc_snum--; - mn.mc_top--; - did_split = 1; - rc = mdb_page_split(&mn, &sepkey, NULL, rp->mp_pgno, 0); - if (rc) - goto done; - - /* root split? */ - if (mn.mc_snum == mc->mc_snum) { - mc->mc_pg[mc->mc_snum] = mc->mc_pg[mc->mc_top]; - mc->mc_ki[mc->mc_snum] = mc->mc_ki[mc->mc_top]; - mc->mc_pg[mc->mc_top] = mc->mc_pg[ptop]; - mc->mc_ki[mc->mc_top] = mc->mc_ki[ptop]; - mc->mc_snum++; - mc->mc_top++; - ptop++; - } - /* Right page might now have changed parent. - * Check if left page also changed parent. - */ - if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && - mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { - for (i=0; imc_pg[i] = mn.mc_pg[i]; - mc->mc_ki[i] = mn.mc_ki[i]; - } - mc->mc_pg[ptop] = mn.mc_pg[ptop]; - if (mn.mc_ki[ptop]) { - mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1; - } else { - /* find right page's left sibling */ - mc->mc_ki[ptop] = mn.mc_ki[ptop]; - mdb_cursor_sibling(mc, 0); - } - } - } else { - mn.mc_top--; - rc = mdb_node_add(&mn, mn.mc_ki[ptop], &sepkey, NULL, rp->mp_pgno, 0); - mn.mc_top++; - } - mc->mc_flags ^= C_SPLITTING; - if (rc != MDB_SUCCESS) { - goto done; - } - if (nflags & MDB_APPEND) { - mc->mc_pg[mc->mc_top] = rp; - mc->mc_ki[mc->mc_top] = 0; - rc = mdb_node_add(mc, 0, newkey, newdata, newpgno, nflags); - if (rc) - goto done; - for (i=0; imc_top; i++) - mc->mc_ki[i] = mn.mc_ki[i]; - } else if (!IS_LEAF2(mp)) { - /* Move nodes */ - mc->mc_pg[mc->mc_top] = rp; - i = split_indx; - j = 0; - do { - if (i == newindx) { - rkey.mv_data = newkey->mv_data; - rkey.mv_size = newkey->mv_size; - if (IS_LEAF(mp)) { - rdata = newdata; - } else - pgno = newpgno; - flags = nflags; - /* Update index for the new key. */ - mc->mc_ki[mc->mc_top] = j; - } else { - node = (MDB_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE); - rkey.mv_data = NODEKEY(node); - rkey.mv_size = node->mn_ksize; - if (IS_LEAF(mp)) { - xdata.mv_data = NODEDATA(node); - xdata.mv_size = NODEDSZ(node); - rdata = &xdata; - } else - pgno = NODEPGNO(node); - flags = node->mn_flags; - } - - if (!IS_LEAF(mp) && j == 0) { - /* First branch index doesn't need key data. */ - rkey.mv_size = 0; - } - - rc = mdb_node_add(mc, j, &rkey, rdata, pgno, flags); - if (rc) - goto done; - if (i == nkeys) { - i = 0; - j = 0; - mc->mc_pg[mc->mc_top] = copy; - } else { - i++; - j++; - } - } while (i != split_indx); - - nkeys = NUMKEYS(copy); - for (i=0; imp_ptrs[i] = copy->mp_ptrs[i]; - mp->mp_lower = copy->mp_lower; - mp->mp_upper = copy->mp_upper; - memcpy(NODEPTR(mp, nkeys-1), NODEPTR(copy, nkeys-1), - env->me_psize - copy->mp_upper - PAGEBASE); - - /* reset back to original page */ - if (newindx < split_indx) { - mc->mc_pg[mc->mc_top] = mp; - if (nflags & MDB_RESERVE) { - node = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - if (!(node->mn_flags & F_BIGDATA)) - newdata->mv_data = NODEDATA(node); - } - } else { - mc->mc_pg[mc->mc_top] = rp; - mc->mc_ki[ptop]++; - /* Make sure mc_ki is still valid. - */ - if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && - mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { - for (i=0; i<=ptop; i++) { - mc->mc_pg[i] = mn.mc_pg[i]; - mc->mc_ki[i] = mn.mc_ki[i]; - } - } - } - } - - { - /* Adjust other cursors pointing to mp */ - MDB_cursor *m2, *m3; - MDB_dbi dbi = mc->mc_dbi; - int fixup = NUMKEYS(mp); - - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - if (mc->mc_flags & C_SUB) - m3 = &m2->mc_xcursor->mx_cursor; - else - m3 = m2; - if (m3 == mc) - continue; - if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) - continue; - if (m3->mc_flags & C_SPLITTING) - continue; - if (new_root) { - int k; - /* root split */ - for (k=m3->mc_top; k>=0; k--) { - m3->mc_ki[k+1] = m3->mc_ki[k]; - m3->mc_pg[k+1] = m3->mc_pg[k]; - } - if (m3->mc_ki[0] >= split_indx) { - m3->mc_ki[0] = 1; - } else { - m3->mc_ki[0] = 0; - } - m3->mc_pg[0] = mc->mc_pg[0]; - m3->mc_snum++; - m3->mc_top++; - } - if (m3->mc_top >= mc->mc_top && m3->mc_pg[mc->mc_top] == mp) { - if (m3->mc_ki[mc->mc_top] >= newindx && !(nflags & MDB_SPLIT_REPLACE)) - m3->mc_ki[mc->mc_top]++; - if (m3->mc_ki[mc->mc_top] >= fixup) { - m3->mc_pg[mc->mc_top] = rp; - m3->mc_ki[mc->mc_top] -= fixup; - m3->mc_ki[ptop] = mn.mc_ki[ptop]; - } - } else if (!did_split && m3->mc_top >= ptop && m3->mc_pg[ptop] == mc->mc_pg[ptop] && - m3->mc_ki[ptop] >= mc->mc_ki[ptop]) { - m3->mc_ki[ptop]++; - } - } - } - DPRINTF(("mp left: %d, rp left: %d", SIZELEFT(mp), SIZELEFT(rp))); - -done: - if (copy) /* tmp page */ - mdb_page_free(env, copy); - if (rc) - mc->mc_txn->mt_flags |= MDB_TXN_ERROR; - return rc; -} - -int -mdb_put(MDB_txn *txn, MDB_dbi dbi, - MDB_val *key, MDB_val *data, unsigned int flags) -{ - MDB_cursor mc; - MDB_xcursor mx; - - if (!key || !data || dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) - return EINVAL; - - if ((flags & (MDB_NOOVERWRITE|MDB_NODUPDATA|MDB_RESERVE|MDB_APPEND|MDB_APPENDDUP)) != flags) - return EINVAL; - - mdb_cursor_init(&mc, txn, dbi, &mx); - return mdb_cursor_put(&mc, key, data, flags); -} - -#ifndef MDB_WBUF -#define MDB_WBUF (1024*1024) -#endif - - /** State needed for a compacting copy. */ -typedef struct mdb_copy { - pthread_mutex_t mc_mutex; - pthread_cond_t mc_cond; - char *mc_wbuf[2]; - char *mc_over[2]; - MDB_env *mc_env; - MDB_txn *mc_txn; - int mc_wlen[2]; - int mc_olen[2]; - pgno_t mc_next_pgno; - HANDLE mc_fd; - int mc_status; - volatile int mc_new; - int mc_toggle; - -} mdb_copy; - - /** Dedicated writer thread for compacting copy. */ -static THREAD_RET ESECT -mdb_env_copythr(void *arg) -{ - mdb_copy *my = arg; - char *ptr; - int toggle = 0, wsize, rc; -#ifdef _WIN32 - DWORD len; -#define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL) -#else - int len; -#define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0) -#endif - - pthread_mutex_lock(&my->mc_mutex); - my->mc_new = 0; - pthread_cond_signal(&my->mc_cond); - for(;;) { - while (!my->mc_new) - pthread_cond_wait(&my->mc_cond, &my->mc_mutex); - if (my->mc_new < 0) { - my->mc_new = 0; - break; - } - my->mc_new = 0; - wsize = my->mc_wlen[toggle]; - ptr = my->mc_wbuf[toggle]; -again: - while (wsize > 0) { - DO_WRITE(rc, my->mc_fd, ptr, wsize, len); - if (!rc) { - rc = ErrCode(); - break; - } else if (len > 0) { - rc = MDB_SUCCESS; - ptr += len; - wsize -= len; - continue; - } else { - rc = EIO; - break; - } - } - if (rc) { - my->mc_status = rc; - break; - } - /* If there's an overflow page tail, write it too */ - if (my->mc_olen[toggle]) { - wsize = my->mc_olen[toggle]; - ptr = my->mc_over[toggle]; - my->mc_olen[toggle] = 0; - goto again; - } - my->mc_wlen[toggle] = 0; - toggle ^= 1; - pthread_cond_signal(&my->mc_cond); - } - pthread_cond_signal(&my->mc_cond); - pthread_mutex_unlock(&my->mc_mutex); - return (THREAD_RET)0; -#undef DO_WRITE -} - - /** Tell the writer thread there's a buffer ready to write */ -static int ESECT -mdb_env_cthr_toggle(mdb_copy *my, int st) -{ - int toggle = my->mc_toggle ^ 1; - pthread_mutex_lock(&my->mc_mutex); - if (my->mc_status) { - pthread_mutex_unlock(&my->mc_mutex); - return my->mc_status; - } - while (my->mc_new == 1) - pthread_cond_wait(&my->mc_cond, &my->mc_mutex); - my->mc_new = st; - my->mc_toggle = toggle; - pthread_cond_signal(&my->mc_cond); - pthread_mutex_unlock(&my->mc_mutex); - return 0; -} - - /** Depth-first tree traversal for compacting copy. */ -static int ESECT -mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) -{ - MDB_cursor mc; - MDB_txn *txn = my->mc_txn; - MDB_node *ni; - MDB_page *mo, *mp, *leaf; - char *buf, *ptr; - int rc, toggle; - unsigned int i; - - /* Empty DB, nothing to do */ - if (*pg == P_INVALID) - return MDB_SUCCESS; - - mc.mc_snum = 1; - mc.mc_top = 0; - mc.mc_txn = txn; - - rc = mdb_page_get(my->mc_txn, *pg, &mc.mc_pg[0], NULL); - if (rc) - return rc; - rc = mdb_page_search_root(&mc, NULL, MDB_PS_FIRST); - if (rc) - return rc; - - /* Make cursor pages writable */ - buf = ptr = malloc(my->mc_env->me_psize * mc.mc_snum); - if (buf == NULL) - return ENOMEM; - - for (i=0; imc_env->me_psize); - mc.mc_pg[i] = (MDB_page *)ptr; - ptr += my->mc_env->me_psize; - } - - /* This is writable space for a leaf page. Usually not needed. */ - leaf = (MDB_page *)ptr; - - toggle = my->mc_toggle; - while (mc.mc_snum > 0) { - unsigned n; - mp = mc.mc_pg[mc.mc_top]; - n = NUMKEYS(mp); - - if (IS_LEAF(mp)) { - if (!IS_LEAF2(mp) && !(flags & F_DUPDATA)) { - for (i=0; imn_flags & F_BIGDATA) { - MDB_page *omp; - pgno_t pg; - - /* Need writable leaf */ - if (mp != leaf) { - mc.mc_pg[mc.mc_top] = leaf; - mdb_page_copy(leaf, mp, my->mc_env->me_psize); - mp = leaf; - ni = NODEPTR(mp, i); - } - - memcpy(&pg, NODEDATA(ni), sizeof(pg)); - rc = mdb_page_get(txn, pg, &omp, NULL); - if (rc) - goto done; - if (my->mc_wlen[toggle] >= MDB_WBUF) { - rc = mdb_env_cthr_toggle(my, 1); - if (rc) - goto done; - toggle = my->mc_toggle; - } - mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); - memcpy(mo, omp, my->mc_env->me_psize); - mo->mp_pgno = my->mc_next_pgno; - my->mc_next_pgno += omp->mp_pages; - my->mc_wlen[toggle] += my->mc_env->me_psize; - if (omp->mp_pages > 1) { - my->mc_olen[toggle] = my->mc_env->me_psize * (omp->mp_pages - 1); - my->mc_over[toggle] = (char *)omp + my->mc_env->me_psize; - rc = mdb_env_cthr_toggle(my, 1); - if (rc) - goto done; - toggle = my->mc_toggle; - } - memcpy(NODEDATA(ni), &mo->mp_pgno, sizeof(pgno_t)); - } else if (ni->mn_flags & F_SUBDATA) { - MDB_db db; - - /* Need writable leaf */ - if (mp != leaf) { - mc.mc_pg[mc.mc_top] = leaf; - mdb_page_copy(leaf, mp, my->mc_env->me_psize); - mp = leaf; - ni = NODEPTR(mp, i); - } - - memcpy(&db, NODEDATA(ni), sizeof(db)); - my->mc_toggle = toggle; - rc = mdb_env_cwalk(my, &db.md_root, ni->mn_flags & F_DUPDATA); - if (rc) - goto done; - toggle = my->mc_toggle; - memcpy(NODEDATA(ni), &db, sizeof(db)); - } - } - } - } else { - mc.mc_ki[mc.mc_top]++; - if (mc.mc_ki[mc.mc_top] < n) { - pgno_t pg; -again: - ni = NODEPTR(mp, mc.mc_ki[mc.mc_top]); - pg = NODEPGNO(ni); - rc = mdb_page_get(txn, pg, &mp, NULL); - if (rc) - goto done; - mc.mc_top++; - mc.mc_snum++; - mc.mc_ki[mc.mc_top] = 0; - if (IS_BRANCH(mp)) { - /* Whenever we advance to a sibling branch page, - * we must proceed all the way down to its first leaf. - */ - mdb_page_copy(mc.mc_pg[mc.mc_top], mp, my->mc_env->me_psize); - goto again; - } else - mc.mc_pg[mc.mc_top] = mp; - continue; - } - } - if (my->mc_wlen[toggle] >= MDB_WBUF) { - rc = mdb_env_cthr_toggle(my, 1); - if (rc) - goto done; - toggle = my->mc_toggle; - } - mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); - mdb_page_copy(mo, mp, my->mc_env->me_psize); - mo->mp_pgno = my->mc_next_pgno++; - my->mc_wlen[toggle] += my->mc_env->me_psize; - if (mc.mc_top) { - /* Update parent if there is one */ - ni = NODEPTR(mc.mc_pg[mc.mc_top-1], mc.mc_ki[mc.mc_top-1]); - SETPGNO(ni, mo->mp_pgno); - mdb_cursor_pop(&mc); - } else { - /* Otherwise we're done */ - *pg = mo->mp_pgno; - break; - } - } -done: - free(buf); - return rc; -} - - /** Copy environment with compaction. */ -static int ESECT -mdb_env_copyfd1(MDB_env *env, HANDLE fd) -{ - MDB_meta *mm; - MDB_page *mp; - mdb_copy my; - MDB_txn *txn = NULL; - pthread_t thr; - int rc; - -#ifdef _WIN32 - my.mc_mutex = CreateMutex(NULL, FALSE, NULL); - my.mc_cond = CreateEvent(NULL, FALSE, FALSE, NULL); - my.mc_wbuf[0] = _aligned_malloc(MDB_WBUF*2, env->me_os_psize); - if (my.mc_wbuf[0] == NULL) - return errno; -#else - pthread_mutex_init(&my.mc_mutex, NULL); - pthread_cond_init(&my.mc_cond, NULL); -#ifdef HAVE_MEMALIGN - my.mc_wbuf[0] = memalign(env->me_os_psize, MDB_WBUF*2); - if (my.mc_wbuf[0] == NULL) - return errno; -#else - rc = posix_memalign((void **)&my.mc_wbuf[0], env->me_os_psize, MDB_WBUF*2); - if (rc) - return rc; -#endif -#endif - memset(my.mc_wbuf[0], 0, MDB_WBUF*2); - my.mc_wbuf[1] = my.mc_wbuf[0] + MDB_WBUF; - my.mc_wlen[0] = 0; - my.mc_wlen[1] = 0; - my.mc_olen[0] = 0; - my.mc_olen[1] = 0; - my.mc_next_pgno = 2; - my.mc_status = 0; - my.mc_new = 1; - my.mc_toggle = 0; - my.mc_env = env; - my.mc_fd = fd; - THREAD_CREATE(thr, mdb_env_copythr, &my); - - rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); - if (rc) - return rc; - - mp = (MDB_page *)my.mc_wbuf[0]; - memset(mp, 0, 2*env->me_psize); - mp->mp_pgno = 0; - mp->mp_flags = P_META; - mm = (MDB_meta *)METADATA(mp); - mdb_env_init_meta0(env, mm); - mm->mm_address = env->me_metas[0]->mm_address; - - mp = (MDB_page *)(my.mc_wbuf[0] + env->me_psize); - mp->mp_pgno = 1; - mp->mp_flags = P_META; - *(MDB_meta *)METADATA(mp) = *mm; - mm = (MDB_meta *)METADATA(mp); - - /* Count the number of free pages, subtract from lastpg to find - * number of active pages - */ - { - MDB_ID freecount = 0; - MDB_cursor mc; - MDB_val key, data; - mdb_cursor_init(&mc, txn, FREE_DBI, NULL); - while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) - freecount += *(MDB_ID *)data.mv_data; - freecount += txn->mt_dbs[0].md_branch_pages + - txn->mt_dbs[0].md_leaf_pages + - txn->mt_dbs[0].md_overflow_pages; - - /* Set metapage 1 */ - mm->mm_last_pg = txn->mt_next_pgno - freecount - 1; - mm->mm_dbs[1] = txn->mt_dbs[1]; - mm->mm_dbs[1].md_root = mm->mm_last_pg; - mm->mm_txnid = 1; - } - my.mc_wlen[0] = env->me_psize * 2; - my.mc_txn = txn; - pthread_mutex_lock(&my.mc_mutex); - while(my.mc_new) - pthread_cond_wait(&my.mc_cond, &my.mc_mutex); - pthread_mutex_unlock(&my.mc_mutex); - rc = mdb_env_cwalk(&my, &txn->mt_dbs[1].md_root, 0); - if (rc == MDB_SUCCESS && my.mc_wlen[my.mc_toggle]) - rc = mdb_env_cthr_toggle(&my, 1); - mdb_env_cthr_toggle(&my, -1); - pthread_mutex_lock(&my.mc_mutex); - while(my.mc_new) - pthread_cond_wait(&my.mc_cond, &my.mc_mutex); - pthread_mutex_unlock(&my.mc_mutex); - THREAD_FINISH(thr); - - mdb_txn_abort(txn); -#ifdef _WIN32 - CloseHandle(my.mc_cond); - CloseHandle(my.mc_mutex); - _aligned_free(my.mc_wbuf[0]); -#else - pthread_cond_destroy(&my.mc_cond); - pthread_mutex_destroy(&my.mc_mutex); - free(my.mc_wbuf[0]); -#endif - return rc; -} - - /** Copy environment as-is. */ -static int ESECT -mdb_env_copyfd0(MDB_env *env, HANDLE fd) -{ - MDB_txn *txn = NULL; - int rc; - size_t wsize; - char *ptr; -#ifdef _WIN32 - DWORD len, w2; -#define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL) -#else - ssize_t len; - size_t w2; -#define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0) -#endif - - /* Do the lock/unlock of the reader mutex before starting the - * write txn. Otherwise other read txns could block writers. - */ - rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); - if (rc) - return rc; - - if (env->me_txns) { - /* We must start the actual read txn after blocking writers */ - mdb_txn_reset0(txn, "reset-stage1"); - - /* Temporarily block writers until we snapshot the meta pages */ - LOCK_MUTEX_W(env); - - rc = mdb_txn_renew0(txn); - if (rc) { - UNLOCK_MUTEX_W(env); - goto leave; - } - } - - wsize = env->me_psize * 2; - ptr = env->me_map; - w2 = wsize; - while (w2 > 0) { - DO_WRITE(rc, fd, ptr, w2, len); - if (!rc) { - rc = ErrCode(); - break; - } else if (len > 0) { - rc = MDB_SUCCESS; - ptr += len; - w2 -= len; - continue; - } else { - /* Non-blocking or async handles are not supported */ - rc = EIO; - break; - } - } - if (env->me_txns) - UNLOCK_MUTEX_W(env); - - if (rc) - goto leave; - - w2 = txn->mt_next_pgno * env->me_psize; -#ifdef WIN32 - { - LARGE_INTEGER fsize; - GetFileSizeEx(env->me_fd, &fsize); - if (w2 > fsize.QuadPart) - w2 = fsize.QuadPart; - } -#else - { - struct stat st; - fstat(env->me_fd, &st); - if (w2 > (size_t)st.st_size) - w2 = st.st_size; - } -#endif - wsize = w2 - wsize; - while (wsize > 0) { - if (wsize > MAX_WRITE) - w2 = MAX_WRITE; - else - w2 = wsize; - DO_WRITE(rc, fd, ptr, w2, len); - if (!rc) { - rc = ErrCode(); - break; - } else if (len > 0) { - rc = MDB_SUCCESS; - ptr += len; - wsize -= len; - continue; - } else { - rc = EIO; - break; - } - } - -leave: - mdb_txn_abort(txn); - return rc; -} - -int ESECT -mdb_env_copyfd2(MDB_env *env, HANDLE fd, unsigned int flags) -{ - if (flags & MDB_CP_COMPACT) - return mdb_env_copyfd1(env, fd); - else - return mdb_env_copyfd0(env, fd); -} - -int ESECT -mdb_env_copyfd(MDB_env *env, HANDLE fd) -{ - return mdb_env_copyfd2(env, fd, 0); -} - -int ESECT -mdb_env_copy2(MDB_env *env, const char *path, unsigned int flags) -{ - int rc, len; - char *lpath; - HANDLE newfd = INVALID_HANDLE_VALUE; - - if (env->me_flags & MDB_NOSUBDIR) { - lpath = (char *)path; - } else { - len = strlen(path); - len += sizeof(DATANAME); - lpath = malloc(len); - if (!lpath) - return ENOMEM; - sprintf(lpath, "%s" DATANAME, path); - } - - /* The destination path must exist, but the destination file must not. - * We don't want the OS to cache the writes, since the source data is - * already in the OS cache. - */ -#ifdef _WIN32 - newfd = CreateFile(lpath, GENERIC_WRITE, 0, NULL, CREATE_NEW, - FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH, NULL); -#else - newfd = open(lpath, O_WRONLY|O_CREAT|O_EXCL, 0666); -#endif - if (newfd == INVALID_HANDLE_VALUE) { - rc = ErrCode(); - goto leave; - } - - if (env->me_psize >= env->me_os_psize) { -#ifdef O_DIRECT - /* Set O_DIRECT if the file system supports it */ - if ((rc = fcntl(newfd, F_GETFL)) != -1) - (void) fcntl(newfd, F_SETFL, rc | O_DIRECT); -#endif -#ifdef F_NOCACHE /* __APPLE__ */ - rc = fcntl(newfd, F_NOCACHE, 1); - if (rc) { - rc = ErrCode(); - goto leave; - } -#endif - } - - rc = mdb_env_copyfd2(env, newfd, flags); - -leave: - if (!(env->me_flags & MDB_NOSUBDIR)) - free(lpath); - if (newfd != INVALID_HANDLE_VALUE) - if (close(newfd) < 0 && rc == MDB_SUCCESS) - rc = ErrCode(); - - return rc; -} - -int ESECT -mdb_env_copy(MDB_env *env, const char *path) -{ - return mdb_env_copy2(env, path, 0); -} - -int ESECT -mdb_env_set_flags(MDB_env *env, unsigned int flag, int onoff) -{ - if ((flag & CHANGEABLE) != flag) - return EINVAL; - if (onoff) - env->me_flags |= flag; - else - env->me_flags &= ~flag; - return MDB_SUCCESS; -} - -int ESECT -mdb_env_get_flags(MDB_env *env, unsigned int *arg) -{ - if (!env || !arg) - return EINVAL; - - *arg = env->me_flags; - return MDB_SUCCESS; -} - -int ESECT -mdb_env_set_userctx(MDB_env *env, void *ctx) -{ - if (!env) - return EINVAL; - env->me_userctx = ctx; - return MDB_SUCCESS; -} - -void * ESECT -mdb_env_get_userctx(MDB_env *env) -{ - return env ? env->me_userctx : NULL; -} - -int ESECT -mdb_env_set_assert(MDB_env *env, MDB_assert_func *func) -{ - if (!env) - return EINVAL; -#ifndef NDEBUG - env->me_assert_func = func; -#endif - return MDB_SUCCESS; -} - -int ESECT -mdb_env_get_path(MDB_env *env, const char **arg) -{ - if (!env || !arg) - return EINVAL; - - *arg = env->me_path; - return MDB_SUCCESS; -} - -int ESECT -mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *arg) -{ - if (!env || !arg) - return EINVAL; - - *arg = env->me_fd; - return MDB_SUCCESS; -} - -/** Common code for #mdb_stat() and #mdb_env_stat(). - * @param[in] env the environment to operate in. - * @param[in] db the #MDB_db record containing the stats to return. - * @param[out] arg the address of an #MDB_stat structure to receive the stats. - * @return 0, this function always succeeds. - */ -static int ESECT -mdb_stat0(MDB_env *env, MDB_db *db, MDB_stat *arg) -{ - arg->ms_psize = env->me_psize; - arg->ms_depth = db->md_depth; - arg->ms_branch_pages = db->md_branch_pages; - arg->ms_leaf_pages = db->md_leaf_pages; - arg->ms_overflow_pages = db->md_overflow_pages; - arg->ms_entries = db->md_entries; - - return MDB_SUCCESS; -} - -int ESECT -mdb_env_stat(MDB_env *env, MDB_stat *arg) -{ - int toggle; - - if (env == NULL || arg == NULL) - return EINVAL; - - toggle = mdb_env_pick_meta(env); - - return mdb_stat0(env, &env->me_metas[toggle]->mm_dbs[MAIN_DBI], arg); -} - -int ESECT -mdb_env_info(MDB_env *env, MDB_envinfo *arg) -{ - int toggle; - - if (env == NULL || arg == NULL) - return EINVAL; - - toggle = mdb_env_pick_meta(env); - arg->me_mapaddr = env->me_metas[toggle]->mm_address; - arg->me_mapsize = env->me_mapsize; - arg->me_maxreaders = env->me_maxreaders; - - /* me_numreaders may be zero if this process never used any readers. Use - * the shared numreader count if it exists. - */ - arg->me_numreaders = env->me_txns ? env->me_txns->mti_numreaders : env->me_numreaders; - - arg->me_last_pgno = env->me_metas[toggle]->mm_last_pg; - arg->me_last_txnid = env->me_metas[toggle]->mm_txnid; - return MDB_SUCCESS; -} - -/** Set the default comparison functions for a database. - * Called immediately after a database is opened to set the defaults. - * The user can then override them with #mdb_set_compare() or - * #mdb_set_dupsort(). - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - */ -static void -mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi) -{ - uint16_t f = txn->mt_dbs[dbi].md_flags; - - txn->mt_dbxs[dbi].md_cmp = - (f & MDB_REVERSEKEY) ? mdb_cmp_memnr : - (f & MDB_INTEGERKEY) ? mdb_cmp_cint : mdb_cmp_memn; - - txn->mt_dbxs[dbi].md_dcmp = - !(f & MDB_DUPSORT) ? 0 : - ((f & MDB_INTEGERDUP) - ? ((f & MDB_DUPFIXED) ? mdb_cmp_int : mdb_cmp_cint) - : ((f & MDB_REVERSEDUP) ? mdb_cmp_memnr : mdb_cmp_memn)); -} - -int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *dbi) -{ - MDB_val key, data; - MDB_dbi i; - MDB_cursor mc; - MDB_db dummy; - int rc, dbflag, exact; - unsigned int unused = 0, seq; - size_t len; - - if (txn->mt_dbxs[FREE_DBI].md_cmp == NULL) { - mdb_default_cmp(txn, FREE_DBI); - } - - if ((flags & VALID_FLAGS) != flags) - return EINVAL; - if (txn->mt_flags & MDB_TXN_ERROR) - return MDB_BAD_TXN; - - /* main DB? */ - if (!name) { - *dbi = MAIN_DBI; - if (flags & PERSISTENT_FLAGS) { - uint16_t f2 = flags & PERSISTENT_FLAGS; - /* make sure flag changes get committed */ - if ((txn->mt_dbs[MAIN_DBI].md_flags | f2) != txn->mt_dbs[MAIN_DBI].md_flags) { - txn->mt_dbs[MAIN_DBI].md_flags |= f2; - txn->mt_flags |= MDB_TXN_DIRTY; - } - } - mdb_default_cmp(txn, MAIN_DBI); - return MDB_SUCCESS; - } - - if (txn->mt_dbxs[MAIN_DBI].md_cmp == NULL) { - mdb_default_cmp(txn, MAIN_DBI); - } - - /* Is the DB already open? */ - len = strlen(name); - for (i=2; imt_numdbs; i++) { - if (!txn->mt_dbxs[i].md_name.mv_size) { - /* Remember this free slot */ - if (!unused) unused = i; - continue; - } - if (len == txn->mt_dbxs[i].md_name.mv_size && - !strncmp(name, txn->mt_dbxs[i].md_name.mv_data, len)) { - *dbi = i; - return MDB_SUCCESS; - } - } - - /* If no free slot and max hit, fail */ - if (!unused && txn->mt_numdbs >= txn->mt_env->me_maxdbs) - return MDB_DBS_FULL; - - /* Cannot mix named databases with some mainDB flags */ - if (txn->mt_dbs[MAIN_DBI].md_flags & (MDB_DUPSORT|MDB_INTEGERKEY)) - return (flags & MDB_CREATE) ? MDB_INCOMPATIBLE : MDB_NOTFOUND; - - /* Find the DB info */ - dbflag = DB_NEW|DB_VALID; - exact = 0; - key.mv_size = len; - key.mv_data = (void *)name; - mdb_cursor_init(&mc, txn, MAIN_DBI, NULL); - rc = mdb_cursor_set(&mc, &key, &data, MDB_SET, &exact); - if (rc == MDB_SUCCESS) { - /* make sure this is actually a DB */ - MDB_node *node = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); - if (!(node->mn_flags & F_SUBDATA)) - return MDB_INCOMPATIBLE; - } else if (rc == MDB_NOTFOUND && (flags & MDB_CREATE)) { - /* Create if requested */ - data.mv_size = sizeof(MDB_db); - data.mv_data = &dummy; - memset(&dummy, 0, sizeof(dummy)); - dummy.md_root = P_INVALID; - dummy.md_flags = flags & PERSISTENT_FLAGS; - rc = mdb_cursor_put(&mc, &key, &data, F_SUBDATA); - dbflag |= DB_DIRTY; - } - - /* OK, got info, add to table */ - if (rc == MDB_SUCCESS) { - unsigned int slot = unused ? unused : txn->mt_numdbs; - txn->mt_dbxs[slot].md_name.mv_data = strdup(name); - txn->mt_dbxs[slot].md_name.mv_size = len; - txn->mt_dbxs[slot].md_rel = NULL; - txn->mt_dbflags[slot] = dbflag; - /* txn-> and env-> are the same in read txns, use - * tmp variable to avoid undefined assignment - */ - seq = ++txn->mt_env->me_dbiseqs[slot]; - txn->mt_dbiseqs[slot] = seq; - - memcpy(&txn->mt_dbs[slot], data.mv_data, sizeof(MDB_db)); - *dbi = slot; - mdb_default_cmp(txn, slot); - if (!unused) { - txn->mt_numdbs++; - } - } - - return rc; -} - -int mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *arg) -{ - if (!arg || !TXN_DBI_EXIST(txn, dbi)) - return EINVAL; - - if (txn->mt_flags & MDB_TXN_ERROR) - return MDB_BAD_TXN; - - if (txn->mt_dbflags[dbi] & DB_STALE) { - MDB_cursor mc; - MDB_xcursor mx; - /* Stale, must read the DB's root. cursor_init does it for us. */ - mdb_cursor_init(&mc, txn, dbi, &mx); - } - return mdb_stat0(txn->mt_env, &txn->mt_dbs[dbi], arg); -} - -void mdb_dbi_close(MDB_env *env, MDB_dbi dbi) -{ - char *ptr; - if (dbi <= MAIN_DBI || dbi >= env->me_maxdbs) - return; - ptr = env->me_dbxs[dbi].md_name.mv_data; - /* If there was no name, this was already closed */ - if (ptr) { - env->me_dbxs[dbi].md_name.mv_data = NULL; - env->me_dbxs[dbi].md_name.mv_size = 0; - env->me_dbflags[dbi] = 0; - env->me_dbiseqs[dbi]++; - free(ptr); - } -} - -int mdb_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned int *flags) -{ - /* We could return the flags for the FREE_DBI too but what's the point? */ - if (dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) - return EINVAL; - *flags = txn->mt_dbs[dbi].md_flags & PERSISTENT_FLAGS; - return MDB_SUCCESS; -} - -/** Add all the DB's pages to the free list. - * @param[in] mc Cursor on the DB to free. - * @param[in] subs non-Zero to check for sub-DBs in this DB. - * @return 0 on success, non-zero on failure. - */ -static int -mdb_drop0(MDB_cursor *mc, int subs) -{ - int rc; - - rc = mdb_page_search(mc, NULL, MDB_PS_FIRST); - if (rc == MDB_SUCCESS) { - MDB_txn *txn = mc->mc_txn; - MDB_node *ni; - MDB_cursor mx; - unsigned int i; - - /* LEAF2 pages have no nodes, cannot have sub-DBs */ - if (IS_LEAF2(mc->mc_pg[mc->mc_top])) - mdb_cursor_pop(mc); - - mdb_cursor_copy(mc, &mx); - while (mc->mc_snum > 0) { - MDB_page *mp = mc->mc_pg[mc->mc_top]; - unsigned n = NUMKEYS(mp); - if (IS_LEAF(mp)) { - for (i=0; imn_flags & F_BIGDATA) { - MDB_page *omp; - pgno_t pg; - memcpy(&pg, NODEDATA(ni), sizeof(pg)); - rc = mdb_page_get(txn, pg, &omp, NULL); - if (rc != 0) - goto done; - mdb_cassert(mc, IS_OVERFLOW(omp)); - rc = mdb_midl_append_range(&txn->mt_free_pgs, - pg, omp->mp_pages); - if (rc) - goto done; - } else if (subs && (ni->mn_flags & F_SUBDATA)) { - mdb_xcursor_init1(mc, ni); - rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0); - if (rc) - goto done; - } - } - } else { - if ((rc = mdb_midl_need(&txn->mt_free_pgs, n)) != 0) - goto done; - for (i=0; imt_free_pgs, pg); - } - } - if (!mc->mc_top) - break; - mc->mc_ki[mc->mc_top] = i; - rc = mdb_cursor_sibling(mc, 1); - if (rc) { - if (rc != MDB_NOTFOUND) - goto done; - /* no more siblings, go back to beginning - * of previous level. - */ - mdb_cursor_pop(mc); - mc->mc_ki[0] = 0; - for (i=1; imc_snum; i++) { - mc->mc_ki[i] = 0; - mc->mc_pg[i] = mx.mc_pg[i]; - } - } - } - /* free it */ - rc = mdb_midl_append(&txn->mt_free_pgs, mc->mc_db->md_root); -done: - if (rc) - txn->mt_flags |= MDB_TXN_ERROR; - } else if (rc == MDB_NOTFOUND) { - rc = MDB_SUCCESS; - } - return rc; -} - -int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del) -{ - MDB_cursor *mc, *m2; - int rc; - - if ((unsigned)del > 1 || dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) - return EINVAL; - - if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) - return EACCES; - - if (dbi > MAIN_DBI && TXN_DBI_CHANGED(txn, dbi)) - return MDB_BAD_DBI; - - rc = mdb_cursor_open(txn, dbi, &mc); - if (rc) - return rc; - - rc = mdb_drop0(mc, mc->mc_db->md_flags & MDB_DUPSORT); - /* Invalidate the dropped DB's cursors */ - for (m2 = txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) - m2->mc_flags &= ~(C_INITIALIZED|C_EOF); - if (rc) - goto leave; - - /* Can't delete the main DB */ - if (del && dbi > MAIN_DBI) { - rc = mdb_del0(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL, 0); - if (!rc) { - txn->mt_dbflags[dbi] = DB_STALE; - mdb_dbi_close(txn->mt_env, dbi); - } else { - txn->mt_flags |= MDB_TXN_ERROR; - } - } else { - /* reset the DB record, mark it dirty */ - txn->mt_dbflags[dbi] |= DB_DIRTY; - txn->mt_dbs[dbi].md_depth = 0; - txn->mt_dbs[dbi].md_branch_pages = 0; - txn->mt_dbs[dbi].md_leaf_pages = 0; - txn->mt_dbs[dbi].md_overflow_pages = 0; - txn->mt_dbs[dbi].md_entries = 0; - txn->mt_dbs[dbi].md_root = P_INVALID; - - txn->mt_flags |= MDB_TXN_DIRTY; - } -leave: - mdb_cursor_close(mc); - return rc; -} - -int mdb_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) -{ - if (dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) - return EINVAL; - - txn->mt_dbxs[dbi].md_cmp = cmp; - return MDB_SUCCESS; -} - -int mdb_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) -{ - if (dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) - return EINVAL; - - txn->mt_dbxs[dbi].md_dcmp = cmp; - return MDB_SUCCESS; -} - -int mdb_set_relfunc(MDB_txn *txn, MDB_dbi dbi, MDB_rel_func *rel) -{ - if (dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) - return EINVAL; - - txn->mt_dbxs[dbi].md_rel = rel; - return MDB_SUCCESS; -} - -int mdb_set_relctx(MDB_txn *txn, MDB_dbi dbi, void *ctx) -{ - if (dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) - return EINVAL; - - txn->mt_dbxs[dbi].md_relctx = ctx; - return MDB_SUCCESS; -} - -int ESECT -mdb_env_get_maxkeysize(MDB_env *env) -{ - return ENV_MAXKEY(env); -} - -int ESECT -mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) -{ - unsigned int i, rdrs; - MDB_reader *mr; - char buf[64]; - int rc = 0, first = 1; - - if (!env || !func) - return -1; - if (!env->me_txns) { - return func("(no reader locks)\n", ctx); - } - rdrs = env->me_txns->mti_numreaders; - mr = env->me_txns->mti_readers; - for (i=0; i> 1; - cursor = base + pivot + 1; - val = pid - ids[cursor]; - - if( val < 0 ) { - n = pivot; - - } else if ( val > 0 ) { - base = cursor; - n -= pivot + 1; - - } else { - /* found, so it's a duplicate */ - return -1; - } - } - - if( val > 0 ) { - ++cursor; - } - ids[0]++; - for (n = ids[0]; n > cursor; n--) - ids[n] = ids[n-1]; - ids[n] = pid; - return 0; -} - -int ESECT -mdb_reader_check(MDB_env *env, int *dead) -{ - unsigned int i, j, rdrs; - MDB_reader *mr; - MDB_PID_T *pids, pid; - int count = 0; - - if (!env) - return EINVAL; - if (dead) - *dead = 0; - if (!env->me_txns) - return MDB_SUCCESS; - rdrs = env->me_txns->mti_numreaders; - pids = malloc((rdrs+1) * sizeof(MDB_PID_T)); - if (!pids) - return ENOMEM; - pids[0] = 0; - mr = env->me_txns->mti_readers; - for (i=0; ime_pid) { - pid = mr[i].mr_pid; - if (mdb_pid_insert(pids, pid) == 0) { - if (!mdb_reader_pid(env, Pidcheck, pid)) { - LOCK_MUTEX_R(env); - /* Recheck, a new process may have reused pid */ - if (!mdb_reader_pid(env, Pidcheck, pid)) { - for (j=i; j. - * - * Copyright 2000-2014 The OpenLDAP Foundation. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - */ - -#include -#include -#include -#include -#include -#include "midl.h" - -/** @defgroup internal LMDB Internals - * @{ - */ -/** @defgroup idls ID List Management - * @{ - */ -#define CMP(x,y) ( (x) < (y) ? -1 : (x) > (y) ) - -unsigned mdb_midl_search( MDB_IDL ids, MDB_ID id ) -{ - /* - * binary search of id in ids - * if found, returns position of id - * if not found, returns first position greater than id - */ - unsigned base = 0; - unsigned cursor = 1; - int val = 0; - unsigned n = ids[0]; - - while( 0 < n ) { - unsigned pivot = n >> 1; - cursor = base + pivot + 1; - val = CMP( ids[cursor], id ); - - if( val < 0 ) { - n = pivot; - - } else if ( val > 0 ) { - base = cursor; - n -= pivot + 1; - - } else { - return cursor; - } - } - - if( val > 0 ) { - ++cursor; - } - return cursor; -} - -#if 0 /* superseded by append/sort */ -int mdb_midl_insert( MDB_IDL ids, MDB_ID id ) -{ - unsigned x, i; - - x = mdb_midl_search( ids, id ); - assert( x > 0 ); - - if( x < 1 ) { - /* internal error */ - return -2; - } - - if ( x <= ids[0] && ids[x] == id ) { - /* duplicate */ - assert(0); - return -1; - } - - if ( ++ids[0] >= MDB_IDL_DB_MAX ) { - /* no room */ - --ids[0]; - return -2; - - } else { - /* insert id */ - for (i=ids[0]; i>x; i--) - ids[i] = ids[i-1]; - ids[x] = id; - } - - return 0; -} -#endif - -MDB_IDL mdb_midl_alloc(int num) -{ - MDB_IDL ids = malloc((num+2) * sizeof(MDB_ID)); - if (ids) { - *ids++ = num; - *ids = 0; - } - return ids; -} - -void mdb_midl_free(MDB_IDL ids) -{ - if (ids) - free(ids-1); -} - -int mdb_midl_shrink( MDB_IDL *idp ) -{ - MDB_IDL ids = *idp; - if (*(--ids) > MDB_IDL_UM_MAX && - (ids = realloc(ids, (MDB_IDL_UM_MAX+1) * sizeof(MDB_ID)))) - { - *ids++ = MDB_IDL_UM_MAX; - *idp = ids; - return 1; - } - return 0; -} - -static int mdb_midl_grow( MDB_IDL *idp, int num ) -{ - MDB_IDL idn = *idp-1; - /* grow it */ - idn = realloc(idn, (*idn + num + 2) * sizeof(MDB_ID)); - if (!idn) - return ENOMEM; - *idn++ += num; - *idp = idn; - return 0; -} - -int mdb_midl_need( MDB_IDL *idp, unsigned num ) -{ - MDB_IDL ids = *idp; - num += ids[0]; - if (num > ids[-1]) { - num = (num + num/4 + (256 + 2)) & -256; - if (!(ids = realloc(ids-1, num * sizeof(MDB_ID)))) - return ENOMEM; - *ids++ = num - 2; - *idp = ids; - } - return 0; -} - -int mdb_midl_append( MDB_IDL *idp, MDB_ID id ) -{ - MDB_IDL ids = *idp; - /* Too big? */ - if (ids[0] >= ids[-1]) { - if (mdb_midl_grow(idp, MDB_IDL_UM_MAX)) - return ENOMEM; - ids = *idp; - } - ids[0]++; - ids[ids[0]] = id; - return 0; -} - -int mdb_midl_append_list( MDB_IDL *idp, MDB_IDL app ) -{ - MDB_IDL ids = *idp; - /* Too big? */ - if (ids[0] + app[0] >= ids[-1]) { - if (mdb_midl_grow(idp, app[0])) - return ENOMEM; - ids = *idp; - } - memcpy(&ids[ids[0]+1], &app[1], app[0] * sizeof(MDB_ID)); - ids[0] += app[0]; - return 0; -} - -int mdb_midl_append_range( MDB_IDL *idp, MDB_ID id, unsigned n ) -{ - MDB_ID *ids = *idp, len = ids[0]; - /* Too big? */ - if (len + n > ids[-1]) { - if (mdb_midl_grow(idp, n | MDB_IDL_UM_MAX)) - return ENOMEM; - ids = *idp; - } - ids[0] = len + n; - ids += len; - while (n) - ids[n--] = id++; - return 0; -} - -void mdb_midl_xmerge( MDB_IDL idl, MDB_IDL merge ) -{ - MDB_ID old_id, merge_id, i = merge[0], j = idl[0], k = i+j, total = k; - idl[0] = (MDB_ID)-1; /* delimiter for idl scan below */ - old_id = idl[j]; - while (i) { - merge_id = merge[i--]; - for (; old_id < merge_id; old_id = idl[--j]) - idl[k--] = old_id; - idl[k--] = merge_id; - } - idl[0] = total; -} - -/* Quicksort + Insertion sort for small arrays */ - -#define SMALL 8 -#define MIDL_SWAP(a,b) { itmp=(a); (a)=(b); (b)=itmp; } - -void -mdb_midl_sort( MDB_IDL ids ) -{ - /* Max possible depth of int-indexed tree * 2 items/level */ - int istack[sizeof(int)*CHAR_BIT * 2]; - int i,j,k,l,ir,jstack; - MDB_ID a, itmp; - - ir = (int)ids[0]; - l = 1; - jstack = 0; - for(;;) { - if (ir - l < SMALL) { /* Insertion sort */ - for (j=l+1;j<=ir;j++) { - a = ids[j]; - for (i=j-1;i>=1;i--) { - if (ids[i] >= a) break; - ids[i+1] = ids[i]; - } - ids[i+1] = a; - } - if (jstack == 0) break; - ir = istack[jstack--]; - l = istack[jstack--]; - } else { - k = (l + ir) >> 1; /* Choose median of left, center, right */ - MIDL_SWAP(ids[k], ids[l+1]); - if (ids[l] < ids[ir]) { - MIDL_SWAP(ids[l], ids[ir]); - } - if (ids[l+1] < ids[ir]) { - MIDL_SWAP(ids[l+1], ids[ir]); - } - if (ids[l] < ids[l+1]) { - MIDL_SWAP(ids[l], ids[l+1]); - } - i = l+1; - j = ir; - a = ids[l+1]; - for(;;) { - do i++; while(ids[i] > a); - do j--; while(ids[j] < a); - if (j < i) break; - MIDL_SWAP(ids[i],ids[j]); - } - ids[l+1] = ids[j]; - ids[j] = a; - jstack += 2; - if (ir-i+1 >= j-l) { - istack[jstack] = ir; - istack[jstack-1] = i; - ir = j-1; - } else { - istack[jstack] = j-1; - istack[jstack-1] = l; - l = i; - } - } - } -} - -unsigned mdb_mid2l_search( MDB_ID2L ids, MDB_ID id ) -{ - /* - * binary search of id in ids - * if found, returns position of id - * if not found, returns first position greater than id - */ - unsigned base = 0; - unsigned cursor = 1; - int val = 0; - unsigned n = (unsigned)ids[0].mid; - - while( 0 < n ) { - unsigned pivot = n >> 1; - cursor = base + pivot + 1; - val = CMP( id, ids[cursor].mid ); - - if( val < 0 ) { - n = pivot; - - } else if ( val > 0 ) { - base = cursor; - n -= pivot + 1; - - } else { - return cursor; - } - } - - if( val > 0 ) { - ++cursor; - } - return cursor; -} - -int mdb_mid2l_insert( MDB_ID2L ids, MDB_ID2 *id ) -{ - unsigned x, i; - - x = mdb_mid2l_search( ids, id->mid ); - - if( x < 1 ) { - /* internal error */ - return -2; - } - - if ( x <= ids[0].mid && ids[x].mid == id->mid ) { - /* duplicate */ - return -1; - } - - if ( ids[0].mid >= MDB_IDL_UM_MAX ) { - /* too big */ - return -2; - - } else { - /* insert id */ - ids[0].mid++; - for (i=(unsigned)ids[0].mid; i>x; i--) - ids[i] = ids[i-1]; - ids[x] = *id; - } - - return 0; -} - -int mdb_mid2l_append( MDB_ID2L ids, MDB_ID2 *id ) -{ - /* Too big? */ - if (ids[0].mid >= MDB_IDL_UM_MAX) { - return -2; - } - ids[0].mid++; - ids[ids[0].mid] = *id; - return 0; -} - -/** @} */ -/** @} */ diff --git a/vendor/github.com/szferi/gomdb/midl.h b/vendor/github.com/szferi/gomdb/midl.h deleted file mode 100644 index a7f25026ce53..000000000000 --- a/vendor/github.com/szferi/gomdb/midl.h +++ /dev/null @@ -1,186 +0,0 @@ -/** @file midl.h - * @brief LMDB ID List header file. - * - * This file was originally part of back-bdb but has been - * modified for use in libmdb. Most of the macros defined - * in this file are unused, just left over from the original. - * - * This file is only used internally in libmdb and its definitions - * are not exposed publicly. - */ -/* $OpenLDAP$ */ -/* This work is part of OpenLDAP Software . - * - * Copyright 2000-2014 The OpenLDAP Foundation. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - */ - -#ifndef _MDB_MIDL_H_ -#define _MDB_MIDL_H_ - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/** @defgroup internal LMDB Internals - * @{ - */ - -/** @defgroup idls ID List Management - * @{ - */ - /** A generic unsigned ID number. These were entryIDs in back-bdb. - * Preferably it should have the same size as a pointer. - */ -typedef size_t MDB_ID; - - /** An IDL is an ID List, a sorted array of IDs. The first - * element of the array is a counter for how many actual - * IDs are in the list. In the original back-bdb code, IDLs are - * sorted in ascending order. For libmdb IDLs are sorted in - * descending order. - */ -typedef MDB_ID *MDB_IDL; - -/* IDL sizes - likely should be even bigger - * limiting factors: sizeof(ID), thread stack size - */ -#define MDB_IDL_LOGN 16 /* DB_SIZE is 2^16, UM_SIZE is 2^17 */ -#define MDB_IDL_DB_SIZE (1< -#include -#include "lmdb.h" -*/ -import "C" - -import ( - "math" - "runtime" - "unsafe" -) - -// DBIOpen Database Flags -const ( - REVERSEKEY = C.MDB_REVERSEKEY // use reverse string keys - DUPSORT = C.MDB_DUPSORT // use sorted duplicates - INTEGERKEY = C.MDB_INTEGERKEY // numeric keys in native byte order. The keys must all be of the same size. - DUPFIXED = C.MDB_DUPFIXED // with DUPSORT, sorted dup items have fixed size - INTEGERDUP = C.MDB_INTEGERDUP // with DUPSORT, dups are numeric in native byte order - REVERSEDUP = C.MDB_REVERSEDUP // with DUPSORT, use reverse string dups - CREATE = C.MDB_CREATE // create DB if not already existing -) - -// put flags -const ( - NODUPDATA = C.MDB_NODUPDATA - NOOVERWRITE = C.MDB_NOOVERWRITE - RESERVE = C.MDB_RESERVE - APPEND = C.MDB_APPEND - APPENDDUP = C.MDB_APPENDDUP -) - -// Txn is Opaque structure for a transaction handle. -// All database operations require a transaction handle. -// Transactions may be read-only or read-write. -type Txn struct { - _txn *C.MDB_txn -} - -func (env *Env) BeginTxn(parent *Txn, flags uint) (*Txn, error) { - var _txn *C.MDB_txn - var ptxn *C.MDB_txn - if parent == nil { - ptxn = nil - } else { - ptxn = parent._txn - } - if flags&RDONLY == 0 { - runtime.LockOSThread() - } - ret := C.mdb_txn_begin(env._env, ptxn, C.uint(flags), &_txn) - if ret != SUCCESS { - runtime.UnlockOSThread() - return nil, errno(ret) - } - return &Txn{_txn}, nil -} - -func (txn *Txn) Commit() error { - ret := C.mdb_txn_commit(txn._txn) - runtime.UnlockOSThread() - // The transaction handle is freed if there was no error - if ret == C.MDB_SUCCESS { - txn._txn = nil - } - return errno(ret) -} - -func (txn *Txn) Abort() { - if txn._txn == nil { - return - } - C.mdb_txn_abort(txn._txn) - runtime.UnlockOSThread() - // The transaction handle is always freed. - txn._txn = nil -} - -func (txn *Txn) Reset() { - C.mdb_txn_reset(txn._txn) -} - -func (txn *Txn) Renew() error { - ret := C.mdb_txn_renew(txn._txn) - return errno(ret) -} - -func (txn *Txn) DBIOpen(name *string, flags uint) (DBI, error) { - var _dbi C.MDB_dbi - var cname *C.char - if name == nil { - cname = nil - } else { - cname = C.CString(*name) - defer C.free(unsafe.Pointer(cname)) - } - ret := C.mdb_dbi_open(txn._txn, cname, C.uint(flags), &_dbi) - if ret != SUCCESS { - return DBI(math.NaN()), errno(ret) - } - return DBI(_dbi), nil -} - -func (txn *Txn) Stat(dbi DBI) (*Stat, error) { - var _stat C.MDB_stat - ret := C.mdb_stat(txn._txn, C.MDB_dbi(dbi), &_stat) - if ret != SUCCESS { - return nil, errno(ret) - } - stat := Stat{PSize: uint(_stat.ms_psize), - Depth: uint(_stat.ms_depth), - BranchPages: uint64(_stat.ms_branch_pages), - LeafPages: uint64(_stat.ms_leaf_pages), - OverflowPages: uint64(_stat.ms_overflow_pages), - Entries: uint64(_stat.ms_entries)} - return &stat, nil -} - -func (txn *Txn) Drop(dbi DBI, del int) error { - ret := C.mdb_drop(txn._txn, C.MDB_dbi(dbi), C.int(del)) - return errno(ret) -} - -func (txn *Txn) Get(dbi DBI, key []byte) ([]byte, error) { - val, err := txn.GetVal(dbi, key) - if err != nil { - return nil, err - } - return val.Bytes(), nil -} - -func (txn *Txn) GetVal(dbi DBI, key []byte) (Val, error) { - ckey := Wrap(key) - var cval Val - ret := C.mdb_get(txn._txn, C.MDB_dbi(dbi), (*C.MDB_val)(&ckey), (*C.MDB_val)(&cval)) - return cval, errno(ret) -} - -func (txn *Txn) Put(dbi DBI, key []byte, val []byte, flags uint) error { - ckey := Wrap(key) - cval := Wrap(val) - ret := C.mdb_put(txn._txn, C.MDB_dbi(dbi), (*C.MDB_val)(&ckey), (*C.MDB_val)(&cval), C.uint(flags)) - return errno(ret) -} - -func (txn *Txn) Del(dbi DBI, key, val []byte) error { - ckey := Wrap(key) - if val == nil { - ret := C.mdb_del(txn._txn, C.MDB_dbi(dbi), (*C.MDB_val)(&ckey), nil) - return errno(ret) - } - cval := Wrap(val) - ret := C.mdb_del(txn._txn, C.MDB_dbi(dbi), (*C.MDB_val)(&ckey), (*C.MDB_val)(&cval)) - return errno(ret) -} - -type Cursor struct { - _cursor *C.MDB_cursor -} - -func (txn *Txn) CursorOpen(dbi DBI) (*Cursor, error) { - var _cursor *C.MDB_cursor - ret := C.mdb_cursor_open(txn._txn, C.MDB_dbi(dbi), &_cursor) - if ret != SUCCESS { - return nil, errno(ret) - } - return &Cursor{_cursor}, nil -} - -func (txn *Txn) CursorRenew(cursor *Cursor) error { - ret := C.mdb_cursor_renew(txn._txn, cursor._cursor) - return errno(ret) -} - -/* -type CmpFunc func(a, b []byte) int - -func (txn *Txn) SetCompare(dbi DBI, cmp CmpFunc) error { - f := func(a, b *C.MDB_val) C.int { - ga := C.GoBytes(a.mv_data, C.int(a.mv_size)) - gb := C.GoBytes(a.mv_data, C.int(a.mv_size)) - return C.int(cmp(ga, gb)) - } - ret := C.mdb_set_compare(txn._txn, C.MDB_dbi(dbi), *unsafe.Pointer(&f)) - return errno(ret) -} -*/ -// func (txn *Txn) SetDupSort(dbi DBI, comp *C.MDB_comp_func) error -// func (txn *Txn) SetRelFunc(dbi DBI, rel *C.MDB_rel_func) error -// func (txn *Txn) SetRelCtx(dbi DBI, void *) error diff --git a/vendor/github.com/szferi/gomdb/val.go b/vendor/github.com/szferi/gomdb/val.go deleted file mode 100644 index d8bfb346c692..000000000000 --- a/vendor/github.com/szferi/gomdb/val.go +++ /dev/null @@ -1,52 +0,0 @@ -package mdb - -/* -#cgo CFLAGS: -pthread -W -Wall -Wno-unused-parameter -Wbad-function-cast -O2 -g -#cgo CFLAGS: -I/usr/local - -#include -#include -#include "lmdb.h" -*/ -import "C" - -import ( - "reflect" - "unsafe" -) - -// MDB_val -type Val C.MDB_val - -// Create a Val that points to p's data. the Val's data must not be freed -// manually and C references must not survive the garbage collection of p (and -// the returned Val). -func Wrap(p []byte) Val { - if len(p) == 0 { - return Val(C.MDB_val{}) - } - return Val(C.MDB_val{ - mv_size: C.size_t(len(p)), - mv_data: unsafe.Pointer(&p[0]), - }) -} - -// If val is nil, a empty slice is retured. -func (val Val) Bytes() []byte { - return C.GoBytes(val.mv_data, C.int(val.mv_size)) -} - -// If val is nil, a empty slice is retured. -func (val Val) BytesNoCopy() []byte { - hdr := reflect.SliceHeader{ - Data: uintptr(unsafe.Pointer(val.mv_data)), - Len: int(val.mv_size), - Cap: int(val.mv_size), - } - return *(*[]byte)(unsafe.Pointer(&hdr)) -} - -// If val is nil, an empty string is returned. -func (val Val) String() string { - return C.GoStringN((*C.char)(val.mv_data), C.int(val.mv_size)) -} From 2847b414187093e493c393fe8fa47f8e0cf33104 Mon Sep 17 00:00:00 2001 From: Matti Ranta Date: Mon, 4 Feb 2019 10:27:29 -0500 Subject: [PATCH 3/3] update docs --- docs/content/doc/advanced/config-cheat-sheet.en-us.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/content/doc/advanced/config-cheat-sheet.en-us.md b/docs/content/doc/advanced/config-cheat-sheet.en-us.md index c2d22f1d838f..8254316716bf 100644 --- a/docs/content/doc/advanced/config-cheat-sheet.en-us.md +++ b/docs/content/doc/advanced/config-cheat-sheet.en-us.md @@ -250,7 +250,7 @@ Values containing `#` or `;` must be quoted using `` ` `` or `"""`. ## Session (`session`) -- `PROVIDER`: **memory**: Session engine provider \[memory, file, redis, mysql\]. +- `PROVIDER`: **memory**: Session engine provider \[memory, file, redis, mysql, couchbase, memcache, nodb, postgres\]. - `PROVIDER_CONFIG`: **data/sessions**: For file, the root path; for others, the connection string. - `COOKIE_SECURE`: **false**: Enable this to force using HTTPS for all session access. - `COOKIE_NAME`: **i\_like\_gitea**: The name of the cookie used for the session ID.