From b558811487ff9b8120f2f341e9fff279a65fdde5 Mon Sep 17 00:00:00 2001 From: Sebastiaan van Stijn Date: Mon, 26 Aug 2019 12:11:56 +0200 Subject: [PATCH 1/6] bump hashicorp/memberlist v0.1.4 full diff: https://github.com/hashicorp/memberlist/compare/3d8438da9589e7b608a83ffac1ef8211486bcb7c...v0.1.4 - hashicorp/memberlist#158 Limit concurrent push/pull connections - hashicorp/memberlist#159 Prioritize alive message over other messages - hashicorp/memberlist#168 Add go.mod - hashicorp/memberlist#167 Various changes to improve the cpu impact of TransmitLimitedQueue in large clusters - hashicorp/memberlist#169 added back-off to accept loop to avoid a tight loop - hashicorp/memberlist#178 Avoid to take into account wrong versions of protocols in Vsn - hashicorp/memberlist#189 Allow a dead node's name to be taken by a new node Signed-off-by: Sebastiaan van Stijn --- vendor.conf | 2 +- .../github.com/hashicorp/memberlist/README.md | 4 +- .../hashicorp/memberlist/alive_delegate.go | 6 +- .../hashicorp/memberlist/broadcast.go | 5 + .../github.com/hashicorp/memberlist/config.go | 5 + vendor/github.com/hashicorp/memberlist/go.mod | 20 + .../hashicorp/memberlist/memberlist.go | 77 +++- vendor/github.com/hashicorp/memberlist/net.go | 92 +++- .../hashicorp/memberlist/net_transport.go | 23 + .../github.com/hashicorp/memberlist/queue.go | 415 ++++++++++++++---- .../github.com/hashicorp/memberlist/state.go | 85 +++- .../github.com/hashicorp/memberlist/util.go | 5 +- 12 files changed, 587 insertions(+), 152 deletions(-) create mode 100644 vendor/github.com/hashicorp/memberlist/go.mod diff --git a/vendor.conf b/vendor.conf index 52aaac27d3..d9bea5e2fa 100644 --- a/vendor.conf +++ b/vendor.conf @@ -24,7 +24,7 @@ github.com/hashicorp/consul 9a9cc9341bb487651a0399e3fc5e github.com/hashicorp/errwrap 8a6fb523712970c966eefc6b39ed2c5e74880354 # v1.0.0 github.com/hashicorp/go-msgpack 71c2886f5a673a35f909803f38ece5810165097b github.com/hashicorp/go-multierror 886a7fbe3eb1c874d46f623bfa70af45f425b3d1 # v1.0.0 -github.com/hashicorp/memberlist 3d8438da9589e7b608a83ffac1ef8211486bcb7c +github.com/hashicorp/memberlist e1138a6a4d8a6eaec6c919aeae5efbe4d69b1ece # v0.1.4 github.com/hashicorp/golang-lru 7f827b33c0f158ec5dfbba01bb0b14a4541fd81d # v0.5.3 github.com/sean-/seed e2103e2c35297fb7e17febb81e49b312087a2372 github.com/hashicorp/go-sockaddr c7188e74f6acae5a989bdc959aa779f8b9f42faf # v1.0.2 diff --git a/vendor/github.com/hashicorp/memberlist/README.md b/vendor/github.com/hashicorp/memberlist/README.md index 0adc075e81..f47fb81aa6 100644 --- a/vendor/github.com/hashicorp/memberlist/README.md +++ b/vendor/github.com/hashicorp/memberlist/README.md @@ -1,4 +1,4 @@ -# memberlist [![GoDoc](https://godoc.org/github.com/hashicorp/memberlist?status.png)](https://godoc.org/github.com/hashicorp/memberlist) +# memberlist [![GoDoc](https://godoc.org/github.com/hashicorp/memberlist?status.png)](https://godoc.org/github.com/hashicorp/memberlist) [![Build Status](https://travis-ci.org/hashicorp/memberlist.svg?branch=master)](https://travis-ci.org/hashicorp/memberlist) memberlist is a [Go](http://www.golang.org) library that manages cluster membership and member failure detection using a gossip based protocol. @@ -65,7 +65,7 @@ For complete documentation, see the associated [Godoc](http://godoc.org/github.c ## Protocol -memberlist is based on ["SWIM: Scalable Weakly-consistent Infection-style Process Group Membership Protocol"](http://www.cs.cornell.edu/~asdas/research/dsn02-swim.pdf). However, we extend the protocol in a number of ways: +memberlist is based on ["SWIM: Scalable Weakly-consistent Infection-style Process Group Membership Protocol"](http://ieeexplore.ieee.org/document/1028914/). However, we extend the protocol in a number of ways: * Several extensions are made to increase propagation speed and convergence rate. diff --git a/vendor/github.com/hashicorp/memberlist/alive_delegate.go b/vendor/github.com/hashicorp/memberlist/alive_delegate.go index 51a0ba9054..615f4a90a5 100644 --- a/vendor/github.com/hashicorp/memberlist/alive_delegate.go +++ b/vendor/github.com/hashicorp/memberlist/alive_delegate.go @@ -7,8 +7,8 @@ package memberlist // a node out and prevent it from being considered a peer // using application specific logic. type AliveDelegate interface { - // NotifyMerge is invoked when a merge could take place. - // Provides a list of the nodes known by the peer. If - // the return value is non-nil, the merge is canceled. + // NotifyAlive is invoked when a message about a live + // node is received from the network. Returning a non-nil + // error prevents the node from being considered a peer. NotifyAlive(peer *Node) error } diff --git a/vendor/github.com/hashicorp/memberlist/broadcast.go b/vendor/github.com/hashicorp/memberlist/broadcast.go index f7e85a119c..d07d41bb69 100644 --- a/vendor/github.com/hashicorp/memberlist/broadcast.go +++ b/vendor/github.com/hashicorp/memberlist/broadcast.go @@ -29,6 +29,11 @@ func (b *memberlistBroadcast) Invalidates(other Broadcast) bool { return b.node == mb.node } +// memberlist.NamedBroadcast optional interface +func (b *memberlistBroadcast) Name() string { + return b.node +} + func (b *memberlistBroadcast) Message() []byte { return b.msg } diff --git a/vendor/github.com/hashicorp/memberlist/config.go b/vendor/github.com/hashicorp/memberlist/config.go index c85b1657a2..c9cd176443 100644 --- a/vendor/github.com/hashicorp/memberlist/config.go +++ b/vendor/github.com/hashicorp/memberlist/config.go @@ -215,6 +215,11 @@ type Config struct { // This is a legacy name for backward compatibility but should really be // called PacketBufferSize now that we have generalized the transport. UDPBufferSize int + + // DeadNodeReclaimTime controls the time before a dead node's name can be + // reclaimed by one with a different address or port. By default, this is 0, + // meaning nodes cannot be reclaimed this way. + DeadNodeReclaimTime time.Duration } // DefaultLANConfig returns a sane set of configurations for Memberlist. diff --git a/vendor/github.com/hashicorp/memberlist/go.mod b/vendor/github.com/hashicorp/memberlist/go.mod new file mode 100644 index 0000000000..0c025ff115 --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/go.mod @@ -0,0 +1,20 @@ +module github.com/hashicorp/memberlist + +require ( + github.com/armon/go-metrics v0.0.0-20180917152333-f0300d1749da + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c + github.com/hashicorp/go-immutable-radix v1.0.0 // indirect + github.com/hashicorp/go-msgpack v0.5.3 + github.com/hashicorp/go-multierror v1.0.0 + github.com/hashicorp/go-sockaddr v1.0.0 + github.com/miekg/dns v1.0.14 + github.com/pascaldekloe/goe v0.0.0-20180627143212-57f6aae5913c // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529 + github.com/stretchr/testify v1.2.2 + golang.org/x/crypto v0.0.0-20181029021203-45a5f77698d3 // indirect + golang.org/x/net v0.0.0-20181023162649-9b4f9f5ad519 // indirect + golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4 // indirect + golang.org/x/sys v0.0.0-20181026203630-95b1ffbd15a5 // indirect +) diff --git a/vendor/github.com/hashicorp/memberlist/memberlist.go b/vendor/github.com/hashicorp/memberlist/memberlist.go index e9084f9fd4..f289a12aed 100644 --- a/vendor/github.com/hashicorp/memberlist/memberlist.go +++ b/vendor/github.com/hashicorp/memberlist/memberlist.go @@ -15,6 +15,7 @@ multiple routes. package memberlist import ( + "container/list" "fmt" "log" "net" @@ -34,6 +35,7 @@ type Memberlist struct { sequenceNum uint32 // Local sequence number incarnation uint32 // Local incarnation number numNodes uint32 // Number of known nodes (estimate) + pushPullReq uint32 // Number of push/pull requests config *Config shutdown int32 // Used as an atomic boolean value @@ -45,7 +47,11 @@ type Memberlist struct { leaveLock sync.Mutex // Serializes calls to Leave transport Transport - handoff chan msgHandoff + + handoffCh chan struct{} + highPriorityMsgQueue *list.List + lowPriorityMsgQueue *list.List + msgQueueLock sync.Mutex nodeLock sync.RWMutex nodes []*nodeState // Known nodes @@ -66,6 +72,15 @@ type Memberlist struct { logger *log.Logger } +// BuildVsnArray creates the array of Vsn +func (conf *Config) BuildVsnArray() []uint8 { + return []uint8{ + ProtocolVersionMin, ProtocolVersionMax, conf.ProtocolVersion, + conf.DelegateProtocolMin, conf.DelegateProtocolMax, + conf.DelegateProtocolVersion, + } +} + // newMemberlist creates the network listeners. // Does not schedule execution of background maintenance. func newMemberlist(conf *Config) (*Memberlist, error) { @@ -160,17 +175,19 @@ func newMemberlist(conf *Config) (*Memberlist, error) { } m := &Memberlist{ - config: conf, - shutdownCh: make(chan struct{}), - leaveBroadcast: make(chan struct{}, 1), - transport: transport, - handoff: make(chan msgHandoff, conf.HandoffQueueDepth), - nodeMap: make(map[string]*nodeState), - nodeTimers: make(map[string]*suspicion), - awareness: newAwareness(conf.AwarenessMaxMultiplier), - ackHandlers: make(map[uint32]*ackHandler), - broadcasts: &TransmitLimitedQueue{RetransmitMult: conf.RetransmitMult}, - logger: logger, + config: conf, + shutdownCh: make(chan struct{}), + leaveBroadcast: make(chan struct{}, 1), + transport: transport, + handoffCh: make(chan struct{}, 1), + highPriorityMsgQueue: list.New(), + lowPriorityMsgQueue: list.New(), + nodeMap: make(map[string]*nodeState), + nodeTimers: make(map[string]*suspicion), + awareness: newAwareness(conf.AwarenessMaxMultiplier), + ackHandlers: make(map[uint32]*ackHandler), + broadcasts: &TransmitLimitedQueue{RetransmitMult: conf.RetransmitMult}, + logger: logger, } m.broadcasts.NumNodes = func() int { return m.estNumNodes() @@ -394,11 +411,7 @@ func (m *Memberlist) setAlive() error { Addr: addr, Port: uint16(port), Meta: meta, - Vsn: []uint8{ - ProtocolVersionMin, ProtocolVersionMax, m.config.ProtocolVersion, - m.config.DelegateProtocolMin, m.config.DelegateProtocolMax, - m.config.DelegateProtocolVersion, - }, + Vsn: m.config.BuildVsnArray(), } m.aliveNode(&a, nil, true) return nil @@ -439,11 +452,7 @@ func (m *Memberlist) UpdateNode(timeout time.Duration) error { Addr: state.Addr, Port: state.Port, Meta: meta, - Vsn: []uint8{ - ProtocolVersionMin, ProtocolVersionMax, m.config.ProtocolVersion, - m.config.DelegateProtocolMin, m.config.DelegateProtocolMax, - m.config.DelegateProtocolVersion, - }, + Vsn: m.config.BuildVsnArray(), } notifyCh := make(chan struct{}) m.aliveNode(&a, notifyCh, true) @@ -657,3 +666,27 @@ func (m *Memberlist) hasShutdown() bool { func (m *Memberlist) hasLeft() bool { return atomic.LoadInt32(&m.leave) == 1 } + +func (m *Memberlist) getNodeState(addr string) nodeStateType { + m.nodeLock.RLock() + defer m.nodeLock.RUnlock() + + n := m.nodeMap[addr] + return n.State +} + +func (m *Memberlist) getNodeStateChange(addr string) time.Time { + m.nodeLock.RLock() + defer m.nodeLock.RUnlock() + + n := m.nodeMap[addr] + return n.StateChange +} + +func (m *Memberlist) changeNode(addr string, f func(*nodeState)) { + m.nodeLock.Lock() + defer m.nodeLock.Unlock() + + n := m.nodeMap[addr] + f(n) +} diff --git a/vendor/github.com/hashicorp/memberlist/net.go b/vendor/github.com/hashicorp/memberlist/net.go index a4330c4d20..f6a0d45fed 100644 --- a/vendor/github.com/hashicorp/memberlist/net.go +++ b/vendor/github.com/hashicorp/memberlist/net.go @@ -8,9 +8,10 @@ import ( "hash/crc32" "io" "net" + "sync/atomic" "time" - "github.com/armon/go-metrics" + metrics "github.com/armon/go-metrics" "github.com/hashicorp/go-msgpack/codec" ) @@ -71,7 +72,8 @@ const ( compoundOverhead = 2 // Assumed overhead per entry in compoundHeader userMsgOverhead = 1 blockingWarning = 10 * time.Millisecond // Warn if a UDP packet takes this long to process - maxPushStateBytes = 10 * 1024 * 1024 + maxPushStateBytes = 20 * 1024 * 1024 + maxPushPullRequests = 128 // Maximum number of concurrent push/pull requests ) // ping request sent directly to node @@ -238,6 +240,16 @@ func (m *Memberlist) handleConn(conn net.Conn) { m.logger.Printf("[ERR] memberlist: Failed to receive user message: %s %s", err, LogConn(conn)) } case pushPullMsg: + // Increment counter of pending push/pulls + numConcurrent := atomic.AddUint32(&m.pushPullReq, 1) + defer atomic.AddUint32(&m.pushPullReq, ^uint32(0)) + + // Check if we have too many open push/pull requests + if numConcurrent >= maxPushPullRequests { + m.logger.Printf("[ERR] memberlist: Too many pending push/pull requests") + return + } + join, remoteNodes, userState, err := m.readRemoteState(bufConn, dec) if err != nil { m.logger.Printf("[ERR] memberlist: Failed to read remote state: %s %s", err, LogConn(conn)) @@ -357,10 +369,25 @@ func (m *Memberlist) handleCommand(buf []byte, from net.Addr, timestamp time.Tim case deadMsg: fallthrough case userMsg: + // Determine the message queue, prioritize alive + queue := m.lowPriorityMsgQueue + if msgType == aliveMsg { + queue = m.highPriorityMsgQueue + } + + // Check for overflow and append if not full + m.msgQueueLock.Lock() + if queue.Len() >= m.config.HandoffQueueDepth { + m.logger.Printf("[WARN] memberlist: handler queue full, dropping message (%d) %s", msgType, LogAddress(from)) + } else { + queue.PushBack(msgHandoff{msgType, buf, from}) + } + m.msgQueueLock.Unlock() + + // Notify of pending message select { - case m.handoff <- msgHandoff{msgType, buf, from}: + case m.handoffCh <- struct{}{}: default: - m.logger.Printf("[WARN] memberlist: handler queue full, dropping message (%d) %s", msgType, LogAddress(from)) } default: @@ -368,28 +395,51 @@ func (m *Memberlist) handleCommand(buf []byte, from net.Addr, timestamp time.Tim } } +// getNextMessage returns the next message to process in priority order, using LIFO +func (m *Memberlist) getNextMessage() (msgHandoff, bool) { + m.msgQueueLock.Lock() + defer m.msgQueueLock.Unlock() + + if el := m.highPriorityMsgQueue.Back(); el != nil { + m.highPriorityMsgQueue.Remove(el) + msg := el.Value.(msgHandoff) + return msg, true + } else if el := m.lowPriorityMsgQueue.Back(); el != nil { + m.lowPriorityMsgQueue.Remove(el) + msg := el.Value.(msgHandoff) + return msg, true + } + return msgHandoff{}, false +} + // packetHandler is a long running goroutine that processes messages received // over the packet interface, but is decoupled from the listener to avoid // blocking the listener which may cause ping/ack messages to be delayed. func (m *Memberlist) packetHandler() { for { select { - case msg := <-m.handoff: - msgType := msg.msgType - buf := msg.buf - from := msg.from - - switch msgType { - case suspectMsg: - m.handleSuspect(buf, from) - case aliveMsg: - m.handleAlive(buf, from) - case deadMsg: - m.handleDead(buf, from) - case userMsg: - m.handleUser(buf, from) - default: - m.logger.Printf("[ERR] memberlist: Message type (%d) not supported %s (packet handler)", msgType, LogAddress(from)) + case <-m.handoffCh: + for { + msg, ok := m.getNextMessage() + if !ok { + break + } + msgType := msg.msgType + buf := msg.buf + from := msg.from + + switch msgType { + case suspectMsg: + m.handleSuspect(buf, from) + case aliveMsg: + m.handleAlive(buf, from) + case deadMsg: + m.handleDead(buf, from) + case userMsg: + m.handleUser(buf, from) + default: + m.logger.Printf("[ERR] memberlist: Message type (%d) not supported %s (packet handler)", msgType, LogAddress(from)) + } } case <-m.shutdownCh: @@ -1094,7 +1144,7 @@ func (m *Memberlist) sendPingAndWaitForAck(addr string, ping ping, deadline time } if ack.SeqNo != ping.SeqNo { - return false, fmt.Errorf("Sequence number from ack (%d) doesn't match ping (%d)", ack.SeqNo, ping.SeqNo, LogConn(conn)) + return false, fmt.Errorf("Sequence number from ack (%d) doesn't match ping (%d)", ack.SeqNo, ping.SeqNo) } return true, nil diff --git a/vendor/github.com/hashicorp/memberlist/net_transport.go b/vendor/github.com/hashicorp/memberlist/net_transport.go index e7b88b01f6..4723127f54 100644 --- a/vendor/github.com/hashicorp/memberlist/net_transport.go +++ b/vendor/github.com/hashicorp/memberlist/net_transport.go @@ -221,6 +221,16 @@ func (t *NetTransport) Shutdown() error { // and hands them off to the stream channel. func (t *NetTransport) tcpListen(tcpLn *net.TCPListener) { defer t.wg.Done() + + // baseDelay is the initial delay after an AcceptTCP() error before attempting again + const baseDelay = 5 * time.Millisecond + + // maxDelay is the maximum delay after an AcceptTCP() error before attempting again. + // In the case that tcpListen() is error-looping, it will delay the shutdown check. + // Therefore, changes to maxDelay may have an effect on the latency of shutdown. + const maxDelay = 1 * time.Second + + var loopDelay time.Duration for { conn, err := tcpLn.AcceptTCP() if err != nil { @@ -228,9 +238,22 @@ func (t *NetTransport) tcpListen(tcpLn *net.TCPListener) { break } + if loopDelay == 0 { + loopDelay = baseDelay + } else { + loopDelay *= 2 + } + + if loopDelay > maxDelay { + loopDelay = maxDelay + } + t.logger.Printf("[ERR] memberlist: Error accepting TCP connection: %v", err) + time.Sleep(loopDelay) continue } + // No error, reset loop delay + loopDelay = 0 t.streamCh <- conn } diff --git a/vendor/github.com/hashicorp/memberlist/queue.go b/vendor/github.com/hashicorp/memberlist/queue.go index 994b90ff10..c970176e18 100644 --- a/vendor/github.com/hashicorp/memberlist/queue.go +++ b/vendor/github.com/hashicorp/memberlist/queue.go @@ -1,8 +1,10 @@ package memberlist import ( - "sort" + "math" "sync" + + "github.com/google/btree" ) // TransmitLimitedQueue is used to queue messages to broadcast to @@ -19,15 +21,93 @@ type TransmitLimitedQueue struct { // number of retransmissions attempted. RetransmitMult int - sync.Mutex - bcQueue limitedBroadcasts + mu sync.Mutex + tq *btree.BTree // stores *limitedBroadcast as btree.Item + tm map[string]*limitedBroadcast + idGen int64 } type limitedBroadcast struct { - transmits int // Number of transmissions attempted. + transmits int // btree-key[0]: Number of transmissions attempted. + msgLen int64 // btree-key[1]: copied from len(b.Message()) + id int64 // btree-key[2]: unique incrementing id stamped at submission time b Broadcast + + name string // set if Broadcast is a NamedBroadcast +} + +// Less tests whether the current item is less than the given argument. +// +// This must provide a strict weak ordering. +// If !a.Less(b) && !b.Less(a), we treat this to mean a == b (i.e. we can only +// hold one of either a or b in the tree). +// +// default ordering is +// - [transmits=0, ..., transmits=inf] +// - [transmits=0:len=999, ..., transmits=0:len=2, ...] +// - [transmits=0:len=999,id=999, ..., transmits=0:len=999:id=1, ...] +func (b *limitedBroadcast) Less(than btree.Item) bool { + o := than.(*limitedBroadcast) + if b.transmits < o.transmits { + return true + } else if b.transmits > o.transmits { + return false + } + if b.msgLen > o.msgLen { + return true + } else if b.msgLen < o.msgLen { + return false + } + return b.id > o.id +} + +// for testing; emits in transmit order if reverse=false +func (q *TransmitLimitedQueue) orderedView(reverse bool) []*limitedBroadcast { + q.mu.Lock() + defer q.mu.Unlock() + + out := make([]*limitedBroadcast, 0, q.lenLocked()) + q.walkReadOnlyLocked(reverse, func(cur *limitedBroadcast) bool { + out = append(out, cur) + return true + }) + + return out +} + +// walkReadOnlyLocked calls f for each item in the queue traversing it in +// natural order (by Less) when reverse=false and the opposite when true. You +// must hold the mutex. +// +// This method panics if you attempt to mutate the item during traversal. The +// underlying btree should also not be mutated during traversal. +func (q *TransmitLimitedQueue) walkReadOnlyLocked(reverse bool, f func(*limitedBroadcast) bool) { + if q.lenLocked() == 0 { + return + } + + iter := func(item btree.Item) bool { + cur := item.(*limitedBroadcast) + + prevTransmits := cur.transmits + prevMsgLen := cur.msgLen + prevID := cur.id + + keepGoing := f(cur) + + if prevTransmits != cur.transmits || prevMsgLen != cur.msgLen || prevID != cur.id { + panic("edited queue while walking read only") + } + + return keepGoing + } + + if reverse { + q.tq.Descend(iter) // end with transmit 0 + } else { + q.tq.Ascend(iter) // start with transmit 0 + } } -type limitedBroadcasts []*limitedBroadcast // Broadcast is something that can be broadcasted via gossip to // the memberlist cluster. @@ -45,123 +125,298 @@ type Broadcast interface { Finished() } +// NamedBroadcast is an optional extension of the Broadcast interface that +// gives each message a unique string name, and that is used to optimize +// +// You shoud ensure that Invalidates() checks the same uniqueness as the +// example below: +// +// func (b *foo) Invalidates(other Broadcast) bool { +// nb, ok := other.(NamedBroadcast) +// if !ok { +// return false +// } +// return b.Name() == nb.Name() +// } +// +// Invalidates() isn't currently used for NamedBroadcasts, but that may change +// in the future. +type NamedBroadcast interface { + Broadcast + // The unique identity of this broadcast message. + Name() string +} + +// UniqueBroadcast is an optional interface that indicates that each message is +// intrinsically unique and there is no need to scan the broadcast queue for +// duplicates. +// +// You should ensure that Invalidates() always returns false if implementing +// this interface. Invalidates() isn't currently used for UniqueBroadcasts, but +// that may change in the future. +type UniqueBroadcast interface { + Broadcast + // UniqueBroadcast is just a marker method for this interface. + UniqueBroadcast() +} + // QueueBroadcast is used to enqueue a broadcast func (q *TransmitLimitedQueue) QueueBroadcast(b Broadcast) { - q.Lock() - defer q.Unlock() - - // Check if this message invalidates another - n := len(q.bcQueue) - for i := 0; i < n; i++ { - if b.Invalidates(q.bcQueue[i].b) { - q.bcQueue[i].b.Finished() - copy(q.bcQueue[i:], q.bcQueue[i+1:]) - q.bcQueue[n-1] = nil - q.bcQueue = q.bcQueue[:n-1] - n-- + q.queueBroadcast(b, 0) +} + +// lazyInit initializes internal data structures the first time they are +// needed. You must already hold the mutex. +func (q *TransmitLimitedQueue) lazyInit() { + if q.tq == nil { + q.tq = btree.New(32) + } + if q.tm == nil { + q.tm = make(map[string]*limitedBroadcast) + } +} + +// queueBroadcast is like QueueBroadcast but you can use a nonzero value for +// the initial transmit tier assigned to the message. This is meant to be used +// for unit testing. +func (q *TransmitLimitedQueue) queueBroadcast(b Broadcast, initialTransmits int) { + q.mu.Lock() + defer q.mu.Unlock() + + q.lazyInit() + + if q.idGen == math.MaxInt64 { + // it's super duper unlikely to wrap around within the retransmit limit + q.idGen = 1 + } else { + q.idGen++ + } + id := q.idGen + + lb := &limitedBroadcast{ + transmits: initialTransmits, + msgLen: int64(len(b.Message())), + id: id, + b: b, + } + unique := false + if nb, ok := b.(NamedBroadcast); ok { + lb.name = nb.Name() + } else if _, ok := b.(UniqueBroadcast); ok { + unique = true + } + + // Check if this message invalidates another. + if lb.name != "" { + if old, ok := q.tm[lb.name]; ok { + old.b.Finished() + q.deleteItem(old) + } + } else if !unique { + // Slow path, hopefully nothing hot hits this. + var remove []*limitedBroadcast + q.tq.Ascend(func(item btree.Item) bool { + cur := item.(*limitedBroadcast) + + // Special Broadcasts can only invalidate each other. + switch cur.b.(type) { + case NamedBroadcast: + // noop + case UniqueBroadcast: + // noop + default: + if b.Invalidates(cur.b) { + cur.b.Finished() + remove = append(remove, cur) + } + } + return true + }) + for _, cur := range remove { + q.deleteItem(cur) } } - // Append to the queue - q.bcQueue = append(q.bcQueue, &limitedBroadcast{0, b}) + // Append to the relevant queue. + q.addItem(lb) +} + +// deleteItem removes the given item from the overall datastructure. You +// must already hold the mutex. +func (q *TransmitLimitedQueue) deleteItem(cur *limitedBroadcast) { + _ = q.tq.Delete(cur) + if cur.name != "" { + delete(q.tm, cur.name) + } + + if q.tq.Len() == 0 { + // At idle there's no reason to let the id generator keep going + // indefinitely. + q.idGen = 0 + } +} + +// addItem adds the given item into the overall datastructure. You must already +// hold the mutex. +func (q *TransmitLimitedQueue) addItem(cur *limitedBroadcast) { + _ = q.tq.ReplaceOrInsert(cur) + if cur.name != "" { + q.tm[cur.name] = cur + } +} + +// getTransmitRange returns a pair of min/max values for transmit values +// represented by the current queue contents. Both values represent actual +// transmit values on the interval [0, len). You must already hold the mutex. +func (q *TransmitLimitedQueue) getTransmitRange() (minTransmit, maxTransmit int) { + if q.lenLocked() == 0 { + return 0, 0 + } + minItem, maxItem := q.tq.Min(), q.tq.Max() + if minItem == nil || maxItem == nil { + return 0, 0 + } + + min := minItem.(*limitedBroadcast).transmits + max := maxItem.(*limitedBroadcast).transmits + + return min, max } // GetBroadcasts is used to get a number of broadcasts, up to a byte limit // and applying a per-message overhead as provided. func (q *TransmitLimitedQueue) GetBroadcasts(overhead, limit int) [][]byte { - q.Lock() - defer q.Unlock() + q.mu.Lock() + defer q.mu.Unlock() // Fast path the default case - if len(q.bcQueue) == 0 { + if q.lenLocked() == 0 { return nil } transmitLimit := retransmitLimit(q.RetransmitMult, q.NumNodes()) - bytesUsed := 0 - var toSend [][]byte - - for i := len(q.bcQueue) - 1; i >= 0; i-- { - // Check if this is within our limits - b := q.bcQueue[i] - msg := b.b.Message() - if bytesUsed+overhead+len(msg) > limit { + + var ( + bytesUsed int + toSend [][]byte + reinsert []*limitedBroadcast + ) + + // Visit fresher items first, but only look at stuff that will fit. + // We'll go tier by tier, grabbing the largest items first. + minTr, maxTr := q.getTransmitRange() + for transmits := minTr; transmits <= maxTr; /*do not advance automatically*/ { + free := int64(limit - bytesUsed - overhead) + if free <= 0 { + break // bail out early + } + + // Search for the least element on a given tier (by transmit count) as + // defined in the limitedBroadcast.Less function that will fit into our + // remaining space. + greaterOrEqual := &limitedBroadcast{ + transmits: transmits, + msgLen: free, + id: math.MaxInt64, + } + lessThan := &limitedBroadcast{ + transmits: transmits + 1, + msgLen: math.MaxInt64, + id: math.MaxInt64, + } + var keep *limitedBroadcast + q.tq.AscendRange(greaterOrEqual, lessThan, func(item btree.Item) bool { + cur := item.(*limitedBroadcast) + // Check if this is within our limits + if int64(len(cur.b.Message())) > free { + // If this happens it's a bug in the datastructure or + // surrounding use doing something like having len(Message()) + // change over time. There's enough going on here that it's + // probably sane to just skip it and move on for now. + return true + } + keep = cur + return false + }) + if keep == nil { + // No more items of an appropriate size in the tier. + transmits++ continue } + msg := keep.b.Message() + // Add to slice to send bytesUsed += overhead + len(msg) toSend = append(toSend, msg) // Check if we should stop transmission - b.transmits++ - if b.transmits >= transmitLimit { - b.b.Finished() - n := len(q.bcQueue) - q.bcQueue[i], q.bcQueue[n-1] = q.bcQueue[n-1], nil - q.bcQueue = q.bcQueue[:n-1] + q.deleteItem(keep) + if keep.transmits+1 >= transmitLimit { + keep.b.Finished() + } else { + // We need to bump this item down to another transmit tier, but + // because it would be in the same direction that we're walking the + // tiers, we will have to delay the reinsertion until we are + // finished our search. Otherwise we'll possibly re-add the message + // when we ascend to the next tier. + keep.transmits++ + reinsert = append(reinsert, keep) } } - // If we are sending anything, we need to re-sort to deal - // with adjusted transmit counts - if len(toSend) > 0 { - q.bcQueue.Sort() + for _, cur := range reinsert { + q.addItem(cur) } + return toSend } // NumQueued returns the number of queued messages func (q *TransmitLimitedQueue) NumQueued() int { - q.Lock() - defer q.Unlock() - return len(q.bcQueue) + q.mu.Lock() + defer q.mu.Unlock() + return q.lenLocked() } -// Reset clears all the queued messages -func (q *TransmitLimitedQueue) Reset() { - q.Lock() - defer q.Unlock() - for _, b := range q.bcQueue { - b.b.Finished() +// lenLocked returns the length of the overall queue datastructure. You must +// hold the mutex. +func (q *TransmitLimitedQueue) lenLocked() int { + if q.tq == nil { + return 0 } - q.bcQueue = nil + return q.tq.Len() +} + +// Reset clears all the queued messages. Should only be used for tests. +func (q *TransmitLimitedQueue) Reset() { + q.mu.Lock() + defer q.mu.Unlock() + + q.walkReadOnlyLocked(false, func(cur *limitedBroadcast) bool { + cur.b.Finished() + return true + }) + + q.tq = nil + q.tm = nil + q.idGen = 0 } // Prune will retain the maxRetain latest messages, and the rest // will be discarded. This can be used to prevent unbounded queue sizes func (q *TransmitLimitedQueue) Prune(maxRetain int) { - q.Lock() - defer q.Unlock() + q.mu.Lock() + defer q.mu.Unlock() // Do nothing if queue size is less than the limit - n := len(q.bcQueue) - if n < maxRetain { - return - } - - // Invalidate the messages we will be removing - for i := 0; i < n-maxRetain; i++ { - q.bcQueue[i].b.Finished() + for q.tq.Len() > maxRetain { + item := q.tq.Max() + if item == nil { + break + } + cur := item.(*limitedBroadcast) + cur.b.Finished() + q.deleteItem(cur) } - - // Move the messages, and retain only the last maxRetain - copy(q.bcQueue[0:], q.bcQueue[n-maxRetain:]) - q.bcQueue = q.bcQueue[:maxRetain] -} - -func (b limitedBroadcasts) Len() int { - return len(b) -} - -func (b limitedBroadcasts) Less(i, j int) bool { - return b[i].transmits < b[j].transmits -} - -func (b limitedBroadcasts) Swap(i, j int) { - b[i], b[j] = b[j], b[i] -} - -func (b limitedBroadcasts) Sort() { - sort.Sort(sort.Reverse(b)) } diff --git a/vendor/github.com/hashicorp/memberlist/state.go b/vendor/github.com/hashicorp/memberlist/state.go index f51692de0a..1af62943e8 100644 --- a/vendor/github.com/hashicorp/memberlist/state.go +++ b/vendor/github.com/hashicorp/memberlist/state.go @@ -9,7 +9,7 @@ import ( "sync/atomic" "time" - "github.com/armon/go-metrics" + metrics "github.com/armon/go-metrics" ) type nodeStateType int @@ -233,6 +233,15 @@ START: m.probeNode(&node) } +// probeNodeByAddr just safely calls probeNode given only the address of the node (for tests) +func (m *Memberlist) probeNodeByAddr(addr string) { + m.nodeLock.RLock() + n := m.nodeMap[addr] + m.nodeLock.RUnlock() + + m.probeNode(n) +} + // probeNode handles a single round of failure checking on a node. func (m *Memberlist) probeNode(node *nodeState) { defer metrics.MeasureSince([]string{"memberlist", "probeNode"}, time.Now()) @@ -841,11 +850,26 @@ func (m *Memberlist) aliveNode(a *alive, notify chan struct{}, bootstrap bool) { return } + if len(a.Vsn) >= 3 { + pMin := a.Vsn[0] + pMax := a.Vsn[1] + pCur := a.Vsn[2] + if pMin == 0 || pMax == 0 || pMin > pMax { + m.logger.Printf("[WARN] memberlist: Ignoring an alive message for '%s' (%v:%d) because protocol version(s) are wrong: %d <= %d <= %d should be >0", a.Node, net.IP(a.Addr), a.Port, pMin, pCur, pMax) + return + } + } + // Invoke the Alive delegate if any. This can be used to filter out // alive messages based on custom logic. For example, using a cluster name. // Using a merge delegate is not enough, as it is possible for passive // cluster merging to still occur. if m.config.Alive != nil { + if len(a.Vsn) < 6 { + m.logger.Printf("[WARN] memberlist: ignoring alive message for '%s' (%v:%d) because Vsn is not present", + a.Node, net.IP(a.Addr), a.Port) + return + } node := &Node{ Name: a.Node, Addr: a.Addr, @@ -867,6 +891,7 @@ func (m *Memberlist) aliveNode(a *alive, notify chan struct{}, bootstrap bool) { // Check if we've never seen this node before, and if not, then // store this node in our node map. + var updatesNode bool if !ok { state = &nodeState{ Node: Node{ @@ -877,6 +902,14 @@ func (m *Memberlist) aliveNode(a *alive, notify chan struct{}, bootstrap bool) { }, State: stateDead, } + if len(a.Vsn) > 5 { + state.PMin = a.Vsn[0] + state.PMax = a.Vsn[1] + state.PCur = a.Vsn[2] + state.DMin = a.Vsn[3] + state.DMax = a.Vsn[4] + state.DCur = a.Vsn[5] + } // Add to map m.nodeMap[a.Node] = state @@ -894,29 +927,40 @@ func (m *Memberlist) aliveNode(a *alive, notify chan struct{}, bootstrap bool) { // Update numNodes after we've added a new node atomic.AddUint32(&m.numNodes, 1) - } - - // Check if this address is different than the existing node - if !bytes.Equal([]byte(state.Addr), a.Addr) || state.Port != a.Port { - m.logger.Printf("[ERR] memberlist: Conflicting address for %s. Mine: %v:%d Theirs: %v:%d", - state.Name, state.Addr, state.Port, net.IP(a.Addr), a.Port) - - // Inform the conflict delegate if provided - if m.config.Conflict != nil { - other := Node{ - Name: a.Node, - Addr: a.Addr, - Port: a.Port, - Meta: a.Meta, + } else { + // Check if this address is different than the existing node unless the old node is dead. + if !bytes.Equal([]byte(state.Addr), a.Addr) || state.Port != a.Port { + // If DeadNodeReclaimTime is configured, check if enough time has elapsed since the node died. + canReclaim := (m.config.DeadNodeReclaimTime > 0 && + time.Since(state.StateChange) > m.config.DeadNodeReclaimTime) + + // Allow the address to be updated if a dead node is being replaced. + if state.State == stateDead && canReclaim { + m.logger.Printf("[INFO] memberlist: Updating address for failed node %s from %v:%d to %v:%d", + state.Name, state.Addr, state.Port, net.IP(a.Addr), a.Port) + updatesNode = true + } else { + m.logger.Printf("[ERR] memberlist: Conflicting address for %s. Mine: %v:%d Theirs: %v:%d Old state: %v", + state.Name, state.Addr, state.Port, net.IP(a.Addr), a.Port, state.State) + + // Inform the conflict delegate if provided + if m.config.Conflict != nil { + other := Node{ + Name: a.Node, + Addr: a.Addr, + Port: a.Port, + Meta: a.Meta, + } + m.config.Conflict.NotifyConflict(&state.Node, &other) + } + return } - m.config.Conflict.NotifyConflict(&state.Node, &other) } - return } // Bail if the incarnation number is older, and this is not about us isLocalNode := state.Name == m.config.Name - if a.Incarnation <= state.Incarnation && !isLocalNode { + if a.Incarnation <= state.Incarnation && !isLocalNode && !updatesNode { return } @@ -956,9 +1000,8 @@ func (m *Memberlist) aliveNode(a *alive, notify chan struct{}, bootstrap bool) { bytes.Equal(a.Vsn, versions) { return } - m.refute(state, a.Incarnation) - m.logger.Printf("[WARN] memberlist: Refuting an alive message") + m.logger.Printf("[WARN] memberlist: Refuting an alive message for '%s' (%v:%d) meta:(%v VS %v), vsn:(%v VS %v)", a.Node, net.IP(a.Addr), a.Port, a.Meta, state.Meta, a.Vsn, versions) } else { m.encodeBroadcastNotify(a.Node, aliveMsg, a, notify) @@ -975,6 +1018,8 @@ func (m *Memberlist) aliveNode(a *alive, notify chan struct{}, bootstrap bool) { // Update the state and incarnation number state.Incarnation = a.Incarnation state.Meta = a.Meta + state.Addr = a.Addr + state.Port = a.Port if state.State != stateAlive { state.State = stateAlive state.StateChange = time.Now() diff --git a/vendor/github.com/hashicorp/memberlist/util.go b/vendor/github.com/hashicorp/memberlist/util.go index e2381a6986..1e582a8a1b 100644 --- a/vendor/github.com/hashicorp/memberlist/util.go +++ b/vendor/github.com/hashicorp/memberlist/util.go @@ -78,10 +78,9 @@ func retransmitLimit(retransmitMult, n int) int { // shuffleNodes randomly shuffles the input nodes using the Fisher-Yates shuffle func shuffleNodes(nodes []*nodeState) { n := len(nodes) - for i := n - 1; i > 0; i-- { - j := rand.Intn(i + 1) + rand.Shuffle(n, func(i, j int) { nodes[i], nodes[j] = nodes[j], nodes[i] - } + }) } // pushPushScale is used to scale the time interval at which push/pull From 0fc1700cecd6fb503dd1fba778a914764db0cb3e Mon Sep 17 00:00:00 2001 From: Sebastiaan van Stijn Date: Mon, 26 Aug 2019 13:19:15 +0200 Subject: [PATCH 2/6] vendor github.com/google/btree v1.0.0 Signed-off-by: Sebastiaan van Stijn --- vendor.conf | 1 + vendor/github.com/google/btree/LICENSE | 202 +++++ vendor/github.com/google/btree/README.md | 12 + vendor/github.com/google/btree/btree.go | 890 +++++++++++++++++++++++ 4 files changed, 1105 insertions(+) create mode 100644 vendor/github.com/google/btree/LICENSE create mode 100644 vendor/github.com/google/btree/README.md create mode 100644 vendor/github.com/google/btree/btree.go diff --git a/vendor.conf b/vendor.conf index d9bea5e2fa..35bea74196 100644 --- a/vendor.conf +++ b/vendor.conf @@ -20,6 +20,7 @@ github.com/gogo/protobuf 5628607bb4c51c3157aacc3a50f0 github.com/godbus/dbus/v5 37bf87eef99d69c4f1d3528bd66e3a87dc201472 # v5.0.3 github.com/gorilla/mux 98cb6bf42e086f6af920b965c38cacc07402d51b # v1.8.0 +github.com/google/btree 4030bb1f1f0c35b30ca7009e9ebd06849dd45306 # v1.0.0 github.com/hashicorp/consul 9a9cc9341bb487651a0399e3fc5e1e8a42e62dd9 # v0.5.2 github.com/hashicorp/errwrap 8a6fb523712970c966eefc6b39ed2c5e74880354 # v1.0.0 github.com/hashicorp/go-msgpack 71c2886f5a673a35f909803f38ece5810165097b diff --git a/vendor/github.com/google/btree/LICENSE b/vendor/github.com/google/btree/LICENSE new file mode 100644 index 0000000000..d645695673 --- /dev/null +++ b/vendor/github.com/google/btree/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/vendor/github.com/google/btree/README.md b/vendor/github.com/google/btree/README.md new file mode 100644 index 0000000000..6062a4dacd --- /dev/null +++ b/vendor/github.com/google/btree/README.md @@ -0,0 +1,12 @@ +# BTree implementation for Go + +![Travis CI Build Status](https://api.travis-ci.org/google/btree.svg?branch=master) + +This package provides an in-memory B-Tree implementation for Go, useful as +an ordered, mutable data structure. + +The API is based off of the wonderful +http://godoc.org/github.com/petar/GoLLRB/llrb, and is meant to allow btree to +act as a drop-in replacement for gollrb trees. + +See http://godoc.org/github.com/google/btree for documentation. diff --git a/vendor/github.com/google/btree/btree.go b/vendor/github.com/google/btree/btree.go new file mode 100644 index 0000000000..6ff062f9bb --- /dev/null +++ b/vendor/github.com/google/btree/btree.go @@ -0,0 +1,890 @@ +// Copyright 2014 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package btree implements in-memory B-Trees of arbitrary degree. +// +// btree implements an in-memory B-Tree for use as an ordered data structure. +// It is not meant for persistent storage solutions. +// +// It has a flatter structure than an equivalent red-black or other binary tree, +// which in some cases yields better memory usage and/or performance. +// See some discussion on the matter here: +// http://google-opensource.blogspot.com/2013/01/c-containers-that-save-memory-and-time.html +// Note, though, that this project is in no way related to the C++ B-Tree +// implementation written about there. +// +// Within this tree, each node contains a slice of items and a (possibly nil) +// slice of children. For basic numeric values or raw structs, this can cause +// efficiency differences when compared to equivalent C++ template code that +// stores values in arrays within the node: +// * Due to the overhead of storing values as interfaces (each +// value needs to be stored as the value itself, then 2 words for the +// interface pointing to that value and its type), resulting in higher +// memory use. +// * Since interfaces can point to values anywhere in memory, values are +// most likely not stored in contiguous blocks, resulting in a higher +// number of cache misses. +// These issues don't tend to matter, though, when working with strings or other +// heap-allocated structures, since C++-equivalent structures also must store +// pointers and also distribute their values across the heap. +// +// This implementation is designed to be a drop-in replacement to gollrb.LLRB +// trees, (http://github.com/petar/gollrb), an excellent and probably the most +// widely used ordered tree implementation in the Go ecosystem currently. +// Its functions, therefore, exactly mirror those of +// llrb.LLRB where possible. Unlike gollrb, though, we currently don't +// support storing multiple equivalent values. +package btree + +import ( + "fmt" + "io" + "sort" + "strings" + "sync" +) + +// Item represents a single object in the tree. +type Item interface { + // Less tests whether the current item is less than the given argument. + // + // This must provide a strict weak ordering. + // If !a.Less(b) && !b.Less(a), we treat this to mean a == b (i.e. we can only + // hold one of either a or b in the tree). + Less(than Item) bool +} + +const ( + DefaultFreeListSize = 32 +) + +var ( + nilItems = make(items, 16) + nilChildren = make(children, 16) +) + +// FreeList represents a free list of btree nodes. By default each +// BTree has its own FreeList, but multiple BTrees can share the same +// FreeList. +// Two Btrees using the same freelist are safe for concurrent write access. +type FreeList struct { + mu sync.Mutex + freelist []*node +} + +// NewFreeList creates a new free list. +// size is the maximum size of the returned free list. +func NewFreeList(size int) *FreeList { + return &FreeList{freelist: make([]*node, 0, size)} +} + +func (f *FreeList) newNode() (n *node) { + f.mu.Lock() + index := len(f.freelist) - 1 + if index < 0 { + f.mu.Unlock() + return new(node) + } + n = f.freelist[index] + f.freelist[index] = nil + f.freelist = f.freelist[:index] + f.mu.Unlock() + return +} + +// freeNode adds the given node to the list, returning true if it was added +// and false if it was discarded. +func (f *FreeList) freeNode(n *node) (out bool) { + f.mu.Lock() + if len(f.freelist) < cap(f.freelist) { + f.freelist = append(f.freelist, n) + out = true + } + f.mu.Unlock() + return +} + +// ItemIterator allows callers of Ascend* to iterate in-order over portions of +// the tree. When this function returns false, iteration will stop and the +// associated Ascend* function will immediately return. +type ItemIterator func(i Item) bool + +// New creates a new B-Tree with the given degree. +// +// New(2), for example, will create a 2-3-4 tree (each node contains 1-3 items +// and 2-4 children). +func New(degree int) *BTree { + return NewWithFreeList(degree, NewFreeList(DefaultFreeListSize)) +} + +// NewWithFreeList creates a new B-Tree that uses the given node free list. +func NewWithFreeList(degree int, f *FreeList) *BTree { + if degree <= 1 { + panic("bad degree") + } + return &BTree{ + degree: degree, + cow: ©OnWriteContext{freelist: f}, + } +} + +// items stores items in a node. +type items []Item + +// insertAt inserts a value into the given index, pushing all subsequent values +// forward. +func (s *items) insertAt(index int, item Item) { + *s = append(*s, nil) + if index < len(*s) { + copy((*s)[index+1:], (*s)[index:]) + } + (*s)[index] = item +} + +// removeAt removes a value at a given index, pulling all subsequent values +// back. +func (s *items) removeAt(index int) Item { + item := (*s)[index] + copy((*s)[index:], (*s)[index+1:]) + (*s)[len(*s)-1] = nil + *s = (*s)[:len(*s)-1] + return item +} + +// pop removes and returns the last element in the list. +func (s *items) pop() (out Item) { + index := len(*s) - 1 + out = (*s)[index] + (*s)[index] = nil + *s = (*s)[:index] + return +} + +// truncate truncates this instance at index so that it contains only the +// first index items. index must be less than or equal to length. +func (s *items) truncate(index int) { + var toClear items + *s, toClear = (*s)[:index], (*s)[index:] + for len(toClear) > 0 { + toClear = toClear[copy(toClear, nilItems):] + } +} + +// find returns the index where the given item should be inserted into this +// list. 'found' is true if the item already exists in the list at the given +// index. +func (s items) find(item Item) (index int, found bool) { + i := sort.Search(len(s), func(i int) bool { + return item.Less(s[i]) + }) + if i > 0 && !s[i-1].Less(item) { + return i - 1, true + } + return i, false +} + +// children stores child nodes in a node. +type children []*node + +// insertAt inserts a value into the given index, pushing all subsequent values +// forward. +func (s *children) insertAt(index int, n *node) { + *s = append(*s, nil) + if index < len(*s) { + copy((*s)[index+1:], (*s)[index:]) + } + (*s)[index] = n +} + +// removeAt removes a value at a given index, pulling all subsequent values +// back. +func (s *children) removeAt(index int) *node { + n := (*s)[index] + copy((*s)[index:], (*s)[index+1:]) + (*s)[len(*s)-1] = nil + *s = (*s)[:len(*s)-1] + return n +} + +// pop removes and returns the last element in the list. +func (s *children) pop() (out *node) { + index := len(*s) - 1 + out = (*s)[index] + (*s)[index] = nil + *s = (*s)[:index] + return +} + +// truncate truncates this instance at index so that it contains only the +// first index children. index must be less than or equal to length. +func (s *children) truncate(index int) { + var toClear children + *s, toClear = (*s)[:index], (*s)[index:] + for len(toClear) > 0 { + toClear = toClear[copy(toClear, nilChildren):] + } +} + +// node is an internal node in a tree. +// +// It must at all times maintain the invariant that either +// * len(children) == 0, len(items) unconstrained +// * len(children) == len(items) + 1 +type node struct { + items items + children children + cow *copyOnWriteContext +} + +func (n *node) mutableFor(cow *copyOnWriteContext) *node { + if n.cow == cow { + return n + } + out := cow.newNode() + if cap(out.items) >= len(n.items) { + out.items = out.items[:len(n.items)] + } else { + out.items = make(items, len(n.items), cap(n.items)) + } + copy(out.items, n.items) + // Copy children + if cap(out.children) >= len(n.children) { + out.children = out.children[:len(n.children)] + } else { + out.children = make(children, len(n.children), cap(n.children)) + } + copy(out.children, n.children) + return out +} + +func (n *node) mutableChild(i int) *node { + c := n.children[i].mutableFor(n.cow) + n.children[i] = c + return c +} + +// split splits the given node at the given index. The current node shrinks, +// and this function returns the item that existed at that index and a new node +// containing all items/children after it. +func (n *node) split(i int) (Item, *node) { + item := n.items[i] + next := n.cow.newNode() + next.items = append(next.items, n.items[i+1:]...) + n.items.truncate(i) + if len(n.children) > 0 { + next.children = append(next.children, n.children[i+1:]...) + n.children.truncate(i + 1) + } + return item, next +} + +// maybeSplitChild checks if a child should be split, and if so splits it. +// Returns whether or not a split occurred. +func (n *node) maybeSplitChild(i, maxItems int) bool { + if len(n.children[i].items) < maxItems { + return false + } + first := n.mutableChild(i) + item, second := first.split(maxItems / 2) + n.items.insertAt(i, item) + n.children.insertAt(i+1, second) + return true +} + +// insert inserts an item into the subtree rooted at this node, making sure +// no nodes in the subtree exceed maxItems items. Should an equivalent item be +// be found/replaced by insert, it will be returned. +func (n *node) insert(item Item, maxItems int) Item { + i, found := n.items.find(item) + if found { + out := n.items[i] + n.items[i] = item + return out + } + if len(n.children) == 0 { + n.items.insertAt(i, item) + return nil + } + if n.maybeSplitChild(i, maxItems) { + inTree := n.items[i] + switch { + case item.Less(inTree): + // no change, we want first split node + case inTree.Less(item): + i++ // we want second split node + default: + out := n.items[i] + n.items[i] = item + return out + } + } + return n.mutableChild(i).insert(item, maxItems) +} + +// get finds the given key in the subtree and returns it. +func (n *node) get(key Item) Item { + i, found := n.items.find(key) + if found { + return n.items[i] + } else if len(n.children) > 0 { + return n.children[i].get(key) + } + return nil +} + +// min returns the first item in the subtree. +func min(n *node) Item { + if n == nil { + return nil + } + for len(n.children) > 0 { + n = n.children[0] + } + if len(n.items) == 0 { + return nil + } + return n.items[0] +} + +// max returns the last item in the subtree. +func max(n *node) Item { + if n == nil { + return nil + } + for len(n.children) > 0 { + n = n.children[len(n.children)-1] + } + if len(n.items) == 0 { + return nil + } + return n.items[len(n.items)-1] +} + +// toRemove details what item to remove in a node.remove call. +type toRemove int + +const ( + removeItem toRemove = iota // removes the given item + removeMin // removes smallest item in the subtree + removeMax // removes largest item in the subtree +) + +// remove removes an item from the subtree rooted at this node. +func (n *node) remove(item Item, minItems int, typ toRemove) Item { + var i int + var found bool + switch typ { + case removeMax: + if len(n.children) == 0 { + return n.items.pop() + } + i = len(n.items) + case removeMin: + if len(n.children) == 0 { + return n.items.removeAt(0) + } + i = 0 + case removeItem: + i, found = n.items.find(item) + if len(n.children) == 0 { + if found { + return n.items.removeAt(i) + } + return nil + } + default: + panic("invalid type") + } + // If we get to here, we have children. + if len(n.children[i].items) <= minItems { + return n.growChildAndRemove(i, item, minItems, typ) + } + child := n.mutableChild(i) + // Either we had enough items to begin with, or we've done some + // merging/stealing, because we've got enough now and we're ready to return + // stuff. + if found { + // The item exists at index 'i', and the child we've selected can give us a + // predecessor, since if we've gotten here it's got > minItems items in it. + out := n.items[i] + // We use our special-case 'remove' call with typ=maxItem to pull the + // predecessor of item i (the rightmost leaf of our immediate left child) + // and set it into where we pulled the item from. + n.items[i] = child.remove(nil, minItems, removeMax) + return out + } + // Final recursive call. Once we're here, we know that the item isn't in this + // node and that the child is big enough to remove from. + return child.remove(item, minItems, typ) +} + +// growChildAndRemove grows child 'i' to make sure it's possible to remove an +// item from it while keeping it at minItems, then calls remove to actually +// remove it. +// +// Most documentation says we have to do two sets of special casing: +// 1) item is in this node +// 2) item is in child +// In both cases, we need to handle the two subcases: +// A) node has enough values that it can spare one +// B) node doesn't have enough values +// For the latter, we have to check: +// a) left sibling has node to spare +// b) right sibling has node to spare +// c) we must merge +// To simplify our code here, we handle cases #1 and #2 the same: +// If a node doesn't have enough items, we make sure it does (using a,b,c). +// We then simply redo our remove call, and the second time (regardless of +// whether we're in case 1 or 2), we'll have enough items and can guarantee +// that we hit case A. +func (n *node) growChildAndRemove(i int, item Item, minItems int, typ toRemove) Item { + if i > 0 && len(n.children[i-1].items) > minItems { + // Steal from left child + child := n.mutableChild(i) + stealFrom := n.mutableChild(i - 1) + stolenItem := stealFrom.items.pop() + child.items.insertAt(0, n.items[i-1]) + n.items[i-1] = stolenItem + if len(stealFrom.children) > 0 { + child.children.insertAt(0, stealFrom.children.pop()) + } + } else if i < len(n.items) && len(n.children[i+1].items) > minItems { + // steal from right child + child := n.mutableChild(i) + stealFrom := n.mutableChild(i + 1) + stolenItem := stealFrom.items.removeAt(0) + child.items = append(child.items, n.items[i]) + n.items[i] = stolenItem + if len(stealFrom.children) > 0 { + child.children = append(child.children, stealFrom.children.removeAt(0)) + } + } else { + if i >= len(n.items) { + i-- + } + child := n.mutableChild(i) + // merge with right child + mergeItem := n.items.removeAt(i) + mergeChild := n.children.removeAt(i + 1) + child.items = append(child.items, mergeItem) + child.items = append(child.items, mergeChild.items...) + child.children = append(child.children, mergeChild.children...) + n.cow.freeNode(mergeChild) + } + return n.remove(item, minItems, typ) +} + +type direction int + +const ( + descend = direction(-1) + ascend = direction(+1) +) + +// iterate provides a simple method for iterating over elements in the tree. +// +// When ascending, the 'start' should be less than 'stop' and when descending, +// the 'start' should be greater than 'stop'. Setting 'includeStart' to true +// will force the iterator to include the first item when it equals 'start', +// thus creating a "greaterOrEqual" or "lessThanEqual" rather than just a +// "greaterThan" or "lessThan" queries. +func (n *node) iterate(dir direction, start, stop Item, includeStart bool, hit bool, iter ItemIterator) (bool, bool) { + var ok, found bool + var index int + switch dir { + case ascend: + if start != nil { + index, _ = n.items.find(start) + } + for i := index; i < len(n.items); i++ { + if len(n.children) > 0 { + if hit, ok = n.children[i].iterate(dir, start, stop, includeStart, hit, iter); !ok { + return hit, false + } + } + if !includeStart && !hit && start != nil && !start.Less(n.items[i]) { + hit = true + continue + } + hit = true + if stop != nil && !n.items[i].Less(stop) { + return hit, false + } + if !iter(n.items[i]) { + return hit, false + } + } + if len(n.children) > 0 { + if hit, ok = n.children[len(n.children)-1].iterate(dir, start, stop, includeStart, hit, iter); !ok { + return hit, false + } + } + case descend: + if start != nil { + index, found = n.items.find(start) + if !found { + index = index - 1 + } + } else { + index = len(n.items) - 1 + } + for i := index; i >= 0; i-- { + if start != nil && !n.items[i].Less(start) { + if !includeStart || hit || start.Less(n.items[i]) { + continue + } + } + if len(n.children) > 0 { + if hit, ok = n.children[i+1].iterate(dir, start, stop, includeStart, hit, iter); !ok { + return hit, false + } + } + if stop != nil && !stop.Less(n.items[i]) { + return hit, false // continue + } + hit = true + if !iter(n.items[i]) { + return hit, false + } + } + if len(n.children) > 0 { + if hit, ok = n.children[0].iterate(dir, start, stop, includeStart, hit, iter); !ok { + return hit, false + } + } + } + return hit, true +} + +// Used for testing/debugging purposes. +func (n *node) print(w io.Writer, level int) { + fmt.Fprintf(w, "%sNODE:%v\n", strings.Repeat(" ", level), n.items) + for _, c := range n.children { + c.print(w, level+1) + } +} + +// BTree is an implementation of a B-Tree. +// +// BTree stores Item instances in an ordered structure, allowing easy insertion, +// removal, and iteration. +// +// Write operations are not safe for concurrent mutation by multiple +// goroutines, but Read operations are. +type BTree struct { + degree int + length int + root *node + cow *copyOnWriteContext +} + +// copyOnWriteContext pointers determine node ownership... a tree with a write +// context equivalent to a node's write context is allowed to modify that node. +// A tree whose write context does not match a node's is not allowed to modify +// it, and must create a new, writable copy (IE: it's a Clone). +// +// When doing any write operation, we maintain the invariant that the current +// node's context is equal to the context of the tree that requested the write. +// We do this by, before we descend into any node, creating a copy with the +// correct context if the contexts don't match. +// +// Since the node we're currently visiting on any write has the requesting +// tree's context, that node is modifiable in place. Children of that node may +// not share context, but before we descend into them, we'll make a mutable +// copy. +type copyOnWriteContext struct { + freelist *FreeList +} + +// Clone clones the btree, lazily. Clone should not be called concurrently, +// but the original tree (t) and the new tree (t2) can be used concurrently +// once the Clone call completes. +// +// The internal tree structure of b is marked read-only and shared between t and +// t2. Writes to both t and t2 use copy-on-write logic, creating new nodes +// whenever one of b's original nodes would have been modified. Read operations +// should have no performance degredation. Write operations for both t and t2 +// will initially experience minor slow-downs caused by additional allocs and +// copies due to the aforementioned copy-on-write logic, but should converge to +// the original performance characteristics of the original tree. +func (t *BTree) Clone() (t2 *BTree) { + // Create two entirely new copy-on-write contexts. + // This operation effectively creates three trees: + // the original, shared nodes (old b.cow) + // the new b.cow nodes + // the new out.cow nodes + cow1, cow2 := *t.cow, *t.cow + out := *t + t.cow = &cow1 + out.cow = &cow2 + return &out +} + +// maxItems returns the max number of items to allow per node. +func (t *BTree) maxItems() int { + return t.degree*2 - 1 +} + +// minItems returns the min number of items to allow per node (ignored for the +// root node). +func (t *BTree) minItems() int { + return t.degree - 1 +} + +func (c *copyOnWriteContext) newNode() (n *node) { + n = c.freelist.newNode() + n.cow = c + return +} + +type freeType int + +const ( + ftFreelistFull freeType = iota // node was freed (available for GC, not stored in freelist) + ftStored // node was stored in the freelist for later use + ftNotOwned // node was ignored by COW, since it's owned by another one +) + +// freeNode frees a node within a given COW context, if it's owned by that +// context. It returns what happened to the node (see freeType const +// documentation). +func (c *copyOnWriteContext) freeNode(n *node) freeType { + if n.cow == c { + // clear to allow GC + n.items.truncate(0) + n.children.truncate(0) + n.cow = nil + if c.freelist.freeNode(n) { + return ftStored + } else { + return ftFreelistFull + } + } else { + return ftNotOwned + } +} + +// ReplaceOrInsert adds the given item to the tree. If an item in the tree +// already equals the given one, it is removed from the tree and returned. +// Otherwise, nil is returned. +// +// nil cannot be added to the tree (will panic). +func (t *BTree) ReplaceOrInsert(item Item) Item { + if item == nil { + panic("nil item being added to BTree") + } + if t.root == nil { + t.root = t.cow.newNode() + t.root.items = append(t.root.items, item) + t.length++ + return nil + } else { + t.root = t.root.mutableFor(t.cow) + if len(t.root.items) >= t.maxItems() { + item2, second := t.root.split(t.maxItems() / 2) + oldroot := t.root + t.root = t.cow.newNode() + t.root.items = append(t.root.items, item2) + t.root.children = append(t.root.children, oldroot, second) + } + } + out := t.root.insert(item, t.maxItems()) + if out == nil { + t.length++ + } + return out +} + +// Delete removes an item equal to the passed in item from the tree, returning +// it. If no such item exists, returns nil. +func (t *BTree) Delete(item Item) Item { + return t.deleteItem(item, removeItem) +} + +// DeleteMin removes the smallest item in the tree and returns it. +// If no such item exists, returns nil. +func (t *BTree) DeleteMin() Item { + return t.deleteItem(nil, removeMin) +} + +// DeleteMax removes the largest item in the tree and returns it. +// If no such item exists, returns nil. +func (t *BTree) DeleteMax() Item { + return t.deleteItem(nil, removeMax) +} + +func (t *BTree) deleteItem(item Item, typ toRemove) Item { + if t.root == nil || len(t.root.items) == 0 { + return nil + } + t.root = t.root.mutableFor(t.cow) + out := t.root.remove(item, t.minItems(), typ) + if len(t.root.items) == 0 && len(t.root.children) > 0 { + oldroot := t.root + t.root = t.root.children[0] + t.cow.freeNode(oldroot) + } + if out != nil { + t.length-- + } + return out +} + +// AscendRange calls the iterator for every value in the tree within the range +// [greaterOrEqual, lessThan), until iterator returns false. +func (t *BTree) AscendRange(greaterOrEqual, lessThan Item, iterator ItemIterator) { + if t.root == nil { + return + } + t.root.iterate(ascend, greaterOrEqual, lessThan, true, false, iterator) +} + +// AscendLessThan calls the iterator for every value in the tree within the range +// [first, pivot), until iterator returns false. +func (t *BTree) AscendLessThan(pivot Item, iterator ItemIterator) { + if t.root == nil { + return + } + t.root.iterate(ascend, nil, pivot, false, false, iterator) +} + +// AscendGreaterOrEqual calls the iterator for every value in the tree within +// the range [pivot, last], until iterator returns false. +func (t *BTree) AscendGreaterOrEqual(pivot Item, iterator ItemIterator) { + if t.root == nil { + return + } + t.root.iterate(ascend, pivot, nil, true, false, iterator) +} + +// Ascend calls the iterator for every value in the tree within the range +// [first, last], until iterator returns false. +func (t *BTree) Ascend(iterator ItemIterator) { + if t.root == nil { + return + } + t.root.iterate(ascend, nil, nil, false, false, iterator) +} + +// DescendRange calls the iterator for every value in the tree within the range +// [lessOrEqual, greaterThan), until iterator returns false. +func (t *BTree) DescendRange(lessOrEqual, greaterThan Item, iterator ItemIterator) { + if t.root == nil { + return + } + t.root.iterate(descend, lessOrEqual, greaterThan, true, false, iterator) +} + +// DescendLessOrEqual calls the iterator for every value in the tree within the range +// [pivot, first], until iterator returns false. +func (t *BTree) DescendLessOrEqual(pivot Item, iterator ItemIterator) { + if t.root == nil { + return + } + t.root.iterate(descend, pivot, nil, true, false, iterator) +} + +// DescendGreaterThan calls the iterator for every value in the tree within +// the range (pivot, last], until iterator returns false. +func (t *BTree) DescendGreaterThan(pivot Item, iterator ItemIterator) { + if t.root == nil { + return + } + t.root.iterate(descend, nil, pivot, false, false, iterator) +} + +// Descend calls the iterator for every value in the tree within the range +// [last, first], until iterator returns false. +func (t *BTree) Descend(iterator ItemIterator) { + if t.root == nil { + return + } + t.root.iterate(descend, nil, nil, false, false, iterator) +} + +// Get looks for the key item in the tree, returning it. It returns nil if +// unable to find that item. +func (t *BTree) Get(key Item) Item { + if t.root == nil { + return nil + } + return t.root.get(key) +} + +// Min returns the smallest item in the tree, or nil if the tree is empty. +func (t *BTree) Min() Item { + return min(t.root) +} + +// Max returns the largest item in the tree, or nil if the tree is empty. +func (t *BTree) Max() Item { + return max(t.root) +} + +// Has returns true if the given key is in the tree. +func (t *BTree) Has(key Item) bool { + return t.Get(key) != nil +} + +// Len returns the number of items currently in the tree. +func (t *BTree) Len() int { + return t.length +} + +// Clear removes all items from the btree. If addNodesToFreelist is true, +// t's nodes are added to its freelist as part of this call, until the freelist +// is full. Otherwise, the root node is simply dereferenced and the subtree +// left to Go's normal GC processes. +// +// This can be much faster +// than calling Delete on all elements, because that requires finding/removing +// each element in the tree and updating the tree accordingly. It also is +// somewhat faster than creating a new tree to replace the old one, because +// nodes from the old tree are reclaimed into the freelist for use by the new +// one, instead of being lost to the garbage collector. +// +// This call takes: +// O(1): when addNodesToFreelist is false, this is a single operation. +// O(1): when the freelist is already full, it breaks out immediately +// O(freelist size): when the freelist is empty and the nodes are all owned +// by this tree, nodes are added to the freelist until full. +// O(tree size): when all nodes are owned by another tree, all nodes are +// iterated over looking for nodes to add to the freelist, and due to +// ownership, none are. +func (t *BTree) Clear(addNodesToFreelist bool) { + if t.root != nil && addNodesToFreelist { + t.root.reset(t.cow) + } + t.root, t.length = nil, 0 +} + +// reset returns a subtree to the freelist. It breaks out immediately if the +// freelist is full, since the only benefit of iterating is to fill that +// freelist up. Returns true if parent reset call should continue. +func (n *node) reset(c *copyOnWriteContext) bool { + for _, child := range n.children { + if !child.reset(c) { + return false + } + } + return c.freeNode(n) != ftFreelistFull +} + +// Int implements the Item interface for integers. +type Int int + +// Less returns true if int(a) < int(b). +func (a Int) Less(b Item) bool { + return a < b.(Int) +} From 099cf66cc61c76295952ee6a5f4412549f938b16 Mon Sep 17 00:00:00 2001 From: Sebastiaan van Stijn Date: Mon, 26 Aug 2019 12:17:44 +0200 Subject: [PATCH 3/6] bump hashicorp/go-msgpack v0.5.5 full diff: https://github.com/hashicorp/go-msgpack/compare/71c2886f5a673a35f909803f38ece5810165097b...v0.5.5 - hashicorp/go-msgpack#3 Add go.mod - hashicorp/go-msgpack#7 Do not attempt to set unsettable types - hashicorp/go-msgpack#8 codec: do not dereference pointers/interfaces for omitempty support - backport of https://github.com/hashicorp/go-msgpack/commit/006e1534301cb75b848ee452ab5d3ba8c6a70784 - backport of https://github.com/hashicorp/go-msgpack/commit/006e1534301cb75b848ee452ab5d3ba8c6a70784 - fixes https://github.com/ugorji/go/issues/67 "omitempty" fails on pointers to bools Signed-off-by: Sebastiaan van Stijn --- vendor.conf | 2 +- .../github.com/hashicorp/go-msgpack/codec/decode.go | 2 +- .../github.com/hashicorp/go-msgpack/codec/helper.go | 7 +++++++ .../hashicorp/go-msgpack/codec/helper_internal.go | 13 +++++++++---- vendor/github.com/hashicorp/go-msgpack/go.mod | 1 + 5 files changed, 19 insertions(+), 6 deletions(-) create mode 100644 vendor/github.com/hashicorp/go-msgpack/go.mod diff --git a/vendor.conf b/vendor.conf index 35bea74196..758ec50e4f 100644 --- a/vendor.conf +++ b/vendor.conf @@ -23,7 +23,7 @@ github.com/gorilla/mux 98cb6bf42e086f6af920b965c38c github.com/google/btree 4030bb1f1f0c35b30ca7009e9ebd06849dd45306 # v1.0.0 github.com/hashicorp/consul 9a9cc9341bb487651a0399e3fc5e1e8a42e62dd9 # v0.5.2 github.com/hashicorp/errwrap 8a6fb523712970c966eefc6b39ed2c5e74880354 # v1.0.0 -github.com/hashicorp/go-msgpack 71c2886f5a673a35f909803f38ece5810165097b +github.com/hashicorp/go-msgpack ad60660ecf9c5a1eae0ca32182ed72bab5807961 # v0.5.5 github.com/hashicorp/go-multierror 886a7fbe3eb1c874d46f623bfa70af45f425b3d1 # v1.0.0 github.com/hashicorp/memberlist e1138a6a4d8a6eaec6c919aeae5efbe4d69b1ece # v0.1.4 github.com/hashicorp/golang-lru 7f827b33c0f158ec5dfbba01bb0b14a4541fd81d # v0.5.3 diff --git a/vendor/github.com/hashicorp/go-msgpack/codec/decode.go b/vendor/github.com/hashicorp/go-msgpack/codec/decode.go index 87bef2b935..851b54ac7e 100644 --- a/vendor/github.com/hashicorp/go-msgpack/codec/decode.go +++ b/vendor/github.com/hashicorp/go-msgpack/codec/decode.go @@ -527,7 +527,7 @@ func (f *decFnInfo) kMap(rv reflect.Value) { } } rvv := rv.MapIndex(rvk) - if !rvv.IsValid() { + if !rvv.IsValid() || !rvv.CanSet() { rvv = reflect.New(vtype).Elem() } diff --git a/vendor/github.com/hashicorp/go-msgpack/codec/helper.go b/vendor/github.com/hashicorp/go-msgpack/codec/helper.go index e6dc0563f0..7da3955edc 100644 --- a/vendor/github.com/hashicorp/go-msgpack/codec/helper.go +++ b/vendor/github.com/hashicorp/go-msgpack/codec/helper.go @@ -45,6 +45,13 @@ const ( // for debugging, set this to false, to catch panic traces. // Note that this will always cause rpc tests to fail, since they need io.EOF sent via panic. recoverPanicToErr = true + + // if checkStructForEmptyValue, check structs fields to see if an empty value. + // This could be an expensive call, so possibly disable it. + checkStructForEmptyValue = false + + // if derefForIsEmptyValue, deref pointers and interfaces when checking isEmptyValue + derefForIsEmptyValue = false ) type charEncoding uint8 diff --git a/vendor/github.com/hashicorp/go-msgpack/codec/helper_internal.go b/vendor/github.com/hashicorp/go-msgpack/codec/helper_internal.go index 58417da958..93f12854f2 100644 --- a/vendor/github.com/hashicorp/go-msgpack/codec/helper_internal.go +++ b/vendor/github.com/hashicorp/go-msgpack/codec/helper_internal.go @@ -33,8 +33,10 @@ func panicValToErr(panicVal interface{}, err *error) { return } -func isEmptyValueDeref(v reflect.Value, deref bool) bool { +func hIsEmptyValue(v reflect.Value, deref, checkStruct bool) bool { switch v.Kind() { + case reflect.Invalid: + return true case reflect.Array, reflect.Map, reflect.Slice, reflect.String: return v.Len() == 0 case reflect.Bool: @@ -50,18 +52,21 @@ func isEmptyValueDeref(v reflect.Value, deref bool) bool { if v.IsNil() { return true } - return isEmptyValueDeref(v.Elem(), deref) + return hIsEmptyValue(v.Elem(), deref, checkStruct) } else { return v.IsNil() } case reflect.Struct: + if !checkStruct { + return false + } // return true if all fields are empty. else return false. // we cannot use equality check, because some fields may be maps/slices/etc // and consequently the structs are not comparable. // return v.Interface() == reflect.Zero(v.Type()).Interface() for i, n := 0, v.NumField(); i < n; i++ { - if !isEmptyValueDeref(v.Field(i), deref) { + if !hIsEmptyValue(v.Field(i), deref, checkStruct) { return false } } @@ -71,7 +76,7 @@ func isEmptyValueDeref(v reflect.Value, deref bool) bool { } func isEmptyValue(v reflect.Value) bool { - return isEmptyValueDeref(v, true) + return hIsEmptyValue(v, derefForIsEmptyValue, checkStructForEmptyValue) } func debugf(format string, args ...interface{}) { diff --git a/vendor/github.com/hashicorp/go-msgpack/go.mod b/vendor/github.com/hashicorp/go-msgpack/go.mod new file mode 100644 index 0000000000..2c92e7fd22 --- /dev/null +++ b/vendor/github.com/hashicorp/go-msgpack/go.mod @@ -0,0 +1 @@ +module github.com/hashicorp/go-msgpack From bb2364fb9e52bb55ff5f810567edfdefd34aeb1e Mon Sep 17 00:00:00 2001 From: Sebastiaan van Stijn Date: Mon, 26 Aug 2019 12:30:12 +0200 Subject: [PATCH 4/6] bump hashicorp/serf v0.8.3 Changelog: https://github.com/hashicorp/serf/blob/v0.8.3/CHANGELOG.md full diff https://github.com/hashicorp/serf/compare/598c54895cc5a7b1a24a398d635e8c0ea0959870...v0.8.3 We were previously 47 commits ahead of v0.7.0 (v0.7.0-47-g598c548), 60 commits behind v0.8.0 - https://github.com/hashicorp/serf/compare/v0.7.0...598c54895cc5a7b1a24a398d635e8c0ea0959870 - https://github.com/hashicorp/serf/compare/598c54895cc5a7b1a24a398d635e8c0ea0959870...v0.8.0 Signed-off-by: Sebastiaan van Stijn --- vendor.conf | 2 +- vendor/github.com/hashicorp/serf/README.md | 27 +- .../hashicorp/serf/coordinate/client.go | 69 +++- .../hashicorp/serf/coordinate/config.go | 2 +- .../hashicorp/serf/coordinate/coordinate.go | 30 +- vendor/github.com/hashicorp/serf/go.mod | 17 + .../hashicorp/serf/serf/broadcast.go | 3 + .../github.com/hashicorp/serf/serf/config.go | 56 ++- .../hashicorp/serf/serf/delegate.go | 36 +- .../github.com/hashicorp/serf/serf/event.go | 88 +++-- .../hashicorp/serf/serf/internal_query.go | 81 +++- .../hashicorp/serf/serf/keymanager.go | 41 +- .../hashicorp/serf/serf/messages.go | 46 ++- .../hashicorp/serf/serf/ping_delegate.go | 47 +-- .../github.com/hashicorp/serf/serf/query.go | 111 +++++- vendor/github.com/hashicorp/serf/serf/serf.go | 351 ++++++++++++------ .../hashicorp/serf/serf/snapshot.go | 273 +++++++++----- 17 files changed, 938 insertions(+), 342 deletions(-) create mode 100644 vendor/github.com/hashicorp/serf/go.mod diff --git a/vendor.conf b/vendor.conf index 758ec50e4f..c644cf49cd 100644 --- a/vendor.conf +++ b/vendor.conf @@ -29,7 +29,7 @@ github.com/hashicorp/memberlist e1138a6a4d8a6eaec6c919aeae5e github.com/hashicorp/golang-lru 7f827b33c0f158ec5dfbba01bb0b14a4541fd81d # v0.5.3 github.com/sean-/seed e2103e2c35297fb7e17febb81e49b312087a2372 github.com/hashicorp/go-sockaddr c7188e74f6acae5a989bdc959aa779f8b9f42faf # v1.0.2 -github.com/hashicorp/serf 598c54895cc5a7b1a24a398d635e8c0ea0959870 +github.com/hashicorp/serf 15cfd05de3dffb3664aa37b06e91f970b825e380 # v0.8.3 github.com/miekg/dns 6c0c4e6581f8e173cc562c8b3363ab984e4ae071 # v1.1.27 github.com/opencontainers/runtime-spec 4d89ac9fbff6c455f46a5bb59c6b1bb7184a5e43 # v1.0.3-0.20200728170252-4d89ac9fbff6 github.com/samuel/go-zookeeper d0e0d8e11f318e000a8cc434616d69e329edc374 diff --git a/vendor/github.com/hashicorp/serf/README.md b/vendor/github.com/hashicorp/serf/README.md index ad8210d432..36e5b77feb 100644 --- a/vendor/github.com/hashicorp/serf/README.md +++ b/vendor/github.com/hashicorp/serf/README.md @@ -1,7 +1,7 @@ -# Serf +# Serf [![Build Status](https://travis-ci.org/hashicorp/serf.png)](https://travis-ci.org/hashicorp/serf) [![Join the chat at https://gitter.im/hashicorp-serf/Lobby](https://badges.gitter.im/hashicorp-serf/Lobby.svg)](https://gitter.im/hashicorp-serf/Lobby?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) -* Website: https://www.serfdom.io -* IRC: `#serfdom` on Freenode +* Website: https://www.serf.io +* Chat: [Gitter](https://gitter.im/hashicorp-serf/Lobby) * Mailing list: [Google Groups](https://groups.google.com/group/serfdom/) Serf is a decentralized solution for service discovery and orchestration @@ -28,8 +28,9 @@ Here are some example use cases of Serf, though there are many others: ## Quick Start -First, [download a pre-built Serf binary](https://www.serfdom.io/downloads.html) -for your operating system or [compile Serf yourself](#developing-serf). +First, [download a pre-built Serf binary](https://www.serf.io/downloads.html) +for your operating system, [compile Serf yourself](#developing-serf), or install +using `go get -u github.com/hashicorp/serf/cmd/serf`. Next, let's start a couple Serf agents. Agents run until they're told to quit and handle the communication of maintenance tasks of Serf. In a real Serf @@ -87,12 +88,12 @@ cluster of the node failure. Full, comprehensive documentation is viewable on the Serf website: -https://www.serfdom.io/docs +https://www.serf.io/docs ## Developing Serf If you wish to work on Serf itself, you'll first need [Go](https://golang.org) -installed (version 1.2+ is _required_). Make sure you have Go properly +installed (version 1.8+ is _required_). Make sure you have Go properly [installed](https://golang.org/doc/install), including setting up your [GOPATH](https://golang.org/doc/code.html#GOPATH). @@ -106,9 +107,15 @@ $ bin/serf ... ``` -*note: `make` will also place a copy of the executable under $GOPATH/bin* +*NOTE: `make` will also place a copy of the executable under `$GOPATH/bin/`* -You can run tests by typing `make test`. +Serf is first and foremost a library with a command-line interface, `serf`. The +Serf library is independent of the command line agent, `serf`. The `serf` +binary is located under `cmd/serf` and can be installed stand alone by issuing +the command `go get -u github.com/hashicorp/serf/cmd/serf`. Applications using +the Serf library should only need to include `github.com/hashicorp/serf`. + +Tests can be run by typing `make test`. If you make any changes to the code, run `make format` in order to automatically -format the code according to Go standards. +format the code according to Go [standards](https://golang.org/doc/effective_go.html#formatting). diff --git a/vendor/github.com/hashicorp/serf/coordinate/client.go b/vendor/github.com/hashicorp/serf/coordinate/client.go index 613bfff89e..3582ee4dae 100644 --- a/vendor/github.com/hashicorp/serf/coordinate/client.go +++ b/vendor/github.com/hashicorp/serf/coordinate/client.go @@ -6,6 +6,8 @@ import ( "sort" "sync" "time" + + "github.com/armon/go-metrics" ) // Client manages the estimated network coordinate for a given node, and adjusts @@ -34,10 +36,20 @@ type Client struct { // value to determine how many samples we keep, per node. latencyFilterSamples map[string][]float64 + // stats is used to record events that occur when updating coordinates. + stats ClientStats + // mutex enables safe concurrent access to the client. mutex sync.RWMutex } +// ClientStats is used to record events that occur when updating coordinates. +type ClientStats struct { + // Resets is incremented any time we reset our local coordinate because + // our calculations have resulted in an invalid state. + Resets int +} + // NewClient creates a new Client and verifies the configuration is valid. func NewClient(config *Config) (*Client, error) { if !(config.Dimensionality > 0) { @@ -63,11 +75,16 @@ func (c *Client) GetCoordinate() *Coordinate { } // SetCoordinate forces the client's coordinate to a known state. -func (c *Client) SetCoordinate(coord *Coordinate) { +func (c *Client) SetCoordinate(coord *Coordinate) error { c.mutex.Lock() defer c.mutex.Unlock() + if err := c.checkCoordinate(coord); err != nil { + return err + } + c.coord = coord.Clone() + return nil } // ForgetNode removes any client state for the given node. @@ -78,6 +95,29 @@ func (c *Client) ForgetNode(node string) { delete(c.latencyFilterSamples, node) } +// Stats returns a copy of stats for the client. +func (c *Client) Stats() ClientStats { + c.mutex.Lock() + defer c.mutex.Unlock() + + return c.stats +} + +// checkCoordinate returns an error if the coordinate isn't compatible with +// this client, or if the coordinate itself isn't valid. This assumes the mutex +// has been locked already. +func (c *Client) checkCoordinate(coord *Coordinate) error { + if !c.coord.IsCompatibleWith(coord) { + return fmt.Errorf("dimensions aren't compatible") + } + + if !coord.IsValid() { + return fmt.Errorf("coordinate is invalid") + } + + return nil +} + // latencyFilter applies a simple moving median filter with a new sample for // a node. This assumes that the mutex has been locked already. func (c *Client) latencyFilter(node string, rttSeconds float64) float64 { @@ -159,15 +199,38 @@ func (c *Client) updateGravity() { // Update takes other, a coordinate for another node, and rtt, a round trip // time observation for a ping to that node, and updates the estimated position of // the client's coordinate. Returns the updated coordinate. -func (c *Client) Update(node string, other *Coordinate, rtt time.Duration) *Coordinate { +func (c *Client) Update(node string, other *Coordinate, rtt time.Duration) (*Coordinate, error) { c.mutex.Lock() defer c.mutex.Unlock() + if err := c.checkCoordinate(other); err != nil { + return nil, err + } + + // The code down below can handle zero RTTs, which we have seen in + // https://github.com/hashicorp/consul/issues/3789, presumably in + // environments with coarse-grained monotonic clocks (we are still + // trying to pin this down). In any event, this is ok from a code PoV + // so we don't need to alert operators with spammy messages. We did + // add a counter so this is still observable, though. + const maxRTT = 10 * time.Second + if rtt < 0 || rtt > maxRTT { + return nil, fmt.Errorf("round trip time not in valid range, duration %v is not a positive value less than %v ", rtt, maxRTT) + } + if rtt == 0 { + metrics.IncrCounter([]string{"serf", "coordinate", "zero-rtt"}, 1) + } + rttSeconds := c.latencyFilter(node, rtt.Seconds()) c.updateVivaldi(other, rttSeconds) c.updateAdjustment(other, rttSeconds) c.updateGravity() - return c.coord.Clone() + if !c.coord.IsValid() { + c.stats.Resets++ + c.coord = NewCoordinate(c.config) + } + + return c.coord.Clone(), nil } // DistanceTo returns the estimated RTT from the client's coordinate to other, the diff --git a/vendor/github.com/hashicorp/serf/coordinate/config.go b/vendor/github.com/hashicorp/serf/coordinate/config.go index a5b3aadfe4..b85a8ab7b0 100644 --- a/vendor/github.com/hashicorp/serf/coordinate/config.go +++ b/vendor/github.com/hashicorp/serf/coordinate/config.go @@ -16,7 +16,7 @@ package coordinate type Config struct { // The dimensionality of the coordinate system. As discussed in [2], more // dimensions improves the accuracy of the estimates up to a point. Per [2] - // we chose 4 dimensions plus a non-Euclidean height. + // we chose 8 dimensions plus a non-Euclidean height. Dimensionality uint // VivaldiErrorMax is the default error value when a node hasn't yet made diff --git a/vendor/github.com/hashicorp/serf/coordinate/coordinate.go b/vendor/github.com/hashicorp/serf/coordinate/coordinate.go index c9194e048b..fbe792c90d 100644 --- a/vendor/github.com/hashicorp/serf/coordinate/coordinate.go +++ b/vendor/github.com/hashicorp/serf/coordinate/coordinate.go @@ -72,6 +72,26 @@ func (c *Coordinate) Clone() *Coordinate { } } +// componentIsValid returns false if a floating point value is a NaN or an +// infinity. +func componentIsValid(f float64) bool { + return !math.IsInf(f, 0) && !math.IsNaN(f) +} + +// IsValid returns false if any component of a coordinate isn't valid, per the +// componentIsValid() helper above. +func (c *Coordinate) IsValid() bool { + for i := range c.Vec { + if !componentIsValid(c.Vec[i]) { + return false + } + } + + return componentIsValid(c.Error) && + componentIsValid(c.Adjustment) && + componentIsValid(c.Height) +} + // IsCompatibleWith checks to see if the two coordinates are compatible // dimensionally. If this returns true then you are guaranteed to not get // any runtime errors operating on them. @@ -122,7 +142,7 @@ func (c *Coordinate) rawDistanceTo(other *Coordinate) float64 { // already been checked to be compatible. func add(vec1 []float64, vec2 []float64) []float64 { ret := make([]float64, len(vec1)) - for i, _ := range ret { + for i := range ret { ret[i] = vec1[i] + vec2[i] } return ret @@ -132,7 +152,7 @@ func add(vec1 []float64, vec2 []float64) []float64 { // dimensions have already been checked to be compatible. func diff(vec1 []float64, vec2 []float64) []float64 { ret := make([]float64, len(vec1)) - for i, _ := range ret { + for i := range ret { ret[i] = vec1[i] - vec2[i] } return ret @@ -141,7 +161,7 @@ func diff(vec1 []float64, vec2 []float64) []float64 { // mul returns vec multiplied by a scalar factor. func mul(vec []float64, factor float64) []float64 { ret := make([]float64, len(vec)) - for i, _ := range vec { + for i := range vec { ret[i] = vec[i] * factor } return ret @@ -150,7 +170,7 @@ func mul(vec []float64, factor float64) []float64 { // magnitude computes the magnitude of the vec. func magnitude(vec []float64) float64 { sum := 0.0 - for i, _ := range vec { + for i := range vec { sum += vec[i] * vec[i] } return math.Sqrt(sum) @@ -168,7 +188,7 @@ func unitVectorAt(vec1 []float64, vec2 []float64) ([]float64, float64) { } // Otherwise, just return a random unit vector. - for i, _ := range ret { + for i := range ret { ret[i] = rand.Float64() - 0.5 } if mag := magnitude(ret); mag > zeroThreshold { diff --git a/vendor/github.com/hashicorp/serf/go.mod b/vendor/github.com/hashicorp/serf/go.mod new file mode 100644 index 0000000000..8ca7a1153c --- /dev/null +++ b/vendor/github.com/hashicorp/serf/go.mod @@ -0,0 +1,17 @@ +module github.com/hashicorp/serf + +require ( + github.com/armon/circbuf v0.0.0-20150827004946-bbbad097214e + github.com/armon/go-metrics v0.0.0-20180917152333-f0300d1749da + github.com/hashicorp/go-msgpack v0.5.3 + github.com/hashicorp/go-syslog v1.0.0 + github.com/hashicorp/go-uuid v1.0.1 // indirect + github.com/hashicorp/logutils v1.0.0 + github.com/hashicorp/mdns v1.0.0 + github.com/hashicorp/memberlist v0.1.3 + github.com/mitchellh/cli v1.0.0 + github.com/mitchellh/mapstructure v0.0.0-20160808181253-ca63d7c062ee + github.com/ryanuber/columnize v0.0.0-20160712163229-9b3edd62028f + github.com/stretchr/testify v1.3.0 // indirect + golang.org/x/net v0.0.0-20181201002055-351d144fa1fc // indirect +) diff --git a/vendor/github.com/hashicorp/serf/serf/broadcast.go b/vendor/github.com/hashicorp/serf/serf/broadcast.go index d20728f3f4..751cf184b2 100644 --- a/vendor/github.com/hashicorp/serf/serf/broadcast.go +++ b/vendor/github.com/hashicorp/serf/serf/broadcast.go @@ -16,6 +16,9 @@ func (b *broadcast) Invalidates(other memberlist.Broadcast) bool { return false } +// implements memberlist.UniqueBroadcast +func (b *broadcast) UniqueBroadcast() {} + func (b *broadcast) Message() []byte { return b.msg } diff --git a/vendor/github.com/hashicorp/serf/serf/config.go b/vendor/github.com/hashicorp/serf/serf/config.go index 87cba9f7ca..0de4247c5b 100644 --- a/vendor/github.com/hashicorp/serf/serf/config.go +++ b/vendor/github.com/hashicorp/serf/serf/config.go @@ -2,6 +2,7 @@ package serf import ( "io" + "log" "os" "time" @@ -15,6 +16,7 @@ var ProtocolVersionMap map[uint8]uint8 func init() { ProtocolVersionMap = map[uint8]uint8{ + 5: 2, 4: 2, 3: 2, 2: 2, @@ -53,6 +55,13 @@ type Config struct { // set, a timeout of 5 seconds will be set. BroadcastTimeout time.Duration + // LeavePropagateDelay is for our leave (node dead) message to propagate + // through the cluster. In particular, we want to stay up long enough to + // service any probes from other nodes before they learn about us + // leaving and stop probing. Otherwise, we risk getting node failures as + // we leave. + LeavePropagateDelay time.Duration + // The settings below relate to Serf's event coalescence feature. Serf // is able to coalesce multiple events into single events in order to // reduce the amount of noise that is sent along the EventCh. For example @@ -103,6 +112,17 @@ type Config struct { ReconnectTimeout time.Duration TombstoneTimeout time.Duration + // FlapTimeout is the amount of time less than which we consider a node + // being failed and rejoining looks like a flap for telemetry purposes. + // This should be set less than a typical reboot time, but large enough + // to see actual events, given our expected detection times for a failed + // node. + FlapTimeout time.Duration + + // QueueCheckInterval is the interval at which we check the message + // queue to apply the warning and max depth. + QueueCheckInterval time.Duration + // QueueDepthWarning is used to generate warning message if the // number of queued messages to broadcast exceeds this number. This // is to provide the user feedback if events are being triggered @@ -114,12 +134,18 @@ type Config struct { // prevent an unbounded growth of memory utilization MaxQueueDepth int - // RecentIntentBuffer is used to set the size of recent join and leave intent - // messages that will be buffered. This is used to guard against - // the case where Serf broadcasts an intent that arrives before the - // Memberlist event. It is important that this not be too small to avoid - // continuous rebroadcasting of dead events. - RecentIntentBuffer int + // MinQueueDepth, if >0 will enforce a lower limit for dropping messages + // and then the max will be max(MinQueueDepth, 2*SizeOfCluster). This + // defaults to 0 which disables this dynamic sizing feature. If this is + // >0 then MaxQueueDepth will be ignored. + MinQueueDepth int + + // RecentIntentTimeout is used to determine how long we store recent + // join and leave intents. This is used to guard against the case where + // Serf broadcasts an intent that arrives before the Memberlist event. + // It is important that this not be too short to avoid continuous + // rebroadcasting of dead events. + RecentIntentTimeout time.Duration // EventBuffer is used to control how many events are buffered. // This is used to prevent re-delivery of events to a client. The buffer @@ -175,6 +201,12 @@ type Config struct { // logs will go to stderr. LogOutput io.Writer + // Logger is a custom logger which you provide. If Logger is set, it will use + // this for the internal logger. If Logger is not set, it will fall back to the + // behavior for using LogOutput. You cannot specify both LogOutput and Logger + // at the same time. + Logger *log.Logger + // SnapshotPath if provided is used to snapshot live nodes as well // as lamport clock values. When Serf is started with a snapshot, // it will attempt to join all the previously known nodes until one @@ -210,6 +242,10 @@ type Config struct { // Merge can be optionally provided to intercept a cluster merge // and conditionally abort the merge. Merge MergeDelegate + + // UserEventSizeLimit is maximum byte size limit of user event `name` + `payload` in bytes. + // It's optimal to be relatively small, since it's going to be gossiped through the cluster. + UserEventSizeLimit int } // Init allocates the subdata structures @@ -230,22 +266,26 @@ func DefaultConfig() *Config { return &Config{ NodeName: hostname, BroadcastTimeout: 5 * time.Second, + LeavePropagateDelay: 1 * time.Second, EventBuffer: 512, QueryBuffer: 512, LogOutput: os.Stderr, - ProtocolVersion: ProtocolVersionMax, + ProtocolVersion: 4, ReapInterval: 15 * time.Second, - RecentIntentBuffer: 128, + RecentIntentTimeout: 5 * time.Minute, ReconnectInterval: 30 * time.Second, ReconnectTimeout: 24 * time.Hour, + QueueCheckInterval: 30 * time.Second, QueueDepthWarning: 128, MaxQueueDepth: 4096, TombstoneTimeout: 24 * time.Hour, + FlapTimeout: 60 * time.Second, MemberlistConfig: memberlist.DefaultLANConfig(), QueryTimeoutMult: 16, QueryResponseSizeLimit: 1024, QuerySizeLimit: 1024, EnableNameConflictResolution: true, DisableCoordinates: false, + UserEventSizeLimit: 512, } } diff --git a/vendor/github.com/hashicorp/serf/serf/delegate.go b/vendor/github.com/hashicorp/serf/serf/delegate.go index d19ca3090f..567c7fe4ab 100644 --- a/vendor/github.com/hashicorp/serf/serf/delegate.go +++ b/vendor/github.com/hashicorp/serf/serf/delegate.go @@ -1,9 +1,12 @@ package serf import ( + "bytes" "fmt" "github.com/armon/go-metrics" + "github.com/hashicorp/go-msgpack/codec" + "github.com/hashicorp/memberlist" ) // delegate is the memberlist.Delegate implementation that Serf uses. @@ -11,6 +14,8 @@ type delegate struct { serf *Serf } +var _ memberlist.Delegate = &delegate{} + func (d *delegate) NodeMeta(limit int) []byte { roleBytes := d.serf.encodeTags(d.serf.config.Tags) if len(roleBytes) > limit { @@ -83,6 +88,25 @@ func (d *delegate) NotifyMsg(buf []byte) { d.serf.logger.Printf("[DEBUG] serf: messageQueryResponseType: %v", resp.From) d.serf.handleQueryResponse(&resp) + case messageRelayType: + var header relayHeader + var handle codec.MsgpackHandle + reader := bytes.NewReader(buf[1:]) + decoder := codec.NewDecoder(reader, &handle) + if err := decoder.Decode(&header); err != nil { + d.serf.logger.Printf("[ERR] serf: Error decoding relay header: %s", err) + break + } + + // The remaining contents are the message itself, so forward that + raw := make([]byte, reader.Len()) + reader.Read(raw) + d.serf.logger.Printf("[DEBUG] serf: Relaying response to addr: %s", header.DestAddr.String()) + if err := d.serf.memberlist.SendTo(&header.DestAddr, raw); err != nil { + d.serf.logger.Printf("[ERR] serf: Error forwarding message to %s: %s", header.DestAddr.String(), err) + break + } + default: d.serf.logger.Printf("[WARN] serf: Received message of unknown type: %d", t) } @@ -202,13 +226,16 @@ func (d *delegate) MergeRemoteState(buf []byte, isJoin bool) { d.serf.queryClock.Witness(pp.QueryLTime - 1) } - // Process the left nodes first to avoid the LTimes from being increment - // in the wrong order + // Process the left nodes first to avoid the LTimes from incrementing + // in the wrong order. Note that we don't have the actual Lamport time + // for the leave message, so we go one past the join time, since the + // leave must have been accepted after that to get onto the left members + // list. If we didn't do this then the message would not get processed. leftMap := make(map[string]struct{}, len(pp.LeftMembers)) leave := messageLeave{} for _, name := range pp.LeftMembers { leftMap[name] = struct{}{} - leave.LTime = pp.StatusLTimes[name] + leave.LTime = pp.StatusLTimes[name] + 1 leave.Node = name d.serf.handleNodeLeaveIntent(&leave) } @@ -230,7 +257,8 @@ func (d *delegate) MergeRemoteState(buf []byte, isJoin bool) { // If we are doing a join, and eventJoinIgnore is set // then we set the eventMinTime to the EventLTime. This // prevents any of the incoming events from being processed - if isJoin && d.serf.eventJoinIgnore { + eventJoinIgnore := d.serf.eventJoinIgnore.Load().(bool) + if isJoin && eventJoinIgnore { d.serf.eventLock.Lock() if pp.EventLTime > d.serf.eventMinTime { d.serf.eventMinTime = pp.EventLTime diff --git a/vendor/github.com/hashicorp/serf/serf/event.go b/vendor/github.com/hashicorp/serf/serf/event.go index 8337e95ead..859a09e56e 100644 --- a/vendor/github.com/hashicorp/serf/serf/event.go +++ b/vendor/github.com/hashicorp/serf/serf/event.go @@ -95,18 +95,19 @@ func (u UserEvent) String() string { return fmt.Sprintf("user-event: %s", u.Name) } -// Query is the struct used EventQuery type events +// Query is the struct used by EventQuery type events type Query struct { LTime LamportTime Name string Payload []byte - serf *Serf - id uint32 // ID is not exported, since it may change - addr []byte // Address to respond to - port uint16 // Port to respond to - deadline time.Time // Must respond by this deadline - respLock sync.Mutex + serf *Serf + id uint32 // ID is not exported, since it may change + addr []byte // Address to respond to + port uint16 // Port to respond to + deadline time.Time // Must respond by this deadline + relayFactor uint8 // Number of duplicate responses to relay back to sender + respLock sync.Mutex } func (q *Query) EventType() EventType { @@ -122,47 +123,74 @@ func (q *Query) Deadline() time.Time { return q.deadline } -// Respond is used to send a response to the user query -func (q *Query) Respond(buf []byte) error { +func (q *Query) createResponse(buf []byte) messageQueryResponse { + // Create response + return messageQueryResponse{ + LTime: q.LTime, + ID: q.id, + From: q.serf.config.NodeName, + Payload: buf, + } +} + +// Check response size +func (q *Query) checkResponseSize(resp []byte) error { + if len(resp) > q.serf.config.QueryResponseSizeLimit { + return fmt.Errorf("response exceeds limit of %d bytes", q.serf.config.QueryResponseSizeLimit) + } + return nil +} + +func (q *Query) respondWithMessageAndResponse(raw []byte, resp messageQueryResponse) error { + // Check the size limit + if err := q.checkResponseSize(raw); err != nil { + return err + } + q.respLock.Lock() defer q.respLock.Unlock() // Check if we've already responded if q.deadline.IsZero() { - return fmt.Errorf("Response already sent") + return fmt.Errorf("response already sent") } // Ensure we aren't past our response deadline if time.Now().After(q.deadline) { - return fmt.Errorf("Response is past the deadline") + return fmt.Errorf("response is past the deadline") } - // Create response - resp := messageQueryResponse{ - LTime: q.LTime, - ID: q.id, - From: q.serf.config.NodeName, - Payload: buf, + // Send the response directly to the originator + addr := net.UDPAddr{IP: q.addr, Port: int(q.port)} + if err := q.serf.memberlist.SendTo(&addr, raw); err != nil { + return err } - // Format the response - raw, err := encodeMessage(messageQueryResponseType, &resp) - if err != nil { - return fmt.Errorf("Failed to format response: %v", err) + // Relay the response through up to relayFactor other nodes + if err := q.serf.relayResponse(q.relayFactor, addr, &resp); err != nil { + return err } - // Check the size limit - if len(raw) > q.serf.config.QueryResponseSizeLimit { - return fmt.Errorf("response exceeds limit of %d bytes", q.serf.config.QueryResponseSizeLimit) + // Clear the deadline, responses sent + q.deadline = time.Time{} + + return nil +} + +// Respond is used to send a response to the user query +func (q *Query) Respond(buf []byte) error { + // Create response + resp := q.createResponse(buf) + + // Encode response + raw, err := encodeMessage(messageQueryResponseType, resp) + if err != nil { + return fmt.Errorf("failed to format response: %v", err) } - // Send the response - addr := net.UDPAddr{IP: q.addr, Port: int(q.port)} - if err := q.serf.memberlist.SendTo(&addr, raw); err != nil { - return err + if err := q.respondWithMessageAndResponse(raw, resp); err != nil { + return fmt.Errorf("failed to respond to key query: %v", err) } - // Clera the deadline, response sent - q.deadline = time.Time{} return nil } diff --git a/vendor/github.com/hashicorp/serf/serf/internal_query.go b/vendor/github.com/hashicorp/serf/serf/internal_query.go index 128b2cf214..a74ebf705b 100644 --- a/vendor/github.com/hashicorp/serf/serf/internal_query.go +++ b/vendor/github.com/hashicorp/serf/serf/internal_query.go @@ -2,6 +2,7 @@ package serf import ( "encoding/base64" + "fmt" "log" "strings" ) @@ -28,6 +29,13 @@ const ( // listKeysQuery is used to list all known keys in the cluster listKeysQuery = "list-keys" + + // minEncodedKeyLength is used to compute the max number of keys in a list key + // response. eg 1024/25 = 40. a message with max size of 1024 bytes cannot + // contain more than 40 keys. There is a test + // (TestSerfQueries_estimateMaxKeysInListKeyResponse) which does the + // computation and in case of changes, the value can be adjusted. + minEncodedKeyLength = 25 ) // internalQueryName is used to generate a query name for an internal query @@ -149,17 +157,62 @@ func (s *serfQueries) handleConflict(q *Query) { } } -// sendKeyResponse handles responding to key-related queries. -func (s *serfQueries) sendKeyResponse(q *Query, resp *nodeKeyResponse) { - buf, err := encodeMessage(messageKeyResponseType, resp) - if err != nil { - s.logger.Printf("[ERR] serf: Failed to encode key response: %v", err) - return +func (s *serfQueries) keyListResponseWithCorrectSize(q *Query, resp *nodeKeyResponse) ([]byte, messageQueryResponse, error) { + maxListKeys := q.serf.config.QueryResponseSizeLimit / minEncodedKeyLength + actual := len(resp.Keys) + for i := maxListKeys; i >= 0; i-- { + buf, err := encodeMessage(messageKeyResponseType, resp) + if err != nil { + return nil, messageQueryResponse{}, err + } + + // Create response + qresp := q.createResponse(buf) + + // Encode response + raw, err := encodeMessage(messageQueryResponseType, qresp) + if err != nil { + return nil, messageQueryResponse{}, err + } + + // Check the size limit + if err = q.checkResponseSize(raw); err != nil { + resp.Keys = resp.Keys[0:i] + resp.Message = fmt.Sprintf("truncated key list response, showing first %d of %d keys", i, actual) + continue + } + + if actual > i { + s.logger.Printf("[WARN] serf: %s", resp.Message) + } + return raw, qresp, nil } + return nil, messageQueryResponse{}, fmt.Errorf("Failed to truncate response so that it fits into message") +} - if err := q.Respond(buf); err != nil { - s.logger.Printf("[ERR] serf: Failed to respond to key query: %v", err) - return +// sendKeyResponse handles responding to key-related queries. +func (s *serfQueries) sendKeyResponse(q *Query, resp *nodeKeyResponse) { + switch q.Name { + case internalQueryName(listKeysQuery): + raw, qresp, err := s.keyListResponseWithCorrectSize(q, resp) + if err != nil { + s.logger.Printf("[ERR] serf: %v", err) + return + } + if err := q.respondWithMessageAndResponse(raw, qresp); err != nil { + s.logger.Printf("[ERR] serf: Failed to respond to key query: %v", err) + return + } + default: + buf, err := encodeMessage(messageKeyResponseType, resp) + if err != nil { + s.logger.Printf("[ERR] serf: Failed to encode key response: %v", err) + return + } + if err := q.Respond(buf); err != nil { + s.logger.Printf("[ERR] serf: Failed to respond to key query: %v", err) + return + } } } @@ -192,10 +245,12 @@ func (s *serfQueries) handleInstallKey(q *Query) { goto SEND } - if err := s.serf.writeKeyringFile(); err != nil { - response.Message = err.Error() - s.logger.Printf("[ERR] serf: Failed to write keyring file: %s", err) - goto SEND + if s.serf.config.KeyringFile != "" { + if err := s.serf.writeKeyringFile(); err != nil { + response.Message = err.Error() + s.logger.Printf("[ERR] serf: Failed to write keyring file: %s", err) + goto SEND + } } response.Result = true diff --git a/vendor/github.com/hashicorp/serf/serf/keymanager.go b/vendor/github.com/hashicorp/serf/serf/keymanager.go index 72a319449d..11aaff2aa0 100644 --- a/vendor/github.com/hashicorp/serf/serf/keymanager.go +++ b/vendor/github.com/hashicorp/serf/serf/keymanager.go @@ -33,6 +33,13 @@ type KeyResponse struct { Keys map[string]int } +// KeyRequestOptions is used to contain optional parameters for a keyring operation +type KeyRequestOptions struct { + // RelayFactor is the number of duplicate query responses to send by relaying through + // other nodes, for redundancy + RelayFactor uint8 +} + // streamKeyResp takes care of reading responses from a channel and composing // them into a KeyResponse. It will update a KeyResponse *in place* and // therefore has nothing to return. @@ -61,6 +68,11 @@ func (k *KeyManager) streamKeyResp(resp *KeyResponse, ch <-chan NodeResponse) { resp.NumErr++ } + if nodeResponse.Result && len(nodeResponse.Message) > 0 { + resp.Messages[r.From] = nodeResponse.Message + k.serf.logger.Println("[WARN] serf:", nodeResponse.Message) + } + // Currently only used for key list queries, this adds keys to a counter // and increments them for each node response which contains them. for _, key := range nodeResponse.Keys { @@ -83,7 +95,7 @@ func (k *KeyManager) streamKeyResp(resp *KeyResponse, ch <-chan NodeResponse) { // handleKeyRequest performs query broadcasting to all members for any type of // key operation and manages gathering responses and packing them up into a // KeyResponse for uniform response handling. -func (k *KeyManager) handleKeyRequest(key, query string) (*KeyResponse, error) { +func (k *KeyManager) handleKeyRequest(key, query string, opts *KeyRequestOptions) (*KeyResponse, error) { resp := &KeyResponse{ Messages: make(map[string]string), Keys: make(map[string]int), @@ -103,6 +115,9 @@ func (k *KeyManager) handleKeyRequest(key, query string) (*KeyResponse, error) { } qParam := k.serf.DefaultQueryParams() + if opts != nil { + qParam.RelayFactor = opts.RelayFactor + } queryResp, err := k.serf.Query(qName, req, qParam) if err != nil { return resp, err @@ -127,30 +142,42 @@ func (k *KeyManager) handleKeyRequest(key, query string) (*KeyResponse, error) { // responses from each of them, returning a list of messages from each node // and any applicable error conditions. func (k *KeyManager) InstallKey(key string) (*KeyResponse, error) { + return k.InstallKeyWithOptions(key, nil) +} + +func (k *KeyManager) InstallKeyWithOptions(key string, opts *KeyRequestOptions) (*KeyResponse, error) { k.l.Lock() defer k.l.Unlock() - return k.handleKeyRequest(key, installKeyQuery) + return k.handleKeyRequest(key, installKeyQuery, opts) } // UseKey handles broadcasting a primary key change to all members in the // cluster, and gathering any response messages. If successful, there should // be an empty KeyResponse returned. func (k *KeyManager) UseKey(key string) (*KeyResponse, error) { + return k.UseKeyWithOptions(key, nil) +} + +func (k *KeyManager) UseKeyWithOptions(key string, opts *KeyRequestOptions) (*KeyResponse, error) { k.l.Lock() defer k.l.Unlock() - return k.handleKeyRequest(key, useKeyQuery) + return k.handleKeyRequest(key, useKeyQuery, opts) } // RemoveKey handles broadcasting a key to the cluster for removal. Each member // will receive this event, and if they have the key in their keyring, remove // it. If any errors are encountered, RemoveKey will collect and relay them. func (k *KeyManager) RemoveKey(key string) (*KeyResponse, error) { + return k.RemoveKeyWithOptions(key, nil) +} + +func (k *KeyManager) RemoveKeyWithOptions(key string, opts *KeyRequestOptions) (*KeyResponse, error) { k.l.Lock() defer k.l.Unlock() - return k.handleKeyRequest(key, removeKeyQuery) + return k.handleKeyRequest(key, removeKeyQuery, opts) } // ListKeys is used to collect installed keys from members in a Serf cluster @@ -159,8 +186,12 @@ func (k *KeyManager) RemoveKey(key string) (*KeyResponse, error) { // Since having multiple keys installed can cause performance penalties in some // cases, it's important to verify this information and remove unneeded keys. func (k *KeyManager) ListKeys() (*KeyResponse, error) { + return k.ListKeysWithOptions(nil) +} + +func (k *KeyManager) ListKeysWithOptions(opts *KeyRequestOptions) (*KeyResponse, error) { k.l.RLock() defer k.l.RUnlock() - return k.handleKeyRequest("", listKeysQuery) + return k.handleKeyRequest("", listKeysQuery, opts) } diff --git a/vendor/github.com/hashicorp/serf/serf/messages.go b/vendor/github.com/hashicorp/serf/serf/messages.go index c90c964509..20df5b8e83 100644 --- a/vendor/github.com/hashicorp/serf/serf/messages.go +++ b/vendor/github.com/hashicorp/serf/serf/messages.go @@ -2,8 +2,10 @@ package serf import ( "bytes" - "github.com/hashicorp/go-msgpack/codec" + "net" "time" + + "github.com/hashicorp/go-msgpack/codec" ) // messageType are the types of gossip messages Serf will send along @@ -20,6 +22,7 @@ const ( messageConflictResponseType messageKeyRequestType messageKeyResponseType + messageRelayType ) const ( @@ -75,15 +78,16 @@ type messageUserEvent struct { // messageQuery is used for query events type messageQuery struct { - LTime LamportTime // Event lamport time - ID uint32 // Query ID, randomly generated - Addr []byte // Source address, used for a direct reply - Port uint16 // Source port, used for a direct reply - Filters [][]byte // Potential query filters - Flags uint32 // Used to provide various flags - Timeout time.Duration // Maximum time between delivery and response - Name string // Query name - Payload []byte // Query payload + LTime LamportTime // Event lamport time + ID uint32 // Query ID, randomly generated + Addr []byte // Source address, used for a direct reply + Port uint16 // Source port, used for a direct reply + Filters [][]byte // Potential query filters + Flags uint32 // Used to provide various flags + RelayFactor uint8 // Used to set the number of duplicate relayed responses + Timeout time.Duration // Maximum time between delivery and response + Name string // Query name + Payload []byte // Query payload } // Ack checks if the ack flag is set @@ -136,6 +140,28 @@ func encodeMessage(t messageType, msg interface{}) ([]byte, error) { return buf.Bytes(), err } +// relayHeader is used to store the end destination of a relayed message +type relayHeader struct { + DestAddr net.UDPAddr +} + +// encodeRelayMessage wraps a message in the messageRelayType, adding the length and +// address of the end recipient to the front of the message +func encodeRelayMessage(t messageType, addr net.UDPAddr, msg interface{}) ([]byte, error) { + buf := bytes.NewBuffer(nil) + handle := codec.MsgpackHandle{} + encoder := codec.NewEncoder(buf, &handle) + + buf.WriteByte(uint8(messageRelayType)) + if err := encoder.Encode(relayHeader{DestAddr: addr}); err != nil { + return nil, err + } + + buf.WriteByte(uint8(t)) + err := encoder.Encode(msg) + return buf.Bytes(), err +} + func encodeFilter(f filterType, filt interface{}) ([]byte, error) { buf := bytes.NewBuffer(nil) buf.WriteByte(uint8(f)) diff --git a/vendor/github.com/hashicorp/serf/serf/ping_delegate.go b/vendor/github.com/hashicorp/serf/serf/ping_delegate.go index a482685a20..98032c5bea 100644 --- a/vendor/github.com/hashicorp/serf/serf/ping_delegate.go +++ b/vendor/github.com/hashicorp/serf/serf/ping_delegate.go @@ -2,7 +2,6 @@ package serf import ( "bytes" - "log" "time" "github.com/armon/go-metrics" @@ -37,7 +36,7 @@ func (p *pingDelegate) AckPayload() []byte { // The rest of the message is the serialized coordinate. enc := codec.NewEncoder(&buf, &codec.MsgpackHandle{}) if err := enc.Encode(p.serf.coordClient.GetCoordinate()); err != nil { - log.Printf("[ERR] serf: Failed to encode coordinate: %v\n", err) + p.serf.logger.Printf("[ERR] serf: Failed to encode coordinate: %v\n", err) } return buf.Bytes() } @@ -52,7 +51,7 @@ func (p *pingDelegate) NotifyPingComplete(other *memberlist.Node, rtt time.Durat // Verify ping version in the header. version := payload[0] if version != PingVersion { - log.Printf("[ERR] serf: Unsupported ping version: %v", version) + p.serf.logger.Printf("[ERR] serf: Unsupported ping version: %v", version) return } @@ -61,29 +60,31 @@ func (p *pingDelegate) NotifyPingComplete(other *memberlist.Node, rtt time.Durat dec := codec.NewDecoder(r, &codec.MsgpackHandle{}) var coord coordinate.Coordinate if err := dec.Decode(&coord); err != nil { - log.Printf("[ERR] serf: Failed to decode coordinate from ping: %v", err) + p.serf.logger.Printf("[ERR] serf: Failed to decode coordinate from ping: %v", err) + return } - // Apply the update. Since this is a coordinate coming from some place - // else we harden this and look for dimensionality problems proactively. + // Apply the update. before := p.serf.coordClient.GetCoordinate() - if before.IsCompatibleWith(&coord) { - after := p.serf.coordClient.Update(other.Name, &coord, rtt) + after, err := p.serf.coordClient.Update(other.Name, &coord, rtt) + if err != nil { + metrics.IncrCounter([]string{"serf", "coordinate", "rejected"}, 1) + p.serf.logger.Printf("[TRACE] serf: Rejected coordinate from %s: %v\n", + other.Name, err) + return + } - // Publish some metrics to give us an idea of how much we are - // adjusting each time we update. - d := float32(before.DistanceTo(after).Seconds() * 1.0e3) - metrics.AddSample([]string{"serf", "coordinate", "adjustment-ms"}, d) + // Publish some metrics to give us an idea of how much we are + // adjusting each time we update. + d := float32(before.DistanceTo(after).Seconds() * 1.0e3) + metrics.AddSample([]string{"serf", "coordinate", "adjustment-ms"}, d) - // Cache the coordinate for the other node, and add our own - // to the cache as well since it just got updated. This lets - // users call GetCachedCoordinate with our node name, which is - // more friendly. - p.serf.coordCacheLock.Lock() - p.serf.coordCache[other.Name] = &coord - p.serf.coordCache[p.serf.config.NodeName] = p.serf.coordClient.GetCoordinate() - p.serf.coordCacheLock.Unlock() - } else { - log.Printf("[ERR] serf: Rejected bad coordinate: %v\n", coord) - } + // Cache the coordinate for the other node, and add our own + // to the cache as well since it just got updated. This lets + // users call GetCachedCoordinate with our node name, which is + // more friendly. + p.serf.coordCacheLock.Lock() + p.serf.coordCache[other.Name] = &coord + p.serf.coordCache[p.serf.config.NodeName] = p.serf.coordClient.GetCoordinate() + p.serf.coordCacheLock.Unlock() } diff --git a/vendor/github.com/hashicorp/serf/serf/query.go b/vendor/github.com/hashicorp/serf/serf/query.go index f29a3b3c54..0bdbb35538 100644 --- a/vendor/github.com/hashicorp/serf/serf/query.go +++ b/vendor/github.com/hashicorp/serf/serf/query.go @@ -1,7 +1,11 @@ package serf import ( + "errors" + "fmt" "math" + "math/rand" + "net" "regexp" "sync" "time" @@ -24,6 +28,10 @@ type QueryParam struct { // send an ack. RequestAck bool + // RelayFactor controls the number of duplicate responses to relay + // back to the sender through other nodes for redundancy. + RelayFactor uint8 + // The timeout limits how long the query is left open. If not provided, // then a default timeout is used based on the configuration of Serf Timeout time.Duration @@ -93,6 +101,10 @@ type QueryResponse struct { // respCh is used to send a response from a node respCh chan NodeResponse + // acks/responses are used to track the nodes that have sent an ack/response + acks map[string]struct{} + responses map[string]struct{} + closed bool closeLock sync.Mutex } @@ -100,13 +112,15 @@ type QueryResponse struct { // newQueryResponse is used to construct a new query response func newQueryResponse(n int, q *messageQuery) *QueryResponse { resp := &QueryResponse{ - deadline: time.Now().Add(q.Timeout), - id: q.ID, - lTime: q.LTime, - respCh: make(chan NodeResponse, n), + deadline: time.Now().Add(q.Timeout), + id: q.ID, + lTime: q.LTime, + respCh: make(chan NodeResponse, n), + responses: make(map[string]struct{}), } if q.Ack() { resp.ackCh = make(chan string, n) + resp.acks = make(map[string]struct{}) } return resp } @@ -135,6 +149,8 @@ func (r *QueryResponse) Deadline() time.Time { // Finished returns if the query is finished running func (r *QueryResponse) Finished() bool { + r.closeLock.Lock() + defer r.closeLock.Unlock() return r.closed || time.Now().After(r.deadline) } @@ -151,6 +167,22 @@ func (r *QueryResponse) ResponseCh() <-chan NodeResponse { return r.respCh } +// sendResponse sends a response on the response channel ensuring the channel is not closed. +func (r *QueryResponse) sendResponse(nr NodeResponse) error { + r.closeLock.Lock() + defer r.closeLock.Unlock() + if r.closed { + return nil + } + select { + case r.respCh <- nr: + r.responses[nr.From] = struct{}{} + default: + return errors.New("serf: Failed to deliver query response, dropping") + } + return nil +} + // NodeResponse is used to represent a single response from a node type NodeResponse struct { From string @@ -208,3 +240,74 @@ func (s *Serf) shouldProcessQuery(filters [][]byte) bool { } return true } + +// relayResponse will relay a copy of the given response to up to relayFactor +// other members. +func (s *Serf) relayResponse(relayFactor uint8, addr net.UDPAddr, resp *messageQueryResponse) error { + if relayFactor == 0 { + return nil + } + + // Needs to be worth it; we need to have at least relayFactor *other* + // nodes. If you have a tiny cluster then the relayFactor shouldn't + // be needed. + members := s.Members() + if len(members) < int(relayFactor)+1 { + return nil + } + + // Prep the relay message, which is a wrapped version of the original. + raw, err := encodeRelayMessage(messageQueryResponseType, addr, &resp) + if err != nil { + return fmt.Errorf("failed to format relayed response: %v", err) + } + if len(raw) > s.config.QueryResponseSizeLimit { + return fmt.Errorf("relayed response exceeds limit of %d bytes", s.config.QueryResponseSizeLimit) + } + + // Relay to a random set of peers. + localName := s.LocalMember().Name + relayMembers := kRandomMembers(int(relayFactor), members, func(m Member) bool { + return m.Status != StatusAlive || m.ProtocolMax < 5 || m.Name == localName + }) + for _, m := range relayMembers { + relayAddr := net.UDPAddr{IP: m.Addr, Port: int(m.Port)} + if err := s.memberlist.SendTo(&relayAddr, raw); err != nil { + return fmt.Errorf("failed to send relay response: %v", err) + } + } + return nil +} + +// kRandomMembers selects up to k members from a given list, optionally +// filtering by the given filterFunc +func kRandomMembers(k int, members []Member, filterFunc func(Member) bool) []Member { + n := len(members) + kMembers := make([]Member, 0, k) +OUTER: + // Probe up to 3*n times, with large n this is not necessary + // since k << n, but with small n we want search to be + // exhaustive + for i := 0; i < 3*n && len(kMembers) < k; i++ { + // Get random member + idx := rand.Intn(n) + member := members[idx] + + // Give the filter a shot at it. + if filterFunc != nil && filterFunc(member) { + continue OUTER + } + + // Check if we have this member already + for j := 0; j < len(kMembers); j++ { + if member.Name == kMembers[j].Name { + continue OUTER + } + } + + // Append the member + kMembers = append(kMembers, member) + } + + return kMembers +} diff --git a/vendor/github.com/hashicorp/serf/serf/serf.go b/vendor/github.com/hashicorp/serf/serf/serf.go index 613b915dc4..08f1f7b991 100644 --- a/vendor/github.com/hashicorp/serf/serf/serf.go +++ b/vendor/github.com/hashicorp/serf/serf/serf.go @@ -10,8 +10,10 @@ import ( "log" "math/rand" "net" + "os" "strconv" "sync" + "sync/atomic" "time" "github.com/armon/go-metrics" @@ -25,7 +27,7 @@ import ( // version to memberlist below. const ( ProtocolVersionMin uint8 = 2 - ProtocolVersionMax = 4 + ProtocolVersionMax = 5 ) const ( @@ -65,16 +67,15 @@ type Serf struct { memberLock sync.RWMutex members map[string]*memberState - // Circular buffers for recent intents, used - // in case we get the intent before the relevant event - recentLeave []nodeIntent - recentLeaveIndex int - recentJoin []nodeIntent - recentJoinIndex int + // recentIntents the lamport time and type of intent for a given node in + // case we get an intent before the relevant memberlist event. This is + // indexed by node, and always store the latest lamport time / intent + // we've seen. The memberLock protects this structure. + recentIntents map[string]nodeIntent eventBroadcasts *memberlist.TransmitLimitedQueue eventBuffer []*userEvents - eventJoinIgnore bool + eventJoinIgnore atomic.Value eventMinTime LamportTime eventLock sync.RWMutex @@ -179,10 +180,18 @@ type memberState struct { leaveTime time.Time // wall clock time of leave } -// nodeIntent is used to buffer intents for out-of-order deliveries +// nodeIntent is used to buffer intents for out-of-order deliveries. type nodeIntent struct { + // Type is the intent being tracked. Only messageJoinType and + // messageLeaveType are tracked. + Type messageType + + // WallTime is the wall clock time we saw this intent in order to + // expire it from the buffer. + WallTime time.Time + + // LTime is the Lamport time, used for cluster-wide ordering of events. LTime LamportTime - Node string } // userEvent is used to buffer events to prevent re-delivery @@ -214,8 +223,8 @@ type queries struct { } const ( - UserEventSizeLimit = 512 // Maximum byte size for event name and payload - snapshotSizeLimit = 128 * 1024 // Maximum 128 KB snapshot + snapshotSizeLimit = 128 * 1024 // Maximum 128 KB snapshot + UserEventSizeLimit = 9 * 1024 // Maximum 9KB for event name and payload ) // Create creates a new Serf instance, starting all the background tasks @@ -233,14 +242,28 @@ func Create(conf *Config) (*Serf, error) { conf.ProtocolVersion, ProtocolVersionMin, ProtocolVersionMax) } + if conf.UserEventSizeLimit > UserEventSizeLimit { + return nil, fmt.Errorf("user event size limit exceeds limit of %d bytes", UserEventSizeLimit) + } + + logger := conf.Logger + if logger == nil { + logOutput := conf.LogOutput + if logOutput == nil { + logOutput = os.Stderr + } + logger = log.New(logOutput, "", log.LstdFlags) + } + serf := &Serf{ config: conf, - logger: log.New(conf.LogOutput, "", log.LstdFlags), + logger: logger, members: make(map[string]*memberState), queryResponse: make(map[LamportTime]*QueryResponse), shutdownCh: make(chan struct{}), state: SerfAlive, } + serf.eventJoinIgnore.Store(false) // Check that the meta data length is okay if len(serf.encodeTags(conf.Tags)) > memberlist.MetaMaxSize { @@ -295,7 +318,6 @@ func Create(conf *Config) (*Serf, error) { conf.RejoinAfterLeave, serf.logger, &serf.clock, - serf.coordClient, conf.EventCh, serf.shutdownCh) if err != nil { @@ -321,27 +343,20 @@ func Create(conf *Config) (*Serf, error) { // Setup the various broadcast queues, which we use to send our own // custom broadcasts along the gossip channel. serf.broadcasts = &memberlist.TransmitLimitedQueue{ - NumNodes: func() int { - return len(serf.members) - }, + NumNodes: serf.NumNodes, RetransmitMult: conf.MemberlistConfig.RetransmitMult, } serf.eventBroadcasts = &memberlist.TransmitLimitedQueue{ - NumNodes: func() int { - return len(serf.members) - }, + NumNodes: serf.NumNodes, RetransmitMult: conf.MemberlistConfig.RetransmitMult, } serf.queryBroadcasts = &memberlist.TransmitLimitedQueue{ - NumNodes: func() int { - return len(serf.members) - }, + NumNodes: serf.NumNodes, RetransmitMult: conf.MemberlistConfig.RetransmitMult, } // Create the buffer for recent intents - serf.recentJoin = make([]nodeIntent, conf.RecentIntentBuffer) - serf.recentLeave = make([]nodeIntent, conf.RecentIntentBuffer) + serf.recentIntents = make(map[string]nodeIntent) // Create a buffer for events and queries serf.eventBuffer = make([]*userEvents, conf.EventBuffer) @@ -426,14 +441,25 @@ func (s *Serf) KeyManager() *KeyManager { } // UserEvent is used to broadcast a custom user event with a given -// name and payload. The events must be fairly small, and if the -// size limit is exceeded and error will be returned. If coalesce is enabled, -// nodes are allowed to coalesce this event. Coalescing is only available -// starting in v0.2 +// name and payload. If the configured size limit is exceeded and error will be returned. +// If coalesce is enabled, nodes are allowed to coalesce this event. +// Coalescing is only available starting in v0.2 func (s *Serf) UserEvent(name string, payload []byte, coalesce bool) error { - // Check the size limit - if len(name)+len(payload) > UserEventSizeLimit { - return fmt.Errorf("user event exceeds limit of %d bytes", UserEventSizeLimit) + payloadSizeBeforeEncoding := len(name)+len(payload) + + // Check size before encoding to prevent needless encoding and return early if it's over the specified limit. + if payloadSizeBeforeEncoding > s.config.UserEventSizeLimit { + return fmt.Errorf( + "user event exceeds configured limit of %d bytes before encoding", + s.config.UserEventSizeLimit, + ) + } + + if payloadSizeBeforeEncoding > UserEventSizeLimit { + return fmt.Errorf( + "user event exceeds sane limit of %d bytes before encoding", + UserEventSizeLimit, + ) } // Create a message @@ -443,16 +469,34 @@ func (s *Serf) UserEvent(name string, payload []byte, coalesce bool) error { Payload: payload, CC: coalesce, } - s.eventClock.Increment() - - // Process update locally - s.handleUserEvent(&msg) // Start broadcasting the event raw, err := encodeMessage(messageUserEventType, &msg) if err != nil { return err } + + // Check the size after encoding to be sure again that + // we're not attempting to send over the specified size limit. + if len(raw) > s.config.UserEventSizeLimit { + return fmt.Errorf( + "encoded user event exceeds configured limit of %d bytes after encoding", + s.config.UserEventSizeLimit, + ) + } + + if len(raw) > UserEventSizeLimit { + return fmt.Errorf( + "encoded user event exceeds sane limit of %d bytes before encoding", + UserEventSizeLimit, + ) + } + + s.eventClock.Increment() + + // Process update locally + s.handleUserEvent(&msg) + s.eventBroadcasts.QueueBroadcast(&broadcast{ msg: raw, }) @@ -493,15 +537,16 @@ func (s *Serf) Query(name string, payload []byte, params *QueryParam) (*QueryRes // Create a message q := messageQuery{ - LTime: s.queryClock.Time(), - ID: uint32(rand.Int31()), - Addr: local.Addr, - Port: local.Port, - Filters: filters, - Flags: flags, - Timeout: params.Timeout, - Name: name, - Payload: payload, + LTime: s.queryClock.Time(), + ID: uint32(rand.Int31()), + Addr: local.Addr, + Port: local.Port, + Filters: filters, + Flags: flags, + RelayFactor: params.RelayFactor, + Timeout: params.Timeout, + Name: name, + Payload: payload, } // Encode the query @@ -582,9 +627,9 @@ func (s *Serf) Join(existing []string, ignoreOld bool) (int, error) { // Ignore any events from a potential join. This is safe since we hold // the joinLock and nobody else can be doing a Join if ignoreOld { - s.eventJoinIgnore = true + s.eventJoinIgnore.Store(true) defer func() { - s.eventJoinIgnore = false + s.eventJoinIgnore.Store(false) }() } @@ -679,6 +724,13 @@ func (s *Serf) Leave() error { return err } + // Wait for the leave to propagate through the cluster. The broadcast + // timeout is how long we wait for the message to go out from our own + // queue, but this wait is for that message to propagate through the + // cluster. In particular, we want to stay up long enough to service + // any probes from other nodes before they learn about us leaving. + time.Sleep(s.config.LeavePropagateDelay) + // Transition to Left only if we not already shutdown s.stateLock.Lock() if s.state != SerfShutdown { @@ -785,13 +837,15 @@ func (s *Serf) Shutdown() error { s.logger.Printf("[WARN] serf: Shutdown without a Leave") } + // Wait to close the shutdown channel until after we've shut down the + // memberlist and its associated network resources, since the shutdown + // channel signals that we are cleaned up outside of Serf. s.state = SerfShutdown - close(s.shutdownCh) - err := s.memberlist.Shutdown() if err != nil { return err } + close(s.shutdownCh) // Wait for the snapshoter to finish if we have one if s.snapshotter != nil { @@ -855,22 +909,25 @@ func (s *Serf) handleNodeJoin(n *memberlist.Node) { }, } - // Check if we have a join intent and use the LTime - if join := recentIntent(s.recentJoin, n.Name); join != nil { - member.statusLTime = join.LTime + // Check if we have a join or leave intent. The intent buffer + // will only hold one event for this node, so the more recent + // one will take effect. + if join, ok := recentIntent(s.recentIntents, n.Name, messageJoinType); ok { + member.statusLTime = join } - - // Check if we have a leave intent - if leave := recentIntent(s.recentLeave, n.Name); leave != nil { - if leave.LTime > member.statusLTime { - member.Status = StatusLeaving - member.statusLTime = leave.LTime - } + if leave, ok := recentIntent(s.recentIntents, n.Name, messageLeaveType); ok { + member.Status = StatusLeaving + member.statusLTime = leave } s.members[n.Name] = member } else { oldStatus = member.Status + deadTime := time.Now().Sub(member.leaveTime) + if oldStatus == StatusFailed && deadTime < s.config.FlapTimeout { + metrics.IncrCounter([]string{"serf", "member", "flap"}, 1) + } + member.Status = StatusAlive member.leaveTime = time.Time{} member.Addr = net.IP(n.Addr) @@ -1011,18 +1068,8 @@ func (s *Serf) handleNodeLeaveIntent(leaveMsg *messageLeave) bool { member, ok := s.members[leaveMsg.Node] if !ok { - // If we've already seen this message don't rebroadcast - if recentIntent(s.recentLeave, leaveMsg.Node) != nil { - return false - } - - // We don't know this member so store it in a buffer for now - s.recentLeave[s.recentLeaveIndex] = nodeIntent{ - LTime: leaveMsg.LTime, - Node: leaveMsg.Node, - } - s.recentLeaveIndex = (s.recentLeaveIndex + 1) % len(s.recentLeave) - return true + // Rebroadcast only if this was an update we hadn't seen before. + return upsertIntent(s.recentIntents, leaveMsg.Node, messageLeaveType, leaveMsg.LTime, time.Now) } // If the message is old, then it is irrelevant and we can skip it @@ -1082,15 +1129,8 @@ func (s *Serf) handleNodeJoinIntent(joinMsg *messageJoin) bool { member, ok := s.members[joinMsg.Node] if !ok { - // If we've already seen this message don't rebroadcast - if recentIntent(s.recentJoin, joinMsg.Node) != nil { - return false - } - - // We don't know this member so store it in a buffer for now - s.recentJoin[s.recentJoinIndex] = nodeIntent{LTime: joinMsg.LTime, Node: joinMsg.Node} - s.recentJoinIndex = (s.recentJoinIndex + 1) % len(s.recentJoin) - return true + // Rebroadcast only if this was an update we hadn't seen before. + return upsertIntent(s.recentIntents, joinMsg.Node, messageJoinType, joinMsg.LTime, time.Now) } // Check if this time is newer than what we have @@ -1245,19 +1285,23 @@ func (s *Serf) handleQuery(query *messageQuery) bool { if err := s.memberlist.SendTo(&addr, raw); err != nil { s.logger.Printf("[ERR] serf: failed to send ack: %v", err) } + if err := s.relayResponse(query.RelayFactor, addr, &ack); err != nil { + s.logger.Printf("[ERR] serf: failed to relay ack: %v", err) + } } } if s.config.EventCh != nil { s.config.EventCh <- &Query{ - LTime: query.LTime, - Name: query.Name, - Payload: query.Payload, - serf: s, - id: query.ID, - addr: query.Addr, - port: query.Port, - deadline: time.Now().Add(query.Timeout), + LTime: query.LTime, + Name: query.Name, + Payload: query.Payload, + serf: s, + id: query.ID, + addr: query.Addr, + port: query.Port, + deadline: time.Now().Add(query.Timeout), + relayFactor: query.RelayFactor, } } return rebroadcast @@ -1290,25 +1334,37 @@ func (s *Serf) handleQueryResponse(resp *messageQueryResponse) { // Process each type of response if resp.Ack() { + // Exit early if this is a duplicate ack + if _, ok := query.acks[resp.From]; ok { + metrics.IncrCounter([]string{"serf", "query_duplicate_acks"}, 1) + return + } + metrics.IncrCounter([]string{"serf", "query_acks"}, 1) select { case query.ackCh <- resp.From: + query.acks[resp.From] = struct{}{} default: - s.logger.Printf("[WARN] serf: Failed to delivery query ack, dropping") + s.logger.Printf("[WARN] serf: Failed to deliver query ack, dropping") } } else { + // Exit early if this is a duplicate response + if _, ok := query.responses[resp.From]; ok { + metrics.IncrCounter([]string{"serf", "query_duplicate_responses"}, 1) + return + } + metrics.IncrCounter([]string{"serf", "query_responses"}, 1) - select { - case query.respCh <- NodeResponse{From: resp.From, Payload: resp.Payload}: - default: - s.logger.Printf("[WARN] serf: Failed to delivery query response, dropping") + err := query.sendResponse(NodeResponse{From: resp.From, Payload: resp.Payload}) + if err != nil { + s.logger.Printf("[WARN] %v", err) } } } // handleNodeConflict is invoked when a join detects a conflict over a name. // This means two different nodes (IP/Port) are claiming the same name. Memberlist -// will reject the "new" node mapping, but we can still be notified +// will reject the "new" node mapping, but we can still be notified. func (s *Serf) handleNodeConflict(existing, other *memberlist.Node) { // Log a basic warning if the node is not us... if existing.Name != s.config.NodeName { @@ -1361,7 +1417,7 @@ func (s *Serf) resolveNodeConflict() { // Update the counters responses++ - if bytes.Equal(member.Addr, local.Addr) && member.Port == local.Port { + if member.Addr.Equal(local.Addr) && member.Port == local.Port { matching++ } } @@ -1382,14 +1438,17 @@ func (s *Serf) resolveNodeConflict() { } } -// handleReap periodically reaps the list of failed and left members. +// handleReap periodically reaps the list of failed and left members, as well +// as old buffered intents. func (s *Serf) handleReap() { for { select { case <-time.After(s.config.ReapInterval): s.memberLock.Lock() - s.failedMembers = s.reap(s.failedMembers, s.config.ReconnectTimeout) - s.leftMembers = s.reap(s.leftMembers, s.config.TombstoneTimeout) + now := time.Now() + s.failedMembers = s.reap(s.failedMembers, now, s.config.ReconnectTimeout) + s.leftMembers = s.reap(s.leftMembers, now, s.config.TombstoneTimeout) + reapIntents(s.recentIntents, now, s.config.RecentIntentTimeout) s.memberLock.Unlock() case <-s.shutdownCh: return @@ -1413,8 +1472,7 @@ func (s *Serf) handleReconnect() { // reap is called with a list of old members and a timeout, and removes // members that have exceeded the timeout. The members are removed from // both the old list and the members itself. Locking is left to the caller. -func (s *Serf) reap(old []*memberState, timeout time.Duration) []*memberState { - now := time.Now() +func (s *Serf) reap(old []*memberState, now time.Time, timeout time.Duration) []*memberState { n := len(old) for i := 0; i < n; i++ { m := old[i] @@ -1485,7 +1543,7 @@ func (s *Serf) reconnect() { } // Select a random member to try and join - idx := int(rand.Uint32() % uint32(n)) + idx := rand.Int31n(int32(n)) mem := s.failedMembers[idx] s.memberLock.RUnlock() @@ -1497,21 +1555,37 @@ func (s *Serf) reconnect() { s.memberlist.Join([]string{addr.String()}) } +// getQueueMax will get the maximum queue depth, which might be dynamic depending +// on how Serf is configured. +func (s *Serf) getQueueMax() int { + max := s.config.MaxQueueDepth + if s.config.MinQueueDepth > 0 { + s.memberLock.RLock() + max = 2 * len(s.members) + s.memberLock.RUnlock() + + if max < s.config.MinQueueDepth { + max = s.config.MinQueueDepth + } + } + return max +} + // checkQueueDepth periodically checks the size of a queue to see if // it is too large func (s *Serf) checkQueueDepth(name string, queue *memberlist.TransmitLimitedQueue) { for { select { - case <-time.After(time.Second): + case <-time.After(s.config.QueueCheckInterval): numq := queue.NumQueued() metrics.AddSample([]string{"serf", "queue", name}, float32(numq)) if numq >= s.config.QueueDepthWarning { s.logger.Printf("[WARN] serf: %s queue depth: %d", name, numq) } - if numq > s.config.MaxQueueDepth { + if max := s.getQueueMax(); numq > max { s.logger.Printf("[WARN] serf: %s queue depth (%d) exceeds limit (%d), dropping messages!", - name, numq, s.config.MaxQueueDepth) - queue.Prune(s.config.MaxQueueDepth) + name, numq, max) + queue.Prune(max) } case <-s.shutdownCh: return @@ -1533,24 +1607,46 @@ func removeOldMember(old []*memberState, name string) []*memberState { return old } -// recentIntent checks the recent intent buffer for a matching -// entry for a given node, and either returns the message or nil -func recentIntent(recent []nodeIntent, node string) (intent *nodeIntent) { - for i := 0; i < len(recent); i++ { - // Break fast if we hit a zero entry - if recent[i].LTime == 0 { - break +// reapIntents clears out any intents that are older than the timeout. Make sure +// the memberLock is held when passing in the Serf instance's recentIntents +// member. +func reapIntents(intents map[string]nodeIntent, now time.Time, timeout time.Duration) { + for node, intent := range intents { + if now.Sub(intent.WallTime) > timeout { + delete(intents, node) } + } +} - // Check for a node match - if recent[i].Node == node { - // Take the most recent entry - if intent == nil || recent[i].LTime > intent.LTime { - intent = &recent[i] - } +// upsertIntent will update an existing intent with the supplied Lamport time, +// or create a new entry. This will return true if a new entry was added. The +// stamper is used to capture the wall clock time for expiring these buffered +// intents. Make sure the memberLock is held when passing in the Serf instance's +// recentIntents member. +func upsertIntent(intents map[string]nodeIntent, node string, itype messageType, + ltime LamportTime, stamper func() time.Time) bool { + if intent, ok := intents[node]; !ok || ltime > intent.LTime { + intents[node] = nodeIntent{ + Type: itype, + WallTime: stamper(), + LTime: ltime, } + return true + } + + return false +} + +// recentIntent checks the recent intent buffer for a matching entry for a given +// node, and returns the Lamport time, if an intent is present, indicated by the +// returned boolean. Make sure the memberLock is held for read when passing in +// the Serf instance's recentIntents member. +func recentIntent(intents map[string]nodeIntent, node string, itype messageType) (LamportTime, bool) { + if intent, ok := intents[node]; ok && intent.Type == itype { + return intent.LTime, true } - return + + return LamportTime(0), false } // handleRejoin attempts to reconnect to previously known alive nodes @@ -1613,10 +1709,18 @@ func (s *Serf) Stats() map[string]string { toString := func(v uint64) string { return strconv.FormatUint(v, 10) } + s.memberLock.RLock() + members := toString(uint64(len(s.members))) + failed := toString(uint64(len(s.failedMembers))) + left := toString(uint64(len(s.leftMembers))) + health_score := toString(uint64(s.memberlist.GetHealthScore())) + + s.memberLock.RUnlock() stats := map[string]string{ - "members": toString(uint64(len(s.members))), - "failed": toString(uint64(len(s.failedMembers))), - "left": toString(uint64(len(s.leftMembers))), + "members": members, + "failed": failed, + "left": left, + "health_score": health_score, "member_time": toString(uint64(s.clock.Time())), "event_time": toString(uint64(s.eventClock.Time())), "query_time": toString(uint64(s.queryClock.Time())), @@ -1625,6 +1729,9 @@ func (s *Serf) Stats() map[string]string { "query_queue": toString(uint64(s.queryBroadcasts.NumQueued())), "encrypted": fmt.Sprintf("%v", s.EncryptionEnabled()), } + if !s.config.DisableCoordinates { + stats["coordinate_resets"] = toString(uint64(s.coordClient.Stats().Resets)) + } return stats } diff --git a/vendor/github.com/hashicorp/serf/serf/snapshot.go b/vendor/github.com/hashicorp/serf/serf/snapshot.go index 44f8a5175a..d2eda0ea23 100644 --- a/vendor/github.com/hashicorp/serf/serf/snapshot.go +++ b/vendor/github.com/hashicorp/serf/serf/snapshot.go @@ -2,7 +2,6 @@ package serf import ( "bufio" - "encoding/json" "fmt" "log" "math/rand" @@ -13,7 +12,6 @@ import ( "time" "github.com/armon/go-metrics" - "github.com/hashicorp/serf/coordinate" ) /* @@ -27,34 +25,59 @@ nodes to re-join, as well as restore our clock values to avoid replaying old events. */ -const flushInterval = 500 * time.Millisecond -const clockUpdateInterval = 500 * time.Millisecond -const coordinateUpdateInterval = 60 * time.Second -const tmpExt = ".compact" +const ( + // flushInterval is how often we force a flush of the snapshot file + flushInterval = 500 * time.Millisecond + + // clockUpdateInterval is how often we fetch the current lamport time of the cluster and write to the snapshot file + clockUpdateInterval = 500 * time.Millisecond + + // tmpExt is the extention we use for the temporary file during compaction + tmpExt = ".compact" + + // snapshotErrorRecoveryInterval is how often we attempt to recover from + // errors writing to the snapshot file. + snapshotErrorRecoveryInterval = 30 * time.Second + + // eventChSize is the size of the event buffers between Serf and the + // consuming application. If this is exhausted we will block Serf and Memberlist. + eventChSize = 2048 + + // shutdownFlushTimeout is the time limit to write pending events to the snapshot during a shutdown + shutdownFlushTimeout = 250 * time.Millisecond + + // snapshotBytesPerNode is an estimated bytes per node to snapshot + snapshotBytesPerNode = 128 + + // snapshotCompactionThreshold is the threshold we apply to + // the snapshot size estimate (nodes * bytes per node) before compacting. + snapshotCompactionThreshold = 2 +) // Snapshotter is responsible for ingesting events and persisting // them to disk, and providing a recovery mechanism at start time. type Snapshotter struct { - aliveNodes map[string]string - clock *LamportClock - coordClient *coordinate.Client - fh *os.File - buffered *bufio.Writer - inCh <-chan Event - lastFlush time.Time - lastClock LamportTime - lastEventClock LamportTime - lastQueryClock LamportTime - leaveCh chan struct{} - leaving bool - logger *log.Logger - maxSize int64 - path string - offset int64 - outCh chan<- Event - rejoinAfterLeave bool - shutdownCh <-chan struct{} - waitCh chan struct{} + aliveNodes map[string]string + clock *LamportClock + fh *os.File + buffered *bufio.Writer + inCh <-chan Event + streamCh chan Event + lastFlush time.Time + lastClock LamportTime + lastEventClock LamportTime + lastQueryClock LamportTime + leaveCh chan struct{} + leaving bool + logger *log.Logger + minCompactSize int64 + path string + offset int64 + outCh chan<- Event + rejoinAfterLeave bool + shutdownCh <-chan struct{} + waitCh chan struct{} + lastAttemptedCompaction time.Time } // PreviousNode is used to represent the previously known alive nodes @@ -74,17 +97,17 @@ func (p PreviousNode) String() string { // Setting rejoinAfterLeave makes leave not clear the state, and can be used // if you intend to rejoin the same cluster after a leave. func NewSnapshotter(path string, - maxSize int, + minCompactSize int, rejoinAfterLeave bool, logger *log.Logger, clock *LamportClock, - coordClient *coordinate.Client, outCh chan<- Event, shutdownCh <-chan struct{}) (chan<- Event, *Snapshotter, error) { - inCh := make(chan Event, 1024) + inCh := make(chan Event, eventChSize) + streamCh := make(chan Event, eventChSize) // Try to open the file - fh, err := os.OpenFile(path, os.O_RDWR|os.O_APPEND|os.O_CREATE, 0755) + fh, err := os.OpenFile(path, os.O_RDWR|os.O_APPEND|os.O_CREATE, 0644) if err != nil { return nil, nil, fmt.Errorf("failed to open snapshot: %v", err) } @@ -101,16 +124,16 @@ func NewSnapshotter(path string, snap := &Snapshotter{ aliveNodes: make(map[string]string), clock: clock, - coordClient: coordClient, fh: fh, buffered: bufio.NewWriter(fh), inCh: inCh, + streamCh: streamCh, lastClock: 0, lastEventClock: 0, lastQueryClock: 0, leaveCh: make(chan struct{}), logger: logger, - maxSize: int64(maxSize), + minCompactSize: int64(minCompactSize), path: path, offset: offset, outCh: outCh, @@ -126,6 +149,7 @@ func NewSnapshotter(path string, } // Start handling new commands + go snap.teeStream() go snap.stream() return inCh, snap, nil } @@ -175,13 +199,68 @@ func (s *Snapshotter) Leave() { } } +// teeStream is a long running routine that is used to copy events +// to the output channel and the internal event handler. +func (s *Snapshotter) teeStream() { + flushEvent := func(e Event) { + // Forward to the internal stream, do not block + select { + case s.streamCh <- e: + default: + } + + // Forward the event immediately, do not block + if s.outCh != nil { + select { + case s.outCh <- e: + default: + } + } + } + +OUTER: + for { + select { + case e := <-s.inCh: + flushEvent(e) + case <-s.shutdownCh: + break OUTER + } + } + + // Drain any remaining events before exiting + for { + select { + case e := <-s.inCh: + flushEvent(e) + default: + return + } + } +} + // stream is a long running routine that is used to handle events func (s *Snapshotter) stream() { clockTicker := time.NewTicker(clockUpdateInterval) defer clockTicker.Stop() - coordinateTicker := time.NewTicker(coordinateUpdateInterval) - defer coordinateTicker.Stop() + // flushEvent is used to handle writing out an event + flushEvent := func(e Event) { + // Stop recording events after a leave is issued + if s.leaving { + return + } + switch typed := e.(type) { + case MemberEvent: + s.processMemberEvent(typed) + case UserEvent: + s.processUserEvent(typed) + case *Query: + s.processQuery(typed) + default: + s.logger.Printf("[ERR] serf: Unknown event to snapshot: %#v", e) + } + } for { select { @@ -200,34 +279,32 @@ func (s *Snapshotter) stream() { s.logger.Printf("[ERR] serf: failed to sync leave to snapshot: %v", err) } - case e := <-s.inCh: - // Forward the event immediately - if s.outCh != nil { - s.outCh <- e - } - - // Stop recording events after a leave is issued - if s.leaving { - continue - } - switch typed := e.(type) { - case MemberEvent: - s.processMemberEvent(typed) - case UserEvent: - s.processUserEvent(typed) - case *Query: - s.processQuery(typed) - default: - s.logger.Printf("[ERR] serf: Unknown event to snapshot: %#v", e) - } + case e := <-s.streamCh: + flushEvent(e) case <-clockTicker.C: s.updateClock() - case <-coordinateTicker.C: - s.updateCoordinate() - case <-s.shutdownCh: + // Setup a timeout + flushTimeout := time.After(shutdownFlushTimeout) + + // Snapshot the clock + s.updateClock() + + // Clear out the buffers + FLUSH: + for { + select { + case e := <-s.streamCh: + flushEvent(e) + case <-flushTimeout: + break FLUSH + default: + break FLUSH + } + } + if err := s.buffered.Flush(); err != nil { s.logger.Printf("[ERR] serf: failed to flush snapshot: %v", err) } @@ -273,20 +350,6 @@ func (s *Snapshotter) updateClock() { } } -// updateCoordinate is called periodically to write out the current local -// coordinate. It's safe to call this if coordinates aren't enabled (nil -// client) and it will be a no-op. -func (s *Snapshotter) updateCoordinate() { - if s.coordClient != nil { - encoded, err := json.Marshal(s.coordClient.GetCoordinate()) - if err != nil { - s.logger.Printf("[ERR] serf: Failed to encode coordinate: %v", err) - } else { - s.tryAppend(fmt.Sprintf("coordinate: %s\n", encoded)) - } - } -} - // processUserEvent is used to handle a single user event func (s *Snapshotter) processUserEvent(e UserEvent) { // Ignore old clocks @@ -311,6 +374,17 @@ func (s *Snapshotter) processQuery(q *Query) { func (s *Snapshotter) tryAppend(l string) { if err := s.appendLine(l); err != nil { s.logger.Printf("[ERR] serf: Failed to update snapshot: %v", err) + now := time.Now() + if now.Sub(s.lastAttemptedCompaction) > snapshotErrorRecoveryInterval { + s.lastAttemptedCompaction = now + s.logger.Printf("[INFO] serf: Attempting compaction to recover from error...") + err = s.compact() + if err != nil { + s.logger.Printf("[ERR] serf: Compaction failed, will reattempt after %v: %v", snapshotErrorRecoveryInterval, err) + } else { + s.logger.Printf("[INFO] serf: Finished compaction, successfully recovered from error state") + } + } } } @@ -334,12 +408,25 @@ func (s *Snapshotter) appendLine(l string) error { // Check if a compaction is necessary s.offset += int64(n) - if s.offset > s.maxSize { + if s.offset > s.snapshotMaxSize() { return s.compact() } return nil } +// snapshotMaxSize computes the maximum size and is used to force periodic compaction. +func (s *Snapshotter) snapshotMaxSize() int64 { + nodes := int64(len(s.aliveNodes)) + estSize := nodes * snapshotBytesPerNode + threshold := estSize * snapshotCompactionThreshold + + // Apply a minimum threshold to avoid frequent compaction + if threshold < s.minCompactSize { + threshold = s.minCompactSize + } + return threshold +} + // Compact is used to compact the snapshot once it is too large func (s *Snapshotter) compact() error { defer metrics.MeasureSince([]string{"serf", "snapshot", "compact"}, time.Now()) @@ -391,30 +478,22 @@ func (s *Snapshotter) compact() error { } offset += int64(n) - // Write out the coordinate. - if s.coordClient != nil { - encoded, err := json.Marshal(s.coordClient.GetCoordinate()) - if err != nil { - fh.Close() - return err - } - - line = fmt.Sprintf("coordinate: %s\n", encoded) - n, err = buf.WriteString(line) - if err != nil { - fh.Close() - return err - } - offset += int64(n) - } - // Flush the new snapshot err = buf.Flush() - fh.Close() + if err != nil { return fmt.Errorf("failed to flush new snapshot: %v", err) } + err = fh.Sync() + + if err != nil { + fh.Close() + return fmt.Errorf("failed to fsync new snapshot: %v", err) + } + + fh.Close() + // We now need to swap the old snapshot file with the new snapshot. // Turns out, Windows won't let us rename the files if we have // open handles to them or if the destination already exists. This @@ -520,19 +599,7 @@ func (s *Snapshotter) replay() error { s.lastQueryClock = LamportTime(timeInt) } else if strings.HasPrefix(line, "coordinate: ") { - if s.coordClient == nil { - s.logger.Printf("[WARN] serf: Ignoring snapshot coordinates since they are disabled") - continue - } - - coordStr := strings.TrimPrefix(line, "coordinate: ") - var coord coordinate.Coordinate - err := json.Unmarshal([]byte(coordStr), &coord) - if err != nil { - s.logger.Printf("[WARN] serf: Failed to decode coordinate: %v", err) - continue - } - s.coordClient.SetCoordinate(&coord) + continue // Ignores any coordinate persistence from old snapshots, serf should re-converge } else if line == "leave" { // Ignore a leave if we plan on re-joining if s.rejoinAfterLeave { From 615dd4ac8ac2fc9c411af81bfbf25ad1eeb5fae0 Mon Sep 17 00:00:00 2001 From: Sebastiaan van Stijn Date: Mon, 26 Aug 2019 12:36:31 +0200 Subject: [PATCH 5/6] bump armon/go-metrics ec5e00d3c878b2a97bbe0884ef45ffd1b4f669f5 full diff: https://github.com/armon/go-metrics/compare/eb0af217e5e9747e41dd5303755356b62d28e3ec...ec5e00d3c878b2a97bbe0884ef45ffd1b4f669f5 Signed-off-by: Sebastiaan van Stijn --- vendor.conf | 2 +- vendor/github.com/armon/go-metrics/README.md | 87 ++++++--- vendor/github.com/armon/go-metrics/go.mod | 16 ++ vendor/github.com/armon/go-metrics/inmem.go | 171 +++++++++++++--- .../armon/go-metrics/inmem_endpoint.go | 131 +++++++++++++ .../armon/go-metrics/inmem_signal.go | 33 +++- vendor/github.com/armon/go-metrics/metrics.go | 183 +++++++++++++++++- vendor/github.com/armon/go-metrics/sink.go | 77 +++++++- vendor/github.com/armon/go-metrics/start.go | 68 +++++-- vendor/github.com/armon/go-metrics/statsd.go | 30 +++ .../github.com/armon/go-metrics/statsite.go | 30 +++ 11 files changed, 728 insertions(+), 100 deletions(-) create mode 100644 vendor/github.com/armon/go-metrics/go.mod create mode 100644 vendor/github.com/armon/go-metrics/inmem_endpoint.go mode change 100755 => 100644 vendor/github.com/armon/go-metrics/metrics.go mode change 100755 => 100644 vendor/github.com/armon/go-metrics/sink.go mode change 100755 => 100644 vendor/github.com/armon/go-metrics/start.go mode change 100755 => 100644 vendor/github.com/armon/go-metrics/statsite.go diff --git a/vendor.conf b/vendor.conf index c644cf49cd..7c69849eb9 100644 --- a/vendor.conf +++ b/vendor.conf @@ -3,7 +3,7 @@ github.com/BurntSushi/toml 3012a1dbe2e4bd1391d42b32f057 github.com/containerd/cgroups 318312a373405e5e91134d8063d04d59768a1bff github.com/Microsoft/go-winio 6c72808b55902eae4c5943626030429ff20f3b63 # v0.4.14 github.com/Microsoft/hcsshim 9dcb42f100215f8d375b4a9265e5bba009217a85 # moby branch -github.com/armon/go-metrics eb0af217e5e9747e41dd5303755356b62d28e3ec +github.com/armon/go-metrics ec5e00d3c878b2a97bbe0884ef45ffd1b4f669f5 github.com/armon/go-radix e39d623f12e8e41c7b5529e9a9dd67a1e2261f80 github.com/coreos/etcd d57e8b8d97adfc4a6c224fe116714bf1a1f3beb9 # v3.3.12 github.com/coreos/go-semver 8ab6407b697782a06568d4b7f1db25550ec2e4c6 # v0.2.0 diff --git a/vendor/github.com/armon/go-metrics/README.md b/vendor/github.com/armon/go-metrics/README.md index d9f46e85ba..aa73348c08 100644 --- a/vendor/github.com/armon/go-metrics/README.md +++ b/vendor/github.com/armon/go-metrics/README.md @@ -4,14 +4,17 @@ go-metrics This library provides a `metrics` package which can be used to instrument code, expose application metrics, and profile runtime performance in a flexible manner. +Current API: [![GoDoc](https://godoc.org/github.com/armon/go-metrics?status.svg)](https://godoc.org/github.com/armon/go-metrics) + Sinks -===== +----- The `metrics` package makes use of a `MetricSink` interface to support delivery to any type of backend. Currently the following sinks are provided: -* StatsiteSink : Sinks to a statsite instance (TCP) -* StatsdSink: Sinks to a statsd / statsite instance (UDP) +* StatsiteSink : Sinks to a [statsite](https://github.com/armon/statsite/) instance (TCP) +* StatsdSink: Sinks to a [StatsD](https://github.com/etsy/statsd/) / statsite instance (UDP) +* PrometheusSink: Sinks to a [Prometheus](http://prometheus.io/) metrics endpoint (exposed via HTTP for scrapes) * InmemSink : Provides in-memory aggregation, can be used to export stats * FanoutSink : Sinks to multiple sinks. Enables writing to multiple statsite instances for example. * BlackholeSink : Sinks to nowhere @@ -20,49 +23,69 @@ In addition to the sinks, the `InmemSignal` can be used to catch a signal, and dump a formatted output of recent metrics. For example, when a process gets a SIGUSR1, it can dump to stderr recent performance metrics for debugging. +Labels +------ + +Most metrics do have an equivalent ending with `WithLabels`, such methods +allow to push metrics with labels and use some features of underlying Sinks +(ex: translated into Prometheus labels). + +Since some of these labels may increase greatly cardinality of metrics, the +library allow to filter labels using a blacklist/whitelist filtering system +which is global to all metrics. + +* If `Config.AllowedLabels` is not nil, then only labels specified in this value will be sent to underlying Sink, otherwise, all labels are sent by default. +* If `Config.BlockedLabels` is not nil, any label specified in this value will not be sent to underlying Sinks. + +By default, both `Config.AllowedLabels` and `Config.BlockedLabels` are nil, meaning that +no tags are filetered at all, but it allow to a user to globally block some tags with high +cardinality at application level. + Examples -======== +-------- Here is an example of using the package: - func SlowMethod() { - // Profiling the runtime of a method - defer metrics.MeasureSince([]string{"SlowMethod"}, time.Now()) - } +```go +func SlowMethod() { + // Profiling the runtime of a method + defer metrics.MeasureSince([]string{"SlowMethod"}, time.Now()) +} - // Configure a statsite sink as the global metrics sink - sink, _ := metrics.NewStatsiteSink("statsite:8125") - metrics.NewGlobal(metrics.DefaultConfig("service-name"), sink) +// Configure a statsite sink as the global metrics sink +sink, _ := metrics.NewStatsiteSink("statsite:8125") +metrics.NewGlobal(metrics.DefaultConfig("service-name"), sink) - // Emit a Key/Value pair - metrics.EmitKey([]string{"questions", "meaning of life"}, 42) +// Emit a Key/Value pair +metrics.EmitKey([]string{"questions", "meaning of life"}, 42) +``` +Here is an example of setting up a signal handler: -Here is an example of setting up an signal handler: +```go +// Setup the inmem sink and signal handler +inm := metrics.NewInmemSink(10*time.Second, time.Minute) +sig := metrics.DefaultInmemSignal(inm) +metrics.NewGlobal(metrics.DefaultConfig("service-name"), inm) - // Setup the inmem sink and signal handler - inm := NewInmemSink(10*time.Second, time.Minute) - sig := DefaultInmemSignal(inm) - metrics.NewGlobal(metrics.DefaultConfig("service-name"), inm) +// Run some code +inm.SetGauge([]string{"foo"}, 42) +inm.EmitKey([]string{"bar"}, 30) - // Run some code - inm.SetGauge([]string{"foo"}, 42) - inm.EmitKey([]string{"bar"}, 30) +inm.IncrCounter([]string{"baz"}, 42) +inm.IncrCounter([]string{"baz"}, 1) +inm.IncrCounter([]string{"baz"}, 80) - inm.IncrCounter([]string{"baz"}, 42) - inm.IncrCounter([]string{"baz"}, 1) - inm.IncrCounter([]string{"baz"}, 80) +inm.AddSample([]string{"method", "wow"}, 42) +inm.AddSample([]string{"method", "wow"}, 100) +inm.AddSample([]string{"method", "wow"}, 22) - inm.AddSample([]string{"method", "wow"}, 42) - inm.AddSample([]string{"method", "wow"}, 100) - inm.AddSample([]string{"method", "wow"}, 22) - - .... +.... +``` When a signal comes in, output like the following will be dumped to stderr: [2014-01-28 14:57:33.04 -0800 PST][G] 'foo': 42.000 [2014-01-28 14:57:33.04 -0800 PST][P] 'bar': 30.000 - [2014-01-28 14:57:33.04 -0800 PST][C] 'baz': Count: 3 Min: 1.000 Mean: 41.000 Max: 80.000 Stddev: 39.509 - [2014-01-28 14:57:33.04 -0800 PST][S] 'method.wow': Count: 3 Min: 22.000 Mean: 54.667 Max: 100.000 Stddev: 40.513 - + [2014-01-28 14:57:33.04 -0800 PST][C] 'baz': Count: 3 Min: 1.000 Mean: 41.000 Max: 80.000 Stddev: 39.509 + [2014-01-28 14:57:33.04 -0800 PST][S] 'method.wow': Count: 3 Min: 22.000 Mean: 54.667 Max: 100.000 Stddev: 40.513 \ No newline at end of file diff --git a/vendor/github.com/armon/go-metrics/go.mod b/vendor/github.com/armon/go-metrics/go.mod new file mode 100644 index 0000000000..88e1e98fbf --- /dev/null +++ b/vendor/github.com/armon/go-metrics/go.mod @@ -0,0 +1,16 @@ +module github.com/armon/go-metrics + +go 1.12 + +require ( + github.com/DataDog/datadog-go v2.2.0+incompatible + github.com/circonus-labs/circonus-gometrics v2.3.1+incompatible + github.com/circonus-labs/circonusllhist v0.1.3 // indirect + github.com/hashicorp/go-immutable-radix v1.0.0 + github.com/hashicorp/go-retryablehttp v0.5.3 // indirect + github.com/pascaldekloe/goe v0.1.0 + github.com/pkg/errors v0.8.1 // indirect + github.com/prometheus/client_golang v0.9.2 + github.com/stretchr/testify v1.3.0 // indirect + github.com/tv42/httpunix v0.0.0-20150427012821-b75d8614f926 // indirect +) diff --git a/vendor/github.com/armon/go-metrics/inmem.go b/vendor/github.com/armon/go-metrics/inmem.go index 0749229bfd..93b0e0ad83 100644 --- a/vendor/github.com/armon/go-metrics/inmem.go +++ b/vendor/github.com/armon/go-metrics/inmem.go @@ -1,8 +1,10 @@ package metrics import ( + "bytes" "fmt" "math" + "net/url" "strings" "sync" "time" @@ -25,6 +27,8 @@ type InmemSink struct { // intervals is a slice of the retained intervals intervals []*IntervalMetrics intervalLock sync.RWMutex + + rateDenom float64 } // IntervalMetrics stores the aggregated metrics @@ -36,7 +40,7 @@ type IntervalMetrics struct { Interval time.Time // Gauges maps the key to the last set value - Gauges map[string]float32 + Gauges map[string]GaugeValue // Points maps the string to the list of emitted values // from EmitKey @@ -44,32 +48,34 @@ type IntervalMetrics struct { // Counters maps the string key to a sum of the counter // values - Counters map[string]*AggregateSample + Counters map[string]SampledValue // Samples maps the key to an AggregateSample, // which has the rolled up view of a sample - Samples map[string]*AggregateSample + Samples map[string]SampledValue } // NewIntervalMetrics creates a new IntervalMetrics for a given interval func NewIntervalMetrics(intv time.Time) *IntervalMetrics { return &IntervalMetrics{ Interval: intv, - Gauges: make(map[string]float32), + Gauges: make(map[string]GaugeValue), Points: make(map[string][]float32), - Counters: make(map[string]*AggregateSample), - Samples: make(map[string]*AggregateSample), + Counters: make(map[string]SampledValue), + Samples: make(map[string]SampledValue), } } // AggregateSample is used to hold aggregate metrics // about a sample type AggregateSample struct { - Count int // The count of emitted pairs - Sum float64 // The sum of values - SumSq float64 // The sum of squared values - Min float64 // Minimum value - Max float64 // Maximum value + Count int // The count of emitted pairs + Rate float64 // The values rate per time unit (usually 1 second) + Sum float64 // The sum of values + SumSq float64 `json:"-"` // The sum of squared values + Min float64 // Minimum value + Max float64 // Maximum value + LastUpdated time.Time `json:"-"` // When value was last updated } // Computes a Stddev of the values @@ -91,7 +97,7 @@ func (a *AggregateSample) Mean() float64 { } // Ingest is used to update a sample -func (a *AggregateSample) Ingest(v float64) { +func (a *AggregateSample) Ingest(v float64, rateDenom float64) { a.Count++ a.Sum += v a.SumSq += (v * v) @@ -101,38 +107,64 @@ func (a *AggregateSample) Ingest(v float64) { if v > a.Max || a.Count == 1 { a.Max = v } + a.Rate = float64(a.Sum) / rateDenom + a.LastUpdated = time.Now() } func (a *AggregateSample) String() string { if a.Count == 0 { return "Count: 0" } else if a.Stddev() == 0 { - return fmt.Sprintf("Count: %d Sum: %0.3f", a.Count, a.Sum) + return fmt.Sprintf("Count: %d Sum: %0.3f LastUpdated: %s", a.Count, a.Sum, a.LastUpdated) } else { - return fmt.Sprintf("Count: %d Min: %0.3f Mean: %0.3f Max: %0.3f Stddev: %0.3f Sum: %0.3f", - a.Count, a.Min, a.Mean(), a.Max, a.Stddev(), a.Sum) + return fmt.Sprintf("Count: %d Min: %0.3f Mean: %0.3f Max: %0.3f Stddev: %0.3f Sum: %0.3f LastUpdated: %s", + a.Count, a.Min, a.Mean(), a.Max, a.Stddev(), a.Sum, a.LastUpdated) + } +} + +// NewInmemSinkFromURL creates an InmemSink from a URL. It is used +// (and tested) from NewMetricSinkFromURL. +func NewInmemSinkFromURL(u *url.URL) (MetricSink, error) { + params := u.Query() + + interval, err := time.ParseDuration(params.Get("interval")) + if err != nil { + return nil, fmt.Errorf("Bad 'interval' param: %s", err) } + + retain, err := time.ParseDuration(params.Get("retain")) + if err != nil { + return nil, fmt.Errorf("Bad 'retain' param: %s", err) + } + + return NewInmemSink(interval, retain), nil } // NewInmemSink is used to construct a new in-memory sink. // Uses an aggregation interval and maximum retention period. func NewInmemSink(interval, retain time.Duration) *InmemSink { + rateTimeUnit := time.Second i := &InmemSink{ interval: interval, retain: retain, maxIntervals: int(retain / interval), + rateDenom: float64(interval.Nanoseconds()) / float64(rateTimeUnit.Nanoseconds()), } i.intervals = make([]*IntervalMetrics, 0, i.maxIntervals) return i } func (i *InmemSink) SetGauge(key []string, val float32) { - k := i.flattenKey(key) + i.SetGaugeWithLabels(key, val, nil) +} + +func (i *InmemSink) SetGaugeWithLabels(key []string, val float32, labels []Label) { + k, name := i.flattenKeyLabels(key, labels) intv := i.getInterval() intv.Lock() defer intv.Unlock() - intv.Gauges[k] = val + intv.Gauges[k] = GaugeValue{Name: name, Value: val, Labels: labels} } func (i *InmemSink) EmitKey(key []string, val float32) { @@ -146,33 +178,49 @@ func (i *InmemSink) EmitKey(key []string, val float32) { } func (i *InmemSink) IncrCounter(key []string, val float32) { - k := i.flattenKey(key) + i.IncrCounterWithLabels(key, val, nil) +} + +func (i *InmemSink) IncrCounterWithLabels(key []string, val float32, labels []Label) { + k, name := i.flattenKeyLabels(key, labels) intv := i.getInterval() intv.Lock() defer intv.Unlock() - agg := intv.Counters[k] - if agg == nil { - agg = &AggregateSample{} + agg, ok := intv.Counters[k] + if !ok { + agg = SampledValue{ + Name: name, + AggregateSample: &AggregateSample{}, + Labels: labels, + } intv.Counters[k] = agg } - agg.Ingest(float64(val)) + agg.Ingest(float64(val), i.rateDenom) } func (i *InmemSink) AddSample(key []string, val float32) { - k := i.flattenKey(key) + i.AddSampleWithLabels(key, val, nil) +} + +func (i *InmemSink) AddSampleWithLabels(key []string, val float32, labels []Label) { + k, name := i.flattenKeyLabels(key, labels) intv := i.getInterval() intv.Lock() defer intv.Unlock() - agg := intv.Samples[k] - if agg == nil { - agg = &AggregateSample{} + agg, ok := intv.Samples[k] + if !ok { + agg = SampledValue{ + Name: name, + AggregateSample: &AggregateSample{}, + Labels: labels, + } intv.Samples[k] = agg } - agg.Ingest(float64(val)) + agg.Ingest(float64(val), i.rateDenom) } // Data is used to retrieve all the aggregated metrics @@ -184,8 +232,37 @@ func (i *InmemSink) Data() []*IntervalMetrics { i.intervalLock.RLock() defer i.intervalLock.RUnlock() - intervals := make([]*IntervalMetrics, len(i.intervals)) - copy(intervals, i.intervals) + n := len(i.intervals) + intervals := make([]*IntervalMetrics, n) + + copy(intervals[:n-1], i.intervals[:n-1]) + current := i.intervals[n-1] + + // make its own copy for current interval + intervals[n-1] = &IntervalMetrics{} + copyCurrent := intervals[n-1] + current.RLock() + *copyCurrent = *current + + copyCurrent.Gauges = make(map[string]GaugeValue, len(current.Gauges)) + for k, v := range current.Gauges { + copyCurrent.Gauges[k] = v + } + // saved values will be not change, just copy its link + copyCurrent.Points = make(map[string][]float32, len(current.Points)) + for k, v := range current.Points { + copyCurrent.Points[k] = v + } + copyCurrent.Counters = make(map[string]SampledValue, len(current.Counters)) + for k, v := range current.Counters { + copyCurrent.Counters[k] = v.deepCopy() + } + copyCurrent.Samples = make(map[string]SampledValue, len(current.Samples)) + for k, v := range current.Samples { + copyCurrent.Samples[k] = v.deepCopy() + } + current.RUnlock() + return intervals } @@ -234,6 +311,38 @@ func (i *InmemSink) getInterval() *IntervalMetrics { // Flattens the key for formatting, removes spaces func (i *InmemSink) flattenKey(parts []string) string { - joined := strings.Join(parts, ".") - return strings.Replace(joined, " ", "_", -1) + buf := &bytes.Buffer{} + replacer := strings.NewReplacer(" ", "_") + + if len(parts) > 0 { + replacer.WriteString(buf, parts[0]) + } + for _, part := range parts[1:] { + replacer.WriteString(buf, ".") + replacer.WriteString(buf, part) + } + + return buf.String() +} + +// Flattens the key for formatting along with its labels, removes spaces +func (i *InmemSink) flattenKeyLabels(parts []string, labels []Label) (string, string) { + buf := &bytes.Buffer{} + replacer := strings.NewReplacer(" ", "_") + + if len(parts) > 0 { + replacer.WriteString(buf, parts[0]) + } + for _, part := range parts[1:] { + replacer.WriteString(buf, ".") + replacer.WriteString(buf, part) + } + + key := buf.String() + + for _, label := range labels { + replacer.WriteString(buf, fmt.Sprintf(";%s=%s", label.Name, label.Value)) + } + + return buf.String(), key } diff --git a/vendor/github.com/armon/go-metrics/inmem_endpoint.go b/vendor/github.com/armon/go-metrics/inmem_endpoint.go new file mode 100644 index 0000000000..5fac958d94 --- /dev/null +++ b/vendor/github.com/armon/go-metrics/inmem_endpoint.go @@ -0,0 +1,131 @@ +package metrics + +import ( + "fmt" + "net/http" + "sort" + "time" +) + +// MetricsSummary holds a roll-up of metrics info for a given interval +type MetricsSummary struct { + Timestamp string + Gauges []GaugeValue + Points []PointValue + Counters []SampledValue + Samples []SampledValue +} + +type GaugeValue struct { + Name string + Hash string `json:"-"` + Value float32 + + Labels []Label `json:"-"` + DisplayLabels map[string]string `json:"Labels"` +} + +type PointValue struct { + Name string + Points []float32 +} + +type SampledValue struct { + Name string + Hash string `json:"-"` + *AggregateSample + Mean float64 + Stddev float64 + + Labels []Label `json:"-"` + DisplayLabels map[string]string `json:"Labels"` +} + +// deepCopy allocates a new instance of AggregateSample +func (source *SampledValue) deepCopy() SampledValue { + dest := *source + if source.AggregateSample != nil { + dest.AggregateSample = &AggregateSample{} + *dest.AggregateSample = *source.AggregateSample + } + return dest +} + +// DisplayMetrics returns a summary of the metrics from the most recent finished interval. +func (i *InmemSink) DisplayMetrics(resp http.ResponseWriter, req *http.Request) (interface{}, error) { + data := i.Data() + + var interval *IntervalMetrics + n := len(data) + switch { + case n == 0: + return nil, fmt.Errorf("no metric intervals have been initialized yet") + case n == 1: + // Show the current interval if it's all we have + interval = data[0] + default: + // Show the most recent finished interval if we have one + interval = data[n-2] + } + + interval.RLock() + defer interval.RUnlock() + + summary := MetricsSummary{ + Timestamp: interval.Interval.Round(time.Second).UTC().String(), + Gauges: make([]GaugeValue, 0, len(interval.Gauges)), + Points: make([]PointValue, 0, len(interval.Points)), + } + + // Format and sort the output of each metric type, so it gets displayed in a + // deterministic order. + for name, points := range interval.Points { + summary.Points = append(summary.Points, PointValue{name, points}) + } + sort.Slice(summary.Points, func(i, j int) bool { + return summary.Points[i].Name < summary.Points[j].Name + }) + + for hash, value := range interval.Gauges { + value.Hash = hash + value.DisplayLabels = make(map[string]string) + for _, label := range value.Labels { + value.DisplayLabels[label.Name] = label.Value + } + value.Labels = nil + + summary.Gauges = append(summary.Gauges, value) + } + sort.Slice(summary.Gauges, func(i, j int) bool { + return summary.Gauges[i].Hash < summary.Gauges[j].Hash + }) + + summary.Counters = formatSamples(interval.Counters) + summary.Samples = formatSamples(interval.Samples) + + return summary, nil +} + +func formatSamples(source map[string]SampledValue) []SampledValue { + output := make([]SampledValue, 0, len(source)) + for hash, sample := range source { + displayLabels := make(map[string]string) + for _, label := range sample.Labels { + displayLabels[label.Name] = label.Value + } + + output = append(output, SampledValue{ + Name: sample.Name, + Hash: hash, + AggregateSample: sample.AggregateSample, + Mean: sample.AggregateSample.Mean(), + Stddev: sample.AggregateSample.Stddev(), + DisplayLabels: displayLabels, + }) + } + sort.Slice(output, func(i, j int) bool { + return output[i].Hash < output[j].Hash + }) + + return output +} diff --git a/vendor/github.com/armon/go-metrics/inmem_signal.go b/vendor/github.com/armon/go-metrics/inmem_signal.go index 95d08ee10f..0937f4aedf 100644 --- a/vendor/github.com/armon/go-metrics/inmem_signal.go +++ b/vendor/github.com/armon/go-metrics/inmem_signal.go @@ -6,6 +6,7 @@ import ( "io" "os" "os/signal" + "strings" "sync" "syscall" ) @@ -75,22 +76,25 @@ func (i *InmemSignal) dumpStats() { data := i.inm.Data() // Skip the last period which is still being aggregated - for i := 0; i < len(data)-1; i++ { - intv := data[i] + for j := 0; j < len(data)-1; j++ { + intv := data[j] intv.RLock() - for name, val := range intv.Gauges { - fmt.Fprintf(buf, "[%v][G] '%s': %0.3f\n", intv.Interval, name, val) + for _, val := range intv.Gauges { + name := i.flattenLabels(val.Name, val.Labels) + fmt.Fprintf(buf, "[%v][G] '%s': %0.3f\n", intv.Interval, name, val.Value) } for name, vals := range intv.Points { for _, val := range vals { fmt.Fprintf(buf, "[%v][P] '%s': %0.3f\n", intv.Interval, name, val) } } - for name, agg := range intv.Counters { - fmt.Fprintf(buf, "[%v][C] '%s': %s\n", intv.Interval, name, agg) + for _, agg := range intv.Counters { + name := i.flattenLabels(agg.Name, agg.Labels) + fmt.Fprintf(buf, "[%v][C] '%s': %s\n", intv.Interval, name, agg.AggregateSample) } - for name, agg := range intv.Samples { - fmt.Fprintf(buf, "[%v][S] '%s': %s\n", intv.Interval, name, agg) + for _, agg := range intv.Samples { + name := i.flattenLabels(agg.Name, agg.Labels) + fmt.Fprintf(buf, "[%v][S] '%s': %s\n", intv.Interval, name, agg.AggregateSample) } intv.RUnlock() } @@ -98,3 +102,16 @@ func (i *InmemSignal) dumpStats() { // Write out the bytes i.w.Write(buf.Bytes()) } + +// Flattens the key for formatting along with its labels, removes spaces +func (i *InmemSignal) flattenLabels(name string, labels []Label) string { + buf := bytes.NewBufferString(name) + replacer := strings.NewReplacer(" ", "_", ":", "_") + + for _, label := range labels { + replacer.WriteString(buf, ".") + replacer.WriteString(buf, label.Value) + } + + return buf.String() +} diff --git a/vendor/github.com/armon/go-metrics/metrics.go b/vendor/github.com/armon/go-metrics/metrics.go old mode 100755 new mode 100644 index b818e4182c..4920d68324 --- a/vendor/github.com/armon/go-metrics/metrics.go +++ b/vendor/github.com/armon/go-metrics/metrics.go @@ -2,20 +2,44 @@ package metrics import ( "runtime" + "strings" "time" + + "github.com/hashicorp/go-immutable-radix" ) +type Label struct { + Name string + Value string +} + func (m *Metrics) SetGauge(key []string, val float32) { - if m.HostName != "" && m.EnableHostname { - key = insert(0, m.HostName, key) + m.SetGaugeWithLabels(key, val, nil) +} + +func (m *Metrics) SetGaugeWithLabels(key []string, val float32, labels []Label) { + if m.HostName != "" { + if m.EnableHostnameLabel { + labels = append(labels, Label{"host", m.HostName}) + } else if m.EnableHostname { + key = insert(0, m.HostName, key) + } } if m.EnableTypePrefix { key = insert(0, "gauge", key) } if m.ServiceName != "" { - key = insert(0, m.ServiceName, key) + if m.EnableServiceLabel { + labels = append(labels, Label{"service", m.ServiceName}) + } else { + key = insert(0, m.ServiceName, key) + } + } + allowed, labelsFiltered := m.allowMetric(key, labels) + if !allowed { + return } - m.sink.SetGauge(key, val) + m.sink.SetGaugeWithLabels(key, val, labelsFiltered) } func (m *Metrics) EmitKey(key []string, val float32) { @@ -25,40 +49,179 @@ func (m *Metrics) EmitKey(key []string, val float32) { if m.ServiceName != "" { key = insert(0, m.ServiceName, key) } + allowed, _ := m.allowMetric(key, nil) + if !allowed { + return + } m.sink.EmitKey(key, val) } func (m *Metrics) IncrCounter(key []string, val float32) { + m.IncrCounterWithLabels(key, val, nil) +} + +func (m *Metrics) IncrCounterWithLabels(key []string, val float32, labels []Label) { + if m.HostName != "" && m.EnableHostnameLabel { + labels = append(labels, Label{"host", m.HostName}) + } if m.EnableTypePrefix { key = insert(0, "counter", key) } if m.ServiceName != "" { - key = insert(0, m.ServiceName, key) + if m.EnableServiceLabel { + labels = append(labels, Label{"service", m.ServiceName}) + } else { + key = insert(0, m.ServiceName, key) + } + } + allowed, labelsFiltered := m.allowMetric(key, labels) + if !allowed { + return } - m.sink.IncrCounter(key, val) + m.sink.IncrCounterWithLabels(key, val, labelsFiltered) } func (m *Metrics) AddSample(key []string, val float32) { + m.AddSampleWithLabels(key, val, nil) +} + +func (m *Metrics) AddSampleWithLabels(key []string, val float32, labels []Label) { + if m.HostName != "" && m.EnableHostnameLabel { + labels = append(labels, Label{"host", m.HostName}) + } if m.EnableTypePrefix { key = insert(0, "sample", key) } if m.ServiceName != "" { - key = insert(0, m.ServiceName, key) + if m.EnableServiceLabel { + labels = append(labels, Label{"service", m.ServiceName}) + } else { + key = insert(0, m.ServiceName, key) + } + } + allowed, labelsFiltered := m.allowMetric(key, labels) + if !allowed { + return } - m.sink.AddSample(key, val) + m.sink.AddSampleWithLabels(key, val, labelsFiltered) } func (m *Metrics) MeasureSince(key []string, start time.Time) { + m.MeasureSinceWithLabels(key, start, nil) +} + +func (m *Metrics) MeasureSinceWithLabels(key []string, start time.Time, labels []Label) { + if m.HostName != "" && m.EnableHostnameLabel { + labels = append(labels, Label{"host", m.HostName}) + } if m.EnableTypePrefix { key = insert(0, "timer", key) } if m.ServiceName != "" { - key = insert(0, m.ServiceName, key) + if m.EnableServiceLabel { + labels = append(labels, Label{"service", m.ServiceName}) + } else { + key = insert(0, m.ServiceName, key) + } + } + allowed, labelsFiltered := m.allowMetric(key, labels) + if !allowed { + return } now := time.Now() elapsed := now.Sub(start) msec := float32(elapsed.Nanoseconds()) / float32(m.TimerGranularity) - m.sink.AddSample(key, msec) + m.sink.AddSampleWithLabels(key, msec, labelsFiltered) +} + +// UpdateFilter overwrites the existing filter with the given rules. +func (m *Metrics) UpdateFilter(allow, block []string) { + m.UpdateFilterAndLabels(allow, block, m.AllowedLabels, m.BlockedLabels) +} + +// UpdateFilterAndLabels overwrites the existing filter with the given rules. +func (m *Metrics) UpdateFilterAndLabels(allow, block, allowedLabels, blockedLabels []string) { + m.filterLock.Lock() + defer m.filterLock.Unlock() + + m.AllowedPrefixes = allow + m.BlockedPrefixes = block + + if allowedLabels == nil { + // Having a white list means we take only elements from it + m.allowedLabels = nil + } else { + m.allowedLabels = make(map[string]bool) + for _, v := range allowedLabels { + m.allowedLabels[v] = true + } + } + m.blockedLabels = make(map[string]bool) + for _, v := range blockedLabels { + m.blockedLabels[v] = true + } + m.AllowedLabels = allowedLabels + m.BlockedLabels = blockedLabels + + m.filter = iradix.New() + for _, prefix := range m.AllowedPrefixes { + m.filter, _, _ = m.filter.Insert([]byte(prefix), true) + } + for _, prefix := range m.BlockedPrefixes { + m.filter, _, _ = m.filter.Insert([]byte(prefix), false) + } +} + +// labelIsAllowed return true if a should be included in metric +// the caller should lock m.filterLock while calling this method +func (m *Metrics) labelIsAllowed(label *Label) bool { + labelName := (*label).Name + if m.blockedLabels != nil { + _, ok := m.blockedLabels[labelName] + if ok { + // If present, let's remove this label + return false + } + } + if m.allowedLabels != nil { + _, ok := m.allowedLabels[labelName] + return ok + } + // Allow by default + return true +} + +// filterLabels return only allowed labels +// the caller should lock m.filterLock while calling this method +func (m *Metrics) filterLabels(labels []Label) []Label { + if labels == nil { + return nil + } + toReturn := []Label{} + for _, label := range labels { + if m.labelIsAllowed(&label) { + toReturn = append(toReturn, label) + } + } + return toReturn +} + +// Returns whether the metric should be allowed based on configured prefix filters +// Also return the applicable labels +func (m *Metrics) allowMetric(key []string, labels []Label) (bool, []Label) { + m.filterLock.RLock() + defer m.filterLock.RUnlock() + + if m.filter == nil || m.filter.Len() == 0 { + return m.Config.FilterDefault, m.filterLabels(labels) + } + + _, allowed, ok := m.filter.Root().LongestPrefix([]byte(strings.Join(key, "."))) + if !ok { + return m.Config.FilterDefault, m.filterLabels(labels) + } + + return allowed.(bool), m.filterLabels(labels) } // Periodically collects runtime stats to publish diff --git a/vendor/github.com/armon/go-metrics/sink.go b/vendor/github.com/armon/go-metrics/sink.go old mode 100755 new mode 100644 index 0c240c2c47..0b7d6e4be4 --- a/vendor/github.com/armon/go-metrics/sink.go +++ b/vendor/github.com/armon/go-metrics/sink.go @@ -1,35 +1,50 @@ package metrics +import ( + "fmt" + "net/url" +) + // The MetricSink interface is used to transmit metrics information // to an external system type MetricSink interface { // A Gauge should retain the last value it is set to SetGauge(key []string, val float32) + SetGaugeWithLabels(key []string, val float32, labels []Label) // Should emit a Key/Value pair for each call EmitKey(key []string, val float32) // Counters should accumulate values IncrCounter(key []string, val float32) + IncrCounterWithLabels(key []string, val float32, labels []Label) // Samples are for timing information, where quantiles are used AddSample(key []string, val float32) + AddSampleWithLabels(key []string, val float32, labels []Label) } // BlackholeSink is used to just blackhole messages type BlackholeSink struct{} -func (*BlackholeSink) SetGauge(key []string, val float32) {} -func (*BlackholeSink) EmitKey(key []string, val float32) {} -func (*BlackholeSink) IncrCounter(key []string, val float32) {} -func (*BlackholeSink) AddSample(key []string, val float32) {} +func (*BlackholeSink) SetGauge(key []string, val float32) {} +func (*BlackholeSink) SetGaugeWithLabels(key []string, val float32, labels []Label) {} +func (*BlackholeSink) EmitKey(key []string, val float32) {} +func (*BlackholeSink) IncrCounter(key []string, val float32) {} +func (*BlackholeSink) IncrCounterWithLabels(key []string, val float32, labels []Label) {} +func (*BlackholeSink) AddSample(key []string, val float32) {} +func (*BlackholeSink) AddSampleWithLabels(key []string, val float32, labels []Label) {} // FanoutSink is used to sink to fanout values to multiple sinks type FanoutSink []MetricSink func (fh FanoutSink) SetGauge(key []string, val float32) { + fh.SetGaugeWithLabels(key, val, nil) +} + +func (fh FanoutSink) SetGaugeWithLabels(key []string, val float32, labels []Label) { for _, s := range fh { - s.SetGauge(key, val) + s.SetGaugeWithLabels(key, val, labels) } } @@ -40,13 +55,61 @@ func (fh FanoutSink) EmitKey(key []string, val float32) { } func (fh FanoutSink) IncrCounter(key []string, val float32) { + fh.IncrCounterWithLabels(key, val, nil) +} + +func (fh FanoutSink) IncrCounterWithLabels(key []string, val float32, labels []Label) { for _, s := range fh { - s.IncrCounter(key, val) + s.IncrCounterWithLabels(key, val, labels) } } func (fh FanoutSink) AddSample(key []string, val float32) { + fh.AddSampleWithLabels(key, val, nil) +} + +func (fh FanoutSink) AddSampleWithLabels(key []string, val float32, labels []Label) { for _, s := range fh { - s.AddSample(key, val) + s.AddSampleWithLabels(key, val, labels) + } +} + +// sinkURLFactoryFunc is an generic interface around the *SinkFromURL() function provided +// by each sink type +type sinkURLFactoryFunc func(*url.URL) (MetricSink, error) + +// sinkRegistry supports the generic NewMetricSink function by mapping URL +// schemes to metric sink factory functions +var sinkRegistry = map[string]sinkURLFactoryFunc{ + "statsd": NewStatsdSinkFromURL, + "statsite": NewStatsiteSinkFromURL, + "inmem": NewInmemSinkFromURL, +} + +// NewMetricSinkFromURL allows a generic URL input to configure any of the +// supported sinks. The scheme of the URL identifies the type of the sink, the +// and query parameters are used to set options. +// +// "statsd://" - Initializes a StatsdSink. The host and port are passed through +// as the "addr" of the sink +// +// "statsite://" - Initializes a StatsiteSink. The host and port become the +// "addr" of the sink +// +// "inmem://" - Initializes an InmemSink. The host and port are ignored. The +// "interval" and "duration" query parameters must be specified with valid +// durations, see NewInmemSink for details. +func NewMetricSinkFromURL(urlStr string) (MetricSink, error) { + u, err := url.Parse(urlStr) + if err != nil { + return nil, err + } + + sinkURLFactoryFunc := sinkRegistry[u.Scheme] + if sinkURLFactoryFunc == nil { + return nil, fmt.Errorf( + "cannot create metric sink, unrecognized sink name: %q", u.Scheme) } + + return sinkURLFactoryFunc(u) } diff --git a/vendor/github.com/armon/go-metrics/start.go b/vendor/github.com/armon/go-metrics/start.go old mode 100755 new mode 100644 index 44113f1004..32a28c4837 --- a/vendor/github.com/armon/go-metrics/start.go +++ b/vendor/github.com/armon/go-metrics/start.go @@ -2,34 +2,50 @@ package metrics import ( "os" + "sync" + "sync/atomic" "time" + + "github.com/hashicorp/go-immutable-radix" ) // Config is used to configure metrics settings type Config struct { - ServiceName string // Prefixed with keys to seperate services + ServiceName string // Prefixed with keys to separate services HostName string // Hostname to use. If not provided and EnableHostname, it will be os.Hostname EnableHostname bool // Enable prefixing gauge values with hostname + EnableHostnameLabel bool // Enable adding hostname to labels + EnableServiceLabel bool // Enable adding service to labels EnableRuntimeMetrics bool // Enables profiling of runtime metrics (GC, Goroutines, Memory) EnableTypePrefix bool // Prefixes key with a type ("counter", "gauge", "timer") TimerGranularity time.Duration // Granularity of timers. ProfileInterval time.Duration // Interval to profile runtime metrics + + AllowedPrefixes []string // A list of metric prefixes to allow, with '.' as the separator + BlockedPrefixes []string // A list of metric prefixes to block, with '.' as the separator + AllowedLabels []string // A list of metric labels to allow, with '.' as the separator + BlockedLabels []string // A list of metric labels to block, with '.' as the separator + FilterDefault bool // Whether to allow metrics by default } // Metrics represents an instance of a metrics sink that can // be used to emit type Metrics struct { Config - lastNumGC uint32 - sink MetricSink + lastNumGC uint32 + sink MetricSink + filter *iradix.Tree + allowedLabels map[string]bool + blockedLabels map[string]bool + filterLock sync.RWMutex // Lock filters and allowedLabels/blockedLabels access } // Shared global metrics instance -var globalMetrics *Metrics +var globalMetrics atomic.Value // *Metrics func init() { // Initialize to a blackhole sink to avoid errors - globalMetrics = &Metrics{sink: &BlackholeSink{}} + globalMetrics.Store(&Metrics{sink: &BlackholeSink{}}) } // DefaultConfig provides a sane default configuration @@ -42,6 +58,7 @@ func DefaultConfig(serviceName string) *Config { EnableTypePrefix: false, // Disable type prefix TimerGranularity: time.Millisecond, // Timers are in milliseconds ProfileInterval: time.Second, // Poll runtime every second + FilterDefault: true, // Don't filter metrics by default } // Try to get the hostname @@ -55,6 +72,7 @@ func New(conf *Config, sink MetricSink) (*Metrics, error) { met := &Metrics{} met.Config = *conf met.sink = sink + met.UpdateFilterAndLabels(conf.AllowedPrefixes, conf.BlockedPrefixes, conf.AllowedLabels, conf.BlockedLabels) // Start the runtime collector if conf.EnableRuntimeMetrics { @@ -68,28 +86,56 @@ func New(conf *Config, sink MetricSink) (*Metrics, error) { func NewGlobal(conf *Config, sink MetricSink) (*Metrics, error) { metrics, err := New(conf, sink) if err == nil { - globalMetrics = metrics + globalMetrics.Store(metrics) } return metrics, err } // Proxy all the methods to the globalMetrics instance func SetGauge(key []string, val float32) { - globalMetrics.SetGauge(key, val) + globalMetrics.Load().(*Metrics).SetGauge(key, val) +} + +func SetGaugeWithLabels(key []string, val float32, labels []Label) { + globalMetrics.Load().(*Metrics).SetGaugeWithLabels(key, val, labels) } func EmitKey(key []string, val float32) { - globalMetrics.EmitKey(key, val) + globalMetrics.Load().(*Metrics).EmitKey(key, val) } func IncrCounter(key []string, val float32) { - globalMetrics.IncrCounter(key, val) + globalMetrics.Load().(*Metrics).IncrCounter(key, val) +} + +func IncrCounterWithLabels(key []string, val float32, labels []Label) { + globalMetrics.Load().(*Metrics).IncrCounterWithLabels(key, val, labels) } func AddSample(key []string, val float32) { - globalMetrics.AddSample(key, val) + globalMetrics.Load().(*Metrics).AddSample(key, val) +} + +func AddSampleWithLabels(key []string, val float32, labels []Label) { + globalMetrics.Load().(*Metrics).AddSampleWithLabels(key, val, labels) } func MeasureSince(key []string, start time.Time) { - globalMetrics.MeasureSince(key, start) + globalMetrics.Load().(*Metrics).MeasureSince(key, start) +} + +func MeasureSinceWithLabels(key []string, start time.Time, labels []Label) { + globalMetrics.Load().(*Metrics).MeasureSinceWithLabels(key, start, labels) +} + +func UpdateFilter(allow, block []string) { + globalMetrics.Load().(*Metrics).UpdateFilter(allow, block) +} + +// UpdateFilterAndLabels set allow/block prefixes of metrics while allowedLabels +// and blockedLabels - when not nil - allow filtering of labels in order to +// block/allow globally labels (especially useful when having large number of +// values for a given label). See README.md for more information about usage. +func UpdateFilterAndLabels(allow, block, allowedLabels, blockedLabels []string) { + globalMetrics.Load().(*Metrics).UpdateFilterAndLabels(allow, block, allowedLabels, blockedLabels) } diff --git a/vendor/github.com/armon/go-metrics/statsd.go b/vendor/github.com/armon/go-metrics/statsd.go index 65a5021a05..1bfffce46e 100644 --- a/vendor/github.com/armon/go-metrics/statsd.go +++ b/vendor/github.com/armon/go-metrics/statsd.go @@ -5,6 +5,7 @@ import ( "fmt" "log" "net" + "net/url" "strings" "time" ) @@ -23,6 +24,12 @@ type StatsdSink struct { metricQueue chan string } +// NewStatsdSinkFromURL creates an StatsdSink from a URL. It is used +// (and tested) from NewMetricSinkFromURL. +func NewStatsdSinkFromURL(u *url.URL) (MetricSink, error) { + return NewStatsdSink(u.Host) +} + // NewStatsdSink is used to create a new StatsdSink func NewStatsdSink(addr string) (*StatsdSink, error) { s := &StatsdSink{ @@ -43,6 +50,11 @@ func (s *StatsdSink) SetGauge(key []string, val float32) { s.pushMetric(fmt.Sprintf("%s:%f|g\n", flatKey, val)) } +func (s *StatsdSink) SetGaugeWithLabels(key []string, val float32, labels []Label) { + flatKey := s.flattenKeyLabels(key, labels) + s.pushMetric(fmt.Sprintf("%s:%f|g\n", flatKey, val)) +} + func (s *StatsdSink) EmitKey(key []string, val float32) { flatKey := s.flattenKey(key) s.pushMetric(fmt.Sprintf("%s:%f|kv\n", flatKey, val)) @@ -53,11 +65,21 @@ func (s *StatsdSink) IncrCounter(key []string, val float32) { s.pushMetric(fmt.Sprintf("%s:%f|c\n", flatKey, val)) } +func (s *StatsdSink) IncrCounterWithLabels(key []string, val float32, labels []Label) { + flatKey := s.flattenKeyLabels(key, labels) + s.pushMetric(fmt.Sprintf("%s:%f|c\n", flatKey, val)) +} + func (s *StatsdSink) AddSample(key []string, val float32) { flatKey := s.flattenKey(key) s.pushMetric(fmt.Sprintf("%s:%f|ms\n", flatKey, val)) } +func (s *StatsdSink) AddSampleWithLabels(key []string, val float32, labels []Label) { + flatKey := s.flattenKeyLabels(key, labels) + s.pushMetric(fmt.Sprintf("%s:%f|ms\n", flatKey, val)) +} + // Flattens the key for formatting, removes spaces func (s *StatsdSink) flattenKey(parts []string) string { joined := strings.Join(parts, ".") @@ -73,6 +95,14 @@ func (s *StatsdSink) flattenKey(parts []string) string { }, joined) } +// Flattens the key along with labels for formatting, removes spaces +func (s *StatsdSink) flattenKeyLabels(parts []string, labels []Label) string { + for _, label := range labels { + parts = append(parts, label.Value) + } + return s.flattenKey(parts) +} + // Does a non-blocking push to the metrics queue func (s *StatsdSink) pushMetric(m string) { select { diff --git a/vendor/github.com/armon/go-metrics/statsite.go b/vendor/github.com/armon/go-metrics/statsite.go old mode 100755 new mode 100644 index 68730139a7..6c0d284d2d --- a/vendor/github.com/armon/go-metrics/statsite.go +++ b/vendor/github.com/armon/go-metrics/statsite.go @@ -5,6 +5,7 @@ import ( "fmt" "log" "net" + "net/url" "strings" "time" ) @@ -16,6 +17,12 @@ const ( flushInterval = 100 * time.Millisecond ) +// NewStatsiteSinkFromURL creates an StatsiteSink from a URL. It is used +// (and tested) from NewMetricSinkFromURL. +func NewStatsiteSinkFromURL(u *url.URL) (MetricSink, error) { + return NewStatsiteSink(u.Host) +} + // StatsiteSink provides a MetricSink that can be used with a // statsite metrics server type StatsiteSink struct { @@ -43,6 +50,11 @@ func (s *StatsiteSink) SetGauge(key []string, val float32) { s.pushMetric(fmt.Sprintf("%s:%f|g\n", flatKey, val)) } +func (s *StatsiteSink) SetGaugeWithLabels(key []string, val float32, labels []Label) { + flatKey := s.flattenKeyLabels(key, labels) + s.pushMetric(fmt.Sprintf("%s:%f|g\n", flatKey, val)) +} + func (s *StatsiteSink) EmitKey(key []string, val float32) { flatKey := s.flattenKey(key) s.pushMetric(fmt.Sprintf("%s:%f|kv\n", flatKey, val)) @@ -53,11 +65,21 @@ func (s *StatsiteSink) IncrCounter(key []string, val float32) { s.pushMetric(fmt.Sprintf("%s:%f|c\n", flatKey, val)) } +func (s *StatsiteSink) IncrCounterWithLabels(key []string, val float32, labels []Label) { + flatKey := s.flattenKeyLabels(key, labels) + s.pushMetric(fmt.Sprintf("%s:%f|c\n", flatKey, val)) +} + func (s *StatsiteSink) AddSample(key []string, val float32) { flatKey := s.flattenKey(key) s.pushMetric(fmt.Sprintf("%s:%f|ms\n", flatKey, val)) } +func (s *StatsiteSink) AddSampleWithLabels(key []string, val float32, labels []Label) { + flatKey := s.flattenKeyLabels(key, labels) + s.pushMetric(fmt.Sprintf("%s:%f|ms\n", flatKey, val)) +} + // Flattens the key for formatting, removes spaces func (s *StatsiteSink) flattenKey(parts []string) string { joined := strings.Join(parts, ".") @@ -73,6 +95,14 @@ func (s *StatsiteSink) flattenKey(parts []string) string { }, joined) } +// Flattens the key along with labels for formatting, removes spaces +func (s *StatsiteSink) flattenKeyLabels(parts []string, labels []Label) string { + for _, label := range labels { + parts = append(parts, label.Value) + } + return s.flattenKey(parts) +} + // Does a non-blocking push to the metrics queue func (s *StatsiteSink) pushMetric(m string) { select { From 10b9600aa728c8c3a0bcd099ea0d24feaf69b97d Mon Sep 17 00:00:00 2001 From: Sebastiaan van Stijn Date: Mon, 26 Aug 2019 13:22:39 +0200 Subject: [PATCH 6/6] vendor hashicorp/go-immutable-radix v1.1.0 Signed-off-by: Sebastiaan van Stijn --- vendor.conf | 1 + .../hashicorp/go-immutable-radix/LICENSE | 363 ++++++++++ .../hashicorp/go-immutable-radix/README.md | 66 ++ .../hashicorp/go-immutable-radix/edges.go | 21 + .../hashicorp/go-immutable-radix/go.mod | 6 + .../hashicorp/go-immutable-radix/iradix.go | 662 ++++++++++++++++++ .../hashicorp/go-immutable-radix/iter.go | 188 +++++ .../hashicorp/go-immutable-radix/node.go | 304 ++++++++ .../hashicorp/go-immutable-radix/raw_iter.go | 78 +++ 9 files changed, 1689 insertions(+) create mode 100644 vendor/github.com/hashicorp/go-immutable-radix/LICENSE create mode 100644 vendor/github.com/hashicorp/go-immutable-radix/README.md create mode 100644 vendor/github.com/hashicorp/go-immutable-radix/edges.go create mode 100644 vendor/github.com/hashicorp/go-immutable-radix/go.mod create mode 100644 vendor/github.com/hashicorp/go-immutable-radix/iradix.go create mode 100644 vendor/github.com/hashicorp/go-immutable-radix/iter.go create mode 100644 vendor/github.com/hashicorp/go-immutable-radix/node.go create mode 100644 vendor/github.com/hashicorp/go-immutable-radix/raw_iter.go diff --git a/vendor.conf b/vendor.conf index 7c69849eb9..d56693a195 100644 --- a/vendor.conf +++ b/vendor.conf @@ -23,6 +23,7 @@ github.com/gorilla/mux 98cb6bf42e086f6af920b965c38c github.com/google/btree 4030bb1f1f0c35b30ca7009e9ebd06849dd45306 # v1.0.0 github.com/hashicorp/consul 9a9cc9341bb487651a0399e3fc5e1e8a42e62dd9 # v0.5.2 github.com/hashicorp/errwrap 8a6fb523712970c966eefc6b39ed2c5e74880354 # v1.0.0 +github.com/hashicorp/go-immutable-radix 7dd1121b595e4e1bd6dd5caa78e0f5c454740379 # v1.1.0 github.com/hashicorp/go-msgpack ad60660ecf9c5a1eae0ca32182ed72bab5807961 # v0.5.5 github.com/hashicorp/go-multierror 886a7fbe3eb1c874d46f623bfa70af45f425b3d1 # v1.0.0 github.com/hashicorp/memberlist e1138a6a4d8a6eaec6c919aeae5efbe4d69b1ece # v0.1.4 diff --git a/vendor/github.com/hashicorp/go-immutable-radix/LICENSE b/vendor/github.com/hashicorp/go-immutable-radix/LICENSE new file mode 100644 index 0000000000..e87a115e46 --- /dev/null +++ b/vendor/github.com/hashicorp/go-immutable-radix/LICENSE @@ -0,0 +1,363 @@ +Mozilla Public License, version 2.0 + +1. Definitions + +1.1. "Contributor" + + means each individual or legal entity that creates, contributes to the + creation of, or owns Covered Software. + +1.2. "Contributor Version" + + means the combination of the Contributions of others (if any) used by a + Contributor and that particular Contributor's Contribution. + +1.3. "Contribution" + + means Covered Software of a particular Contributor. + +1.4. "Covered Software" + + means Source Code Form to which the initial Contributor has attached the + notice in Exhibit A, the Executable Form of such Source Code Form, and + Modifications of such Source Code Form, in each case including portions + thereof. + +1.5. "Incompatible With Secondary Licenses" + means + + a. that the initial Contributor has attached the notice described in + Exhibit B to the Covered Software; or + + b. that the Covered Software was made available under the terms of + version 1.1 or earlier of the License, but not also under the terms of + a Secondary License. + +1.6. "Executable Form" + + means any form of the work other than Source Code Form. + +1.7. "Larger Work" + + means a work that combines Covered Software with other material, in a + separate file or files, that is not Covered Software. + +1.8. "License" + + means this document. + +1.9. "Licensable" + + means having the right to grant, to the maximum extent possible, whether + at the time of the initial grant or subsequently, any and all of the + rights conveyed by this License. + +1.10. "Modifications" + + means any of the following: + + a. any file in Source Code Form that results from an addition to, + deletion from, or modification of the contents of Covered Software; or + + b. any new file in Source Code Form that contains any Covered Software. + +1.11. "Patent Claims" of a Contributor + + means any patent claim(s), including without limitation, method, + process, and apparatus claims, in any patent Licensable by such + Contributor that would be infringed, but for the grant of the License, + by the making, using, selling, offering for sale, having made, import, + or transfer of either its Contributions or its Contributor Version. + +1.12. "Secondary License" + + means either the GNU General Public License, Version 2.0, the GNU Lesser + General Public License, Version 2.1, the GNU Affero General Public + License, Version 3.0, or any later versions of those licenses. + +1.13. "Source Code Form" + + means the form of the work preferred for making modifications. + +1.14. "You" (or "Your") + + means an individual or a legal entity exercising rights under this + License. For legal entities, "You" includes any entity that controls, is + controlled by, or is under common control with You. For purposes of this + definition, "control" means (a) the power, direct or indirect, to cause + the direction or management of such entity, whether by contract or + otherwise, or (b) ownership of more than fifty percent (50%) of the + outstanding shares or beneficial ownership of such entity. + + +2. License Grants and Conditions + +2.1. Grants + + Each Contributor hereby grants You a world-wide, royalty-free, + non-exclusive license: + + a. under intellectual property rights (other than patent or trademark) + Licensable by such Contributor to use, reproduce, make available, + modify, display, perform, distribute, and otherwise exploit its + Contributions, either on an unmodified basis, with Modifications, or + as part of a Larger Work; and + + b. under Patent Claims of such Contributor to make, use, sell, offer for + sale, have made, import, and otherwise transfer either its + Contributions or its Contributor Version. + +2.2. Effective Date + + The licenses granted in Section 2.1 with respect to any Contribution + become effective for each Contribution on the date the Contributor first + distributes such Contribution. + +2.3. Limitations on Grant Scope + + The licenses granted in this Section 2 are the only rights granted under + this License. No additional rights or licenses will be implied from the + distribution or licensing of Covered Software under this License. + Notwithstanding Section 2.1(b) above, no patent license is granted by a + Contributor: + + a. for any code that a Contributor has removed from Covered Software; or + + b. for infringements caused by: (i) Your and any other third party's + modifications of Covered Software, or (ii) the combination of its + Contributions with other software (except as part of its Contributor + Version); or + + c. under Patent Claims infringed by Covered Software in the absence of + its Contributions. + + This License does not grant any rights in the trademarks, service marks, + or logos of any Contributor (except as may be necessary to comply with + the notice requirements in Section 3.4). + +2.4. Subsequent Licenses + + No Contributor makes additional grants as a result of Your choice to + distribute the Covered Software under a subsequent version of this + License (see Section 10.2) or under the terms of a Secondary License (if + permitted under the terms of Section 3.3). + +2.5. Representation + + Each Contributor represents that the Contributor believes its + Contributions are its original creation(s) or it has sufficient rights to + grant the rights to its Contributions conveyed by this License. + +2.6. Fair Use + + This License is not intended to limit any rights You have under + applicable copyright doctrines of fair use, fair dealing, or other + equivalents. + +2.7. Conditions + + Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted in + Section 2.1. + + +3. Responsibilities + +3.1. Distribution of Source Form + + All distribution of Covered Software in Source Code Form, including any + Modifications that You create or to which You contribute, must be under + the terms of this License. You must inform recipients that the Source + Code Form of the Covered Software is governed by the terms of this + License, and how they can obtain a copy of this License. You may not + attempt to alter or restrict the recipients' rights in the Source Code + Form. + +3.2. Distribution of Executable Form + + If You distribute Covered Software in Executable Form then: + + a. such Covered Software must also be made available in Source Code Form, + as described in Section 3.1, and You must inform recipients of the + Executable Form how they can obtain a copy of such Source Code Form by + reasonable means in a timely manner, at a charge no more than the cost + of distribution to the recipient; and + + b. You may distribute such Executable Form under the terms of this + License, or sublicense it under different terms, provided that the + license for the Executable Form does not attempt to limit or alter the + recipients' rights in the Source Code Form under this License. + +3.3. Distribution of a Larger Work + + You may create and distribute a Larger Work under terms of Your choice, + provided that You also comply with the requirements of this License for + the Covered Software. If the Larger Work is a combination of Covered + Software with a work governed by one or more Secondary Licenses, and the + Covered Software is not Incompatible With Secondary Licenses, this + License permits You to additionally distribute such Covered Software + under the terms of such Secondary License(s), so that the recipient of + the Larger Work may, at their option, further distribute the Covered + Software under the terms of either this License or such Secondary + License(s). + +3.4. Notices + + You may not remove or alter the substance of any license notices + (including copyright notices, patent notices, disclaimers of warranty, or + limitations of liability) contained within the Source Code Form of the + Covered Software, except that You may alter any license notices to the + extent required to remedy known factual inaccuracies. + +3.5. Application of Additional Terms + + You may choose to offer, and to charge a fee for, warranty, support, + indemnity or liability obligations to one or more recipients of Covered + Software. However, You may do so only on Your own behalf, and not on + behalf of any Contributor. You must make it absolutely clear that any + such warranty, support, indemnity, or liability obligation is offered by + You alone, and You hereby agree to indemnify every Contributor for any + liability incurred by such Contributor as a result of warranty, support, + indemnity or liability terms You offer. You may include additional + disclaimers of warranty and limitations of liability specific to any + jurisdiction. + +4. Inability to Comply Due to Statute or Regulation + + If it is impossible for You to comply with any of the terms of this License + with respect to some or all of the Covered Software due to statute, + judicial order, or regulation then You must: (a) comply with the terms of + this License to the maximum extent possible; and (b) describe the + limitations and the code they affect. Such description must be placed in a + text file included with all distributions of the Covered Software under + this License. Except to the extent prohibited by statute or regulation, + such description must be sufficiently detailed for a recipient of ordinary + skill to be able to understand it. + +5. Termination + +5.1. The rights granted under this License will terminate automatically if You + fail to comply with any of its terms. However, if You become compliant, + then the rights granted under this License from a particular Contributor + are reinstated (a) provisionally, unless and until such Contributor + explicitly and finally terminates Your grants, and (b) on an ongoing + basis, if such Contributor fails to notify You of the non-compliance by + some reasonable means prior to 60 days after You have come back into + compliance. Moreover, Your grants from a particular Contributor are + reinstated on an ongoing basis if such Contributor notifies You of the + non-compliance by some reasonable means, this is the first time You have + received notice of non-compliance with this License from such + Contributor, and You become compliant prior to 30 days after Your receipt + of the notice. + +5.2. If You initiate litigation against any entity by asserting a patent + infringement claim (excluding declaratory judgment actions, + counter-claims, and cross-claims) alleging that a Contributor Version + directly or indirectly infringes any patent, then the rights granted to + You by any and all Contributors for the Covered Software under Section + 2.1 of this License shall terminate. + +5.3. In the event of termination under Sections 5.1 or 5.2 above, all end user + license agreements (excluding distributors and resellers) which have been + validly granted by You or Your distributors under this License prior to + termination shall survive termination. + +6. Disclaimer of Warranty + + Covered Software is provided under this License on an "as is" basis, + without warranty of any kind, either expressed, implied, or statutory, + including, without limitation, warranties that the Covered Software is free + of defects, merchantable, fit for a particular purpose or non-infringing. + The entire risk as to the quality and performance of the Covered Software + is with You. Should any Covered Software prove defective in any respect, + You (not any Contributor) assume the cost of any necessary servicing, + repair, or correction. This disclaimer of warranty constitutes an essential + part of this License. No use of any Covered Software is authorized under + this License except under this disclaimer. + +7. Limitation of Liability + + Under no circumstances and under no legal theory, whether tort (including + negligence), contract, or otherwise, shall any Contributor, or anyone who + distributes Covered Software as permitted above, be liable to You for any + direct, indirect, special, incidental, or consequential damages of any + character including, without limitation, damages for lost profits, loss of + goodwill, work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses, even if such party shall have been + informed of the possibility of such damages. This limitation of liability + shall not apply to liability for death or personal injury resulting from + such party's negligence to the extent applicable law prohibits such + limitation. Some jurisdictions do not allow the exclusion or limitation of + incidental or consequential damages, so this exclusion and limitation may + not apply to You. + +8. Litigation + + Any litigation relating to this License may be brought only in the courts + of a jurisdiction where the defendant maintains its principal place of + business and such litigation shall be governed by laws of that + jurisdiction, without reference to its conflict-of-law provisions. Nothing + in this Section shall prevent a party's ability to bring cross-claims or + counter-claims. + +9. Miscellaneous + + This License represents the complete agreement concerning the subject + matter hereof. If any provision of this License is held to be + unenforceable, such provision shall be reformed only to the extent + necessary to make it enforceable. Any law or regulation which provides that + the language of a contract shall be construed against the drafter shall not + be used to construe this License against a Contributor. + + +10. Versions of the License + +10.1. New Versions + + Mozilla Foundation is the license steward. Except as provided in Section + 10.3, no one other than the license steward has the right to modify or + publish new versions of this License. Each version will be given a + distinguishing version number. + +10.2. Effect of New Versions + + You may distribute the Covered Software under the terms of the version + of the License under which You originally received the Covered Software, + or under the terms of any subsequent version published by the license + steward. + +10.3. Modified Versions + + If you create software not governed by this License, and you want to + create a new license for such software, you may create and use a + modified version of this License if you rename the license and remove + any references to the name of the license steward (except to note that + such modified license differs from this License). + +10.4. Distributing Source Code Form that is Incompatible With Secondary + Licenses If You choose to distribute Source Code Form that is + Incompatible With Secondary Licenses under the terms of this version of + the License, the notice described in Exhibit B of this License must be + attached. + +Exhibit A - Source Code Form License Notice + + This Source Code Form is subject to the + terms of the Mozilla Public License, v. + 2.0. If a copy of the MPL was not + distributed with this file, You can + obtain one at + http://mozilla.org/MPL/2.0/. + +If it is not possible or desirable to put the notice in a particular file, +then You may include the notice in a location (such as a LICENSE file in a +relevant directory) where a recipient would be likely to look for such a +notice. + +You may add additional accurate notices of copyright ownership. + +Exhibit B - "Incompatible With Secondary Licenses" Notice + + This Source Code Form is "Incompatible + With Secondary Licenses", as defined by + the Mozilla Public License, v. 2.0. + diff --git a/vendor/github.com/hashicorp/go-immutable-radix/README.md b/vendor/github.com/hashicorp/go-immutable-radix/README.md new file mode 100644 index 0000000000..4b6338b5a6 --- /dev/null +++ b/vendor/github.com/hashicorp/go-immutable-radix/README.md @@ -0,0 +1,66 @@ +go-immutable-radix [![Build Status](https://travis-ci.org/hashicorp/go-immutable-radix.png)](https://travis-ci.org/hashicorp/go-immutable-radix) +========= + +Provides the `iradix` package that implements an immutable [radix tree](http://en.wikipedia.org/wiki/Radix_tree). +The package only provides a single `Tree` implementation, optimized for sparse nodes. + +As a radix tree, it provides the following: + * O(k) operations. In many cases, this can be faster than a hash table since + the hash function is an O(k) operation, and hash tables have very poor cache locality. + * Minimum / Maximum value lookups + * Ordered iteration + +A tree supports using a transaction to batch multiple updates (insert, delete) +in a more efficient manner than performing each operation one at a time. + +For a mutable variant, see [go-radix](https://github.com/armon/go-radix). + +Documentation +============= + +The full documentation is available on [Godoc](http://godoc.org/github.com/hashicorp/go-immutable-radix). + +Example +======= + +Below is a simple example of usage + +```go +// Create a tree +r := iradix.New() +r, _, _ = r.Insert([]byte("foo"), 1) +r, _, _ = r.Insert([]byte("bar"), 2) +r, _, _ = r.Insert([]byte("foobar"), 2) + +// Find the longest prefix match +m, _, _ := r.Root().LongestPrefix([]byte("foozip")) +if string(m) != "foo" { + panic("should be foo") +} +``` + +Here is an example of performing a range scan of the keys. + +```go +// Create a tree +r := iradix.New() +r, _, _ = r.Insert([]byte("001"), 1) +r, _, _ = r.Insert([]byte("002"), 2) +r, _, _ = r.Insert([]byte("005"), 5) +r, _, _ = r.Insert([]byte("010"), 10) +r, _, _ = r.Insert([]byte("100"), 10) + +// Range scan over the keys that sort lexicographically between [003, 050) +it := r.Root().Iterator() +it.SeekLowerBound([]byte("003")) +for key, _, ok := it.Next(); ok; key, _, ok = it.Next() { + if key >= "050" { + break + } + fmt.Println(key) +} +// Output: +// 005 +// 010 +``` + diff --git a/vendor/github.com/hashicorp/go-immutable-radix/edges.go b/vendor/github.com/hashicorp/go-immutable-radix/edges.go new file mode 100644 index 0000000000..a63674775f --- /dev/null +++ b/vendor/github.com/hashicorp/go-immutable-radix/edges.go @@ -0,0 +1,21 @@ +package iradix + +import "sort" + +type edges []edge + +func (e edges) Len() int { + return len(e) +} + +func (e edges) Less(i, j int) bool { + return e[i].label < e[j].label +} + +func (e edges) Swap(i, j int) { + e[i], e[j] = e[j], e[i] +} + +func (e edges) Sort() { + sort.Sort(e) +} diff --git a/vendor/github.com/hashicorp/go-immutable-radix/go.mod b/vendor/github.com/hashicorp/go-immutable-radix/go.mod new file mode 100644 index 0000000000..27e7b7c955 --- /dev/null +++ b/vendor/github.com/hashicorp/go-immutable-radix/go.mod @@ -0,0 +1,6 @@ +module github.com/hashicorp/go-immutable-radix + +require ( + github.com/hashicorp/go-uuid v1.0.0 + github.com/hashicorp/golang-lru v0.5.0 +) diff --git a/vendor/github.com/hashicorp/go-immutable-radix/iradix.go b/vendor/github.com/hashicorp/go-immutable-radix/iradix.go new file mode 100644 index 0000000000..e5e6e57f26 --- /dev/null +++ b/vendor/github.com/hashicorp/go-immutable-radix/iradix.go @@ -0,0 +1,662 @@ +package iradix + +import ( + "bytes" + "strings" + + "github.com/hashicorp/golang-lru/simplelru" +) + +const ( + // defaultModifiedCache is the default size of the modified node + // cache used per transaction. This is used to cache the updates + // to the nodes near the root, while the leaves do not need to be + // cached. This is important for very large transactions to prevent + // the modified cache from growing to be enormous. This is also used + // to set the max size of the mutation notify maps since those should + // also be bounded in a similar way. + defaultModifiedCache = 8192 +) + +// Tree implements an immutable radix tree. This can be treated as a +// Dictionary abstract data type. The main advantage over a standard +// hash map is prefix-based lookups and ordered iteration. The immutability +// means that it is safe to concurrently read from a Tree without any +// coordination. +type Tree struct { + root *Node + size int +} + +// New returns an empty Tree +func New() *Tree { + t := &Tree{ + root: &Node{ + mutateCh: make(chan struct{}), + }, + } + return t +} + +// Len is used to return the number of elements in the tree +func (t *Tree) Len() int { + return t.size +} + +// Txn is a transaction on the tree. This transaction is applied +// atomically and returns a new tree when committed. A transaction +// is not thread safe, and should only be used by a single goroutine. +type Txn struct { + // root is the modified root for the transaction. + root *Node + + // snap is a snapshot of the root node for use if we have to run the + // slow notify algorithm. + snap *Node + + // size tracks the size of the tree as it is modified during the + // transaction. + size int + + // writable is a cache of writable nodes that have been created during + // the course of the transaction. This allows us to re-use the same + // nodes for further writes and avoid unnecessary copies of nodes that + // have never been exposed outside the transaction. This will only hold + // up to defaultModifiedCache number of entries. + writable *simplelru.LRU + + // trackChannels is used to hold channels that need to be notified to + // signal mutation of the tree. This will only hold up to + // defaultModifiedCache number of entries, after which we will set the + // trackOverflow flag, which will cause us to use a more expensive + // algorithm to perform the notifications. Mutation tracking is only + // performed if trackMutate is true. + trackChannels map[chan struct{}]struct{} + trackOverflow bool + trackMutate bool +} + +// Txn starts a new transaction that can be used to mutate the tree +func (t *Tree) Txn() *Txn { + txn := &Txn{ + root: t.root, + snap: t.root, + size: t.size, + } + return txn +} + +// TrackMutate can be used to toggle if mutations are tracked. If this is enabled +// then notifications will be issued for affected internal nodes and leaves when +// the transaction is committed. +func (t *Txn) TrackMutate(track bool) { + t.trackMutate = track +} + +// trackChannel safely attempts to track the given mutation channel, setting the +// overflow flag if we can no longer track any more. This limits the amount of +// state that will accumulate during a transaction and we have a slower algorithm +// to switch to if we overflow. +func (t *Txn) trackChannel(ch chan struct{}) { + // In overflow, make sure we don't store any more objects. + if t.trackOverflow { + return + } + + // If this would overflow the state we reject it and set the flag (since + // we aren't tracking everything that's required any longer). + if len(t.trackChannels) >= defaultModifiedCache { + // Mark that we are in the overflow state + t.trackOverflow = true + + // Clear the map so that the channels can be garbage collected. It is + // safe to do this since we have already overflowed and will be using + // the slow notify algorithm. + t.trackChannels = nil + return + } + + // Create the map on the fly when we need it. + if t.trackChannels == nil { + t.trackChannels = make(map[chan struct{}]struct{}) + } + + // Otherwise we are good to track it. + t.trackChannels[ch] = struct{}{} +} + +// writeNode returns a node to be modified, if the current node has already been +// modified during the course of the transaction, it is used in-place. Set +// forLeafUpdate to true if you are getting a write node to update the leaf, +// which will set leaf mutation tracking appropriately as well. +func (t *Txn) writeNode(n *Node, forLeafUpdate bool) *Node { + // Ensure the writable set exists. + if t.writable == nil { + lru, err := simplelru.NewLRU(defaultModifiedCache, nil) + if err != nil { + panic(err) + } + t.writable = lru + } + + // If this node has already been modified, we can continue to use it + // during this transaction. We know that we don't need to track it for + // a node update since the node is writable, but if this is for a leaf + // update we track it, in case the initial write to this node didn't + // update the leaf. + if _, ok := t.writable.Get(n); ok { + if t.trackMutate && forLeafUpdate && n.leaf != nil { + t.trackChannel(n.leaf.mutateCh) + } + return n + } + + // Mark this node as being mutated. + if t.trackMutate { + t.trackChannel(n.mutateCh) + } + + // Mark its leaf as being mutated, if appropriate. + if t.trackMutate && forLeafUpdate && n.leaf != nil { + t.trackChannel(n.leaf.mutateCh) + } + + // Copy the existing node. If you have set forLeafUpdate it will be + // safe to replace this leaf with another after you get your node for + // writing. You MUST replace it, because the channel associated with + // this leaf will be closed when this transaction is committed. + nc := &Node{ + mutateCh: make(chan struct{}), + leaf: n.leaf, + } + if n.prefix != nil { + nc.prefix = make([]byte, len(n.prefix)) + copy(nc.prefix, n.prefix) + } + if len(n.edges) != 0 { + nc.edges = make([]edge, len(n.edges)) + copy(nc.edges, n.edges) + } + + // Mark this node as writable. + t.writable.Add(nc, nil) + return nc +} + +// Visit all the nodes in the tree under n, and add their mutateChannels to the transaction +// Returns the size of the subtree visited +func (t *Txn) trackChannelsAndCount(n *Node) int { + // Count only leaf nodes + leaves := 0 + if n.leaf != nil { + leaves = 1 + } + // Mark this node as being mutated. + if t.trackMutate { + t.trackChannel(n.mutateCh) + } + + // Mark its leaf as being mutated, if appropriate. + if t.trackMutate && n.leaf != nil { + t.trackChannel(n.leaf.mutateCh) + } + + // Recurse on the children + for _, e := range n.edges { + leaves += t.trackChannelsAndCount(e.node) + } + return leaves +} + +// mergeChild is called to collapse the given node with its child. This is only +// called when the given node is not a leaf and has a single edge. +func (t *Txn) mergeChild(n *Node) { + // Mark the child node as being mutated since we are about to abandon + // it. We don't need to mark the leaf since we are retaining it if it + // is there. + e := n.edges[0] + child := e.node + if t.trackMutate { + t.trackChannel(child.mutateCh) + } + + // Merge the nodes. + n.prefix = concat(n.prefix, child.prefix) + n.leaf = child.leaf + if len(child.edges) != 0 { + n.edges = make([]edge, len(child.edges)) + copy(n.edges, child.edges) + } else { + n.edges = nil + } +} + +// insert does a recursive insertion +func (t *Txn) insert(n *Node, k, search []byte, v interface{}) (*Node, interface{}, bool) { + // Handle key exhaustion + if len(search) == 0 { + var oldVal interface{} + didUpdate := false + if n.isLeaf() { + oldVal = n.leaf.val + didUpdate = true + } + + nc := t.writeNode(n, true) + nc.leaf = &leafNode{ + mutateCh: make(chan struct{}), + key: k, + val: v, + } + return nc, oldVal, didUpdate + } + + // Look for the edge + idx, child := n.getEdge(search[0]) + + // No edge, create one + if child == nil { + e := edge{ + label: search[0], + node: &Node{ + mutateCh: make(chan struct{}), + leaf: &leafNode{ + mutateCh: make(chan struct{}), + key: k, + val: v, + }, + prefix: search, + }, + } + nc := t.writeNode(n, false) + nc.addEdge(e) + return nc, nil, false + } + + // Determine longest prefix of the search key on match + commonPrefix := longestPrefix(search, child.prefix) + if commonPrefix == len(child.prefix) { + search = search[commonPrefix:] + newChild, oldVal, didUpdate := t.insert(child, k, search, v) + if newChild != nil { + nc := t.writeNode(n, false) + nc.edges[idx].node = newChild + return nc, oldVal, didUpdate + } + return nil, oldVal, didUpdate + } + + // Split the node + nc := t.writeNode(n, false) + splitNode := &Node{ + mutateCh: make(chan struct{}), + prefix: search[:commonPrefix], + } + nc.replaceEdge(edge{ + label: search[0], + node: splitNode, + }) + + // Restore the existing child node + modChild := t.writeNode(child, false) + splitNode.addEdge(edge{ + label: modChild.prefix[commonPrefix], + node: modChild, + }) + modChild.prefix = modChild.prefix[commonPrefix:] + + // Create a new leaf node + leaf := &leafNode{ + mutateCh: make(chan struct{}), + key: k, + val: v, + } + + // If the new key is a subset, add to to this node + search = search[commonPrefix:] + if len(search) == 0 { + splitNode.leaf = leaf + return nc, nil, false + } + + // Create a new edge for the node + splitNode.addEdge(edge{ + label: search[0], + node: &Node{ + mutateCh: make(chan struct{}), + leaf: leaf, + prefix: search, + }, + }) + return nc, nil, false +} + +// delete does a recursive deletion +func (t *Txn) delete(parent, n *Node, search []byte) (*Node, *leafNode) { + // Check for key exhaustion + if len(search) == 0 { + if !n.isLeaf() { + return nil, nil + } + // Copy the pointer in case we are in a transaction that already + // modified this node since the node will be reused. Any changes + // made to the node will not affect returning the original leaf + // value. + oldLeaf := n.leaf + + // Remove the leaf node + nc := t.writeNode(n, true) + nc.leaf = nil + + // Check if this node should be merged + if n != t.root && len(nc.edges) == 1 { + t.mergeChild(nc) + } + return nc, oldLeaf + } + + // Look for an edge + label := search[0] + idx, child := n.getEdge(label) + if child == nil || !bytes.HasPrefix(search, child.prefix) { + return nil, nil + } + + // Consume the search prefix + search = search[len(child.prefix):] + newChild, leaf := t.delete(n, child, search) + if newChild == nil { + return nil, nil + } + + // Copy this node. WATCH OUT - it's safe to pass "false" here because we + // will only ADD a leaf via nc.mergeChild() if there isn't one due to + // the !nc.isLeaf() check in the logic just below. This is pretty subtle, + // so be careful if you change any of the logic here. + nc := t.writeNode(n, false) + + // Delete the edge if the node has no edges + if newChild.leaf == nil && len(newChild.edges) == 0 { + nc.delEdge(label) + if n != t.root && len(nc.edges) == 1 && !nc.isLeaf() { + t.mergeChild(nc) + } + } else { + nc.edges[idx].node = newChild + } + return nc, leaf +} + +// delete does a recursive deletion +func (t *Txn) deletePrefix(parent, n *Node, search []byte) (*Node, int) { + // Check for key exhaustion + if len(search) == 0 { + nc := t.writeNode(n, true) + if n.isLeaf() { + nc.leaf = nil + } + nc.edges = nil + return nc, t.trackChannelsAndCount(n) + } + + // Look for an edge + label := search[0] + idx, child := n.getEdge(label) + // We make sure that either the child node's prefix starts with the search term, or the search term starts with the child node's prefix + // Need to do both so that we can delete prefixes that don't correspond to any node in the tree + if child == nil || (!bytes.HasPrefix(child.prefix, search) && !bytes.HasPrefix(search, child.prefix)) { + return nil, 0 + } + + // Consume the search prefix + if len(child.prefix) > len(search) { + search = []byte("") + } else { + search = search[len(child.prefix):] + } + newChild, numDeletions := t.deletePrefix(n, child, search) + if newChild == nil { + return nil, 0 + } + // Copy this node. WATCH OUT - it's safe to pass "false" here because we + // will only ADD a leaf via nc.mergeChild() if there isn't one due to + // the !nc.isLeaf() check in the logic just below. This is pretty subtle, + // so be careful if you change any of the logic here. + + nc := t.writeNode(n, false) + + // Delete the edge if the node has no edges + if newChild.leaf == nil && len(newChild.edges) == 0 { + nc.delEdge(label) + if n != t.root && len(nc.edges) == 1 && !nc.isLeaf() { + t.mergeChild(nc) + } + } else { + nc.edges[idx].node = newChild + } + return nc, numDeletions +} + +// Insert is used to add or update a given key. The return provides +// the previous value and a bool indicating if any was set. +func (t *Txn) Insert(k []byte, v interface{}) (interface{}, bool) { + newRoot, oldVal, didUpdate := t.insert(t.root, k, k, v) + if newRoot != nil { + t.root = newRoot + } + if !didUpdate { + t.size++ + } + return oldVal, didUpdate +} + +// Delete is used to delete a given key. Returns the old value if any, +// and a bool indicating if the key was set. +func (t *Txn) Delete(k []byte) (interface{}, bool) { + newRoot, leaf := t.delete(nil, t.root, k) + if newRoot != nil { + t.root = newRoot + } + if leaf != nil { + t.size-- + return leaf.val, true + } + return nil, false +} + +// DeletePrefix is used to delete an entire subtree that matches the prefix +// This will delete all nodes under that prefix +func (t *Txn) DeletePrefix(prefix []byte) bool { + newRoot, numDeletions := t.deletePrefix(nil, t.root, prefix) + if newRoot != nil { + t.root = newRoot + t.size = t.size - numDeletions + return true + } + return false + +} + +// Root returns the current root of the radix tree within this +// transaction. The root is not safe across insert and delete operations, +// but can be used to read the current state during a transaction. +func (t *Txn) Root() *Node { + return t.root +} + +// Get is used to lookup a specific key, returning +// the value and if it was found +func (t *Txn) Get(k []byte) (interface{}, bool) { + return t.root.Get(k) +} + +// GetWatch is used to lookup a specific key, returning +// the watch channel, value and if it was found +func (t *Txn) GetWatch(k []byte) (<-chan struct{}, interface{}, bool) { + return t.root.GetWatch(k) +} + +// Commit is used to finalize the transaction and return a new tree. If mutation +// tracking is turned on then notifications will also be issued. +func (t *Txn) Commit() *Tree { + nt := t.CommitOnly() + if t.trackMutate { + t.Notify() + } + return nt +} + +// CommitOnly is used to finalize the transaction and return a new tree, but +// does not issue any notifications until Notify is called. +func (t *Txn) CommitOnly() *Tree { + nt := &Tree{t.root, t.size} + t.writable = nil + return nt +} + +// slowNotify does a complete comparison of the before and after trees in order +// to trigger notifications. This doesn't require any additional state but it +// is very expensive to compute. +func (t *Txn) slowNotify() { + snapIter := t.snap.rawIterator() + rootIter := t.root.rawIterator() + for snapIter.Front() != nil || rootIter.Front() != nil { + // If we've exhausted the nodes in the old snapshot, we know + // there's nothing remaining to notify. + if snapIter.Front() == nil { + return + } + snapElem := snapIter.Front() + + // If we've exhausted the nodes in the new root, we know we need + // to invalidate everything that remains in the old snapshot. We + // know from the loop condition there's something in the old + // snapshot. + if rootIter.Front() == nil { + close(snapElem.mutateCh) + if snapElem.isLeaf() { + close(snapElem.leaf.mutateCh) + } + snapIter.Next() + continue + } + + // Do one string compare so we can check the various conditions + // below without repeating the compare. + cmp := strings.Compare(snapIter.Path(), rootIter.Path()) + + // If the snapshot is behind the root, then we must have deleted + // this node during the transaction. + if cmp < 0 { + close(snapElem.mutateCh) + if snapElem.isLeaf() { + close(snapElem.leaf.mutateCh) + } + snapIter.Next() + continue + } + + // If the snapshot is ahead of the root, then we must have added + // this node during the transaction. + if cmp > 0 { + rootIter.Next() + continue + } + + // If we have the same path, then we need to see if we mutated a + // node and possibly the leaf. + rootElem := rootIter.Front() + if snapElem != rootElem { + close(snapElem.mutateCh) + if snapElem.leaf != nil && (snapElem.leaf != rootElem.leaf) { + close(snapElem.leaf.mutateCh) + } + } + snapIter.Next() + rootIter.Next() + } +} + +// Notify is used along with TrackMutate to trigger notifications. This must +// only be done once a transaction is committed via CommitOnly, and it is called +// automatically by Commit. +func (t *Txn) Notify() { + if !t.trackMutate { + return + } + + // If we've overflowed the tracking state we can't use it in any way and + // need to do a full tree compare. + if t.trackOverflow { + t.slowNotify() + } else { + for ch := range t.trackChannels { + close(ch) + } + } + + // Clean up the tracking state so that a re-notify is safe (will trigger + // the else clause above which will be a no-op). + t.trackChannels = nil + t.trackOverflow = false +} + +// Insert is used to add or update a given key. The return provides +// the new tree, previous value and a bool indicating if any was set. +func (t *Tree) Insert(k []byte, v interface{}) (*Tree, interface{}, bool) { + txn := t.Txn() + old, ok := txn.Insert(k, v) + return txn.Commit(), old, ok +} + +// Delete is used to delete a given key. Returns the new tree, +// old value if any, and a bool indicating if the key was set. +func (t *Tree) Delete(k []byte) (*Tree, interface{}, bool) { + txn := t.Txn() + old, ok := txn.Delete(k) + return txn.Commit(), old, ok +} + +// DeletePrefix is used to delete all nodes starting with a given prefix. Returns the new tree, +// and a bool indicating if the prefix matched any nodes +func (t *Tree) DeletePrefix(k []byte) (*Tree, bool) { + txn := t.Txn() + ok := txn.DeletePrefix(k) + return txn.Commit(), ok +} + +// Root returns the root node of the tree which can be used for richer +// query operations. +func (t *Tree) Root() *Node { + return t.root +} + +// Get is used to lookup a specific key, returning +// the value and if it was found +func (t *Tree) Get(k []byte) (interface{}, bool) { + return t.root.Get(k) +} + +// longestPrefix finds the length of the shared prefix +// of two strings +func longestPrefix(k1, k2 []byte) int { + max := len(k1) + if l := len(k2); l < max { + max = l + } + var i int + for i = 0; i < max; i++ { + if k1[i] != k2[i] { + break + } + } + return i +} + +// concat two byte slices, returning a third new copy +func concat(a, b []byte) []byte { + c := make([]byte, len(a)+len(b)) + copy(c, a) + copy(c[len(a):], b) + return c +} diff --git a/vendor/github.com/hashicorp/go-immutable-radix/iter.go b/vendor/github.com/hashicorp/go-immutable-radix/iter.go new file mode 100644 index 0000000000..1ecaf831c7 --- /dev/null +++ b/vendor/github.com/hashicorp/go-immutable-radix/iter.go @@ -0,0 +1,188 @@ +package iradix + +import ( + "bytes" +) + +// Iterator is used to iterate over a set of nodes +// in pre-order +type Iterator struct { + node *Node + stack []edges +} + +// SeekPrefixWatch is used to seek the iterator to a given prefix +// and returns the watch channel of the finest granularity +func (i *Iterator) SeekPrefixWatch(prefix []byte) (watch <-chan struct{}) { + // Wipe the stack + i.stack = nil + n := i.node + watch = n.mutateCh + search := prefix + for { + // Check for key exhaution + if len(search) == 0 { + i.node = n + return + } + + // Look for an edge + _, n = n.getEdge(search[0]) + if n == nil { + i.node = nil + return + } + + // Update to the finest granularity as the search makes progress + watch = n.mutateCh + + // Consume the search prefix + if bytes.HasPrefix(search, n.prefix) { + search = search[len(n.prefix):] + + } else if bytes.HasPrefix(n.prefix, search) { + i.node = n + return + } else { + i.node = nil + return + } + } +} + +// SeekPrefix is used to seek the iterator to a given prefix +func (i *Iterator) SeekPrefix(prefix []byte) { + i.SeekPrefixWatch(prefix) +} + +func (i *Iterator) recurseMin(n *Node) *Node { + // Traverse to the minimum child + if n.leaf != nil { + return n + } + if len(n.edges) > 0 { + // Add all the other edges to the stack (the min node will be added as + // we recurse) + i.stack = append(i.stack, n.edges[1:]) + return i.recurseMin(n.edges[0].node) + } + // Shouldn't be possible + return nil +} + +// SeekLowerBound is used to seek the iterator to the smallest key that is +// greater or equal to the given key. There is no watch variant as it's hard to +// predict based on the radix structure which node(s) changes might affect the +// result. +func (i *Iterator) SeekLowerBound(key []byte) { + // Wipe the stack. Unlike Prefix iteration, we need to build the stack as we + // go because we need only a subset of edges of many nodes in the path to the + // leaf with the lower bound. + i.stack = []edges{} + n := i.node + search := key + + found := func(n *Node) { + i.node = n + i.stack = append(i.stack, edges{edge{node: n}}) + } + + for { + // Compare current prefix with the search key's same-length prefix. + var prefixCmp int + if len(n.prefix) < len(search) { + prefixCmp = bytes.Compare(n.prefix, search[0:len(n.prefix)]) + } else { + prefixCmp = bytes.Compare(n.prefix, search) + } + + if prefixCmp > 0 { + // Prefix is larger, that means the lower bound is greater than the search + // and from now on we need to follow the minimum path to the smallest + // leaf under this subtree. + n = i.recurseMin(n) + if n != nil { + found(n) + } + return + } + + if prefixCmp < 0 { + // Prefix is smaller than search prefix, that means there is no lower + // bound + i.node = nil + return + } + + // Prefix is equal, we are still heading for an exact match. If this is a + // leaf we're done. + if n.leaf != nil { + if bytes.Compare(n.leaf.key, key) < 0 { + i.node = nil + return + } + found(n) + return + } + + // Consume the search prefix + if len(n.prefix) > len(search) { + search = []byte{} + } else { + search = search[len(n.prefix):] + } + + // Otherwise, take the lower bound next edge. + idx, lbNode := n.getLowerBoundEdge(search[0]) + if lbNode == nil { + i.node = nil + return + } + + // Create stack edges for the all strictly higher edges in this node. + if idx+1 < len(n.edges) { + i.stack = append(i.stack, n.edges[idx+1:]) + } + + i.node = lbNode + // Recurse + n = lbNode + } +} + +// Next returns the next node in order +func (i *Iterator) Next() ([]byte, interface{}, bool) { + // Initialize our stack if needed + if i.stack == nil && i.node != nil { + i.stack = []edges{ + edges{ + edge{node: i.node}, + }, + } + } + + for len(i.stack) > 0 { + // Inspect the last element of the stack + n := len(i.stack) + last := i.stack[n-1] + elem := last[0].node + + // Update the stack + if len(last) > 1 { + i.stack[n-1] = last[1:] + } else { + i.stack = i.stack[:n-1] + } + + // Push the edges onto the frontier + if len(elem.edges) > 0 { + i.stack = append(i.stack, elem.edges) + } + + // Return the leaf values if any + if elem.leaf != nil { + return elem.leaf.key, elem.leaf.val, true + } + } + return nil, nil, false +} diff --git a/vendor/github.com/hashicorp/go-immutable-radix/node.go b/vendor/github.com/hashicorp/go-immutable-radix/node.go new file mode 100644 index 0000000000..3ab904edce --- /dev/null +++ b/vendor/github.com/hashicorp/go-immutable-radix/node.go @@ -0,0 +1,304 @@ +package iradix + +import ( + "bytes" + "sort" +) + +// WalkFn is used when walking the tree. Takes a +// key and value, returning if iteration should +// be terminated. +type WalkFn func(k []byte, v interface{}) bool + +// leafNode is used to represent a value +type leafNode struct { + mutateCh chan struct{} + key []byte + val interface{} +} + +// edge is used to represent an edge node +type edge struct { + label byte + node *Node +} + +// Node is an immutable node in the radix tree +type Node struct { + // mutateCh is closed if this node is modified + mutateCh chan struct{} + + // leaf is used to store possible leaf + leaf *leafNode + + // prefix is the common prefix we ignore + prefix []byte + + // Edges should be stored in-order for iteration. + // We avoid a fully materialized slice to save memory, + // since in most cases we expect to be sparse + edges edges +} + +func (n *Node) isLeaf() bool { + return n.leaf != nil +} + +func (n *Node) addEdge(e edge) { + num := len(n.edges) + idx := sort.Search(num, func(i int) bool { + return n.edges[i].label >= e.label + }) + n.edges = append(n.edges, e) + if idx != num { + copy(n.edges[idx+1:], n.edges[idx:num]) + n.edges[idx] = e + } +} + +func (n *Node) replaceEdge(e edge) { + num := len(n.edges) + idx := sort.Search(num, func(i int) bool { + return n.edges[i].label >= e.label + }) + if idx < num && n.edges[idx].label == e.label { + n.edges[idx].node = e.node + return + } + panic("replacing missing edge") +} + +func (n *Node) getEdge(label byte) (int, *Node) { + num := len(n.edges) + idx := sort.Search(num, func(i int) bool { + return n.edges[i].label >= label + }) + if idx < num && n.edges[idx].label == label { + return idx, n.edges[idx].node + } + return -1, nil +} + +func (n *Node) getLowerBoundEdge(label byte) (int, *Node) { + num := len(n.edges) + idx := sort.Search(num, func(i int) bool { + return n.edges[i].label >= label + }) + // we want lower bound behavior so return even if it's not an exact match + if idx < num { + return idx, n.edges[idx].node + } + return -1, nil +} + +func (n *Node) delEdge(label byte) { + num := len(n.edges) + idx := sort.Search(num, func(i int) bool { + return n.edges[i].label >= label + }) + if idx < num && n.edges[idx].label == label { + copy(n.edges[idx:], n.edges[idx+1:]) + n.edges[len(n.edges)-1] = edge{} + n.edges = n.edges[:len(n.edges)-1] + } +} + +func (n *Node) GetWatch(k []byte) (<-chan struct{}, interface{}, bool) { + search := k + watch := n.mutateCh + for { + // Check for key exhaustion + if len(search) == 0 { + if n.isLeaf() { + return n.leaf.mutateCh, n.leaf.val, true + } + break + } + + // Look for an edge + _, n = n.getEdge(search[0]) + if n == nil { + break + } + + // Update to the finest granularity as the search makes progress + watch = n.mutateCh + + // Consume the search prefix + if bytes.HasPrefix(search, n.prefix) { + search = search[len(n.prefix):] + } else { + break + } + } + return watch, nil, false +} + +func (n *Node) Get(k []byte) (interface{}, bool) { + _, val, ok := n.GetWatch(k) + return val, ok +} + +// LongestPrefix is like Get, but instead of an +// exact match, it will return the longest prefix match. +func (n *Node) LongestPrefix(k []byte) ([]byte, interface{}, bool) { + var last *leafNode + search := k + for { + // Look for a leaf node + if n.isLeaf() { + last = n.leaf + } + + // Check for key exhaution + if len(search) == 0 { + break + } + + // Look for an edge + _, n = n.getEdge(search[0]) + if n == nil { + break + } + + // Consume the search prefix + if bytes.HasPrefix(search, n.prefix) { + search = search[len(n.prefix):] + } else { + break + } + } + if last != nil { + return last.key, last.val, true + } + return nil, nil, false +} + +// Minimum is used to return the minimum value in the tree +func (n *Node) Minimum() ([]byte, interface{}, bool) { + for { + if n.isLeaf() { + return n.leaf.key, n.leaf.val, true + } + if len(n.edges) > 0 { + n = n.edges[0].node + } else { + break + } + } + return nil, nil, false +} + +// Maximum is used to return the maximum value in the tree +func (n *Node) Maximum() ([]byte, interface{}, bool) { + for { + if num := len(n.edges); num > 0 { + n = n.edges[num-1].node + continue + } + if n.isLeaf() { + return n.leaf.key, n.leaf.val, true + } else { + break + } + } + return nil, nil, false +} + +// Iterator is used to return an iterator at +// the given node to walk the tree +func (n *Node) Iterator() *Iterator { + return &Iterator{node: n} +} + +// rawIterator is used to return a raw iterator at the given node to walk the +// tree. +func (n *Node) rawIterator() *rawIterator { + iter := &rawIterator{node: n} + iter.Next() + return iter +} + +// Walk is used to walk the tree +func (n *Node) Walk(fn WalkFn) { + recursiveWalk(n, fn) +} + +// WalkPrefix is used to walk the tree under a prefix +func (n *Node) WalkPrefix(prefix []byte, fn WalkFn) { + search := prefix + for { + // Check for key exhaution + if len(search) == 0 { + recursiveWalk(n, fn) + return + } + + // Look for an edge + _, n = n.getEdge(search[0]) + if n == nil { + break + } + + // Consume the search prefix + if bytes.HasPrefix(search, n.prefix) { + search = search[len(n.prefix):] + + } else if bytes.HasPrefix(n.prefix, search) { + // Child may be under our search prefix + recursiveWalk(n, fn) + return + } else { + break + } + } +} + +// WalkPath is used to walk the tree, but only visiting nodes +// from the root down to a given leaf. Where WalkPrefix walks +// all the entries *under* the given prefix, this walks the +// entries *above* the given prefix. +func (n *Node) WalkPath(path []byte, fn WalkFn) { + search := path + for { + // Visit the leaf values if any + if n.leaf != nil && fn(n.leaf.key, n.leaf.val) { + return + } + + // Check for key exhaution + if len(search) == 0 { + return + } + + // Look for an edge + _, n = n.getEdge(search[0]) + if n == nil { + return + } + + // Consume the search prefix + if bytes.HasPrefix(search, n.prefix) { + search = search[len(n.prefix):] + } else { + break + } + } +} + +// recursiveWalk is used to do a pre-order walk of a node +// recursively. Returns true if the walk should be aborted +func recursiveWalk(n *Node, fn WalkFn) bool { + // Visit the leaf values if any + if n.leaf != nil && fn(n.leaf.key, n.leaf.val) { + return true + } + + // Recurse on the children + for _, e := range n.edges { + if recursiveWalk(e.node, fn) { + return true + } + } + return false +} diff --git a/vendor/github.com/hashicorp/go-immutable-radix/raw_iter.go b/vendor/github.com/hashicorp/go-immutable-radix/raw_iter.go new file mode 100644 index 0000000000..04814c1323 --- /dev/null +++ b/vendor/github.com/hashicorp/go-immutable-radix/raw_iter.go @@ -0,0 +1,78 @@ +package iradix + +// rawIterator visits each of the nodes in the tree, even the ones that are not +// leaves. It keeps track of the effective path (what a leaf at a given node +// would be called), which is useful for comparing trees. +type rawIterator struct { + // node is the starting node in the tree for the iterator. + node *Node + + // stack keeps track of edges in the frontier. + stack []rawStackEntry + + // pos is the current position of the iterator. + pos *Node + + // path is the effective path of the current iterator position, + // regardless of whether the current node is a leaf. + path string +} + +// rawStackEntry is used to keep track of the cumulative common path as well as +// its associated edges in the frontier. +type rawStackEntry struct { + path string + edges edges +} + +// Front returns the current node that has been iterated to. +func (i *rawIterator) Front() *Node { + return i.pos +} + +// Path returns the effective path of the current node, even if it's not actually +// a leaf. +func (i *rawIterator) Path() string { + return i.path +} + +// Next advances the iterator to the next node. +func (i *rawIterator) Next() { + // Initialize our stack if needed. + if i.stack == nil && i.node != nil { + i.stack = []rawStackEntry{ + rawStackEntry{ + edges: edges{ + edge{node: i.node}, + }, + }, + } + } + + for len(i.stack) > 0 { + // Inspect the last element of the stack. + n := len(i.stack) + last := i.stack[n-1] + elem := last.edges[0].node + + // Update the stack. + if len(last.edges) > 1 { + i.stack[n-1].edges = last.edges[1:] + } else { + i.stack = i.stack[:n-1] + } + + // Push the edges onto the frontier. + if len(elem.edges) > 0 { + path := last.path + string(elem.prefix) + i.stack = append(i.stack, rawStackEntry{path, elem.edges}) + } + + i.pos = elem + i.path = last.path + string(elem.prefix) + return + } + + i.pos = nil + i.path = "" +}