-
Notifications
You must be signed in to change notification settings - Fork 3.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Distributed Query/Clustering Fixes #2353
Changes from all commits
9268ddf
f5a8227
90e3059
406a951
25a43a8
38628e5
2975a9d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -642,30 +642,18 @@ func (cmd *RunCommand) openServer(joinURLs []url.URL) *influxdb.Server { | |
// Give brokers time to elect a leader if entire cluster is being restarted. | ||
time.Sleep(1 * time.Second) | ||
|
||
if s.ID() == 0 && s.Index() == 0 { | ||
if len(joinURLs) > 0 { | ||
joinServer(s, *cmd.node.ClusterURL(), joinURLs) | ||
return s | ||
} | ||
|
||
if err := s.Initialize(*cmd.node.ClusterURL()); err != nil { | ||
log.Fatalf("server initialization error(0): %s", err) | ||
} | ||
|
||
u := cmd.node.ClusterURL() | ||
log.Printf("initialized data node: %s\n", u.String()) | ||
return s | ||
if s.ID() == 0 { | ||
joinOrInitializeServer(s, *cmd.node.ClusterURL(), joinURLs) | ||
} else { | ||
log.Printf("data node already member of cluster. Using existing state and ignoring join URLs") | ||
} | ||
|
||
return s | ||
} | ||
|
||
// joins a server to an existing cluster. | ||
func joinServer(s *influxdb.Server, u url.URL, joinURLs []url.URL) { | ||
// TODO: Use separate broker and data join urls. | ||
|
||
// joinOrInitializeServer joins a new server to an existing cluster or initializes it as the first | ||
// member of the cluster | ||
func joinOrInitializeServer(s *influxdb.Server, u url.URL, joinURLs []url.URL) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Now that I see it I don't fully understand the comments in this function. "Create data node on an existing data node." -- what do this mean? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Comment is incorrect, will fix. |
||
// Create data node on an existing data node. | ||
for _, joinURL := range joinURLs { | ||
if err := s.Join(&u, &joinURL); err == influxdb.ErrDataNodeNotFound { | ||
|
@@ -676,12 +664,22 @@ func joinServer(s *influxdb.Server, u url.URL, joinURLs []url.URL) { | |
log.Printf("initialized data node: %s\n", (&u).String()) | ||
return | ||
} else if err != nil { | ||
// does not return so that the next joinURL can be tried | ||
log.Printf("join: failed to connect data node: %s: %s", (&u).String(), err) | ||
} else { | ||
log.Printf("join: connected data node to %s", u) | ||
return | ||
} | ||
} | ||
|
||
if len(joinURLs) == 0 { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not strictly part of this, but are we missing a return at line 666 above? If so, can you add it to this PR? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We did introduce that in this PR, and yes, it should have a return. Will fix. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK. Perhaps it's something about the race testing framework? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No. Should not return here because this is looping over each joinURL. If one fails, it will try the next one. If they all fail, L681 will run and fatal out. |
||
if err := s.Initialize(u); err != nil { | ||
log.Fatalf("server initialization error(2): %s", err) | ||
} | ||
log.Printf("initialized data node: %s\n", (&u).String()) | ||
return | ||
} | ||
|
||
log.Fatalf("join: failed to connect data node to any specified server") | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1015,8 +1015,38 @@ func (s *Server) applyDropDatabase(m *messaging.Message) (err error) { | |
// Remove from metastore. | ||
err = s.meta.mustUpdate(m.Index, func(tx *metatx) error { return tx.dropDatabase(c.Name) }) | ||
|
||
db := s.databases[c.Name] | ||
for _, rp := range db.policies { | ||
for _, sg := range rp.shardGroups { | ||
for _, sh := range sg.Shards { | ||
|
||
// if we have this shard locally, close and remove it | ||
if sh.store != nil { | ||
// close topic readers/heartbeaters/etc. connections | ||
err := s.client.CloseConn(sh.ID) | ||
if err != nil { | ||
panic(err) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These shouldn't ever happen so I used panic so we can get get a line number and more context if they did. Not sure how best to handle this though because we've dropped the database from the metastore but we're failing to cleanup resources. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Cool. Panic works for me. On Tuesday, April 21, 2015, Jason Wilder notifications@github.com wrote:
|
||
} | ||
|
||
err = sh.close() | ||
if err != nil { | ||
panic(err) | ||
} | ||
|
||
err = os.Remove(s.shardPath(sh.ID)) | ||
if err != nil { | ||
panic(err) | ||
} | ||
} | ||
|
||
delete(s.shards, sh.ID) | ||
} | ||
} | ||
} | ||
|
||
// Delete the database entry. | ||
delete(s.databases, c.Name) | ||
|
||
return | ||
} | ||
|
||
|
@@ -3157,6 +3187,7 @@ func (s *Server) StartLocalMapper(rm *RemoteMapper) (*LocalMapper, error) { | |
selectFields: rm.SelectFields, | ||
selectTags: rm.SelectTags, | ||
interval: rm.Interval, | ||
tmin: rm.TMin, | ||
tmax: rm.TMax, | ||
limit: limit, | ||
} | ||
|
@@ -3517,6 +3548,7 @@ type MessagingClient interface { | |
|
||
// Conn returns an open, streaming connection to a topic. | ||
Conn(topicID uint64) MessagingConn | ||
CloseConn(topicID uint64) error | ||
} | ||
|
||
type messagingClient struct { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So why wasn't this condition correct? Why was also requiring
Index()
to be 0 wrong?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
From what I understand, after you open your own meta store, you will read your ID from that, if you're ID isn't 0, your index certainly shouldn't be either, as you have been part of a cluster from before.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If I understand this issue correctly, the bug was that sometimes
s.ID()
was 0 buts.Index()
was not, hence the bug?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes. Index was non-zero.