Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

mute consul debug messages #567

Merged
merged 7 commits into from
Dec 11, 2015
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions client/alloc_runner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ func testAllocRunner(restarts bool) (*MockAllocStateUpdater, *AllocRunner) {
conf.AllocDir = os.TempDir()
upd := &MockAllocStateUpdater{}
alloc := mock.Alloc()
consulClient, _ := NewConsulService(logger, "127.0.0.1:8500", "", "", false, false)
consulClient, _ := NewConsulService(logger, "127.0.0.1:8500", "", "", false, false, &structs.Node{})
if !restarts {
alloc.Job.Type = structs.JobTypeBatch
*alloc.Job.LookupTaskGroup(alloc.TaskGroup).RestartPolicy = structs.RestartPolicy{Attempts: 0}
Expand Down Expand Up @@ -142,7 +142,7 @@ func TestAllocRunner_SaveRestoreState(t *testing.T) {
}

// Create a new alloc runner
consulClient, err := NewConsulService(ar.logger, "127.0.0.1:8500", "", "", false, false)
consulClient, err := NewConsulService(ar.logger, "127.0.0.1:8500", "", "", false, false, &structs.Node{})
ar2 := NewAllocRunner(ar.logger, ar.config, upd.Update,
&structs.Allocation{ID: ar.alloc.ID}, consulClient)
err = ar2.RestoreState()
Expand Down
2 changes: 1 addition & 1 deletion client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ func (c *Client) setupConsulService() error {
auth := c.config.Read("consul.auth")
enableSSL := c.config.ReadBoolDefault("consul.ssl", false)
verifySSL := c.config.ReadBoolDefault("consul.verifyssl", true)
if consulService, err = NewConsulService(c.logger, addr, token, auth, enableSSL, verifySSL); err != nil {
if consulService, err = NewConsulService(c.logger, addr, token, auth, enableSSL, verifySSL, c.config.Node); err != nil {
return err
}
c.consulService = consulService
Expand Down
26 changes: 17 additions & 9 deletions client/consul.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ type ConsulService struct {
client consulApi
logger *log.Logger
shutdownCh chan struct{}
node *structs.Node

trackedTasks map[string]*trackedTask
serviceStates map[string]string
Expand All @@ -80,7 +81,7 @@ type ConsulService struct {

// A factory method to create new consul service
func NewConsulService(logger *log.Logger, consulAddr string, token string,
auth string, enableSSL bool, verifySSL bool) (*ConsulService, error) {
auth string, enableSSL bool, verifySSL bool, node *structs.Node) (*ConsulService, error) {
var err error
var c *consul.Client
cfg := consul.DefaultConfig()
Expand Down Expand Up @@ -122,6 +123,7 @@ func NewConsulService(logger *log.Logger, consulAddr string, token string,
consulService := ConsulService{
client: &consulApiClient{client: c},
logger: logger,
node: node,
trackedTasks: make(map[string]*trackedTask),
serviceStates: make(map[string]string),
shutdownCh: make(chan struct{}),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@dadgar registerService already has a debug message in it.

Expand Down Expand Up @@ -161,7 +163,7 @@ func (c *ConsulService) Deregister(task *structs.Task, allocID string) error {
}
c.logger.Printf("[INFO] consul: deregistering service %v with consul", service.Name)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we want to mask this output also? If not I think I misunderstood why the others are being skipped.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@cbednarski This is being printed only once and not continuously so I think this is fine and it indicates the user that we have taken out the service and it's corresponding checks. If consul agent is not available at that time, the service and checks would be taken out once the agent comes back up but we won't print the message continuously as we keep retrying during the sync loop.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should move this after the if so that there is only one debug message. Either a successful deregister or an error deregistering same for registering

if err := c.deregisterService(service.Id); err != nil {
c.logger.Printf("[DEBUG] consul: error in deregistering service %v from consul", service.Name)
c.printLogMessage("[DEBUG] consul: error in deregistering service %v from consul", service.Name)
mErr.Errors = append(mErr.Errors, err)
}
}
Expand Down Expand Up @@ -207,14 +209,14 @@ func (c *ConsulService) performSync() {
// Add new services which Consul agent isn't aware of
knownServices[service.Id] = struct{}{}
if _, ok := consulServices[service.Id]; !ok {
c.logger.Printf("[INFO] consul: registering service %s with consul.", service.Name)
c.printLogMessage("[INFO] consul: registering service %s with consul.", service.Name)
c.registerService(service, trackedTask.task, trackedTask.allocID)
continue
}

// If a service has changed, re-register it with Consul agent
if service.Hash() != c.serviceStates[service.Id] {
c.logger.Printf("[INFO] consul: reregistering service %s with consul.", service.Name)
c.printLogMessage("[INFO] consul: reregistering service %s with consul.", service.Name)
c.registerService(service, trackedTask.task, trackedTask.allocID)
continue
}
Expand Down Expand Up @@ -242,7 +244,7 @@ func (c *ConsulService) performSync() {
for _, consulService := range consulServices {
if _, ok := knownServices[consulService.ID]; !ok {
delete(c.serviceStates, consulService.ID)
c.logger.Printf("[INFO] consul: deregistering service %v with consul", consulService.Service)
c.printLogMessage("[INFO] consul: deregistering service %v with consul", consulService.Service)
c.deregisterService(consulService.ID)
}
}
Expand Down Expand Up @@ -273,13 +275,13 @@ func (c *ConsulService) registerService(service *structs.Service, task *structs.
}

if err := c.client.ServiceRegister(asr); err != nil {
c.logger.Printf("[DEBUG] consul: error while registering service %v with consul: %v", service.Name, err)
c.printLogMessage("[DEBUG] consul: error while registering service %v with consul: %v", service.Name, err)
mErr.Errors = append(mErr.Errors, err)
}
for _, check := range service.Checks {
cr := c.makeCheck(service, check, host, port)
if err := c.registerCheck(cr); err != nil {
c.logger.Printf("[DEBUG] consul: error while registerting check %v with consul: %v", check.Name, err)
c.printLogMessage("[DEBUG] consul: error while registerting check %v with consul: %v", check.Name, err)
mErr.Errors = append(mErr.Errors, err)
}

Expand All @@ -289,13 +291,13 @@ func (c *ConsulService) registerService(service *structs.Service, task *structs.

// registerCheck registers a check with Consul
func (c *ConsulService) registerCheck(check *consul.AgentCheckRegistration) error {
c.logger.Printf("[INFO] consul: registering Check with ID: %v for service: %v", check.ID, check.ServiceID)
c.printLogMessage("[INFO] consul: registering check with ID: %v for service: %v", check.ID, check.ServiceID)
return c.client.CheckRegister(check)
}

// deregisterCheck de-registers a check with a specific ID from Consul
func (c *ConsulService) deregisterCheck(checkID string) error {
c.logger.Printf("[INFO] consul: removing check with ID: %v", checkID)
c.printLogMessage("[INFO] consul: removing check with ID: %v", checkID)
return c.client.CheckDeregister(checkID)
}

Expand Down Expand Up @@ -336,3 +338,9 @@ func (c *ConsulService) makeCheck(service *structs.Service, check *structs.Servi
}
return cr
}

func (c *ConsulService) printLogMessage(message string, v ...interface{}) {
if _, ok := c.node.Attributes["consul.version"]; ok {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems weird that we're suppressing log messages when Consul is not present, but we're still running through all of the register / deregister logic. I feel like we should have shortcut or no-op'd much earlier. Can we skip over the consul logic completely if Consul is not running?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@cbednarski If a user has defined a service block in his/her Task, then even if the Consul agent is not running we start tracking the service and try to keep syncing it with Consul. The current logic allows consul agent to be unavailable but when it comes back up again, it syncs all the service and check definitions.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I still don't follow. There are two scenarios:

  1. Consul failed temporarily. Let's suppose consul initially is missing but will show up. Nomad starts first. Then Consul starts 20 seconds later. The second fingerprinter pass will enable the consul functionality. From here on any failure to sync a service with Consul is likely an operational problem, so we should show the logs.
  2. Consul is not running and never will be. In this case the fingerprinter detects Consul is not there, and in fact Consul never shows up. Why would we ever call NewConsulService at all, or why would it not immediately no-op? Showing an info log saying we're registering a service when we in fact will never register the service is spurious, and the entire feature seems extraneous.

If we have to have the service registered permanently when the task starts just in case consul shows up some day, it seems like we should still be able to do this check much earlier and skip all of this additional complexity with respect to filtering the logs and retrying something that's never going to succeed.

For example in performSync if we can't connect to consul we can just skip the entire run, log an ERR message that we can't connect, and move on.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think if we just eat all the logs it solves this problem? Do you agree @cbednarski. Because then it just becomes us building the internal state such that if Consul comes back we can do the diff and register/deregister the necessary services and we'd never tell the user we are taking an action against Consul when we aren't.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The problem with suppressing the logs is Nomad is actually doing a lot of work that suddenly becomes invisible, but we probably shouldn't be doing the work in the first place. Hiding the logs is a smell.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with @cbednarski on this one. I think we should skip the entire register loop if Consul was fingerprinted as missing. It doesn't make much sense otherwise to loop over every task, service, and check, and attempt a REST call we know will fail anyways.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ryanuber Discussed more about this internally. The fingerprinter currently updates every 15s, we would continue to do precise registration and de-registration whenever a new service comes up or goes away. We would not sync with Consul every 5s if we can't get the list of services and checks.

c.logger.Printf(message, v)
}
}
2 changes: 1 addition & 1 deletion client/consul_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ func (a *mockConsulApiClient) Checks() (map[string]*consul.AgentCheck, error) {

func newConsulService() *ConsulService {
logger := log.New(os.Stdout, "logger: ", log.Lshortfile)
c, _ := NewConsulService(logger, "", "", "", false, false)
c, _ := NewConsulService(logger, "", "", "", false, false, &structs.Node{})
c.client = &mockConsulApiClient{}
return c
}
Expand Down
25 changes: 22 additions & 3 deletions client/fingerprint/consul.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,21 @@ import (
"github.com/hashicorp/nomad/nomad/structs"
)

const (
consulAvailable = "consulavailable"
consulUnavailable = "consulunavailable"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we just change these to available and unavailable? That way you could just compare against the last state, and log a message saying something like consul status changed to %s.

)

// ConsulFingerprint is used to fingerprint the architecture
type ConsulFingerprint struct {
logger *log.Logger
client *consul.Client
logger *log.Logger
client *consul.Client
lastState string
}

// NewConsulFingerprint is used to create an OS fingerprint
func NewConsulFingerprint(logger *log.Logger) Fingerprint {
f := &ConsulFingerprint{logger: logger}
f := &ConsulFingerprint{logger: logger, lastState: consulUnavailable}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

return &ConsulFingerprint{
   logger: logger,
   lastState: consulUnavailable,
}

return f
}

Expand Down Expand Up @@ -55,6 +61,13 @@ func (f *ConsulFingerprint) Fingerprint(config *client.Config, node *structs.Nod
if err != nil {
// Clear any attributes set by a previous fingerprint.
f.clearConsulAttributes(node)

// Print a message indicating that the Consul Agent is not available
// anymore
if f.lastState == consulAvailable {
f.logger.Printf("[INFO] fingerprint.consul: consul agent is unavailable")
}
f.lastState = consulUnavailable
return false, nil
}

Expand All @@ -68,6 +81,12 @@ func (f *ConsulFingerprint) Fingerprint(config *client.Config, node *structs.Nod
node.Attributes["consul.datacenter"],
node.Attributes["consul.name"])

// If the Consul Agent was previously unavailable print a message to
// indicate the Agent is available now
if f.lastState == consulUnavailable {
f.logger.Printf("[INFO] fingerprt.consul: consul agent is available")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fingerprint.consul.

}
f.lastState = consulAvailable
return true, nil
}

Expand Down
4 changes: 2 additions & 2 deletions client/task_runner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ func testTaskRunner(restarts bool) (*MockTaskStateUpdater, *TaskRunner) {
upd := &MockTaskStateUpdater{}
alloc := mock.Alloc()
task := alloc.Job.TaskGroups[0].Tasks[0]
consulClient, _ := NewConsulService(logger, "127.0.0.1:8500", "", "", false, false)
consulClient, _ := NewConsulService(logger, "127.0.0.1:8500", "", "", false, false, &structs.Node{})
// Initialize the port listing. This should be done by the offer process but
// we have a mock so that doesn't happen.
task.Resources.Networks[0].ReservedPorts = []structs.Port{{"", 80}}
Expand Down Expand Up @@ -164,7 +164,7 @@ func TestTaskRunner_SaveRestoreState(t *testing.T) {
}

// Create a new task runner
consulClient, _ := NewConsulService(tr.logger, "127.0.0.1:8500", "", "", false, false)
consulClient, _ := NewConsulService(tr.logger, "127.0.0.1:8500", "", "", false, false, &structs.Node{})
tr2 := NewTaskRunner(tr.logger, tr.config, upd.Update,
tr.ctx, tr.allocID, &structs.Task{Name: tr.task.Name}, tr.state, tr.restartTracker,
consulClient)
Expand Down