Skip to content

Commit

Permalink
Import flow failure related log messages should be more specific abou…
Browse files Browse the repository at this point in the history
…t what went wrong

tendrl-bug-id: Tendrl#1080
bugzilla: 1688630

Signed-off-by: GowthamShanmugasundaram <gshanmug@redhat.com>
  • Loading branch information
GowthamShanmugam committed Mar 27, 2019
1 parent 7fcbe3b commit 5dfe801
Show file tree
Hide file tree
Showing 9 changed files with 266 additions and 47 deletions.
6 changes: 3 additions & 3 deletions tendrl/commons/jobs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ def run(self):
_job_sync_interval = 5
NS.node_context = NS.node_context.load()
NS.tendrl_context = NS.tendrl_context.load()
if "tendrl/monitor" not in NS.node_context.tags:
if "tendrl/monitor" not in NS.node_context.tags and \
"tendrl/integration/monitoring" not in NS.node_context.tags:
if NS.tendrl_context.integration_id is None or \
NS.node_context.fqdn is None:
time.sleep(_job_sync_interval)
Expand Down Expand Up @@ -144,7 +145,6 @@ def process_job(jid):

_now_plus_10_epoch = (_now_plus_10 -
_epoch_start).total_seconds()
time.sleep(7)
job = job.load()
if job.status == "new":
# To avoid server and storage node do save same time
Expand Down Expand Up @@ -209,7 +209,7 @@ def process_job(jid):
obj_name, flow_name)
else:
runnable_flow = current_ns.ns.get_flow(flow_name)

time.sleep(3)
job = job.load()
lock_info = dict(node_id=NS.node_context.node_id,
fqdn=NS.node_context.fqdn,
Expand Down
46 changes: 35 additions & 11 deletions tendrl/commons/objects/cluster/atoms/import_cluster/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ def run(self):
# create same flow for each node in node list except
# $this
payload = {"tags": ["tendrl/node_%s" % node],
"node_id": node,
"run": "tendrl.flows.ImportCluster",
"status": "new",
"parameters": new_params,
Expand Down Expand Up @@ -115,7 +116,8 @@ def run(self):
logger.log(
"error",
NS.publisher_id,
{"message": "Failed to detect underlying cluster version"},
{"message": "Failed to detect underlying cluster "
"version. Error: %s" % err},
job_id=self.parameters['job_id'],
flow_id=self.parameters['flow_id']
)
Expand Down Expand Up @@ -220,15 +222,6 @@ def run(self):
job_id=self.parameters['job_id']
).load()
if loop_count >= wait_count:
logger.log(
"error",
NS.publisher_id,
{"message": "Import jobs on cluster(%s) not yet "
"complete on all nodes(%s). Timing out." %
(_cluster.short_name, str(node_list))},
job_id=self.parameters['job_id'],
flow_id=self.parameters['flow_id']
)
# Marking child jobs as failed which did not complete
# as the parent job has timed out. This has to be done
# explicitly because these jobs will still be processed
Expand All @@ -239,10 +232,41 @@ def run(self):
job_id=child_job_id
).load()
if child_job.status not in ["finished", "failed"]:
if child_job.status in ["new", ""]:
node_id = child_job.payload.get(
"node_id", ""
)
node_obj = NS.tendrl.objects.NodeContext(
node_id = node_id
).load()
logger.log(
"error",
NS.publisher_id,
{"message": "Import child job %s is "
"not yet picked by %s, Either node is"
" down or tendrl-node-agent service "
"is not running" % (
child_job.job_id,
node_obj.fqdn
)},
job_id=self.parameters['job_id'],
flow_id=self.parameters['flow_id']
)
child_job.status = "failed"
child_job.save()

logger.log(
"error",
NS.publisher_id,
{"message": "Import jobs on cluster(%s) not yet "
"complete on all nodes(%s). Timing out." %
(_cluster.short_name, str(node_list))},
job_id=self.parameters['job_id'],
flow_id=self.parameters['flow_id']
)

return False
time.sleep(10)
time.sleep(5)
completed = True
for child_job_id in parent_job.children:
child_job = NS.tendrl.objects.Job(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,22 +35,31 @@ def run(self):
wait_count = 24
while True:
child_job_failed = False
job = Job(job_id=_job_id).load()
if loop_count >= wait_count:
if job.status in ["new", ""]:
msg = ("Child job %s for setting up cluster alias not yet "
"picked up by the server, Service "
"tendrl-monitoring-integration may be down. "
"Timing out. (%s)" % (
job.job_id, integration_id
))
else:
msg = ("Child job %s for setting up cluster alias not yet"
"compeleted by the server. Timing out. (%s)" % (
job.job_id, integration_id))
logger.log(
"error",
NS.publisher_id,
{
"message": "Setting up cluster alias"
"not yet complete. Timing out. (%s)" %
integration_id
"message": msg
},
job_id=self.parameters['job_id'],
flow_id=self.parameters['flow_id'],
)
return False
time.sleep(5)
finished = True
job = Job(job_id=_job_id).load()
if job.status not in ["finished", "failed"]:
finished = False
elif job.status == "failed":
Expand Down
14 changes: 14 additions & 0 deletions tendrl/commons/objects/definition/master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ namespace.tendrl:
optional:
- Cluster.short_name
pre_run:
- tendrl.objects.Node.atoms.CheckServiceStatus
- tendrl.objects.Cluster.atoms.CheckClusterNodesUp
- tendrl.objects.Node.atoms.IsNodeTendrlManaged
- tendrl.objects.Cluster.atoms.ValidImportClusterParams
Expand All @@ -83,6 +84,7 @@ namespace.tendrl:
UnmanageCluster:
tags:
- "tendrl/monitor"
- "tendrl/integration/monitoring"
atoms:
- tendrl.objects.Cluster.atoms.SetClusterUnmanaged
- tendrl.objects.Cluster.atoms.StopMonitoringServices
Expand All @@ -97,6 +99,7 @@ namespace.tendrl:
optional:
- Cluster.delete_telemetry_data
pre_run:
- tendrl.objects.Node.atoms.CheckServiceStatus
- tendrl.objects.Cluster.atoms.CheckClusterNodesUp
post_run:
- tendrl.objects.Cluster.atoms.IsClusterImportReady
Expand Down Expand Up @@ -378,12 +381,16 @@ namespace.tendrl:
type: Create
uuid: eda0b13a-7362-48d5-b5ca-4b6d6533a5ab
attrs:
node_id:
type: String
running:
type: String
exists:
type: String
service:
type: String
error:
type: List
enabled: true
list: nodes/$NodeContext.node_id/Services
help: "Service"
Expand Down Expand Up @@ -938,6 +945,13 @@ namespace.tendrl:
run: tendrl.objects.Node.atoms.Cmd
type: Create
uuid: dc8fff3a-34d9-4786-9282-55eff6abb6c3
CheckServiceStatus:
help: Check all necessary services are running to import a cluster
enabled: true
run: tendrl.objects.Node.atoms.CheckServiceStatus
type: check
uuid: aec1d8b6-0689-4b14-a7cb-d085a0e1d10c
version: 1
attrs:
node_id:
type: String
Expand Down
142 changes: 142 additions & 0 deletions tendrl/commons/objects/node/atoms/check_service_status/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
import etcd

from tendrl.commons import objects
from tendrl.commons.objects import AtomExecutionFailedError
from tendrl.commons.utils import etcd_utils
from tendrl.commons.utils import log_utils as logger


TENDRL_SERVICES = {
"server": [
"tendrl-node-agent",
"tendrl-monitoring-integration",
"tendrl-api",
],
"storage_node": [
"tendrl-node-agent",
"glusterd",
]
}


class CheckServiceStatus(objects.BaseAtom):
def __init__(self, *args, **kwargs):
super(CheckServiceStatus, self).__init__(*args, **kwargs)

def run(self):
node_context = NS.tendrl.objects.NodeContext().load()
tags = list(node_context.tags)
service_status = True
if "tendrl/monitor" in tags or \
"tendrl/integration/monitoring" in tags:
# check neccessary service status in server
for service_name in TENDRL_SERVICES["server"]:
service = get_service_status(
service_name
)
if not service.running:
if len(service.error) > 0:
msg = ("Unable to find status of the service %s "
"on server-node. Error: %s" % (
service_name,
service.error
))
else:
msg = ("Service %s is not running on a server-node, "
"Please start it manually or check the log "
"file to figure out the exact problem" % service_name)
logger.log(
"error",
NS.get("publisher_id", None),
{
"message": msg
},
job_id=self.parameters['job_id'],
flow_id=self.parameters['flow_id']
)
service_status = False
else:
# check neccessary service status in storage nodes
for service_name in TENDRL_SERVICES["storage_node"]:
service = get_service_status(service_name)
if not service.running:
if len(service.error) > 0:
msg = ("Unable to find status of the service %s "
"on %s. Error: %s" % (
service_name,
NS.node_context.fqdn,
service.error
))
else:
msg = ("Service %s is not running on %s, Please "
"start it manually or check the log file to "
"figure out the exact problem" % (
service_name,
NS.node_context.fqdn
))
logger.log(
"error",
NS.get("publisher_id", None),
{
"message": msg
},
job_id=self.parameters['job_id'],
flow_id=self.parameters['flow_id']
)
service_status = False
node_list = self.parameters['Node[]']
if service_status and len(node_list) > 1:
# check monitoring integration is running on server
try:
node_arr = etcd_utils.read(
"/indexes/tags/tendrl/integration/monitoring"
).value
node_id = eval(node_arr)[0]
service = NS.tendrl.objects.Service(
service="tendrl-monitoring-integration",
node_id=node_id
).load()
service.error = list(service.error)
if not service.running:
if len(service.error) > 0:
msg = ("Unable to find status of the service "
"tendrl-monitoring-integration on "
"server-node. Error: %s" % service.error)
else:
msg = ("Service tendrl-monitoring-integration is "
"not running on a server-node, Please start"
" it manually or check the log file to "
"figure out the exact problem")
logger.log(
"error",
NS.get("publisher_id", None),
{
"message": msg
},
job_id=self.parameters['job_id'],
flow_id=self.parameters['flow_id']
)
service_status = False
except (etcd.EtcdKeyNotFound, IndexError):
msg = ("Service tendrl-monitoring-integration is "
"not running in a server, Please start it "
"manually or check the log message to "
"figure out the exact problem")
logger.log(
"error",
NS.get("publisher_id", None),
{
"message": msg
},
job_id=self.parameters['job_id'],
flow_id=self.parameters['flow_id']
)
service_status = False
return service_status


def get_service_status(service_name):
service = NS.tendrl.objects.Service(
service=service_name
)
return service
Loading

0 comments on commit 5dfe801

Please sign in to comment.