Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

W-16726901: Add delete-invalid-parant-leaf-node.py Python script #1014

Merged
merged 2 commits into from
Oct 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion carbonj.service/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ COPY ${DEPENDENCY}/entrypoint.sh ${DEPENDENCY}/onOutOfMemoryError.sh ${DEPENDENC
${DEPENDENCY}/deletemetrics.py ${DEPENDENCY}/disklog.sh ${DEPENDENCY}/fdlog.sh ${DEPENDENCY}/iolog.sh \
${DEPENDENCY}/reportGcMetrics.sh ${DEPENDENCY}/reportRocksDbMetrics.sh ${DEPENDENCY}/requestlog-stats.pl \
${DEPENDENCY}/whisper.py ${DEPENDENCY}/cj-load.py ${DEPENDENCY}/check-invalid-namespaces.py \
${DEPENDENCY}/delete-invalid-namespaces.py /app/bin/
${DEPENDENCY}/delete-invalid-namespaces.py ${DEPENDENCY}/delete-invalid-parent-leaf-node.py /app/bin/
RUN chmod ugo+x /app/bin/*
# add java options file
# default configs
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
#!/usr/bin/env python3
#
# Copyright (c) 2018, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
#


import os
import platform
import logging
import subprocess
import time
import requests
import argparse
import socket


logger = logging.getLogger()


def setup_logging():
logger.setLevel(level=os.environ.get("LOGLEVEL", "INFO"))
console = logging.StreamHandler()
formatter = logging.Formatter('[%(levelname)s] %(asctime)s - %(filename)s:%(lineno)s - %(message)s')
console.setFormatter(formatter)
logger.addHandler(console)


def parse_args():
parser = argparse.ArgumentParser(description='Delete invalid parent leaf nodes')
parser.add_argument('--dry-run', dest='dry_run', help='Whether to dryrun', action='store_true')
return parser.parse_args()


setup_logging()


INVALID_PARENT_LEAF_NODE_ERROR_FILE = '/data/invalid_parent_leaf_nodes_error'
PROCESSED_INVALID_PARENT_LEAF_NODE_FILE = '/data/processed_invalid_parent_leaf_nodes'
NOT_PROCESSED_INVALID_PARENT_LEAF_NODE_FILE = '/data/not_processed_invalid_parent_leaf_nodes'


def run_command(command):
try:
subprocess.run(command, shell=True, check=True, stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as e:
logger.error(f"Error: {e}")


def run_command_with_output(command, retry=1):
count = 1
while count <= retry:
count += 1
try:
line = subprocess.check_output(command, text=True, stderr=subprocess.STDOUT)
return line.strip()
except subprocess.CalledProcessError as e:
logger.error(f"Error: {e}")
time.sleep(1)
return ''


def check_id_in_db(db_name, key):
command = ['/root/ldb', 'scan', f"--db=/data/carbonj-data/{db_name}", f"--from={key}", '--max_keys=1', '--hex', '--no_value']
logger.info('Running command: %s', command)
line = run_command_with_output(command, 3)
if line == '':
return True
if len(line) < len(key):
return False
real_key = line[:len(key)]
if real_key != key:
logger.info('Mismatched real key %s', real_key)
return False
return True


class InvalidParentLeafNodeCleaner:
def __init__(self, args):
self.dry_run = args.dry_run

def check_aws_client(self):
if os.path.exists("/usr/local/bin/aws"):
return
logger.info("Download AWS client ...")
command = [f'curl https://awscli.amazonaws.com/awscli-exe-linux-{platform.machine()}.zip -o ./awscliv2.zip']
run_command(command)
logger.info("Install AWS client ...")
command = ['unzip -q ./awscliv2.zip && ./aws/install']
run_command(command)
command = ['rm -rf ./aws ./awscliv2.zip']
run_command(command)

def check_ldb_tool(self):
if os.path.exists("/root/ldb"):
return
self.check_aws_client()
logger.info("Download ldb tool ...")
command = ['aws s3 cp s3://carbonj-tools-bucket/ldb /root/ldb && chmod +x /root/ldb']
run_command(command)

def process_invalid_parent_leaf_nodes(self, id, name, shard, replica):
logger.info(f"Processing invalid parent leaf node {id} with name {name}")
hex_id = f'0x{int(id):016X}'
for db_name in ['30m2y', '5m7d', '60s24h']:
if check_id_in_db(db_name, hex_id):
return False
if self.dry_run:
url = f"http://carbonj-{shard}-{replica}.carbonj-{shard}.carbonj.svc.cluster.local:2001/_dw/rest/carbonj/metrics/deleteAPI/{name}?exclude=&delete=false"
else:
url = f"http://carbonj-{shard}-{replica}.carbonj-{shard}.carbonj.svc.cluster.local:2001/_dw/rest/carbonj/metrics/deleteAPI/{name}?exclude=&delete=true"
logger.info(f"Deleting invalid parent leaf node {name} with url {url}")
response = requests.delete(url)
if response.status_code == 200:
logger.info(response.json())
return True
else:
logger.error(f"Failed to delete namespace {name} with HTTP Status code {response.status_code}")
return False


def dump_invalid_parent_leaf_nodes(self):
self.check_ldb_tool()
command = [f'grep ERROR /app/log/carbonj.log* | grep "Cannot create metric with name" > {INVALID_PARENT_LEAF_NODE_ERROR_FILE}']
run_command(command)
processed_invalid_parent_leaf_nodes = []
if os.path.exists(PROCESSED_INVALID_PARENT_LEAF_NODE_FILE):
with open(PROCESSED_INVALID_PARENT_LEAF_NODE_FILE, 'r') as f:
while True:
line = f.readline()
if not line:
break
space = line.find(' ')
processed_invalid_parent_leaf_nodes.append(line[:space])

processing_invalid_parent_leaf_nodes = {}
with open(INVALID_PARENT_LEAF_NODE_ERROR_FILE, 'r') as f:
while True:
line = f.readline()
if not line:
break
# /app/log/carbonj.log.2024-10-13.log:2024-10-13 23:59:49,456 ERROR c.d.c.s.d.i.MetricIndexImpl [TimeSeriesStore.SerialTaskPool 0]
# Cannot create metric with name [pod186.ecom.bgfq.bgfq_prd.blade3-6.bgfq_prd.healthmgr.threadCount.storefront]
# because [pod186.ecom.bgfq.bgfq_prd.blade3-6.bgfq_prd.healthmgr.threadCount] is already a leaf with ID [7353210610]
left_bracket = line.rindex('[')
right_bracket = line.rindex(']')
id = line[left_bracket+1:right_bracket]
if id in processed_invalid_parent_leaf_nodes or id in processing_invalid_parent_leaf_nodes:
continue
left_bracket = line.rindex('[', 0, left_bracket)
right_bracket = line.rindex(']', 0, right_bracket)
name = line[left_bracket+1:right_bracket]
processing_invalid_parent_leaf_nodes[id] = name

hostname = socket.gethostname()
parts = hostname.split('-')

with open(PROCESSED_INVALID_PARENT_LEAF_NODE_FILE, 'a') as f, open(NOT_PROCESSED_INVALID_PARENT_LEAF_NODE_FILE, 'a') as nf:
for id in processing_invalid_parent_leaf_nodes:
if self.process_invalid_parent_leaf_nodes(id, processing_invalid_parent_leaf_nodes[id], parts[1], parts[2]):
f.write(f'{id} {processing_invalid_parent_leaf_nodes[id]}\n')
else:
nf.write(f'{id} {processing_invalid_parent_leaf_nodes[id]}\n')


class Main:
def __init__(self):
return

def execute(self):
args = parse_args()
cleaner = InvalidParentLeafNodeCleaner(args)
cleaner.dump_invalid_parent_leaf_nodes()


if __name__ == "__main__":
Main().execute()
Loading